Deduplicate objects efficiently when dumping iseq to binary

We were inefficient in cases where there are a lot of duplicates due to
the use of linear search. Use a hash table instead.

These cases are not that rare in the wild.

[Feature #16505]
This commit is contained in:
NagayamaRyoga 2019-12-18 19:26:02 +09:00 committed by Alan Wu
parent 2079f436c7
commit 6e5e6a40c4
Notes: git 2020-02-10 01:33:59 +09:00

View File

@ -9520,7 +9520,8 @@ struct ibf_header {
struct ibf_dump_buffer {
VALUE str;
VALUE obj_list; /* [objs] */
VALUE obj_list; /* [objs] */
st_table *obj_table; /* obj -> obj number */
};
struct ibf_dump {
@ -9666,25 +9667,26 @@ static VALUE ibf_load_object(const struct ibf_load *load, VALUE object_index);
static rb_iseq_t *ibf_load_iseq(const struct ibf_load *load, const rb_iseq_t *index_iseq);
static VALUE
ibf_dump_object_list_new(void)
ibf_dump_object_list_new(st_table **obj_table)
{
VALUE obj_list = rb_ary_tmp_new(1);
rb_ary_push(obj_list, Qnil); /* 0th is nil */
*obj_table = st_init_numtable(); /* need free */
rb_st_insert(*obj_table, (st_data_t)Qnil, (st_data_t)0); /* 0th is nil */
return obj_list;
}
static VALUE
ibf_dump_object(struct ibf_dump *dump, VALUE obj)
{
VALUE obj_list = dump->current_buffer->obj_list;
long index = RARRAY_LEN(obj_list);
long i;
for (i=0; i<index; i++) {
if (RARRAY_AREF(obj_list, i) == obj) return (VALUE)i; /* dedup */
int obj_index = ibf_table_lookup(dump->current_buffer->obj_table, (st_data_t)obj);
if (obj_index < 0) {
obj_index = ibf_table_index(dump->current_buffer->obj_table, (st_data_t)obj);
rb_ary_push(dump->current_buffer->obj_list, obj);
}
rb_ary_push(obj_list, obj);
return (VALUE)index;
return obj_index;
}
static VALUE
@ -10372,7 +10374,7 @@ ibf_dump_iseq_each(struct ibf_dump *dump, const rb_iseq_t *iseq)
struct ibf_dump_buffer *saved_buffer = dump->current_buffer;
struct ibf_dump_buffer buffer;
buffer.str = rb_str_new(0, 0);
buffer.obj_list = ibf_dump_object_list_new();
buffer.obj_list = ibf_dump_object_list_new(&buffer.obj_table);
dump->current_buffer = &buffer;
#endif
@ -10477,6 +10479,8 @@ ibf_dump_iseq_each(struct ibf_dump *dump, const rb_iseq_t *iseq)
ibf_dump_write_small_value(dump, local_obj_list_offset);
ibf_dump_write_small_value(dump, local_obj_list_size);
rb_st_free_table(buffer.obj_table);
return offset;
#else
return body_offset;
@ -10504,15 +10508,15 @@ ibf_load_iseq_each(struct ibf_load *load, rb_iseq_t *iseq, ibf_offset_t offset)
struct ibf_load_buffer *saved_buffer = load->current_buffer;
load->current_buffer = &load->global_buffer;
const ibf_offset_t iseq_start = ibf_load_small_value(load, &reading_pos);
const ibf_offset_t iseq_length_bytes = ibf_load_small_value(load, &reading_pos);
const ibf_offset_t body_offset = ibf_load_small_value(load, &reading_pos);
const ibf_offset_t iseq_start = (ibf_offset_t)ibf_load_small_value(load, &reading_pos);
const ibf_offset_t iseq_length_bytes = (ibf_offset_t)ibf_load_small_value(load, &reading_pos);
const ibf_offset_t body_offset = (ibf_offset_t)ibf_load_small_value(load, &reading_pos);
struct ibf_load_buffer buffer;
buffer.buff = load->global_buffer.buff + iseq_start;
buffer.size = iseq_length_bytes;
buffer.obj_list_offset = ibf_load_small_value(load, &reading_pos);
buffer.obj_list_size = ibf_load_small_value(load, &reading_pos);
buffer.obj_list_offset = (ibf_offset_t)ibf_load_small_value(load, &reading_pos);
buffer.obj_list_size = (ibf_offset_t)ibf_load_small_value(load, &reading_pos);
buffer.obj_list = rb_ary_tmp_new(buffer.obj_list_size);
rb_ary_resize(buffer.obj_list, buffer.obj_list_size);
@ -11338,6 +11342,10 @@ static void
ibf_dump_free(void *ptr)
{
struct ibf_dump *dump = (struct ibf_dump *)ptr;
if (dump->global_buffer.obj_table) {
st_free_table(dump->global_buffer.obj_table);
dump->global_buffer.obj_table = 0;
}
if (dump->iseq_table) {
st_free_table(dump->iseq_table);
dump->iseq_table = 0;
@ -11351,6 +11359,7 @@ ibf_dump_memsize(const void *ptr)
struct ibf_dump *dump = (struct ibf_dump *)ptr;
size_t size = sizeof(*dump);
if (dump->iseq_table) size += st_memsize(dump->iseq_table);
if (dump->global_buffer.obj_table) size += st_memsize(dump->global_buffer.obj_table);
return size;
}
@ -11364,7 +11373,7 @@ static void
ibf_dump_setup(struct ibf_dump *dump, VALUE dumper_obj)
{
RB_OBJ_WRITE(dumper_obj, &dump->iseq_list, rb_ary_tmp_new(0));
RB_OBJ_WRITE(dumper_obj, &dump->global_buffer.obj_list, ibf_dump_object_list_new());
RB_OBJ_WRITE(dumper_obj, &dump->global_buffer.obj_list, ibf_dump_object_list_new(&dump->global_buffer.obj_table));
RB_OBJ_WRITE(dumper_obj, &dump->global_buffer.str, rb_str_new(0, 0));
dump->iseq_table = st_init_numtable(); /* need free */