diff --git a/benchmark/hash_aref_str_lit.yml b/benchmark/hash_aref_str_lit.yml new file mode 100644 index 0000000000..ed8142bcf1 --- /dev/null +++ b/benchmark/hash_aref_str_lit.yml @@ -0,0 +1,20 @@ +prelude: | + # frozen_string_literal: true + hash = 10.times.to_h do |i| + [i, i] + end + dyn_sym = "dynamic_symbol".to_sym + binary = RubyVM::InstructionSequence.compile("# frozen_string_literal: true\n'iseq_load'").to_binary + iseq_literal_string = RubyVM::InstructionSequence.load_from_binary(binary).eval + + hash[:some_symbol] = 1 + hash[dyn_sym] = 2 + hash["small"] = 3 + hash["frozen_string_literal"] = 4 + hash[iseq_literal_string] = 5 +benchmark: + symbol: hash[:some_symbol] + dyn_symbol: hash[dyn_sym] + small_lit: hash["small"] + frozen_lit: hash["frozen_string_literal"] + iseq_lit: hash[iseq_literal_string] diff --git a/compile.c b/compile.c index 0113d0e0eb..5fa35512c2 100644 --- a/compile.c +++ b/compile.c @@ -13546,7 +13546,7 @@ ibf_load_object_string(const struct ibf_load *load, const struct ibf_object_head VALUE str; if (header->frozen && !header->internal) { - str = rb_enc_interned_str(ptr, len, rb_enc_from_index(encindex)); + str = rb_enc_literal_str(ptr, len, rb_enc_from_index(encindex)); } else { str = rb_enc_str_new(ptr, len, rb_enc_from_index(encindex)); diff --git a/internal/string.h b/internal/string.h index 3333b3afc3..3533766ffb 100644 --- a/internal/string.h +++ b/internal/string.h @@ -80,6 +80,7 @@ VALUE rb_str_concat_literals(size_t num, const VALUE *strary); VALUE rb_str_eql(VALUE str1, VALUE str2); VALUE rb_id_quote_unprintable(ID); VALUE rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc); +VALUE rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc); struct rb_execution_context_struct; VALUE rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str, bool chilled); diff --git a/prism_compile.c b/prism_compile.c index ea4c4fa054..e1e16a00a7 100644 --- a/prism_compile.c +++ b/prism_compile.c @@ -292,7 +292,7 @@ parse_static_literal_string(rb_iseq_t *iseq, const pm_scope_node_t *scope_node, encoding = scope_node->encoding; } - VALUE value = rb_enc_interned_str((const char *) pm_string_source(string), pm_string_length(string), encoding); + VALUE value = rb_enc_literal_str((const char *) pm_string_source(string), pm_string_length(string), encoding); rb_enc_str_coderange(value); if (ISEQ_COMPILE_DATA(iseq)->option->debug_frozen_string_literal || RTEST(ruby_debug)) { diff --git a/ruby_parser.c b/ruby_parser.c index 1dcdfd8e79..98541fc597 100644 --- a/ruby_parser.c +++ b/ruby_parser.c @@ -921,7 +921,7 @@ rb_parser_build_script_lines_from(rb_parser_ary_t *lines) VALUE rb_str_new_parser_string(rb_parser_string_t *str) { - VALUE string = rb_enc_interned_str(str->ptr, str->len, str->enc); + VALUE string = rb_enc_literal_str(str->ptr, str->len, str->enc); rb_enc_str_coderange(string); return string; } diff --git a/string.c b/string.c index d43a6391be..98dbf1b706 100644 --- a/string.c +++ b/string.c @@ -89,6 +89,9 @@ VALUE rb_cSymbol; * another string (the shared root). * 3: STR_CHILLED (will be frozen in a future version) * The string appears frozen but can be mutated with a warning. + * 4: STR_PRECOMPUTED_HASH + * The string is embedded and has its precomputed hascode stored + * after the terminator. * 5: STR_SHARED_ROOT * Other strings may point to the contents of this string. When this * flag is set, STR_SHARED must not be set. @@ -116,6 +119,7 @@ VALUE rb_cSymbol; */ #define RUBY_MAX_CHAR_LEN 16 +#define STR_PRECOMPUTED_HASH FL_USER4 #define STR_SHARED_ROOT FL_USER5 #define STR_BORROWED FL_USER6 #define STR_TMPLOCK FL_USER7 @@ -240,6 +244,11 @@ rb_str_size_as_embedded(VALUE str) else { real_size = sizeof(struct RString); } + + if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) { + real_size += sizeof(st_index_t); + } + return real_size; } @@ -257,6 +266,7 @@ static VALUE str_new(VALUE klass, const char *ptr, long len); static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen); static inline void str_modifiable(VALUE str); static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str); +static inline VALUE str_alloc_embed(VALUE klass, size_t capa); static inline void str_make_independent(VALUE str) @@ -334,7 +344,7 @@ mustnot_wchar(VALUE str) static int fstring_cmp(VALUE a, VALUE b); -static VALUE register_fstring(VALUE str, bool copy); +static VALUE register_fstring(VALUE str, bool copy, bool precompute_hash); const struct st_hash_type rb_fstring_hash_type = { fstring_cmp, @@ -343,9 +353,42 @@ const struct st_hash_type rb_fstring_hash_type = { #define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString) +static inline st_index_t +str_do_hash(VALUE str) +{ + st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)); + int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0; + if (e && !is_ascii_string(str)) { + h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e)); + } + return h; +} + +static VALUE +str_precompute_hash(VALUE str) +{ + RUBY_ASSERT(!FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)); + RUBY_ASSERT(STR_EMBED_P(str)); + +#if RUBY_DEBUG + size_t used_bytes = (RSTRING_LEN(str) + TERM_LEN(str)); + size_t free_bytes = str_embed_capa(str) - used_bytes; + RUBY_ASSERT(free_bytes >= sizeof(st_index_t)); +#endif + + typedef struct {char bytes[sizeof(st_index_t)];} unaligned_index; + union {st_index_t i; unaligned_index b;} u = {.i = str_do_hash(str)}; + *(unaligned_index *)(RSTRING_END(str) + TERM_LEN(str)) = u.b; + + FL_SET(str, STR_PRECOMPUTED_HASH); + + return str; +} + struct fstr_update_arg { VALUE fstr; bool copy; + bool precompute_hash; }; static int @@ -370,8 +413,23 @@ fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int exist else { if (FL_TEST_RAW(str, STR_FAKESTR)) { if (arg->copy) { - VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len); - rb_enc_copy(new_str, str); + VALUE new_str; + long len = RSTRING_LEN(str); + long capa = len + sizeof(st_index_t); + int term_len = TERM_LEN(str); + + if (arg->precompute_hash && STR_EMBEDDABLE_P(capa, term_len)) { + new_str = str_alloc_embed(rb_cString, capa + term_len); + memcpy(RSTRING_PTR(new_str), RSTRING_PTR(str), len); + STR_SET_LEN(new_str, RSTRING_LEN(str)); + TERM_FILL(RSTRING_END(new_str), TERM_LEN(str)); + rb_enc_copy(new_str, str); + str_precompute_hash(new_str); + } + else { + new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len); + rb_enc_copy(new_str, str); + } str = new_str; } else { @@ -428,7 +486,7 @@ rb_fstring(VALUE str) if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE | STR_CHILLED)) rb_str_resize(str, RSTRING_LEN(str)); - fstr = register_fstring(str, FALSE); + fstr = register_fstring(str, false, false); if (!bare) { str_replace_shared_without_enc(str, fstr); @@ -439,10 +497,12 @@ rb_fstring(VALUE str) } static VALUE -register_fstring(VALUE str, bool copy) +register_fstring(VALUE str, bool copy, bool precompute_hash) { - struct fstr_update_arg args; - args.copy = copy; + struct fstr_update_arg args = { + .copy = copy, + .precompute_hash = precompute_hash + }; RB_VM_LOCK_ENTER(); { @@ -500,14 +560,14 @@ VALUE rb_fstring_new(const char *ptr, long len) { struct RString fake_str; - return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE); + return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), false, false); } VALUE rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc) { struct RString fake_str; - return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE); + return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), false, false); } VALUE @@ -3658,12 +3718,15 @@ rb_str_prepend_multi(int argc, VALUE *argv, VALUE str) st_index_t rb_str_hash(VALUE str) { - st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str)); - int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0; - if (e && !is_ascii_string(str)) { - h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e)); + if (FL_TEST_RAW(str, STR_PRECOMPUTED_HASH)) { + typedef struct {char bytes[sizeof(st_index_t)];} unaligned_index; + st_index_t precomputed_hash = ((union {st_index_t i; unaligned_index b;} *)(RSTRING_END(str) + TERM_LEN(str)))->i; + + RUBY_ASSERT(precomputed_hash == str_do_hash(str)); + return precomputed_hash; } - return h; + + return str_do_hash(str); } int @@ -12133,7 +12196,7 @@ VALUE rb_interned_str(const char *ptr, long len) { struct RString fake_str; - return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE); + return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), true, false); } VALUE @@ -12150,7 +12213,18 @@ rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc) } struct RString fake_str; - return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE); + return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, false); +} + +VALUE +rb_enc_literal_str(const char *ptr, long len, rb_encoding *enc) +{ + if (enc != NULL && UNLIKELY(rb_enc_autoload_p(enc))) { + rb_enc_autoload(enc); + } + + struct RString fake_str; + return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), true, true); } VALUE