From f32d5071b7b01f258eb45cf533496d82d5c0f6a1 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Mon, 10 Feb 2025 17:30:34 +0100 Subject: [PATCH] Elide string allocation when using `String#gsub` in MAP mode If the provided Hash doesn't have a default proc, we know for sure that we'll never call into user provided code, hence the string we allocate to access the Hash can't possibly escape. So we don't actually have to allocate it, we can use a fake_str, AKA a stack allocated string. ``` compare-ruby: ruby 3.5.0dev (2025-02-10T13:47:44Z master 3fb455adab) +PRISM [arm64-darwin23] built-ruby: ruby 3.5.0dev (2025-02-10T17:09:52Z opt-gsub-alloc ea5c28958f) +PRISM [arm64-darwin23] warming up.... | |compare-ruby|built-ruby| |:----------------|-----------:|---------:| |escape | 3.374k| 3.722k| | | -| 1.10x| |escape_bin | 5.469k| 6.587k| | | -| 1.20x| |escape_utf8 | 3.465k| 3.734k| | | -| 1.08x| |escape_utf8_bin | 5.752k| 7.283k| | | -| 1.27x| ``` --- benchmark/string_gsub.yml | 43 +++++++++++++++++++++++++++++++++++++++ common.mk | 1 + hash.c | 2 +- internal/hash.h | 1 + string.c | 19 +++++++++++++++-- 5 files changed, 63 insertions(+), 3 deletions(-) create mode 100644 benchmark/string_gsub.yml diff --git a/benchmark/string_gsub.yml b/benchmark/string_gsub.yml new file mode 100644 index 0000000000..0f964337dd --- /dev/null +++ b/benchmark/string_gsub.yml @@ -0,0 +1,43 @@ +prelude: | + # frozen_string_literal: true + STR = ((("a" * 31) + "<") * 1000).freeze + STR_UNICODE = ((("a" * 30) + "\u2028") * 1000).freeze + ESCAPED_CHARS_BINARY = { + "\u2028".b => '\u2028'.b, + "\u2029".b => '\u2029'.b, + ">".b => '\u003e'.b.freeze, + "<".b => '\u003c'.b.freeze, + "&".b => '\u0026'.b.freeze, + } + BINARY_PATTERN = Regexp.union(ESCAPED_CHARS_BINARY.keys) + + ESCAPED_CHARS = { + "\u2028" => '\u2028', + "\u2029" => '\u2029', + ">" => '\u003e', + "<" => '\u003c', + "&" => '\u0026', + } + ESCAPE_PATTERN = Regexp.union(ESCAPED_CHARS.keys) + + +benchmark: + escape: | + str = STR.dup + str.gsub!(ESCAPE_PATTERN, ESCAPED_CHARS) + str + + escape_bin: | + str = STR.b + str.gsub!(BINARY_PATTERN, ESCAPED_CHARS_BINARY) + str.force_encoding(Encoding::UTF_8) + + escape_utf8: | + str = STR_UNICODE.dup + str.gsub!(ESCAPE_PATTERN, ESCAPED_CHARS) + str + + escape_utf8_bin: | + str = STR_UNICODE.b + str.gsub!(BINARY_PATTERN, ESCAPED_CHARS_BINARY) + str.force_encoding(Encoding::UTF_8) diff --git a/common.mk b/common.mk index a82e9bc49f..f050ab5f3d 100644 --- a/common.mk +++ b/common.mk @@ -17878,6 +17878,7 @@ string.$(OBJEXT): $(top_srcdir)/internal/encoding.h string.$(OBJEXT): $(top_srcdir)/internal/error.h string.$(OBJEXT): $(top_srcdir)/internal/fixnum.h string.$(OBJEXT): $(top_srcdir)/internal/gc.h +string.$(OBJEXT): $(top_srcdir)/internal/hash.h string.$(OBJEXT): $(top_srcdir)/internal/imemo.h string.$(OBJEXT): $(top_srcdir)/internal/numeric.h string.$(OBJEXT): $(top_srcdir)/internal/object.h diff --git a/hash.c b/hash.c index 9e1fe4c8fe..0e3d5606af 100644 --- a/hash.c +++ b/hash.c @@ -2037,7 +2037,7 @@ call_default_proc(VALUE proc, VALUE hash, VALUE key) return rb_proc_call_with_block(proc, 2, args, Qnil); } -static bool +bool rb_hash_default_unredefined(VALUE hash) { VALUE klass = RBASIC_CLASS(hash); diff --git a/internal/hash.h b/internal/hash.h index d66b5b2d04..676f140496 100644 --- a/internal/hash.h +++ b/internal/hash.h @@ -86,6 +86,7 @@ VALUE rb_hash_set_pair(VALUE hash, VALUE pair); int rb_hash_stlike_delete(VALUE hash, st_data_t *pkey, st_data_t *pval); int rb_hash_stlike_foreach_with_replace(VALUE hash, st_foreach_check_callback_func *func, st_update_callback_func *replace, st_data_t arg); int rb_hash_stlike_update(VALUE hash, st_data_t key, st_update_callback_func *func, st_data_t arg); +bool rb_hash_default_unredefined(VALUE hash); VALUE rb_ident_hash_new_with_size(st_index_t size); void rb_hash_free(VALUE hash); RUBY_EXTERN VALUE rb_cHash_empty_frozen; diff --git a/string.c b/string.c index 6faeb5d00e..2c055bfd25 100644 --- a/string.c +++ b/string.c @@ -31,6 +31,7 @@ #include "internal/encoding.h" #include "internal/error.h" #include "internal/gc.h" +#include "internal/hash.h" #include "internal/numeric.h" #include "internal/object.h" #include "internal/proc.h" @@ -6295,7 +6296,7 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil; long beg, beg0, end0; long offset, blen, slen, len, last; - enum {STR, ITER, MAP} mode = STR; + enum {STR, ITER, FAST_MAP, MAP} mode = STR; char *sp, *cp; int need_backref = -1; rb_encoding *str_enc; @@ -6311,6 +6312,9 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) if (NIL_P(hash)) { StringValue(repl); } + else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) { + mode = FAST_MAP; + } else { mode = MAP; } @@ -6355,7 +6359,18 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) val = rb_obj_as_string(rb_yield(match0)); } else { - val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0)); + struct RString fake_str; + VALUE key; + if (mode == FAST_MAP) { + // It is safe to use a fake_str here because we established that it won't escape, + // as it's only used for `rb_hash_aref` and we checked the hash doesn't have a + // default proc. + key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str)); + } + else { + key = rb_str_subseq(str, beg0, end0 - beg0); + } + val = rb_hash_aref(hash, key); val = rb_obj_as_string(val); } str_mod_check(str, sp, slen);