Elide string allocation when using String#gsub in MAP mode

If the provided Hash doesn't have a default proc, we know for
sure that we'll never call into user provided code, hence the
string we allocate to access the Hash can't possibly escape.

So we don't actually have to allocate it, we can use a fake_str,
AKA a stack allocated string.

```
compare-ruby: ruby 3.5.0dev (2025-02-10T13:47:44Z master 3fb455adab) +PRISM [arm64-darwin23]
built-ruby: ruby 3.5.0dev (2025-02-10T17:09:52Z opt-gsub-alloc ea5c28958f) +PRISM [arm64-darwin23]
warming up....

|                 |compare-ruby|built-ruby|
|:----------------|-----------:|---------:|
|escape           |      3.374k|    3.722k|
|                 |           -|     1.10x|
|escape_bin       |      5.469k|    6.587k|
|                 |           -|     1.20x|
|escape_utf8      |      3.465k|    3.734k|
|                 |           -|     1.08x|
|escape_utf8_bin  |      5.752k|    7.283k|
|                 |           -|     1.27x|
```
This commit is contained in:
Jean Boussier 2025-02-10 17:30:34 +01:00
parent b8db606d2c
commit f32d5071b7
Notes: git 2025-02-12 09:24:07 +00:00
5 changed files with 63 additions and 3 deletions

43
benchmark/string_gsub.yml Normal file
View File

@ -0,0 +1,43 @@
prelude: |
# frozen_string_literal: true
STR = ((("a" * 31) + "<") * 1000).freeze
STR_UNICODE = ((("a" * 30) + "\u2028") * 1000).freeze
ESCAPED_CHARS_BINARY = {
"\u2028".b => '\u2028'.b,
"\u2029".b => '\u2029'.b,
">".b => '\u003e'.b.freeze,
"<".b => '\u003c'.b.freeze,
"&".b => '\u0026'.b.freeze,
}
BINARY_PATTERN = Regexp.union(ESCAPED_CHARS_BINARY.keys)
ESCAPED_CHARS = {
"\u2028" => '\u2028',
"\u2029" => '\u2029',
">" => '\u003e',
"<" => '\u003c',
"&" => '\u0026',
}
ESCAPE_PATTERN = Regexp.union(ESCAPED_CHARS.keys)
benchmark:
escape: |
str = STR.dup
str.gsub!(ESCAPE_PATTERN, ESCAPED_CHARS)
str
escape_bin: |
str = STR.b
str.gsub!(BINARY_PATTERN, ESCAPED_CHARS_BINARY)
str.force_encoding(Encoding::UTF_8)
escape_utf8: |
str = STR_UNICODE.dup
str.gsub!(ESCAPE_PATTERN, ESCAPED_CHARS)
str
escape_utf8_bin: |
str = STR_UNICODE.b
str.gsub!(BINARY_PATTERN, ESCAPED_CHARS_BINARY)
str.force_encoding(Encoding::UTF_8)

View File

@ -17878,6 +17878,7 @@ string.$(OBJEXT): $(top_srcdir)/internal/encoding.h
string.$(OBJEXT): $(top_srcdir)/internal/error.h
string.$(OBJEXT): $(top_srcdir)/internal/fixnum.h
string.$(OBJEXT): $(top_srcdir)/internal/gc.h
string.$(OBJEXT): $(top_srcdir)/internal/hash.h
string.$(OBJEXT): $(top_srcdir)/internal/imemo.h
string.$(OBJEXT): $(top_srcdir)/internal/numeric.h
string.$(OBJEXT): $(top_srcdir)/internal/object.h

2
hash.c
View File

@ -2037,7 +2037,7 @@ call_default_proc(VALUE proc, VALUE hash, VALUE key)
return rb_proc_call_with_block(proc, 2, args, Qnil);
}
static bool
bool
rb_hash_default_unredefined(VALUE hash)
{
VALUE klass = RBASIC_CLASS(hash);

View File

@ -86,6 +86,7 @@ VALUE rb_hash_set_pair(VALUE hash, VALUE pair);
int rb_hash_stlike_delete(VALUE hash, st_data_t *pkey, st_data_t *pval);
int rb_hash_stlike_foreach_with_replace(VALUE hash, st_foreach_check_callback_func *func, st_update_callback_func *replace, st_data_t arg);
int rb_hash_stlike_update(VALUE hash, st_data_t key, st_update_callback_func *func, st_data_t arg);
bool rb_hash_default_unredefined(VALUE hash);
VALUE rb_ident_hash_new_with_size(st_index_t size);
void rb_hash_free(VALUE hash);
RUBY_EXTERN VALUE rb_cHash_empty_frozen;

View File

@ -31,6 +31,7 @@
#include "internal/encoding.h"
#include "internal/error.h"
#include "internal/gc.h"
#include "internal/hash.h"
#include "internal/numeric.h"
#include "internal/object.h"
#include "internal/proc.h"
@ -6295,7 +6296,7 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
long beg, beg0, end0;
long offset, blen, slen, len, last;
enum {STR, ITER, MAP} mode = STR;
enum {STR, ITER, FAST_MAP, MAP} mode = STR;
char *sp, *cp;
int need_backref = -1;
rb_encoding *str_enc;
@ -6311,6 +6312,9 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
if (NIL_P(hash)) {
StringValue(repl);
}
else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
mode = FAST_MAP;
}
else {
mode = MAP;
}
@ -6355,7 +6359,18 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
val = rb_obj_as_string(rb_yield(match0));
}
else {
val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
struct RString fake_str;
VALUE key;
if (mode == FAST_MAP) {
// It is safe to use a fake_str here because we established that it won't escape,
// as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
// default proc.
key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
}
else {
key = rb_str_subseq(str, beg0, end0 - beg0);
}
val = rb_hash_aref(hash, key);
val = rb_obj_as_string(val);
}
str_mod_check(str, sp, slen);