Elide string allocation when using String#gsub
in MAP mode
If the provided Hash doesn't have a default proc, we know for sure that we'll never call into user provided code, hence the string we allocate to access the Hash can't possibly escape. So we don't actually have to allocate it, we can use a fake_str, AKA a stack allocated string. ``` compare-ruby: ruby 3.5.0dev (2025-02-10T13:47:44Z master 3fb455adab) +PRISM [arm64-darwin23] built-ruby: ruby 3.5.0dev (2025-02-10T17:09:52Z opt-gsub-alloc ea5c28958f) +PRISM [arm64-darwin23] warming up.... | |compare-ruby|built-ruby| |:----------------|-----------:|---------:| |escape | 3.374k| 3.722k| | | -| 1.10x| |escape_bin | 5.469k| 6.587k| | | -| 1.20x| |escape_utf8 | 3.465k| 3.734k| | | -| 1.08x| |escape_utf8_bin | 5.752k| 7.283k| | | -| 1.27x| ```
This commit is contained in:
parent
b8db606d2c
commit
f32d5071b7
Notes:
git
2025-02-12 09:24:07 +00:00
43
benchmark/string_gsub.yml
Normal file
43
benchmark/string_gsub.yml
Normal file
@ -0,0 +1,43 @@
|
||||
prelude: |
|
||||
# frozen_string_literal: true
|
||||
STR = ((("a" * 31) + "<") * 1000).freeze
|
||||
STR_UNICODE = ((("a" * 30) + "\u2028") * 1000).freeze
|
||||
ESCAPED_CHARS_BINARY = {
|
||||
"\u2028".b => '\u2028'.b,
|
||||
"\u2029".b => '\u2029'.b,
|
||||
">".b => '\u003e'.b.freeze,
|
||||
"<".b => '\u003c'.b.freeze,
|
||||
"&".b => '\u0026'.b.freeze,
|
||||
}
|
||||
BINARY_PATTERN = Regexp.union(ESCAPED_CHARS_BINARY.keys)
|
||||
|
||||
ESCAPED_CHARS = {
|
||||
"\u2028" => '\u2028',
|
||||
"\u2029" => '\u2029',
|
||||
">" => '\u003e',
|
||||
"<" => '\u003c',
|
||||
"&" => '\u0026',
|
||||
}
|
||||
ESCAPE_PATTERN = Regexp.union(ESCAPED_CHARS.keys)
|
||||
|
||||
|
||||
benchmark:
|
||||
escape: |
|
||||
str = STR.dup
|
||||
str.gsub!(ESCAPE_PATTERN, ESCAPED_CHARS)
|
||||
str
|
||||
|
||||
escape_bin: |
|
||||
str = STR.b
|
||||
str.gsub!(BINARY_PATTERN, ESCAPED_CHARS_BINARY)
|
||||
str.force_encoding(Encoding::UTF_8)
|
||||
|
||||
escape_utf8: |
|
||||
str = STR_UNICODE.dup
|
||||
str.gsub!(ESCAPE_PATTERN, ESCAPED_CHARS)
|
||||
str
|
||||
|
||||
escape_utf8_bin: |
|
||||
str = STR_UNICODE.b
|
||||
str.gsub!(BINARY_PATTERN, ESCAPED_CHARS_BINARY)
|
||||
str.force_encoding(Encoding::UTF_8)
|
@ -17878,6 +17878,7 @@ string.$(OBJEXT): $(top_srcdir)/internal/encoding.h
|
||||
string.$(OBJEXT): $(top_srcdir)/internal/error.h
|
||||
string.$(OBJEXT): $(top_srcdir)/internal/fixnum.h
|
||||
string.$(OBJEXT): $(top_srcdir)/internal/gc.h
|
||||
string.$(OBJEXT): $(top_srcdir)/internal/hash.h
|
||||
string.$(OBJEXT): $(top_srcdir)/internal/imemo.h
|
||||
string.$(OBJEXT): $(top_srcdir)/internal/numeric.h
|
||||
string.$(OBJEXT): $(top_srcdir)/internal/object.h
|
||||
|
2
hash.c
2
hash.c
@ -2037,7 +2037,7 @@ call_default_proc(VALUE proc, VALUE hash, VALUE key)
|
||||
return rb_proc_call_with_block(proc, 2, args, Qnil);
|
||||
}
|
||||
|
||||
static bool
|
||||
bool
|
||||
rb_hash_default_unredefined(VALUE hash)
|
||||
{
|
||||
VALUE klass = RBASIC_CLASS(hash);
|
||||
|
@ -86,6 +86,7 @@ VALUE rb_hash_set_pair(VALUE hash, VALUE pair);
|
||||
int rb_hash_stlike_delete(VALUE hash, st_data_t *pkey, st_data_t *pval);
|
||||
int rb_hash_stlike_foreach_with_replace(VALUE hash, st_foreach_check_callback_func *func, st_update_callback_func *replace, st_data_t arg);
|
||||
int rb_hash_stlike_update(VALUE hash, st_data_t key, st_update_callback_func *func, st_data_t arg);
|
||||
bool rb_hash_default_unredefined(VALUE hash);
|
||||
VALUE rb_ident_hash_new_with_size(st_index_t size);
|
||||
void rb_hash_free(VALUE hash);
|
||||
RUBY_EXTERN VALUE rb_cHash_empty_frozen;
|
||||
|
19
string.c
19
string.c
@ -31,6 +31,7 @@
|
||||
#include "internal/encoding.h"
|
||||
#include "internal/error.h"
|
||||
#include "internal/gc.h"
|
||||
#include "internal/hash.h"
|
||||
#include "internal/numeric.h"
|
||||
#include "internal/object.h"
|
||||
#include "internal/proc.h"
|
||||
@ -6295,7 +6296,7 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
|
||||
VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
|
||||
long beg, beg0, end0;
|
||||
long offset, blen, slen, len, last;
|
||||
enum {STR, ITER, MAP} mode = STR;
|
||||
enum {STR, ITER, FAST_MAP, MAP} mode = STR;
|
||||
char *sp, *cp;
|
||||
int need_backref = -1;
|
||||
rb_encoding *str_enc;
|
||||
@ -6311,6 +6312,9 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
|
||||
if (NIL_P(hash)) {
|
||||
StringValue(repl);
|
||||
}
|
||||
else if (rb_hash_default_unredefined(hash) && !FL_TEST_RAW(hash, RHASH_PROC_DEFAULT)) {
|
||||
mode = FAST_MAP;
|
||||
}
|
||||
else {
|
||||
mode = MAP;
|
||||
}
|
||||
@ -6355,7 +6359,18 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
|
||||
val = rb_obj_as_string(rb_yield(match0));
|
||||
}
|
||||
else {
|
||||
val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
|
||||
struct RString fake_str;
|
||||
VALUE key;
|
||||
if (mode == FAST_MAP) {
|
||||
// It is safe to use a fake_str here because we established that it won't escape,
|
||||
// as it's only used for `rb_hash_aref` and we checked the hash doesn't have a
|
||||
// default proc.
|
||||
key = setup_fake_str(&fake_str, sp + beg0, end0 - beg0, ENCODING_GET_INLINED(str));
|
||||
}
|
||||
else {
|
||||
key = rb_str_subseq(str, beg0, end0 - beg0);
|
||||
}
|
||||
val = rb_hash_aref(hash, key);
|
||||
val = rb_obj_as_string(val);
|
||||
}
|
||||
str_mod_check(str, sp, slen);
|
||||
|
Loading…
x
Reference in New Issue
Block a user