String#gsub! Elide MatchData allocation when we know it can't escape
In gsub is used with a string replacement or a map that doesn't have a default proc, we know for sure no code can cause the MatchData to escape the `gsub` call. In such case, we still have to allocate a new MatchData because we don't know what is the lifetime of the backref, but for any subsequent match we can re-use the MatchData we allocated ourselves, reducing allocations significantly. This partially fixes [Misc #20652], except when a block is used, and partially reduce the performance impact of abc0304cb28cb9dcc3476993bc487884c139fd11 / [Bug #17507] ``` compare-ruby: ruby 3.5.0dev (2025-02-24T09:44:57Z master 5cf146399f) +PRISM [arm64-darwin24] built-ruby: ruby 3.5.0dev (2025-02-24T10:58:27Z gsub-elude-match da966636e9) +PRISM [arm64-darwin24] warming up.... | |compare-ruby|built-ruby| |:----------------|-----------:|---------:| |escape | 3.577k| 3.697k| | | -| 1.03x| |escape_bin | 5.869k| 6.743k| | | -| 1.15x| |escape_utf8 | 3.448k| 3.738k| | | -| 1.08x| |escape_utf8_bin | 6.361k| 7.267k| | | -| 1.14x| ``` Co-Authored-By: Étienne Barrié <etienne.barrie@gmail.com>
This commit is contained in:
parent
21ac0a3a64
commit
87f9c3c65e
@ -14,12 +14,12 @@
|
||||
/* re.c */
|
||||
VALUE rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline);
|
||||
VALUE rb_reg_check_preprocess(VALUE);
|
||||
long rb_reg_search0(VALUE, VALUE, long, int, int);
|
||||
long rb_reg_search0(VALUE, VALUE, long, int, int, VALUE *);
|
||||
VALUE rb_reg_match_p(VALUE re, VALUE str, long pos);
|
||||
bool rb_reg_start_with_p(VALUE re, VALUE str);
|
||||
VALUE rb_reg_hash(VALUE re);
|
||||
VALUE rb_reg_equal(VALUE re1, VALUE re2);
|
||||
void rb_backref_set_string(VALUE string, long pos, long len);
|
||||
VALUE rb_backref_set_string(VALUE string, long pos, long len);
|
||||
void rb_match_unbusy(VALUE);
|
||||
int rb_match_count(VALUE match);
|
||||
VALUE rb_reg_new_ary(VALUE ary, int options);
|
||||
|
19
re.c
19
re.c
@ -1521,7 +1521,7 @@ match_set_string(VALUE m, VALUE string, long pos, long len)
|
||||
rmatch->regs.end[0] = pos + len;
|
||||
}
|
||||
|
||||
void
|
||||
VALUE
|
||||
rb_backref_set_string(VALUE string, long pos, long len)
|
||||
{
|
||||
VALUE match = rb_backref_get();
|
||||
@ -1530,6 +1530,7 @@ rb_backref_set_string(VALUE string, long pos, long len)
|
||||
}
|
||||
match_set_string(match, string, pos, len);
|
||||
rb_backref_set(match);
|
||||
return match;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -1812,12 +1813,20 @@ rb_reg_search_set_match(VALUE re, VALUE str, long pos, int reverse, int set_back
|
||||
return ONIG_MISMATCH;
|
||||
}
|
||||
|
||||
VALUE match = match_alloc(rb_cMatch);
|
||||
VALUE match;
|
||||
if (set_match && RTEST(*set_match)) {
|
||||
match = *set_match;
|
||||
}
|
||||
else {
|
||||
match = match_alloc(rb_cMatch);
|
||||
}
|
||||
|
||||
rb_matchext_t *rm = RMATCH_EXT(match);
|
||||
rm->regs = regs;
|
||||
|
||||
if (set_backref_str) {
|
||||
RB_OBJ_WRITE(match, &RMATCH(match)->str, rb_str_new4(str));
|
||||
rb_obj_reveal(match, rb_cMatch);
|
||||
}
|
||||
else {
|
||||
/* Note that a MatchData object with RMATCH(match)->str == 0 is incomplete!
|
||||
@ -1835,15 +1844,15 @@ rb_reg_search_set_match(VALUE re, VALUE str, long pos, int reverse, int set_back
|
||||
}
|
||||
|
||||
long
|
||||
rb_reg_search0(VALUE re, VALUE str, long pos, int reverse, int set_backref_str)
|
||||
rb_reg_search0(VALUE re, VALUE str, long pos, int reverse, int set_backref_str, VALUE *match)
|
||||
{
|
||||
return rb_reg_search_set_match(re, str, pos, reverse, set_backref_str, NULL);
|
||||
return rb_reg_search_set_match(re, str, pos, reverse, set_backref_str, match);
|
||||
}
|
||||
|
||||
long
|
||||
rb_reg_search(VALUE re, VALUE str, long pos, int reverse)
|
||||
{
|
||||
return rb_reg_search0(re, str, pos, reverse, 1);
|
||||
return rb_reg_search_set_match(re, str, pos, reverse, 1, NULL);
|
||||
}
|
||||
|
||||
static OnigPosition
|
||||
|
41
string.c
41
string.c
@ -6122,14 +6122,17 @@ get_pat_quoted(VALUE pat, int check)
|
||||
}
|
||||
|
||||
static long
|
||||
rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
|
||||
rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match)
|
||||
{
|
||||
if (BUILTIN_TYPE(pat) == T_STRING) {
|
||||
pos = rb_str_byteindex(str, pat, pos);
|
||||
if (set_backref_str) {
|
||||
if (pos >= 0) {
|
||||
str = rb_str_new_frozen_String(str);
|
||||
rb_backref_set_string(str, pos, RSTRING_LEN(pat));
|
||||
VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat));
|
||||
if (match) {
|
||||
*match = match_data;
|
||||
}
|
||||
}
|
||||
else {
|
||||
rb_backref_set(Qnil);
|
||||
@ -6138,10 +6141,16 @@ rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
|
||||
return pos;
|
||||
}
|
||||
else {
|
||||
return rb_reg_search0(pat, str, pos, 0, set_backref_str);
|
||||
return rb_reg_search0(pat, str, pos, 0, set_backref_str, match);
|
||||
}
|
||||
}
|
||||
|
||||
static long
|
||||
rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
|
||||
{
|
||||
return rb_pat_search0(pat, str, pos, set_backref_str, NULL);
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* call-seq:
|
||||
@ -6293,12 +6302,12 @@ rb_str_sub(int argc, VALUE *argv, VALUE str)
|
||||
static VALUE
|
||||
str_gsub(int argc, VALUE *argv, VALUE str, int bang)
|
||||
{
|
||||
VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
|
||||
VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil;
|
||||
long beg, beg0, end0;
|
||||
long offset, blen, slen, len, last;
|
||||
enum {STR, ITER, FAST_MAP, MAP} mode = STR;
|
||||
char *sp, *cp;
|
||||
int need_backref = -1;
|
||||
int need_backref_str = -1;
|
||||
rb_encoding *str_enc;
|
||||
|
||||
switch (argc) {
|
||||
@ -6324,7 +6333,8 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
|
||||
}
|
||||
|
||||
pat = get_pat_quoted(argv[0], 1);
|
||||
beg = rb_pat_search(pat, str, 0, need_backref);
|
||||
beg = rb_pat_search0(pat, str, 0, need_backref_str, &match);
|
||||
|
||||
if (beg < 0) {
|
||||
if (bang) return Qnil; /* no match, no substitution */
|
||||
return str_duplicate(rb_cString, str);
|
||||
@ -6341,7 +6351,6 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
|
||||
ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
|
||||
|
||||
do {
|
||||
VALUE match = rb_backref_get();
|
||||
struct re_registers *regs = RMATCH_REGS(match);
|
||||
if (RB_TYPE_P(pat, T_STRING)) {
|
||||
beg0 = beg;
|
||||
@ -6354,7 +6363,7 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
|
||||
if (mode == ITER) match0 = rb_reg_nth_match(0, match);
|
||||
}
|
||||
|
||||
if (mode) {
|
||||
if (mode != STR) {
|
||||
if (mode == ITER) {
|
||||
val = rb_obj_as_string(rb_yield(match0));
|
||||
}
|
||||
@ -6378,10 +6387,10 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
|
||||
rb_raise(rb_eRuntimeError, "block should not cheat");
|
||||
}
|
||||
}
|
||||
else if (need_backref) {
|
||||
else if (need_backref_str) {
|
||||
val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
|
||||
if (need_backref < 0) {
|
||||
need_backref = val != repl;
|
||||
if (need_backref_str < 0) {
|
||||
need_backref_str = val != repl;
|
||||
}
|
||||
}
|
||||
else {
|
||||
@ -6409,14 +6418,20 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang)
|
||||
}
|
||||
cp = RSTRING_PTR(str) + offset;
|
||||
if (offset > RSTRING_LEN(str)) break;
|
||||
beg = rb_pat_search(pat, str, offset, need_backref);
|
||||
|
||||
// In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely.
|
||||
if (mode != FAST_MAP && mode != STR) {
|
||||
match = Qnil;
|
||||
}
|
||||
beg = rb_pat_search0(pat, str, offset, need_backref_str, &match);
|
||||
|
||||
RB_GC_GUARD(match);
|
||||
} while (beg >= 0);
|
||||
|
||||
if (RSTRING_LEN(str) > offset) {
|
||||
rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
|
||||
}
|
||||
rb_pat_search(pat, str, last, 1);
|
||||
rb_pat_search0(pat, str, last, 1, &match);
|
||||
if (bang) {
|
||||
str_shared_replace(str, dest);
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user