diff --git a/internal/re.h b/internal/re.h index 3e20114665..2788f8b42a 100644 --- a/internal/re.h +++ b/internal/re.h @@ -14,12 +14,12 @@ /* re.c */ VALUE rb_reg_compile(VALUE str, int options, const char *sourcefile, int sourceline); VALUE rb_reg_check_preprocess(VALUE); -long rb_reg_search0(VALUE, VALUE, long, int, int); +long rb_reg_search0(VALUE, VALUE, long, int, int, VALUE *); VALUE rb_reg_match_p(VALUE re, VALUE str, long pos); bool rb_reg_start_with_p(VALUE re, VALUE str); VALUE rb_reg_hash(VALUE re); VALUE rb_reg_equal(VALUE re1, VALUE re2); -void rb_backref_set_string(VALUE string, long pos, long len); +VALUE rb_backref_set_string(VALUE string, long pos, long len); void rb_match_unbusy(VALUE); int rb_match_count(VALUE match); VALUE rb_reg_new_ary(VALUE ary, int options); diff --git a/re.c b/re.c index dc0e0e6432..4f7afb6b0b 100644 --- a/re.c +++ b/re.c @@ -1521,7 +1521,7 @@ match_set_string(VALUE m, VALUE string, long pos, long len) rmatch->regs.end[0] = pos + len; } -void +VALUE rb_backref_set_string(VALUE string, long pos, long len) { VALUE match = rb_backref_get(); @@ -1530,6 +1530,7 @@ rb_backref_set_string(VALUE string, long pos, long len) } match_set_string(match, string, pos, len); rb_backref_set(match); + return match; } /* @@ -1812,12 +1813,20 @@ rb_reg_search_set_match(VALUE re, VALUE str, long pos, int reverse, int set_back return ONIG_MISMATCH; } - VALUE match = match_alloc(rb_cMatch); + VALUE match; + if (set_match && RTEST(*set_match)) { + match = *set_match; + } + else { + match = match_alloc(rb_cMatch); + } + rb_matchext_t *rm = RMATCH_EXT(match); rm->regs = regs; if (set_backref_str) { RB_OBJ_WRITE(match, &RMATCH(match)->str, rb_str_new4(str)); + rb_obj_reveal(match, rb_cMatch); } else { /* Note that a MatchData object with RMATCH(match)->str == 0 is incomplete! @@ -1835,15 +1844,15 @@ rb_reg_search_set_match(VALUE re, VALUE str, long pos, int reverse, int set_back } long -rb_reg_search0(VALUE re, VALUE str, long pos, int reverse, int set_backref_str) +rb_reg_search0(VALUE re, VALUE str, long pos, int reverse, int set_backref_str, VALUE *match) { - return rb_reg_search_set_match(re, str, pos, reverse, set_backref_str, NULL); + return rb_reg_search_set_match(re, str, pos, reverse, set_backref_str, match); } long rb_reg_search(VALUE re, VALUE str, long pos, int reverse) { - return rb_reg_search0(re, str, pos, reverse, 1); + return rb_reg_search_set_match(re, str, pos, reverse, 1, NULL); } static OnigPosition diff --git a/string.c b/string.c index 2c055bfd25..c1b43f800b 100644 --- a/string.c +++ b/string.c @@ -6122,14 +6122,17 @@ get_pat_quoted(VALUE pat, int check) } static long -rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str) +rb_pat_search0(VALUE pat, VALUE str, long pos, int set_backref_str, VALUE *match) { if (BUILTIN_TYPE(pat) == T_STRING) { pos = rb_str_byteindex(str, pat, pos); if (set_backref_str) { if (pos >= 0) { str = rb_str_new_frozen_String(str); - rb_backref_set_string(str, pos, RSTRING_LEN(pat)); + VALUE match_data = rb_backref_set_string(str, pos, RSTRING_LEN(pat)); + if (match) { + *match = match_data; + } } else { rb_backref_set(Qnil); @@ -6138,10 +6141,16 @@ rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str) return pos; } else { - return rb_reg_search0(pat, str, pos, 0, set_backref_str); + return rb_reg_search0(pat, str, pos, 0, set_backref_str, match); } } +static long +rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str) +{ + return rb_pat_search0(pat, str, pos, set_backref_str, NULL); +} + /* * call-seq: @@ -6293,12 +6302,12 @@ rb_str_sub(int argc, VALUE *argv, VALUE str) static VALUE str_gsub(int argc, VALUE *argv, VALUE str, int bang) { - VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil; + VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil, match = Qnil; long beg, beg0, end0; long offset, blen, slen, len, last; enum {STR, ITER, FAST_MAP, MAP} mode = STR; char *sp, *cp; - int need_backref = -1; + int need_backref_str = -1; rb_encoding *str_enc; switch (argc) { @@ -6324,7 +6333,8 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) } pat = get_pat_quoted(argv[0], 1); - beg = rb_pat_search(pat, str, 0, need_backref); + beg = rb_pat_search0(pat, str, 0, need_backref_str, &match); + if (beg < 0) { if (bang) return Qnil; /* no match, no substitution */ return str_duplicate(rb_cString, str); @@ -6341,7 +6351,6 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID); do { - VALUE match = rb_backref_get(); struct re_registers *regs = RMATCH_REGS(match); if (RB_TYPE_P(pat, T_STRING)) { beg0 = beg; @@ -6354,7 +6363,7 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) if (mode == ITER) match0 = rb_reg_nth_match(0, match); } - if (mode) { + if (mode != STR) { if (mode == ITER) { val = rb_obj_as_string(rb_yield(match0)); } @@ -6378,10 +6387,10 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) rb_raise(rb_eRuntimeError, "block should not cheat"); } } - else if (need_backref) { + else if (need_backref_str) { val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat); - if (need_backref < 0) { - need_backref = val != repl; + if (need_backref_str < 0) { + need_backref_str = val != repl; } } else { @@ -6409,14 +6418,20 @@ str_gsub(int argc, VALUE *argv, VALUE str, int bang) } cp = RSTRING_PTR(str) + offset; if (offset > RSTRING_LEN(str)) break; - beg = rb_pat_search(pat, str, offset, need_backref); + + // In FAST_MAP and STR mode the backref can't escape so we can re-use the MatchData safely. + if (mode != FAST_MAP && mode != STR) { + match = Qnil; + } + beg = rb_pat_search0(pat, str, offset, need_backref_str, &match); RB_GC_GUARD(match); } while (beg >= 0); + if (RSTRING_LEN(str) > offset) { rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc); } - rb_pat_search(pat, str, last, 1); + rb_pat_search0(pat, str, last, 1, &match); if (bang) { str_shared_replace(str, dest); }