Reuse Regexp ptr when recompiling

When matching an incompatible encoding, the Regexp needs to recompile.
If `usecnt == 0`, then we can reuse the `ptr` because nothing else is
using it. This avoids allocating another `regex_t`.

This speeds up matches that switch to incompatible encodings by 15%.

Branch:

```
Regex#match? with different encoding
                          1.431M (± 1.3%) i/s -      7.264M in   5.076153s
Regex#match? with same encoding
                         16.858M (± 1.1%) i/s -     85.347M in   5.063279s
```

Base:

```
Regex#match? with different encoding
                          1.248M (± 2.0%) i/s -      6.342M in   5.083151s
Regex#match? with same encoding
                         16.377M (± 1.1%) i/s -     82.519M in   5.039504s
```

Script:

```
regex = /foo/
str1 = "日本語"
str2 = "English".force_encoding("ASCII-8BIT")

Benchmark.ips do |x|
  x.report("Regex#match? with different encoding") do |times|
    i = 0
    while i < times
      regex.match?(str1)
      regex.match?(str2)
      i += 1
    end
  end

  x.report("Regex#match? with same encoding") do |times|
    i = 0
    while i < times
      regex.match?(str1)
      i += 1
    end
  end
end
```
This commit is contained in:
Peter Zhu 2023-07-28 11:28:44 -04:00
parent a542512b7c
commit d42b9ffb20

35
re.c
View File

@ -1606,9 +1606,30 @@ rb_reg_prepare_re(VALUE re, VALUE str)
const char *ptr;
long len;
RSTRING_GETMEM(unescaped, ptr, len);
r = onig_new(&reg, (UChar *)ptr, (UChar *)(ptr + len),
reg->options, enc,
OnigDefaultSyntax, &einfo);
/* If there are no other users of this regex, then we can directly overwrite it. */
if (RREGEXP(re)->usecnt == 0) {
regex_t tmp_reg;
r = onig_new_without_alloc(&tmp_reg, (UChar *)ptr, (UChar *)(ptr + len),
reg->options, enc,
OnigDefaultSyntax, &einfo);
if (r) {
/* There was an error so perform cleanups. */
onig_free_body(&tmp_reg);
}
else {
onig_free_body(reg);
/* There are no errors so set reg to tmp_reg. */
*reg = tmp_reg;
}
}
else {
r = onig_new(&reg, (UChar *)ptr, (UChar *)(ptr + len),
reg->options, enc,
OnigDefaultSyntax, &einfo);
}
if (r) {
onig_error_code_to_str((UChar*)err, r, &einfo);
rb_reg_raise(pattern, RREGEXP_SRC_LEN(re), err, re);
@ -1634,13 +1655,7 @@ rb_reg_onig_match(VALUE re, VALUE str,
if (!tmpreg) RREGEXP(re)->usecnt--;
if (tmpreg) {
if (RREGEXP(re)->usecnt) {
onig_free(reg);
}
else {
onig_free(RREGEXP_PTR(re));
RREGEXP_PTR(re) = reg;
}
onig_free(reg);
}
if (result < 0) {