Use UTF-8 encoding for literal extended regexps with UTF-8 characters in comments

Fixes [Bug #19455]
This commit is contained in:
Jeremy Evans 2023-03-24 11:53:53 -07:00
parent ec211ad54d
commit a8ba1ddd78
Notes: git 2023-04-24 02:28:24 +00:00
2 changed files with 15 additions and 1 deletions

9
re.c
View File

@ -2948,7 +2948,11 @@ escape_asis:
case '#':
if (extended_mode && !in_char_class) {
/* consume and ignore comment in extended regexp */
while ((p < end) && ((c = *p++) != '\n'));
while ((p < end) && ((c = *p++) != '\n')) {
if ((c & 0x80) && !*encp && enc == rb_utf8_encoding()) {
*encp = enc;
}
}
break;
}
rb_str_buf_cat(buf, (char *)&c, 1);
@ -2983,6 +2987,9 @@ escape_asis:
switch (c = *p++) {
default:
if (!(c & 0x80)) break;
if (!*encp && enc == rb_utf8_encoding()) {
*encp = enc;
}
--p;
/* fallthrough */
case '\\':

View File

@ -187,6 +187,13 @@ class TestRegexp < Test::Unit::TestCase
RUBY
end
def test_utf8_comment_in_usascii_extended_regexp_bug_19455
assert_separately([], <<-RUBY)
assert_equal(Encoding::UTF_8, /(?#\u1000)/x.encoding)
assert_equal(Encoding::UTF_8, /#\u1000/x.encoding)
RUBY
end
def test_union
assert_equal :ok, begin
Regexp.union(