From a8ba1ddd78544b4bda749051d44f7b2a8a0ec5ff Mon Sep 17 00:00:00 2001 From: Jeremy Evans Date: Fri, 24 Mar 2023 11:53:53 -0700 Subject: [PATCH] Use UTF-8 encoding for literal extended regexps with UTF-8 characters in comments Fixes [Bug #19455] --- re.c | 9 ++++++++- test/ruby/test_regexp.rb | 7 +++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/re.c b/re.c index d7490bbbbf..f6abf46131 100644 --- a/re.c +++ b/re.c @@ -2948,7 +2948,11 @@ escape_asis: case '#': if (extended_mode && !in_char_class) { /* consume and ignore comment in extended regexp */ - while ((p < end) && ((c = *p++) != '\n')); + while ((p < end) && ((c = *p++) != '\n')) { + if ((c & 0x80) && !*encp && enc == rb_utf8_encoding()) { + *encp = enc; + } + } break; } rb_str_buf_cat(buf, (char *)&c, 1); @@ -2983,6 +2987,9 @@ escape_asis: switch (c = *p++) { default: if (!(c & 0x80)) break; + if (!*encp && enc == rb_utf8_encoding()) { + *encp = enc; + } --p; /* fallthrough */ case '\\': diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb index f10d5f3077..d04b9cf766 100644 --- a/test/ruby/test_regexp.rb +++ b/test/ruby/test_regexp.rb @@ -187,6 +187,13 @@ class TestRegexp < Test::Unit::TestCase RUBY end + def test_utf8_comment_in_usascii_extended_regexp_bug_19455 + assert_separately([], <<-RUBY) + assert_equal(Encoding::UTF_8, /(?#\u1000)/x.encoding) + assert_equal(Encoding::UTF_8, /#\u1000/x.encoding) + RUBY + end + def test_union assert_equal :ok, begin Regexp.union(