Use UTF-8 encoding for literal extended regexps with UTF-8 characters in comments

Fixes [Bug #19455]
Merged: https://github.com/ruby/ruby/pull/7592
2023-03-24 11:53:53 -07:00 · 2023-03-24 11:53:53 -07:00 · a8ba1ddd78 · 2023-04-24 02:28:24 +00:00
commit a8ba1ddd78
parent ec211ad54d
2 changed files with 15 additions and 1 deletions
--- a/re.c
+++ b/re.c
@ -2948,7 +2948,11 @@ escape_asis:
          case '#':
            if (extended_mode && !in_char_class) {
                /* consume and ignore comment in extended regexp */
-                while ((p < end) && ((c = *p++) != '\n'));
+                while ((p < end) && ((c = *p++) != '\n')) {
+                    if ((c & 0x80) && !*encp && enc == rb_utf8_encoding()) {
+                        *encp = enc;
+                    }
+                }
                break;
            }
            rb_str_buf_cat(buf, (char *)&c, 1);
@ -2983,6 +2987,9 @@ escape_asis:
                        switch (c = *p++) {
                          default:
                            if (!(c & 0x80)) break;
+                            if (!*encp && enc == rb_utf8_encoding()) {
+                                *encp = enc;
+                            }
                            --p;
                            /* fallthrough */
                          case '\\':
--- a/test/ruby/test_regexp.rb
+++ b/test/ruby/test_regexp.rb
@ -187,6 +187,13 @@ class TestRegexp < Test::Unit::TestCase
    RUBY
  end

+  def test_utf8_comment_in_usascii_extended_regexp_bug_19455
+    assert_separately([], <<-RUBY)
+      assert_equal(Encoding::UTF_8, /(?#\u1000)/x.encoding)
+      assert_equal(Encoding::UTF_8, /#\u1000/x.encoding)
+    RUBY
+  end
+
  def test_union
    assert_equal :ok, begin
      Regexp.union(