From a8ba1ddd78544b4bda749051d44f7b2a8a0ec5ff Mon Sep 17 00:00:00 2001
From: Jeremy Evans <code@jeremyevans.net>
Date: Fri, 24 Mar 2023 11:53:53 -0700
Subject: [PATCH] Use UTF-8 encoding for literal extended regexps with UTF-8
 characters in comments

Fixes [Bug #19455]
---
 re.c                     | 9 ++++++++-
 test/ruby/test_regexp.rb | 7 +++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/re.c b/re.c
index d7490bbbbf..f6abf46131 100644
--- a/re.c
+++ b/re.c
@@ -2948,7 +2948,11 @@ escape_asis:
           case '#':
             if (extended_mode && !in_char_class) {
                 /* consume and ignore comment in extended regexp */
-                while ((p < end) && ((c = *p++) != '\n'));
+                while ((p < end) && ((c = *p++) != '\n')) {
+                    if ((c & 0x80) && !*encp && enc == rb_utf8_encoding()) {
+                        *encp = enc;
+                    }
+                }
                 break;
             }
             rb_str_buf_cat(buf, (char *)&c, 1);
@@ -2983,6 +2987,9 @@ escape_asis:
                         switch (c = *p++) {
                           default:
                             if (!(c & 0x80)) break;
+                            if (!*encp && enc == rb_utf8_encoding()) {
+                                *encp = enc;
+                            }
                             --p;
                             /* fallthrough */
                           case '\\':
diff --git a/test/ruby/test_regexp.rb b/test/ruby/test_regexp.rb
index f10d5f3077..d04b9cf766 100644
--- a/test/ruby/test_regexp.rb
+++ b/test/ruby/test_regexp.rb
@@ -187,6 +187,13 @@ class TestRegexp < Test::Unit::TestCase
     RUBY
   end
 
+  def test_utf8_comment_in_usascii_extended_regexp_bug_19455
+    assert_separately([], <<-RUBY)
+      assert_equal(Encoding::UTF_8, /(?#\u1000)/x.encoding)
+      assert_equal(Encoding::UTF_8, /#\u1000/x.encoding)
+    RUBY
+  end
+
   def test_union
     assert_equal :ok, begin
       Regexp.union(