Use mbuf instead of bitset for character class for small UTF. Fixes #16145

Merged: https://github.com/ruby/ruby/pull/12787
2025-02-21 00:42:05 +01:00 · 2025-02-21 00:42:05 +01:00 · a50fbc56a3 · 2025-02-28 03:34:55 +00:00
commit a50fbc56a3
parent 025832c385
2 changed files with 15 additions and 2 deletions
--- a/regparse.c
+++ b/regparse.c
@ -5669,7 +5669,7 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
    if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
 	(is_in == 0 &&  IS_NCCLASS_NOT(cc))) {
      if (add_flag) {
-	if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
+	if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= 0x80) {
 	  r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
 	  if (r < 0) return r;
 	}
@ -5681,7 +5681,7 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
 #else
    if (is_in != 0) {
      if (add_flag) {
-	if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
+	if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= 0x80) {
 	  if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
 	  r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
 	  if (r < 0) return r;
--- a/test/ruby/test_regexp.rb
+++ b/test/ruby/test_regexp.rb
@ -2114,4 +2114,17 @@ class TestRegexp < Test::Unit::TestCase
      re =~ s
    end
  end
+
+  def test_bug_16145_caseinsensitive_small_utf # [Bug#16145]
+    o_acute_lower = 243.chr('UTF-8')
+    o_acute_upper = 211.chr('UTF-8')
+    # [xó] =~ "abcÓ"
+    assert(/[x#{o_acute_lower}]/i.match?("abc#{o_acute_upper}"), "should match o acute case insensitive")
+
+
+    e_acute_lower = 233.chr('UTF-8')
+    e_acute_upper = 201.chr('UTF-8')
+    # [xé] =~ 'CAFÉ'
+    assert(/[x#{e_acute_lower}]/i.match?("CAF#{e_acute_upper}"), "should match e acute case insensitive")
+  end
 end