Fix case folding in single byte encoding

Merged: https://github.com/ruby/ruby/pull/12889
2025-03-09 15:22:16 +09:00 · 2025-03-09 15:22:16 +09:00 · 75844889eb · 2025-03-18 12:04:20 +00:00
commit 75844889eb
parent c7f31c88ae
2 changed files with 43 additions and 27 deletions
--- a/regparse.c
+++ b/regparse.c
@ -5626,6 +5626,22 @@ clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
 }
 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */

+static inline bool
+is_singlebyte_range(OnigCodePoint code, OnigEncoding enc)
+{
+  /* single byte encoding */
+  if (ONIGENC_MBC_MAXLEN(enc) == 1) {
+    return true;
+  }
+
+  /* wide char encoding */
+  if (ONIGENC_MBC_MINLEN(enc) > 1) {
+    return false;
+  }
+
+  return (code < 0x80);
+}
+
 typedef struct {
  ScanEnv*    env;
  CClassNode* cc;
@ -5669,31 +5685,28 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
    if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
 	(is_in == 0 &&  IS_NCCLASS_NOT(cc))) {
      if (add_flag) {
-	if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= 0x80) {
-	  r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
-	  if (r < 0) return r;
-	}
-	else {
-	  BITSET_SET_BIT(bs, *to);
-	}
+        if (is_singlebyte_range(*to, env->enc)) {
+          BITSET_SET_BIT(bs, *to);
+        } else {
+          r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
+          if (r < 0) return r;
+        }
      }
    }
 #else
    if (is_in != 0) {
      if (add_flag) {
-	if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= 0x80) {
-	  if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
-	  r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
-	  if (r < 0) return r;
-	}
-	else {
-	  if (IS_NCCLASS_NOT(cc)) {
-	    BITSET_CLEAR_BIT(bs, *to);
-	  }
-	  else {
-	    BITSET_SET_BIT(bs, *to);
-	  }
-	}
+        if (is_singlebyte_range(*to, env->enc)) {
+          if (IS_NCCLASS_NOT(cc)) {
+            BITSET_CLEAR_BIT(bs, *to);
+          } else {
+            BITSET_SET_BIT(bs, *to);
+          }
+        } else {
+          if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
+          r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
+          if (r < 0) return r;
+        }
      }
    }
 #endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
--- a/test/ruby/test_regexp.rb
+++ b/test/ruby/test_regexp.rb
@ -2127,13 +2127,16 @@ class TestRegexp < Test::Unit::TestCase
    end
  end

-  def test_bug_16145_caseinsensitive_small_utf # [Bug#16145]
-    o_acute_lower = 243.chr('UTF-8')
-    o_acute_upper = 211.chr('UTF-8')
-    assert_match(/[x#{o_acute_lower}]/i, "abc#{o_acute_upper}", "should match o acute case insensitive")
+  def test_bug_16145_and_bug_21176_caseinsensitive_small # [Bug#16145] [Bug#21176]
+    encodings = [Encoding::UTF_8, Encoding::ISO_8859_1]
+    encodings.each do |enc|
+      o_acute_lower = "\u00F3".encode(enc)
+      o_acute_upper = "\u00D3".encode(enc)
+      assert_match(/[x#{o_acute_lower}]/i, "abc#{o_acute_upper}", "should match o acute case insensitive")

-    e_acute_lower = 233.chr('UTF-8')
-    e_acute_upper = 201.chr('UTF-8')
-    assert_match(/[x#{e_acute_lower}]/i, "CAF#{e_acute_upper}", "should match e acute case insensitive")
+      e_acute_lower = "\u00E9".encode(enc)
+      e_acute_upper = "\u00C9".encode(enc)
+      assert_match(/[x#{e_acute_lower}]/i, "CAF#{e_acute_upper}", "should match e acute case insensitive")
+    end
  end
 end