Fix case folding in single byte encoding

This commit is contained in:
Mari Imaizumi 2025-03-09 15:22:16 +09:00
parent c7f31c88ae
commit 75844889eb
Notes: git 2025-03-18 12:04:20 +00:00
2 changed files with 43 additions and 27 deletions

View File

@ -5626,6 +5626,22 @@ clear_not_flag_cclass(CClassNode* cc, OnigEncoding enc)
}
#endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */
static inline bool
is_singlebyte_range(OnigCodePoint code, OnigEncoding enc)
{
/* single byte encoding */
if (ONIGENC_MBC_MAXLEN(enc) == 1) {
return true;
}
/* wide char encoding */
if (ONIGENC_MBC_MINLEN(enc) > 1) {
return false;
}
return (code < 0x80);
}
typedef struct {
ScanEnv* env;
CClassNode* cc;
@ -5669,31 +5685,28 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
(is_in == 0 && IS_NCCLASS_NOT(cc))) {
if (add_flag) {
if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= 0x80) {
r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
if (r < 0) return r;
}
else {
BITSET_SET_BIT(bs, *to);
}
if (is_singlebyte_range(*to, env->enc)) {
BITSET_SET_BIT(bs, *to);
} else {
r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
if (r < 0) return r;
}
}
}
#else
if (is_in != 0) {
if (add_flag) {
if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= 0x80) {
if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
if (r < 0) return r;
}
else {
if (IS_NCCLASS_NOT(cc)) {
BITSET_CLEAR_BIT(bs, *to);
}
else {
BITSET_SET_BIT(bs, *to);
}
}
if (is_singlebyte_range(*to, env->enc)) {
if (IS_NCCLASS_NOT(cc)) {
BITSET_CLEAR_BIT(bs, *to);
} else {
BITSET_SET_BIT(bs, *to);
}
} else {
if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
if (r < 0) return r;
}
}
}
#endif /* CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS */

View File

@ -2127,13 +2127,16 @@ class TestRegexp < Test::Unit::TestCase
end
end
def test_bug_16145_caseinsensitive_small_utf # [Bug#16145]
o_acute_lower = 243.chr('UTF-8')
o_acute_upper = 211.chr('UTF-8')
assert_match(/[x#{o_acute_lower}]/i, "abc#{o_acute_upper}", "should match o acute case insensitive")
def test_bug_16145_and_bug_21176_caseinsensitive_small # [Bug#16145] [Bug#21176]
encodings = [Encoding::UTF_8, Encoding::ISO_8859_1]
encodings.each do |enc|
o_acute_lower = "\u00F3".encode(enc)
o_acute_upper = "\u00D3".encode(enc)
assert_match(/[x#{o_acute_lower}]/i, "abc#{o_acute_upper}", "should match o acute case insensitive")
e_acute_lower = 233.chr('UTF-8')
e_acute_upper = 201.chr('UTF-8')
assert_match(/[x#{e_acute_lower}]/i, "CAF#{e_acute_upper}", "should match e acute case insensitive")
e_acute_lower = "\u00E9".encode(enc)
e_acute_upper = "\u00C9".encode(enc)
assert_match(/[x#{e_acute_lower}]/i, "CAF#{e_acute_upper}", "should match e acute case insensitive")
end
end
end