Use mbuf instead of bitset for character class for small UTF. Fixes #16145

This commit is contained in:
Maciej Rzasa 2025-02-21 00:42:05 +01:00 committed by Nobuyoshi Nakada
parent 025832c385
commit a50fbc56a3
Notes: git 2025-02-28 03:34:55 +00:00
2 changed files with 15 additions and 2 deletions

View File

@ -5669,7 +5669,7 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
if ((is_in != 0 && !IS_NCCLASS_NOT(cc)) ||
(is_in == 0 && IS_NCCLASS_NOT(cc))) {
if (add_flag) {
if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= 0x80) {
r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
if (r < 0) return r;
}
@ -5681,7 +5681,7 @@ i_apply_case_fold(OnigCodePoint from, OnigCodePoint to[],
#else
if (is_in != 0) {
if (add_flag) {
if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= SINGLE_BYTE_SIZE) {
if (ONIGENC_MBC_MINLEN(env->enc) > 1 || *to >= 0x80) {
if (IS_NCCLASS_NOT(cc)) clear_not_flag_cclass(cc, env->enc);
r = add_code_range0(&(cc->mbuf), env, *to, *to, 0);
if (r < 0) return r;

View File

@ -2114,4 +2114,17 @@ class TestRegexp < Test::Unit::TestCase
re =~ s
end
end
def test_bug_16145_caseinsensitive_small_utf # [Bug#16145]
o_acute_lower = 243.chr('UTF-8')
o_acute_upper = 211.chr('UTF-8')
# [xó] =~ "abcÓ"
assert(/[x#{o_acute_lower}]/i.match?("abc#{o_acute_upper}"), "should match o acute case insensitive")
e_acute_lower = 233.chr('UTF-8')
e_acute_upper = 201.chr('UTF-8')
# [xé] =~ 'CAFÉ'
assert(/[x#{e_acute_lower}]/i.match?("CAF#{e_acute_upper}"), "should match e acute case insensitive")
end
end