Correctly handle consecutive lookarounds (#9738)

Fix [Bug #20207]
Fix [Bug #20212]

Handling consecutive lookarounds in init_cache_opcodes is buggy, so it
causes invalid memory access reported in [Bug #20207] and [Bug #20212].
This fixes it by using recursive functions to detected lookarounds
nesting correctly.
This commit is contained in:
Hiroya Fujinami 2024-01-29 23:51:26 +09:00 committed by GitHub
parent 0d4de0f4b1
commit 3e6e3ca262
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 150 additions and 85 deletions

221
regexec.c
View File

@ -258,18 +258,19 @@ We encode a match cache point to an integer value by the following equation:
A bit-array for memoizing (recording) match cache points once backtracked. A bit-array for memoizing (recording) match cache points once backtracked.
*/ */
/* count the total number of cache opcodes for allocating a match cache buffer. */ static OnigPosition count_num_cache_opcodes_inner(
static OnigPosition const regex_t* reg,
count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr) MemNumType current_repeat_mem, int lookaround_nesting,
UChar** pp, long* num_cache_opcodes_ptr
)
{ {
UChar* p = reg->p; UChar* p = *pp;
UChar* pend = p + reg->used; UChar* pend = reg->p + reg->used;
LengthType len; LengthType len;
MemNumType repeat_mem; MemNumType repeat_mem;
OnigEncoding enc = reg->enc; OnigEncoding enc = reg->enc;
MemNumType current_repeat_mem = -1; long num_cache_opcodes = *num_cache_opcodes_ptr;
long num_cache_opcodes = 0; OnigPosition result;
int lookaround_nesting = 0;
while (p < pend) { while (p < pend) {
switch (*p++) { switch (*p++) {
@ -402,7 +403,16 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
if (reg->repeat_range[repeat_mem].lower == 0) { if (reg->repeat_range[repeat_mem].lower == 0) {
num_cache_opcodes++; num_cache_opcodes++;
} }
current_repeat_mem = repeat_mem; result = count_num_cache_opcodes_inner(reg, repeat_mem, lookaround_nesting, &p, &num_cache_opcodes);
if (result < 0 || num_cache_opcodes < 0) {
goto fail;
}
{
OnigRepeatRange *repeat_range = &reg->repeat_range[repeat_mem];
if (repeat_range->lower < repeat_range->upper) {
num_cache_opcodes++;
}
}
break; break;
case OP_REPEAT_INC: case OP_REPEAT_INC:
case OP_REPEAT_INC_NG: case OP_REPEAT_INC_NG:
@ -411,14 +421,7 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
// A lone or invalid OP_REPEAT_INC is found. // A lone or invalid OP_REPEAT_INC is found.
goto impossible; goto impossible;
} }
{ goto exit;
OnigRepeatRange *repeat_range = &reg->repeat_range[repeat_mem];
if (repeat_range->lower < repeat_range->upper) {
num_cache_opcodes++;
}
current_repeat_mem = -1;
}
break;
case OP_REPEAT_INC_SG: case OP_REPEAT_INC_SG:
case OP_REPEAT_INC_NG_SG: case OP_REPEAT_INC_NG_SG:
goto impossible; goto impossible;
@ -438,7 +441,10 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
// A look-around nested in a atomic grouping is found. // A look-around nested in a atomic grouping is found.
goto impossible; goto impossible;
} }
lookaround_nesting++; result = count_num_cache_opcodes_inner(reg, current_repeat_mem, lookaround_nesting + 1, &p, &num_cache_opcodes);
if (result < 0 || num_cache_opcodes < 0) {
goto fail;
}
break; break;
case OP_PUSH_POS_NOT: case OP_PUSH_POS_NOT:
if (lookaround_nesting < 0) { if (lookaround_nesting < 0) {
@ -446,7 +452,10 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
goto impossible; goto impossible;
} }
p += SIZE_RELADDR; p += SIZE_RELADDR;
lookaround_nesting++; result = count_num_cache_opcodes_inner(reg, current_repeat_mem, lookaround_nesting + 1, &p, &num_cache_opcodes);
if (result < 0 || num_cache_opcodes < 0) {
goto fail;
}
break; break;
case OP_PUSH_LOOK_BEHIND_NOT: case OP_PUSH_LOOK_BEHIND_NOT:
if (lookaround_nesting < 0) { if (lookaround_nesting < 0) {
@ -455,25 +464,26 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
} }
p += SIZE_RELADDR; p += SIZE_RELADDR;
p += SIZE_LENGTH; p += SIZE_LENGTH;
lookaround_nesting++; result = count_num_cache_opcodes_inner(reg, current_repeat_mem, lookaround_nesting + 1, &p, &num_cache_opcodes);
break; if (result < 0 || num_cache_opcodes < 0) {
case OP_POP_POS: goto fail;
case OP_FAIL_POS: }
case OP_FAIL_LOOK_BEHIND_NOT:
lookaround_nesting--;
break; break;
case OP_PUSH_STOP_BT: case OP_PUSH_STOP_BT:
if (lookaround_nesting != 0) { if (lookaround_nesting != 0) {
// A nested atomic grouping is found. // A nested atomic grouping is found.
goto impossible; goto impossible;
} }
lookaround_nesting++; result = count_num_cache_opcodes_inner(reg, current_repeat_mem, -1, &p, &num_cache_opcodes);
lookaround_nesting *= -1; if (result < 0 || num_cache_opcodes < 0) {
goto fail;
}
break; break;
case OP_POP_POS:
case OP_FAIL_POS:
case OP_FAIL_LOOK_BEHIND_NOT:
case OP_POP_STOP_BT: case OP_POP_STOP_BT:
lookaround_nesting *= -1; goto exit;
lookaround_nesting--;
break;
case OP_LOOK_BEHIND: case OP_LOOK_BEHIND:
p += SIZE_LENGTH; p += SIZE_LENGTH;
break; break;
@ -507,9 +517,15 @@ count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
} }
} }
exit:
*pp = p;
*num_cache_opcodes_ptr = num_cache_opcodes; *num_cache_opcodes_ptr = num_cache_opcodes;
return 0; return 0;
fail:
*num_cache_opcodes_ptr = num_cache_opcodes;
return result;
impossible: impossible:
*num_cache_opcodes_ptr = NUM_CACHE_OPCODES_IMPOSSIBLE; *num_cache_opcodes_ptr = NUM_CACHE_OPCODES_IMPOSSIBLE;
return 0; return 0;
@ -518,48 +534,49 @@ bytecode_error:
return ONIGERR_UNDEFINED_BYTECODE; return ONIGERR_UNDEFINED_BYTECODE;
} }
/* collect cache opcodes from the given regex program, and compute the total number of cache points. */ /* count the total number of cache opcodes for allocating a match cache buffer. */
static OnigPosition static OnigPosition
init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num_cache_points_ptr) count_num_cache_opcodes(const regex_t* reg, long* num_cache_opcodes_ptr)
{ {
UChar* pbegin;
UChar* p = reg->p; UChar* p = reg->p;
UChar* pend = p + reg->used; *num_cache_opcodes_ptr = 0;
OnigPosition result = count_num_cache_opcodes_inner(reg, -1, 0, &p, num_cache_opcodes_ptr);
if (result == 0 && *num_cache_opcodes_ptr >= 0 && p != reg->p + reg->used) {
return ONIGERR_UNDEFINED_BYTECODE;
}
return result;
}
static OnigPosition
init_cache_opcodes_inner(
const regex_t* reg,
MemNumType current_repeat_mem, int lookaround_nesting,
OnigCacheOpcode** cache_opcodes_ptr, UChar** pp, long* num_cache_points_ptr
)
{
UChar* p = *pp;
UChar* pend = reg->p + reg->used;
UChar* pbegin;
LengthType len; LengthType len;
MemNumType repeat_mem; MemNumType repeat_mem;
OnigEncoding enc = reg->enc; OnigEncoding enc = reg->enc;
MemNumType current_repeat_mem = -1; long cache_point = *num_cache_points_ptr;
long cache_point = 0; OnigCacheOpcode *cache_opcodes = *cache_opcodes_ptr;
long num_cache_points_at_repeat = 0; OnigPosition result;
int lookaround_nesting = 0;
const OnigCacheOpcode* cache_opcodes_begin = cache_opcodes;
OnigCacheOpcode* cache_opcodes_at_repeat = NULL;
# define INC_CACHE_OPCODES do {\ # define INC_CACHE_OPCODES do {\
cache_opcodes->addr = pbegin;\ cache_opcodes->addr = pbegin;\
cache_opcodes->cache_point = cache_point - num_cache_points_at_repeat;\ cache_opcodes->cache_point = cache_point;\
cache_opcodes->outer_repeat_mem = current_repeat_mem;\ cache_opcodes->outer_repeat_mem = current_repeat_mem;\
cache_opcodes->num_cache_points_at_outer_repeat = num_cache_points_at_repeat;\ cache_opcodes->num_cache_points_at_outer_repeat = 0;\
cache_opcodes->num_cache_points_in_outer_repeat = 0;\ cache_opcodes->num_cache_points_in_outer_repeat = 0;\
cache_opcodes->lookaround_nesting = lookaround_nesting;\ cache_opcodes->lookaround_nesting = lookaround_nesting;\
cache_opcodes->match_addr = NULL;\
cache_point += lookaround_nesting != 0 ? 2 : 1;\ cache_point += lookaround_nesting != 0 ? 2 : 1;\
cache_opcodes++;\ cache_opcodes++;\
} while (0) } while (0)
# define INC_LOOKAROUND_NESTING lookaround_nesting++
# define DEC_LOOKAROUND_NESTING do {\
UChar* match_addr = p - 1;\
OnigCacheOpcode* cache_opcodes_in_lookaround = cache_opcodes - 1;\
while (\
cache_opcodes_in_lookaround >= cache_opcodes_begin &&\
cache_opcodes_in_lookaround->lookaround_nesting == lookaround_nesting\
) {\
cache_opcodes_in_lookaround->match_addr = match_addr;\
cache_opcodes_in_lookaround--;\
}\
lookaround_nesting--;\
} while (0)
while (p < pend) { while (p < pend) {
pbegin = p; pbegin = p;
switch (*p++) { switch (*p++) {
@ -692,33 +709,31 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
if (reg->repeat_range[repeat_mem].lower == 0) { if (reg->repeat_range[repeat_mem].lower == 0) {
INC_CACHE_OPCODES; INC_CACHE_OPCODES;
} }
current_repeat_mem = repeat_mem;
num_cache_points_at_repeat = cache_point;
cache_opcodes_at_repeat = cache_opcodes;
break;
case OP_REPEAT_INC:
case OP_REPEAT_INC_NG:
GET_MEMNUM_INC(repeat_mem, p);
{ {
long num_cache_points_in_repeat = cache_point - num_cache_points_at_repeat; long num_cache_points_in_repeat = 0;
long num_cache_points_at_repeat = cache_point;
OnigCacheOpcode* cache_opcodes_in_repeat = cache_opcodes;
result = init_cache_opcodes_inner(reg, repeat_mem, lookaround_nesting, &cache_opcodes, &p, &num_cache_points_in_repeat);
if (result != 0) {
goto fail;
}
OnigRepeatRange *repeat_range = &reg->repeat_range[repeat_mem]; OnigRepeatRange *repeat_range = &reg->repeat_range[repeat_mem];
if (repeat_range->lower < repeat_range->upper) { if (repeat_range->lower < repeat_range->upper) {
INC_CACHE_OPCODES; INC_CACHE_OPCODES;
cache_point -= lookaround_nesting != 0 ? 2 : 1; cache_point -= lookaround_nesting != 0 ? 2 : 1;
} }
cache_point -= num_cache_points_in_repeat;
int repeat_bounds = repeat_range->upper == 0x7fffffff ? 1 : repeat_range->upper - repeat_range->lower; int repeat_bounds = repeat_range->upper == 0x7fffffff ? 1 : repeat_range->upper - repeat_range->lower;
cache_point += num_cache_points_in_repeat * repeat_range->lower + (num_cache_points_in_repeat + (lookaround_nesting != 0 ? 2 : 1)) * repeat_bounds; cache_point += num_cache_points_in_repeat * repeat_range->lower + (num_cache_points_in_repeat + (lookaround_nesting != 0 ? 2 : 1)) * repeat_bounds;
OnigCacheOpcode* cache_opcodes_in_repeat = cache_opcodes - 1; for (; cache_opcodes_in_repeat < cache_opcodes; cache_opcodes_in_repeat++) {
while (cache_opcodes_at_repeat <= cache_opcodes_in_repeat) { cache_opcodes_in_repeat->num_cache_points_at_outer_repeat = num_cache_points_at_repeat;
cache_opcodes_in_repeat->num_cache_points_in_outer_repeat = num_cache_points_in_repeat; cache_opcodes_in_repeat->num_cache_points_in_outer_repeat = num_cache_points_in_repeat;
cache_opcodes_in_repeat--;
} }
current_repeat_mem = -1;
num_cache_points_at_repeat = 0;
cache_opcodes_at_repeat = NULL;
} }
break; break;
case OP_REPEAT_INC:
case OP_REPEAT_INC_NG:
p += SIZE_MEMNUM;
goto exit;
case OP_REPEAT_INC_SG: case OP_REPEAT_INC_SG:
case OP_REPEAT_INC_NG_SG: case OP_REPEAT_INC_NG_SG:
goto unexpected_bytecode_error; goto unexpected_bytecode_error;
@ -734,30 +749,48 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
break; break;
case OP_PUSH_POS: case OP_PUSH_POS:
INC_LOOKAROUND_NESTING; lookaround:
{
OnigCacheOpcode* cache_opcodes_in_lookaround = cache_opcodes;
result = init_cache_opcodes_inner(reg, current_repeat_mem, lookaround_nesting + 1, &cache_opcodes, &p, &cache_point);
if (result != 0) {
goto fail;
}
UChar* match_addr = p - 1;
for (; cache_opcodes_in_lookaround < cache_opcodes; cache_opcodes_in_lookaround++) {
if (cache_opcodes_in_lookaround->match_addr == NULL) {
cache_opcodes_in_lookaround->match_addr = match_addr;
}
}
}
break; break;
case OP_PUSH_POS_NOT: case OP_PUSH_POS_NOT:
p += SIZE_RELADDR; p += SIZE_RELADDR;
INC_LOOKAROUND_NESTING; goto lookaround;
break;
case OP_PUSH_LOOK_BEHIND_NOT: case OP_PUSH_LOOK_BEHIND_NOT:
p += SIZE_RELADDR; p += SIZE_RELADDR;
p += SIZE_LENGTH; p += SIZE_LENGTH;
INC_LOOKAROUND_NESTING; goto lookaround;
case OP_PUSH_STOP_BT:
{
OnigCacheOpcode* cache_opcodes_in_atomic = cache_opcodes;
result = init_cache_opcodes_inner(reg, current_repeat_mem, -1, &cache_opcodes, &p, &cache_point);
if (result != 0) {
goto fail;
}
UChar* match_addr = p - 1;
for (; cache_opcodes_in_atomic < cache_opcodes; cache_opcodes_in_atomic++) {
if (cache_opcodes_in_atomic->match_addr == NULL) {
cache_opcodes_in_atomic->match_addr = match_addr;
}
}
}
break; break;
case OP_POP_POS: case OP_POP_POS:
case OP_FAIL_POS: case OP_FAIL_POS:
case OP_FAIL_LOOK_BEHIND_NOT: case OP_FAIL_LOOK_BEHIND_NOT:
DEC_LOOKAROUND_NESTING;
break;
case OP_PUSH_STOP_BT:
INC_LOOKAROUND_NESTING;
lookaround_nesting *= -1;
break;
case OP_POP_STOP_BT: case OP_POP_STOP_BT:
lookaround_nesting *= -1; goto exit;
DEC_LOOKAROUND_NESTING;
break;
case OP_LOOK_BEHIND: case OP_LOOK_BEHIND:
p += SIZE_LENGTH; p += SIZE_LENGTH;
break; break;
@ -790,15 +823,35 @@ init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes, long* num
} }
} }
exit:
*cache_opcodes_ptr = cache_opcodes;
*pp = p;
*num_cache_points_ptr = cache_point; *num_cache_points_ptr = cache_point;
return 0; return 0;
fail:
return result;
unexpected_bytecode_error: unexpected_bytecode_error:
return ONIGERR_UNEXPECTED_BYTECODE; return ONIGERR_UNEXPECTED_BYTECODE;
bytecode_error: bytecode_error:
return ONIGERR_UNDEFINED_BYTECODE; return ONIGERR_UNDEFINED_BYTECODE;
} }
/* collect cache opcodes from the given regex program, and compute the total number of cache points. */
static OnigPosition
init_cache_opcodes(const regex_t* reg, OnigCacheOpcode* cache_opcodes_ptr, long* num_cache_points_ptr)
{
UChar* p = reg->p;
*num_cache_points_ptr = 0;
OnigPosition result = init_cache_opcodes_inner(reg, -1, 0, &cache_opcodes_ptr, &p, num_cache_points_ptr);
if (result == 0 && p != reg->p + reg->used) {
return ONIGERR_UNDEFINED_BYTECODE;
}
return result;
}
#else #else
static OnigPosition static OnigPosition
count_num_cache_opcodes(regex_t* reg, long* num_cache_opcodes) count_num_cache_opcodes(regex_t* reg, long* num_cache_opcodes)

View File

@ -2016,6 +2016,18 @@ class TestRegexp < Test::Unit::TestCase
assert(/^(?:.+){2,4}?b|b/.match? "aaaabaa") assert(/^(?:.+){2,4}?b|b/.match? "aaaabaa")
end end
def test_bug_20207 # [Bug #20207]
assert(!'clan'.match?(/(?=.*a)(?!.*n)/))
end
def test_bug_20212 # [Bug #20212]
regex = Regexp.new(
/\A((?=.*?[a-z])(?!.*--)[a-z\d]+[a-z\d-]*[a-z\d]+).((?=.*?[a-z])(?!.*--)[a-z\d]+[a-z\d-]*[a-z\d]+).((?=.*?[a-z])(?!.*--)[a-zd]+[a-zd-]*[a-zd]+).((?=.*?[a-z])(?!.*--)[a-zd]+[a-zd-]*[a-zd]+)\Z/x
)
string = "www.google.com"
100.times.each { assert(regex.match?(string)) }
end
def test_linear_time_p def test_linear_time_p
assert_send [Regexp, :linear_time?, /a/] assert_send [Regexp, :linear_time?, /a/]
assert_send [Regexp, :linear_time?, 'a'] assert_send [Regexp, :linear_time?, 'a']