Handle unterminated unicode escapes in regexps
This fixes an infinite loop possible after ec3542229b29ec93062e9d90e877ea29d3c19472. For \u{} escapes in regexps, skip validation in the parser, and rely on the regexp code to handle validation. This is necessary so that invalid unicode escapes in comments in extended regexps are allowed. Fixes [Bug #19750] Co-authored-by: Nobuyoshi Nakada <nobu@ruby-lang.org>
This commit is contained in:
parent
41779fede0
commit
1bc8838d60
Notes:
git
2023-07-01 02:38:12 +00:00
Merged: https://github.com/ruby/ruby/pull/8003 Merged-By: jeremyevans <code@jeremyevans.net>
97
parse.y
97
parse.y
@ -7279,6 +7279,8 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp,
|
|||||||
return TRUE;
|
return TRUE;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static int tokadd_mbchar(struct parser_params *p, int c);
|
||||||
|
|
||||||
/* return value is for ?\u3042 */
|
/* return value is for ?\u3042 */
|
||||||
static void
|
static void
|
||||||
tokadd_utf8(struct parser_params *p, rb_encoding **encp,
|
tokadd_utf8(struct parser_params *p, rb_encoding **encp,
|
||||||
@ -7296,44 +7298,71 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp,
|
|||||||
if (regexp_literal) { tokadd(p, '\\'); tokadd(p, 'u'); }
|
if (regexp_literal) { tokadd(p, '\\'); tokadd(p, 'u'); }
|
||||||
|
|
||||||
if (peek(p, open_brace)) { /* handle \u{...} form */
|
if (peek(p, open_brace)) { /* handle \u{...} form */
|
||||||
const char *second = NULL;
|
if (regexp_literal && p->lex.strterm->u.literal.u1.func == str_regexp) {
|
||||||
int c, last = nextc(p);
|
/*
|
||||||
if (p->lex.pcur >= p->lex.pend) goto unterminated;
|
* Skip parsing validation code and copy bytes as-is until term or
|
||||||
while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend);
|
* closing brace, in order to correctly handle extended regexps where
|
||||||
while (c != close_brace) {
|
* invalid unicode escapes are allowed in comments. The regexp parser
|
||||||
if (c == term) goto unterminated;
|
* does its own validation and will catch any issues.
|
||||||
if (second == multiple_codepoints)
|
*/
|
||||||
second = p->lex.pcur;
|
int c = *p->lex.pcur;
|
||||||
if (regexp_literal) tokadd(p, last);
|
tokadd(p, c);
|
||||||
if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) {
|
for (c = *++p->lex.pcur; p->lex.pcur < p->lex.pend; c = *++p->lex.pcur) {
|
||||||
break;
|
if (c == close_brace) {
|
||||||
|
tokadd(p, c);
|
||||||
|
++p->lex.pcur;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if (c == term) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (c == '\\' && p->lex.pcur + 1 < p->lex.pend) {
|
||||||
|
tokadd(p, c);
|
||||||
|
c = *++p->lex.pcur;
|
||||||
|
}
|
||||||
|
tokadd_mbchar(p, c);
|
||||||
}
|
}
|
||||||
while (ISSPACE(c = *p->lex.pcur)) {
|
}
|
||||||
if (++p->lex.pcur >= p->lex.pend) goto unterminated;
|
else {
|
||||||
last = c;
|
const char *second = NULL;
|
||||||
|
int c, last = nextc(p);
|
||||||
|
if (p->lex.pcur >= p->lex.pend) goto unterminated;
|
||||||
|
while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend);
|
||||||
|
while (c != close_brace) {
|
||||||
|
if (c == term) goto unterminated;
|
||||||
|
if (second == multiple_codepoints)
|
||||||
|
second = p->lex.pcur;
|
||||||
|
if (regexp_literal) tokadd(p, last);
|
||||||
|
if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
while (ISSPACE(c = *p->lex.pcur)) {
|
||||||
|
if (++p->lex.pcur >= p->lex.pend) goto unterminated;
|
||||||
|
last = c;
|
||||||
|
}
|
||||||
|
if (term == -1 && !second)
|
||||||
|
second = multiple_codepoints;
|
||||||
}
|
}
|
||||||
if (term == -1 && !second)
|
|
||||||
second = multiple_codepoints;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (c != close_brace) {
|
if (c != close_brace) {
|
||||||
unterminated:
|
unterminated:
|
||||||
token_flush(p);
|
token_flush(p);
|
||||||
yyerror0("unterminated Unicode escape");
|
yyerror0("unterminated Unicode escape");
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (second && second != multiple_codepoints) {
|
if (second && second != multiple_codepoints) {
|
||||||
const char *pcur = p->lex.pcur;
|
const char *pcur = p->lex.pcur;
|
||||||
p->lex.pcur = second;
|
p->lex.pcur = second;
|
||||||
dispatch_scan_event(p, tSTRING_CONTENT);
|
dispatch_scan_event(p, tSTRING_CONTENT);
|
||||||
token_flush(p);
|
token_flush(p);
|
||||||
p->lex.pcur = pcur;
|
p->lex.pcur = pcur;
|
||||||
yyerror0(multiple_codepoints);
|
yyerror0(multiple_codepoints);
|
||||||
token_flush(p);
|
token_flush(p);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (regexp_literal) tokadd(p, close_brace);
|
if (regexp_literal) tokadd(p, close_brace);
|
||||||
nextc(p);
|
nextc(p);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else { /* handle \uxxxx form */
|
else { /* handle \uxxxx form */
|
||||||
if (!tokadd_codepoint(p, encp, regexp_literal, FALSE)) {
|
if (!tokadd_codepoint(p, encp, regexp_literal, FALSE)) {
|
||||||
|
@ -1052,6 +1052,22 @@ x = __ENCODING__
|
|||||||
assert_syntax_error(" 0b\n", /\^/)
|
assert_syntax_error(" 0b\n", /\^/)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def test_unclosed_unicode_escape_at_eol_bug_19750
|
||||||
|
assert_separately([], "#{<<-"begin;"}\n#{<<~'end;'}")
|
||||||
|
begin;
|
||||||
|
assert_syntax_error("/\\u", /too short escape sequence/)
|
||||||
|
assert_syntax_error("/\\u{", /unterminated regexp meets end of file/)
|
||||||
|
assert_syntax_error("/\\u{\\n", /invalid Unicode list/)
|
||||||
|
assert_syntax_error("/a#\\u{\\n/", /invalid Unicode list/)
|
||||||
|
re = eval("/a#\\u{\n$/x")
|
||||||
|
assert_match(re, 'a')
|
||||||
|
assert_not_match(re, 'a#')
|
||||||
|
re = eval("/a#\\u\n$/x")
|
||||||
|
assert_match(re, 'a')
|
||||||
|
assert_not_match(re, 'a#')
|
||||||
|
end;
|
||||||
|
end
|
||||||
|
|
||||||
def test_error_def_in_argument
|
def test_error_def_in_argument
|
||||||
assert_separately([], "#{<<-"begin;"}\n#{<<~"end;"}")
|
assert_separately([], "#{<<-"begin;"}\n#{<<~"end;"}")
|
||||||
begin;
|
begin;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user