diff --git a/parse.y b/parse.y index 13cb3842d5..abc980d6fc 100644 --- a/parse.y +++ b/parse.y @@ -7279,6 +7279,8 @@ tokadd_codepoint(struct parser_params *p, rb_encoding **encp, return TRUE; } +static int tokadd_mbchar(struct parser_params *p, int c); + /* return value is for ?\u3042 */ static void tokadd_utf8(struct parser_params *p, rb_encoding **encp, @@ -7296,44 +7298,71 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp, if (regexp_literal) { tokadd(p, '\\'); tokadd(p, 'u'); } if (peek(p, open_brace)) { /* handle \u{...} form */ - const char *second = NULL; - int c, last = nextc(p); - if (p->lex.pcur >= p->lex.pend) goto unterminated; - while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend); - while (c != close_brace) { - if (c == term) goto unterminated; - if (second == multiple_codepoints) - second = p->lex.pcur; - if (regexp_literal) tokadd(p, last); - if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) { - break; + if (regexp_literal && p->lex.strterm->u.literal.u1.func == str_regexp) { + /* + * Skip parsing validation code and copy bytes as-is until term or + * closing brace, in order to correctly handle extended regexps where + * invalid unicode escapes are allowed in comments. The regexp parser + * does its own validation and will catch any issues. + */ + int c = *p->lex.pcur; + tokadd(p, c); + for (c = *++p->lex.pcur; p->lex.pcur < p->lex.pend; c = *++p->lex.pcur) { + if (c == close_brace) { + tokadd(p, c); + ++p->lex.pcur; + break; + } + else if (c == term) { + break; + } + if (c == '\\' && p->lex.pcur + 1 < p->lex.pend) { + tokadd(p, c); + c = *++p->lex.pcur; + } + tokadd_mbchar(p, c); } - while (ISSPACE(c = *p->lex.pcur)) { - if (++p->lex.pcur >= p->lex.pend) goto unterminated; - last = c; + } + else { + const char *second = NULL; + int c, last = nextc(p); + if (p->lex.pcur >= p->lex.pend) goto unterminated; + while (ISSPACE(c = *p->lex.pcur) && ++p->lex.pcur < p->lex.pend); + while (c != close_brace) { + if (c == term) goto unterminated; + if (second == multiple_codepoints) + second = p->lex.pcur; + if (regexp_literal) tokadd(p, last); + if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) { + break; + } + while (ISSPACE(c = *p->lex.pcur)) { + if (++p->lex.pcur >= p->lex.pend) goto unterminated; + last = c; + } + if (term == -1 && !second) + second = multiple_codepoints; } - if (term == -1 && !second) - second = multiple_codepoints; - } - if (c != close_brace) { - unterminated: - token_flush(p); - yyerror0("unterminated Unicode escape"); - return; - } - if (second && second != multiple_codepoints) { - const char *pcur = p->lex.pcur; - p->lex.pcur = second; - dispatch_scan_event(p, tSTRING_CONTENT); - token_flush(p); - p->lex.pcur = pcur; - yyerror0(multiple_codepoints); - token_flush(p); - } + if (c != close_brace) { + unterminated: + token_flush(p); + yyerror0("unterminated Unicode escape"); + return; + } + if (second && second != multiple_codepoints) { + const char *pcur = p->lex.pcur; + p->lex.pcur = second; + dispatch_scan_event(p, tSTRING_CONTENT); + token_flush(p); + p->lex.pcur = pcur; + yyerror0(multiple_codepoints); + token_flush(p); + } - if (regexp_literal) tokadd(p, close_brace); - nextc(p); + if (regexp_literal) tokadd(p, close_brace); + nextc(p); + } } else { /* handle \uxxxx form */ if (!tokadd_codepoint(p, encp, regexp_literal, FALSE)) { diff --git a/test/ruby/test_parse.rb b/test/ruby/test_parse.rb index bf0d9f1bd5..cf989d190b 100644 --- a/test/ruby/test_parse.rb +++ b/test/ruby/test_parse.rb @@ -1052,6 +1052,22 @@ x = __ENCODING__ assert_syntax_error(" 0b\n", /\^/) end + def test_unclosed_unicode_escape_at_eol_bug_19750 + assert_separately([], "#{<<-"begin;"}\n#{<<~'end;'}") + begin; + assert_syntax_error("/\\u", /too short escape sequence/) + assert_syntax_error("/\\u{", /unterminated regexp meets end of file/) + assert_syntax_error("/\\u{\\n", /invalid Unicode list/) + assert_syntax_error("/a#\\u{\\n/", /invalid Unicode list/) + re = eval("/a#\\u{\n$/x") + assert_match(re, 'a') + assert_not_match(re, 'a#') + re = eval("/a#\\u\n$/x") + assert_match(re, 'a') + assert_not_match(re, 'a#') + end; + end + def test_error_def_in_argument assert_separately([], "#{<<-"begin;"}\n#{<<~"end;"}") begin;