[ruby/prism] Pass Unicode escapes on to onigmo

When we encounter an invalid unicode escape within a regular
expression, we now pass that error on to Onigmo as if it didn't
exist in the parser (which matches the upstream parser's behavior).

We do this because there are tests that specify that you are
allowed to have invalid Unicode escapes if they are within the
context of a regular expression comment for a regular expression
in extended mode. That looks like:

    /# \u /x

Note that this _only_ applies to Unicode escapes (as opposed to
applying to hex or meta/control escapes as well). Importantly it
also only applies if the regular expression is terminated. An
unterminated regular expression will still get error handling done
in the parser. That would look like:

    /# \u

that would result in the same error handling we have today.

https://github.com/ruby/prism/commit/fb98034806
This commit is contained in:
Kevin Newton 2024-08-23 15:03:39 -04:00 committed by git
parent 3f6be01bfc
commit 3eb42054d9
6 changed files with 66 additions and 12 deletions

View File

@ -81,6 +81,7 @@ errors:
- ESCAPE_INVALID_META_REPEAT
- ESCAPE_INVALID_UNICODE
- ESCAPE_INVALID_UNICODE_CM_FLAGS
- ESCAPE_INVALID_UNICODE_LIST
- ESCAPE_INVALID_UNICODE_LITERAL
- ESCAPE_INVALID_UNICODE_LONG
- ESCAPE_INVALID_UNICODE_SHORT

View File

@ -9718,11 +9718,27 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
const uint8_t *start = parser->current.end - 1;
parser->current.end++;
if (peek(parser) == '{') {
if (parser->current.end == parser->end) {
const uint8_t *start = parser->current.end - 2;
PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start);
} else if (peek(parser) == '{') {
const uint8_t *unicode_codepoints_start = parser->current.end - 2;
parser->current.end++;
parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end);
size_t whitespace;
while (true) {
if ((whitespace = pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end)) > 0) {
parser->current.end += whitespace;
} else if (peek(parser) == '\\' && peek_offset(parser, 1) == 'n') {
// This is super hacky, but it gets us nicer error
// messages because we can still pass it off to the
// regular expression engine even if we hit an
// unterminated regular expression.
parser->current.end += 2;
} else {
break;
}
}
const uint8_t *extra_codepoints_start = NULL;
int codepoints_count = 0;
@ -9736,8 +9752,17 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
pm_parser_err(parser, unicode_start, unicode_start + hexadecimal_length, PM_ERR_ESCAPE_INVALID_UNICODE_LONG);
} else if (hexadecimal_length == 0) {
// there are not hexadecimal characters
pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE);
pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
if (flags & PM_ESCAPE_FLAG_REGEXP) {
// If this is a regular expression, we are going to
// let the regular expression engine handle this
// error instead of us.
pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
} else {
pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE);
pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
}
return;
}
@ -9759,10 +9784,19 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
pm_parser_err(parser, extra_codepoints_start, parser->current.end - 1, PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL);
}
if (peek(parser) == '}') {
if (parser->current.end == parser->end) {
PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->current.end - start), start);
} else if (peek(parser) == '}') {
parser->current.end++;
} else {
pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
if (flags & PM_ESCAPE_FLAG_REGEXP) {
// If this is a regular expression, we are going to let
// the regular expression engine handle this error
// instead of us.
pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
} else {
pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
}
}
if (flags & PM_ESCAPE_FLAG_REGEXP) {
@ -9772,8 +9806,12 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
size_t length = pm_strspn_hexadecimal_digit(parser->current.end, MIN(parser->end - parser->current.end, 4));
if (length == 0) {
const uint8_t *start = parser->current.end - 2;
PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start);
if (flags & PM_ESCAPE_FLAG_REGEXP) {
pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
} else {
const uint8_t *start = parser->current.end - 2;
PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start);
}
} else if (length == 4) {
uint32_t value = escape_unicode(parser, parser->current.end, 4);
@ -9785,7 +9823,15 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
parser->current.end += 4;
} else {
parser->current.end += length;
pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE);
if (flags & PM_ESCAPE_FLAG_REGEXP) {
// If this is a regular expression, we are going to let
// the regular expression engine handle this error
// instead of us.
pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
} else {
pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE);
}
}
}

View File

@ -165,6 +165,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
[PM_ERR_ESCAPE_INVALID_META_REPEAT] = { "invalid meta escape sequence; meta cannot be repeated", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_ESCAPE_INVALID_UNICODE] = { "invalid Unicode escape sequence", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_ESCAPE_INVALID_UNICODE_CM_FLAGS] = { "invalid Unicode escape sequence; Unicode cannot be combined with control or meta flags", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_ESCAPE_INVALID_UNICODE_LIST] = { "invalid Unicode list: %.*s", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL] = { "invalid Unicode escape sequence; Multiple codepoints at single character literal are disallowed", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_ESCAPE_INVALID_UNICODE_LONG] = { "invalid Unicode escape sequence; maximum length is 6 digits", PM_ERROR_LEVEL_SYNTAX },
[PM_ERR_ESCAPE_INVALID_UNICODE_SHORT] = { "too short escape sequence: %.*s", PM_ERROR_LEVEL_SYNTAX },

View File

@ -1,3 +1,4 @@
/\u/
/\u
^~ too short escape sequence: \u
^ unterminated regexp meets end of file; expected a closing delimiter

View File

@ -1,3 +1,3 @@
?\u{3
^~~~ unterminated Unicode escape
^~~~ invalid Unicode list: \u{3

View File

@ -159,6 +159,11 @@ module Prism
# to validate backreferences so these are all going to fail.
next if (context.name == "//" || context.name.start_with?("%r")) && ord.chr.start_with?(/\d/)
# \u is passed directly on to the regular expression engine and it is
# responsible for handling syntax errors. In this case we do not check
# it because it would require going through the compiler.
next if context.is_a?(Context::RegExp) && ord.chr == "u"
# \a \b \c ...
assert_unescape(context, ord.chr)
end