diff --git a/prism/config.yml b/prism/config.yml index 6ead7a9d95..d26af5c66b 100644 --- a/prism/config.yml +++ b/prism/config.yml @@ -81,6 +81,7 @@ errors: - ESCAPE_INVALID_META_REPEAT - ESCAPE_INVALID_UNICODE - ESCAPE_INVALID_UNICODE_CM_FLAGS + - ESCAPE_INVALID_UNICODE_LIST - ESCAPE_INVALID_UNICODE_LITERAL - ESCAPE_INVALID_UNICODE_LONG - ESCAPE_INVALID_UNICODE_SHORT diff --git a/prism/prism.c b/prism/prism.c index a706881e3e..fa153333bf 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -9718,11 +9718,27 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre const uint8_t *start = parser->current.end - 1; parser->current.end++; - if (peek(parser) == '{') { + if (parser->current.end == parser->end) { + const uint8_t *start = parser->current.end - 2; + PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start); + } else if (peek(parser) == '{') { const uint8_t *unicode_codepoints_start = parser->current.end - 2; - parser->current.end++; - parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end); + + size_t whitespace; + while (true) { + if ((whitespace = pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end)) > 0) { + parser->current.end += whitespace; + } else if (peek(parser) == '\\' && peek_offset(parser, 1) == 'n') { + // This is super hacky, but it gets us nicer error + // messages because we can still pass it off to the + // regular expression engine even if we hit an + // unterminated regular expression. + parser->current.end += 2; + } else { + break; + } + } const uint8_t *extra_codepoints_start = NULL; int codepoints_count = 0; @@ -9736,8 +9752,17 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre pm_parser_err(parser, unicode_start, unicode_start + hexadecimal_length, PM_ERR_ESCAPE_INVALID_UNICODE_LONG); } else if (hexadecimal_length == 0) { // there are not hexadecimal characters - pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE); - pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM); + + if (flags & PM_ESCAPE_FLAG_REGEXP) { + // If this is a regular expression, we are going to + // let the regular expression engine handle this + // error instead of us. + pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); + } else { + pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE); + pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM); + } + return; } @@ -9759,10 +9784,19 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre pm_parser_err(parser, extra_codepoints_start, parser->current.end - 1, PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL); } - if (peek(parser) == '}') { + if (parser->current.end == parser->end) { + PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->current.end - start), start); + } else if (peek(parser) == '}') { parser->current.end++; } else { - pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM); + if (flags & PM_ESCAPE_FLAG_REGEXP) { + // If this is a regular expression, we are going to let + // the regular expression engine handle this error + // instead of us. + pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); + } else { + pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM); + } } if (flags & PM_ESCAPE_FLAG_REGEXP) { @@ -9772,8 +9806,12 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre size_t length = pm_strspn_hexadecimal_digit(parser->current.end, MIN(parser->end - parser->current.end, 4)); if (length == 0) { - const uint8_t *start = parser->current.end - 2; - PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start); + if (flags & PM_ESCAPE_FLAG_REGEXP) { + pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); + } else { + const uint8_t *start = parser->current.end - 2; + PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start); + } } else if (length == 4) { uint32_t value = escape_unicode(parser, parser->current.end, 4); @@ -9785,7 +9823,15 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre parser->current.end += 4; } else { parser->current.end += length; - pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE); + + if (flags & PM_ESCAPE_FLAG_REGEXP) { + // If this is a regular expression, we are going to let + // the regular expression engine handle this error + // instead of us. + pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start)); + } else { + pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE); + } } } diff --git a/prism/templates/src/diagnostic.c.erb b/prism/templates/src/diagnostic.c.erb index d3fabb8449..7b63d2dd0f 100644 --- a/prism/templates/src/diagnostic.c.erb +++ b/prism/templates/src/diagnostic.c.erb @@ -165,6 +165,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_ESCAPE_INVALID_META_REPEAT] = { "invalid meta escape sequence; meta cannot be repeated", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_ESCAPE_INVALID_UNICODE] = { "invalid Unicode escape sequence", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_ESCAPE_INVALID_UNICODE_CM_FLAGS] = { "invalid Unicode escape sequence; Unicode cannot be combined with control or meta flags", PM_ERROR_LEVEL_SYNTAX }, + [PM_ERR_ESCAPE_INVALID_UNICODE_LIST] = { "invalid Unicode list: %.*s", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL] = { "invalid Unicode escape sequence; Multiple codepoints at single character literal are disallowed", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_ESCAPE_INVALID_UNICODE_LONG] = { "invalid Unicode escape sequence; maximum length is 6 digits", PM_ERROR_LEVEL_SYNTAX }, [PM_ERR_ESCAPE_INVALID_UNICODE_SHORT] = { "too short escape sequence: %.*s", PM_ERROR_LEVEL_SYNTAX }, diff --git a/test/prism/errors/regexp_unicode_too_short.txt b/test/prism/errors/regexp_unicode_too_short.txt index a7638b2712..b6504ec9f9 100644 --- a/test/prism/errors/regexp_unicode_too_short.txt +++ b/test/prism/errors/regexp_unicode_too_short.txt @@ -1,3 +1,4 @@ -/\u/ +/\u ^~ too short escape sequence: \u +^ unterminated regexp meets end of file; expected a closing delimiter diff --git a/test/prism/errors/unterminated_unicode_brackets_should_be_a_syntax_error.txt b/test/prism/errors/unterminated_unicode_brackets_should_be_a_syntax_error.txt index 1a65c6149a..f3ac9f29ab 100644 --- a/test/prism/errors/unterminated_unicode_brackets_should_be_a_syntax_error.txt +++ b/test/prism/errors/unterminated_unicode_brackets_should_be_a_syntax_error.txt @@ -1,3 +1,3 @@ ?\u{3 - ^~~~ unterminated Unicode escape + ^~~~ invalid Unicode list: \u{3 diff --git a/test/prism/unescape_test.rb b/test/prism/unescape_test.rb index 24a9e3f6bc..f9e5a60e45 100644 --- a/test/prism/unescape_test.rb +++ b/test/prism/unescape_test.rb @@ -159,6 +159,11 @@ module Prism # to validate backreferences so these are all going to fail. next if (context.name == "//" || context.name.start_with?("%r")) && ord.chr.start_with?(/\d/) + # \u is passed directly on to the regular expression engine and it is + # responsible for handling syntax errors. In this case we do not check + # it because it would require going through the compiler. + next if context.is_a?(Context::RegExp) && ord.chr == "u" + # \a \b \c ... assert_unescape(context, ord.chr) end