[ruby/prism] Pass Unicode escapes on to onigmo

When we encounter an invalid unicode escape within a regular expression, we now pass that error on to Onigmo as if it didn't exist in the parser (which matches the upstream parser's behavior). We do this because there are tests that specify that you are allowed to have invalid Unicode escapes if they are within the context of a regular expression comment for a regular expression in extended mode. That looks like: /# \u /x Note that this _only_ applies to Unicode escapes (as opposed to applying to hex or meta/control escapes as well). Importantly it also only applies if the regular expression is terminated. An unterminated regular expression will still get error handling done in the parser. That would look like: /# \u that would result in the same error handling we have today. https://github.com/ruby/prism/commit/fb98034806
2024-08-23 15:03:39 -04:00 · 2024-08-23 15:03:39 -04:00 · 3eb42054d9
commit 3eb42054d9
parent 3f6be01bfc
6 changed files with 66 additions and 12 deletions
--- a/prism/config.yml
+++ b/prism/config.yml
@ -81,6 +81,7 @@ errors:
  - ESCAPE_INVALID_META_REPEAT
  - ESCAPE_INVALID_UNICODE
  - ESCAPE_INVALID_UNICODE_CM_FLAGS
+  - ESCAPE_INVALID_UNICODE_LIST
  - ESCAPE_INVALID_UNICODE_LITERAL
  - ESCAPE_INVALID_UNICODE_LONG
  - ESCAPE_INVALID_UNICODE_SHORT
--- a/prism/prism.c
+++ b/prism/prism.c
@ -9718,11 +9718,27 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
            const uint8_t *start = parser->current.end - 1;
            parser->current.end++;

-            if (peek(parser) == '{') {
+            if (parser->current.end == parser->end) {
+                const uint8_t *start = parser->current.end - 2;
+                PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start);
+            } else if (peek(parser) == '{') {
                const uint8_t *unicode_codepoints_start = parser->current.end - 2;
-
                parser->current.end++;
-                parser->current.end += pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end);
+
+                size_t whitespace;
+                while (true) {
+                    if ((whitespace = pm_strspn_whitespace(parser->current.end, parser->end - parser->current.end)) > 0) {
+                        parser->current.end += whitespace;
+                    } else if (peek(parser) == '\\' && peek_offset(parser, 1) == 'n') {
+                        // This is super hacky, but it gets us nicer error
+                        // messages because we can still pass it off to the
+                        // regular expression engine even if we hit an
+                        // unterminated regular expression.
+                        parser->current.end += 2;
+                    } else {
+                        break;
+                    }
+                }

                const uint8_t *extra_codepoints_start = NULL;
                int codepoints_count = 0;
@ -9736,8 +9752,17 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
                        pm_parser_err(parser, unicode_start, unicode_start + hexadecimal_length, PM_ERR_ESCAPE_INVALID_UNICODE_LONG);
                    } else if (hexadecimal_length == 0) {
                        // there are not hexadecimal characters
-                        pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE);
-                        pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
+
+                        if (flags & PM_ESCAPE_FLAG_REGEXP) {
+                            // If this is a regular expression, we are going to
+                            // let the regular expression engine handle this
+                            // error instead of us.
+                            pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
+                        } else {
+                            pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE);
+                            pm_parser_err(parser, parser->current.end, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
+                        }
+
                        return;
                    }

@ -9759,10 +9784,19 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
                    pm_parser_err(parser, extra_codepoints_start, parser->current.end - 1, PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL);
                }

-                if (peek(parser) == '}') {
+                if (parser->current.end == parser->end) {
+                    PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_LIST, (int) (parser->current.end - start), start);
+                } else if (peek(parser) == '}') {
                    parser->current.end++;
                } else {
-                    pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
+                    if (flags & PM_ESCAPE_FLAG_REGEXP) {
+                        // If this is a regular expression, we are going to let
+                        // the regular expression engine handle this error
+                        // instead of us.
+                        pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
+                    } else {
+                        pm_parser_err(parser, unicode_codepoints_start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_TERM);
+                    }
                }

                if (flags & PM_ESCAPE_FLAG_REGEXP) {
@ -9772,8 +9806,12 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
                size_t length = pm_strspn_hexadecimal_digit(parser->current.end, MIN(parser->end - parser->current.end, 4));

                if (length == 0) {
-                    const uint8_t *start = parser->current.end - 2;
-                    PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start);
+                    if (flags & PM_ESCAPE_FLAG_REGEXP) {
+                        pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
+                    } else {
+                        const uint8_t *start = parser->current.end - 2;
+                        PM_PARSER_ERR_FORMAT(parser, start, parser->current.end, PM_ERR_ESCAPE_INVALID_UNICODE_SHORT, 2, start);
+                    }
                } else if (length == 4) {
                    uint32_t value = escape_unicode(parser, parser->current.end, 4);

@ -9785,7 +9823,15 @@ escape_read(pm_parser_t *parser, pm_buffer_t *buffer, pm_buffer_t *regular_expre
                    parser->current.end += 4;
                } else {
                    parser->current.end += length;
-                    pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE);
+
+                    if (flags & PM_ESCAPE_FLAG_REGEXP) {
+                        // If this is a regular expression, we are going to let
+                        // the regular expression engine handle this error
+                        // instead of us.
+                        pm_buffer_append_bytes(regular_expression_buffer, start, (size_t) (parser->current.end - start));
+                    } else {
+                        pm_parser_err_current(parser, PM_ERR_ESCAPE_INVALID_UNICODE);
+                    }
                }
            }

--- a/prism/templates/src/diagnostic.c.erb
+++ b/prism/templates/src/diagnostic.c.erb
@ -165,6 +165,7 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = {
    [PM_ERR_ESCAPE_INVALID_META_REPEAT]         = { "invalid meta escape sequence; meta cannot be repeated", PM_ERROR_LEVEL_SYNTAX },
    [PM_ERR_ESCAPE_INVALID_UNICODE]             = { "invalid Unicode escape sequence", PM_ERROR_LEVEL_SYNTAX },
    [PM_ERR_ESCAPE_INVALID_UNICODE_CM_FLAGS]    = { "invalid Unicode escape sequence; Unicode cannot be combined with control or meta flags", PM_ERROR_LEVEL_SYNTAX },
+    [PM_ERR_ESCAPE_INVALID_UNICODE_LIST]        = { "invalid Unicode list: %.*s", PM_ERROR_LEVEL_SYNTAX },
    [PM_ERR_ESCAPE_INVALID_UNICODE_LITERAL]     = { "invalid Unicode escape sequence; Multiple codepoints at single character literal are disallowed", PM_ERROR_LEVEL_SYNTAX },
    [PM_ERR_ESCAPE_INVALID_UNICODE_LONG]        = { "invalid Unicode escape sequence; maximum length is 6 digits", PM_ERROR_LEVEL_SYNTAX },
    [PM_ERR_ESCAPE_INVALID_UNICODE_SHORT]       = { "too short escape sequence: %.*s", PM_ERROR_LEVEL_SYNTAX },
--- a/test/prism/errors/regexp_unicode_too_short.txt
+++ b/test/prism/errors/regexp_unicode_too_short.txt
@ -1,3 +1,4 @@
-/\u/
+/\u
 ^~ too short escape sequence: \u
+^ unterminated regexp meets end of file; expected a closing delimiter

--- a/test/prism/errors/unterminated_unicode_brackets_should_be_a_syntax_error.txt
+++ b/test/prism/errors/unterminated_unicode_brackets_should_be_a_syntax_error.txt
@ -1,3 +1,3 @@
 ?\u{3
- ^~~~ unterminated Unicode escape
+ ^~~~ invalid Unicode list: \u{3

--- a/test/prism/unescape_test.rb
+++ b/test/prism/unescape_test.rb
@ -159,6 +159,11 @@ module Prism
        # to validate backreferences so these are all going to fail.
        next if (context.name == "//" || context.name.start_with?("%r")) && ord.chr.start_with?(/\d/)

+        # \u is passed directly on to the regular expression engine and it is
+        # responsible for handling syntax errors. In this case we do not check
+        # it because it would require going through the compiler.
+        next if context.is_a?(Context::RegExp) && ord.chr == "u"
+
        # \a \b \c ...
        assert_unescape(context, ord.chr)
      end