[ruby/prism] Fix parsing heredoc ends

https://github.com/ruby/prism/commit/aa8c702271
2023-12-01 19:59:50 +01:00 · 2023-12-01 19:59:50 +01:00 · 562d949e02
commit 562d949e02
parent 39238888bc
5 changed files with 148 additions and 29 deletions
--- a/prism/prism.c
+++ b/prism/prism.c
@ -9761,24 +9761,43 @@ parser_lex(pm_parser_t *parser) {
            // terminator, then we need to return the ending of the heredoc.
            if (current_token_starts_line(parser)) {
                const uint8_t *start = parser->current.start;
-                size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
+                if (start + ident_length <= parser->end) {
                if ((start + ident_length <= parser->end) && (memcmp(start, ident_start, ident_length) == 0)) {
                    bool matched = true;
                    bool at_end = false;
                    const uint8_t *newline = next_newline(start, parser->end - start);
                    const uint8_t *ident_end = newline;
                    const uint8_t *terminator_end = newline;
-                    size_t eol_length = match_eol_at(parser, start + ident_length);
+                    if (newline == NULL) {
-                    if (eol_length) {
+                        terminator_end = parser->end;
-                        parser->current.end = start + ident_length + eol_length;
+                        ident_end = parser->end;
                        pm_newline_list_append(&parser->newline_list, parser->current.end - 1);
                    } else if (parser->end == (start + ident_length)) {
                        parser->current.end = start + ident_length;
                        at_end = true;
                    } else {
-                        matched = false;
+                        terminator_end++;
                        if (newline[-1] == '\r') {
                            ident_end--; // Remove \r
                        }
                    }
-                    if (matched) {
+                    const uint8_t *terminator_start = ident_end - ident_length;
                    const uint8_t *cursor = start;
                    if (
                        lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH ||
                        lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE
                    ) {
                        while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) {
                            cursor++;
                        }
                    }
                    if (
                        (cursor == terminator_start) &&
                        (memcmp(terminator_start, ident_start, ident_length) == 0)
                    ) {
                        if (newline != NULL) {
                            pm_newline_list_append(&parser->newline_list, newline);
                        }
                        parser->current.end = terminator_end;
                        if (*lex_mode->as.heredoc.next_start == '\\') {
                            parser->next_start = NULL;
                        } else {
@ -9794,7 +9813,7 @@ parser_lex(pm_parser_t *parser) {
                        LEX(PM_TOKEN_HEREDOC_END);
                    }
                }
-
+                size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
                if (
                    lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
                    (lex_mode->as.heredoc.common_whitespace > whitespace) &&
@ -9838,6 +9857,42 @@ parser_lex(pm_parser_t *parser) {
                        // If we have a - or ~ heredoc, then we can match after
                        // some leading whitespace.
                        const uint8_t *start = breakpoint + 1;
                        if (!was_escaped_newline && (start + ident_length <= parser->end)) {
                            // We want to match the terminator starting from the end of the line in case
                            // there is whitespace in the ident such as <<-'   DOC' or <<~'   DOC'.
                            const uint8_t *newline = next_newline(start, parser->end - start);
                            if (newline == NULL) {
                                newline = parser->end;
                            } else if (newline[-1] == '\r') {
                                newline--; // Remove \r
                            }
                            // Start of a possible terminator.
                            const uint8_t *terminator_start = newline - ident_length;
                            // Cursor to check for the leading whitespace. We skip the
                            // leading whitespace if we have a - or ~ heredoc.
                            const uint8_t *cursor = start;
                            if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH ||
                                lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
                                while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) {
                                    cursor++;
                                }
                            }
                            if (
                                cursor == terminator_start &&
                                (memcmp(terminator_start, ident_start, ident_length) == 0)
                            ) {
                                parser->current.end = breakpoint + 1;
                                pm_token_buffer_flush(parser, &token_buffer);
                                LEX(PM_TOKEN_STRING_CONTENT);
                            }
                        }
                        size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
                        // If we have hit a newline that is followed by a valid
@ -9845,22 +9900,6 @@ parser_lex(pm_parser_t *parser) {
                        // heredoc here as string content. Then, the next time a
                        // token is lexed, it will match again and return the
                        // end of the heredoc.
                        if (
                            !was_escaped_newline &&
                            (start + ident_length <= parser->end) &&
                            (memcmp(start, ident_start, ident_length) == 0)
                        ) {
                            // Heredoc terminators must be followed by a
                            // newline, CRLF, or EOF to be valid.
                            if (
                                start + ident_length == parser->end ||
                                match_eol_at(parser, start + ident_length)
                            ) {
                                parser->current.end = breakpoint + 1;
                                pm_token_buffer_flush(parser, &token_buffer);
                                LEX(PM_TOKEN_STRING_CONTENT);
                            }
                        }
                        if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
                            if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') {
--- a/test/prism/fixtures/heredocs_leading_whitespace.txt
+++ b/test/prism/fixtures/heredocs_leading_whitespace.txt
@ -0,0 +1,19 @@
 <<-'  FOO'
 a
 b
     FOO
 <<-'  FOO'
 a
 b
  FOO
 <<~'  FOO'
 a
 b
     FOO
 <<~'  FOO'
 a
 b
  FOO
--- a/test/prism/locals_test.rb
+++ b/test/prism/locals_test.rb
@ -68,6 +68,13 @@ module Prism
    # HERE
    todos << "seattlerb/heredoc_nested.txt"
    # Ruby < 3.3.0 fails to parse:
    #
    # <<-'  HERE'
    #  foo
    #   HERE
    invalid << "heredocs_leading_whitespace.txt" if RUBY_VERSION < "3.3.0"
    base = File.join(__dir__, "fixtures")
    skips = invalid | todos
--- a/test/prism/parse_test.rb
+++ b/test/prism/parse_test.rb
@ -111,6 +111,11 @@ module Prism
      # Additionally, Ripper cannot parse the %w[] fixture in this file, so set ripper_should_parse to false.
      ripper_should_parse = false if relative == "spanning_heredoc.txt"
      # Ruby < 3.3.0 cannot parse heredocs where there are leading whitespace charactes in the heredoc start.
      # Example: <<~'   EOF' or <<-'  EOF'
      # https://bugs.ruby-lang.org/issues/19539
      ripper_should_parse = false if relative == "heredocs_leading_whitespace.txt" && RUBY_VERSION < "3.3.0"
      define_method "test_filepath_#{relative}" do
        # First, read the source from the filepath. Use binmode to avoid converting CRLF on Windows,
        # and explicitly set the external encoding to UTF-8 to override the binmode default.
--- a/test/prism/snapshots/heredocs_leading_whitespace.txt
+++ b/test/prism/snapshots/heredocs_leading_whitespace.txt
@ -0,0 +1,49 @@
@ ProgramNode (location: (1,0)-(16,10))
 ├── locals: []
 └── statements:
    @ StatementsNode (location: (1,0)-(16,10))
    └── body: (length: 4)
        ├── @ StringNode (location: (1,0)-(1,10))
        │   ├── flags: ∅
        │   ├── opening_loc: (1,0)-(1,10) = "<<-'  FOO'"
        │   ├── content_loc: (2,0)-(4,0) = "a\nb\n"
        │   ├── closing_loc: (4,0)-(5,0) = "     FOO\n"
        │   └── unescaped: "a\nb\n"
        ├── @ StringNode (location: (6,0)-(6,10))
        │   ├── flags: ∅
        │   ├── opening_loc: (6,0)-(6,10) = "<<-'  FOO'"
        │   ├── content_loc: (7,0)-(9,0) = "a\nb\n"
        │   ├── closing_loc: (9,0)-(10,0) = "  FOO\n"
        │   └── unescaped: "a\nb\n"
        ├── @ InterpolatedStringNode (location: (11,0)-(11,10))
        │   ├── opening_loc: (11,0)-(11,10) = "<<~'  FOO'"
        │   ├── parts: (length: 2)
        │   │   ├── @ StringNode (location: (12,0)-(13,0))
        │   │   │   ├── flags: ∅
        │   │   │   ├── opening_loc: ∅
        │   │   │   ├── content_loc: (12,0)-(13,0) = "a\n"
        │   │   │   ├── closing_loc: ∅
        │   │   │   └── unescaped: "a\n"
        │   │   └── @ StringNode (location: (13,0)-(14,0))
        │   │       ├── flags: ∅
        │   │       ├── opening_loc: ∅
        │   │       ├── content_loc: (13,0)-(14,0) = "b\n"
        │   │       ├── closing_loc: ∅
        │   │       └── unescaped: "b\n"
        │   └── closing_loc: (14,0)-(15,0) = "     FOO\n"
        └── @ InterpolatedStringNode (location: (16,0)-(16,10))
            ├── opening_loc: (16,0)-(16,10) = "<<~'  FOO'"
            ├── parts: (length: 2)
            │   ├── @ StringNode (location: (17,0)-(18,0))
            │   │   ├── flags: ∅
            │   │   ├── opening_loc: ∅
            │   │   ├── content_loc: (17,0)-(18,0) = "a\n"
            │   │   ├── closing_loc: ∅
            │   │   └── unescaped: "a\n"
            │   └── @ StringNode (location: (18,0)-(19,0))
            │       ├── flags: ∅
            │       ├── opening_loc: ∅
            │       ├── content_loc: (18,0)-(19,0) = "b\n"
            │       ├── closing_loc: ∅
            │       └── unescaped: "b\n"
            └── closing_loc: (19,0)-(20,0) = "  FOO\n"