[ruby/yarp] fix: regexes and strings with escaped newline around a heredoc

Found via the fuzzer. https://github.com/ruby/yarp/commit/501757135a Co-authored-by: Kevin Newton <kddnewton@gmail.com>
2023-08-25 10:12:13 -04:00 · 2023-08-25 10:12:13 -04:00 · 3525c460f9
commit 3525c460f9
parent bf3d48e182
4 changed files with 130 additions and 7 deletions
--- a/test/yarp/fixtures/wrapping_heredoc.txt
+++ b/test/yarp/fixtures/wrapping_heredoc.txt
@ -0,0 +1,13 @@
 # test regex, string, and lists that wrap a heredoc thanks to an escaped newline
 # ripper incorrectly creates a "b\nc" string instead of two separate string tokens
 pp <<-A.gsub(/b\
 a
 A
 c/, "")
 # ripper incorrectly creates a "e\nf" string instead of two separate string tokens
 pp <<-A + "e\
 d
 A
 f"
--- a/test/yarp/parse_test.rb
+++ b/test/yarp/parse_test.rb
@ -112,6 +112,10 @@ class ParseTest < Test::Unit::TestCase
      # Waiting for feedback on https://bugs.ruby-lang.org/issues/19838.
      return if relative == "seattlerb/heredoc_nested.txt"
      # Ripper seems to have a bug that the regex portions before and after the heredoc are combined
      # into a single token.
      return if relative == "wrapping_heredoc.txt"
      # Finally, assert that we can lex the source and get the same tokens as
      # Ripper.
      lex_result = YARP.lex_compat(source)
--- a/test/yarp/snapshots/wrapping_heredoc.txt
+++ b/test/yarp/snapshots/wrapping_heredoc.txt
@ -0,0 +1,80 @@
 ProgramNode(165...298)(
  [],
  StatementsNode(165...298)(
    [CallNode(165...193)(
       nil,
       nil,
       (165...167),
       nil,
       ArgumentsNode(168...193)(
         [CallNode(168...193)(
            InterpolatedStringNode(168...172)(
              (168...172),
              [StringNode(182...184)(nil, (182...184), nil, "a\n")],
              (184...186)
            ),
            (172...173),
            (173...177),
            (177...178),
            ArgumentsNode(178...192)(
              [InterpolatedRegularExpressionNode(178...188)(
                 (178...179),
                 [StringNode(179...182)(nil, (179...182), nil, "b"),
                  StringNode(186...187)(nil, (186...187), nil, "c")],
                 (187...188),
                 0
               ),
               StringNode(190...192)(
                 (190...191),
                 (191...191),
                 (191...192),
                 ""
               )]
            ),
            (192...193),
            nil,
            0,
            "gsub"
          )]
       ),
       nil,
       nil,
       0,
       "pp"
     ),
     CallNode(278...298)(
       nil,
       nil,
       (278...280),
       nil,
       ArgumentsNode(281...298)(
         [CallNode(281...298)(
            InterpolatedStringNode(281...285)(
              (281...285),
              [StringNode(292...294)(nil, (292...294), nil, "d\n")],
              (294...296)
            ),
            nil,
            (286...287),
            nil,
            ArgumentsNode(288...298)(
              [InterpolatedStringNode(288...298)(
                 (288...289),
                 [StringNode(289...292)(nil, (289...292), nil, "e"),
                  StringNode(296...297)(nil, (296...297), nil, "f")],
                 (297...298)
               )]
            ),
            nil,
            nil,
            0,
            "+"
          )]
       ),
       nil,
       nil,
       0,
       "pp"
     )]
  )
 )
--- a/yarp/yarp.c
+++ b/yarp/yarp.c
@ -6614,7 +6614,13 @@ parser_lex(yp_parser_t *parser) {
        case YP_LEX_REGEXP: {
            // First, we'll set to start of this token to be the current end.
            if (parser->next_start == NULL) {
                parser->current.start = parser->current.end;
            } else {
                parser->current.start = parser->next_start;
                parser->current.end = parser->next_start;
                parser->next_start = NULL;
            }
            // We'll check if we're at the end of the file. If we are, then we need to
            // return the EOF token.
@ -6693,9 +6699,19 @@ parser_lex(yp_parser_t *parser) {
                if (*breakpoint == '\\') {
                    size_t difference = yp_unescape_calculate_difference(parser, breakpoint, YP_UNESCAPE_ALL, false);
-                    // If the result is an escaped newline, then we need to
+                    // If the result is an escaped newline ...
-                    // track that newline.
+                    if (*(breakpoint + difference - 1) == '\n') {
-                    yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);
+                        if (parser->heredoc_end) {
                            // ... if we are on the same line as a heredoc, flush the heredoc and
                            // continue parsing after heredoc_end.
                            parser->current.end = breakpoint + difference;
                            parser_flush_heredoc_end(parser);
                            LEX(YP_TOKEN_STRING_CONTENT);
                        } else {
                            // ... else track the newline.
                            yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
                        }
                    }
                    breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
                    continue;
@ -6833,9 +6849,19 @@ parser_lex(yp_parser_t *parser) {
                        yp_unescape_type_t unescape_type = parser->lex_modes.current->as.string.interpolation ? YP_UNESCAPE_ALL : YP_UNESCAPE_MINIMAL;
                        size_t difference = yp_unescape_calculate_difference(parser, breakpoint, unescape_type, false);
-                        // If the result is an escaped newline, then we need to
+                        // If the result is an escaped newline ...
-                        // track that newline.
+                        if (*(breakpoint + difference - 1) == '\n') {
-                        yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1);
+                            if (parser->heredoc_end) {
                                // ... if we are on the same line as a heredoc, flush the heredoc and
                                // continue parsing after heredoc_end.
                                parser->current.end = breakpoint + difference;
                                parser_flush_heredoc_end(parser);
                                LEX(YP_TOKEN_STRING_CONTENT);
                            } else {
                                // ... else track the newline.
                                yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1);
                            }
                        }
                        breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference));
                        break;