[ruby/prism] Fix percent delimiter strings with crlfs

parse.y treats CRLF as a LF and basically "normalizes" them before parsing. That means a string like `%\nfoo\r\n` is actually treated as `%\nfoo\n` for the purposes of parsing. This happens on both the opening side of the percent string as well as on the closing side. So for example `%\r\nfoo\n` must be treated as `%\nfoo\n`. To handle this in Prism, when we start a % string, we check if it starts with `\r\n`, and then consider the terminator to actually be `\n`. Then we check if there are `\r\n` as we lex the string and treat those as `\n`, but only in the case the start was a `\n`. Fixes: #3230 [Bug #20938] https://github.com/ruby/prism/commit/e573ceaad6 Co-authored-by: John Hawthorn <jhawthorn@github.com> Co-authored-by: eileencodes <eileencodes@gmail.com> Co-authored-by: Kevin Newton <kddnewton@gmail.com>
2024-12-05 13:56:03 -05:00 · 2024-12-05 13:56:03 -05:00 · 9fe6fd8693
commit 9fe6fd8693
parent d53e4545f4
2 changed files with 86 additions and 3 deletions
--- a/prism/prism.c
+++ b/prism/prism.c
@ -10503,6 +10503,7 @@ pm_token_buffer_escape(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
    }

    const uint8_t *end = parser->current.end - 1;
+    assert(end >= start);
    pm_buffer_append_bytes(&token_buffer->buffer, start, (size_t) (end - start));

    token_buffer->cursor = end;
@ -10583,9 +10584,15 @@ pm_lex_percent_delimiter(pm_parser_t *parser) {
            pm_newline_list_append(&parser->newline_list, parser->current.end + eol_length - 1);
        }

-        const uint8_t delimiter = *parser->current.end;
-        parser->current.end += eol_length;
+        uint8_t delimiter = *parser->current.end;

+        // If our delimiter is \r\n, we want to treat it as if it's \n.
+        // For example, %\r\nfoo\r\n should be "foo"
+        if (eol_length == 2) {
+            delimiter = *(parser->current.end + 1);
+        }
+
+        parser->current.end += eol_length;
        return delimiter;
    }

@ -12335,10 +12342,28 @@ parser_lex(pm_parser_t *parser) {
                    continue;
                }

+                bool is_terminator = (*breakpoint == lex_mode->as.string.terminator);
+
+                // If the terminator is newline, we need to consider \r\n _also_ a newline
+                // For example: `%\nfoo\r\n`
+                // The string should be "foo", not "foo\r"
+                if (*breakpoint == '\r' && peek_at(parser, breakpoint + 1) == '\n') {
+                    if (lex_mode->as.string.terminator == '\n') {
+                        is_terminator = true;
+                    }
+
+                    // If the terminator is a CR, but we see a CRLF, we need to
+                    // treat the CRLF as a newline, meaning this is _not_ the
+                    // terminator
+                    if (lex_mode->as.string.terminator == '\r') {
+                        is_terminator = false;
+                    }
+                }
+
                // Note that we have to check the terminator here first because we could
                // potentially be parsing a % string that has a # character as the
                // terminator.
-                if (*breakpoint == lex_mode->as.string.terminator) {
+                if (is_terminator) {
                    // If this terminator doesn't actually close the string, then we need
                    // to continue on past it.
                    if (lex_mode->as.string.nesting > 0) {
--- a/test/prism/percent_delimiter_string_test.rb
+++ b/test/prism/percent_delimiter_string_test.rb
@ -0,0 +1,58 @@
+# frozen_string_literal: true
+
+require_relative "test_helper"
+
+module Prism
+  class PercentDelimiterStringTest < TestCase
+    def test_newline_terminator_with_lf_crlf
+      str = "%\n123456\r\n"
+      assert_parse "123456", str
+    end
+
+    def test_newline_terminator_with_lf_crlf_with_extra_cr
+      str = "%\n123456\r\r\n"
+      assert_parse "123456\r", str
+    end
+
+    def test_newline_terminator_with_crlf_pair
+      str = "%\r\n123456\r\n"
+      assert_parse "123456", str
+    end
+
+    def test_newline_terminator_with_crlf_crlf_with_extra_cr
+      str = "%\r\n123456\r\r\n"
+      assert_parse "123456\r", str
+    end
+
+    def test_newline_terminator_with_cr_cr
+      str = "%\r123456\r;\n"
+      assert_parse "123456", str
+    end
+
+    def test_newline_terminator_with_crlf_lf
+      str = "%\r\n123456\n;\n"
+      assert_parse "123456", str
+    end
+
+    def test_cr_crlf
+      str = "%\r1\r\n \r"
+      assert_parse "1\n ", str
+    end
+
+    def test_lf_crlf
+      str = "%\n1\r\n \n"
+      assert_parse "1", str
+    end
+
+    def test_lf_lf
+      str = "%\n1\n \n"
+      assert_parse "1", str
+    end
+
+    def assert_parse(expected, str)
+      tree = Prism.parse str
+      node = tree.value.breadth_first_search { |x| Prism::StringNode === x }
+      assert_equal expected, node.unescaped
+    end
+  end
+end