[ruby/prism] Fix parsing heredoc ends
https://github.com/ruby/prism/commit/aa8c702271
This commit is contained in:
parent
39238888bc
commit
562d949e02
@ -9761,24 +9761,43 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// terminator, then we need to return the ending of the heredoc.
|
// terminator, then we need to return the ending of the heredoc.
|
||||||
if (current_token_starts_line(parser)) {
|
if (current_token_starts_line(parser)) {
|
||||||
const uint8_t *start = parser->current.start;
|
const uint8_t *start = parser->current.start;
|
||||||
size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
|
if (start + ident_length <= parser->end) {
|
||||||
|
|
||||||
if ((start + ident_length <= parser->end) && (memcmp(start, ident_start, ident_length) == 0)) {
|
|
||||||
bool matched = true;
|
|
||||||
bool at_end = false;
|
bool at_end = false;
|
||||||
|
const uint8_t *newline = next_newline(start, parser->end - start);
|
||||||
|
const uint8_t *ident_end = newline;
|
||||||
|
const uint8_t *terminator_end = newline;
|
||||||
|
|
||||||
size_t eol_length = match_eol_at(parser, start + ident_length);
|
if (newline == NULL) {
|
||||||
if (eol_length) {
|
terminator_end = parser->end;
|
||||||
parser->current.end = start + ident_length + eol_length;
|
ident_end = parser->end;
|
||||||
pm_newline_list_append(&parser->newline_list, parser->current.end - 1);
|
|
||||||
} else if (parser->end == (start + ident_length)) {
|
|
||||||
parser->current.end = start + ident_length;
|
|
||||||
at_end = true;
|
at_end = true;
|
||||||
} else {
|
} else {
|
||||||
matched = false;
|
terminator_end++;
|
||||||
|
if (newline[-1] == '\r') {
|
||||||
|
ident_end--; // Remove \r
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (matched) {
|
const uint8_t *terminator_start = ident_end - ident_length;
|
||||||
|
const uint8_t *cursor = start;
|
||||||
|
|
||||||
|
if (
|
||||||
|
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH ||
|
||||||
|
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE
|
||||||
|
) {
|
||||||
|
while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) {
|
||||||
|
cursor++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
(cursor == terminator_start) &&
|
||||||
|
(memcmp(terminator_start, ident_start, ident_length) == 0)
|
||||||
|
) {
|
||||||
|
if (newline != NULL) {
|
||||||
|
pm_newline_list_append(&parser->newline_list, newline);
|
||||||
|
}
|
||||||
|
parser->current.end = terminator_end;
|
||||||
if (*lex_mode->as.heredoc.next_start == '\\') {
|
if (*lex_mode->as.heredoc.next_start == '\\') {
|
||||||
parser->next_start = NULL;
|
parser->next_start = NULL;
|
||||||
} else {
|
} else {
|
||||||
@ -9794,7 +9813,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
LEX(PM_TOKEN_HEREDOC_END);
|
LEX(PM_TOKEN_HEREDOC_END);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
|
||||||
if (
|
if (
|
||||||
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
|
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE &&
|
||||||
(lex_mode->as.heredoc.common_whitespace > whitespace) &&
|
(lex_mode->as.heredoc.common_whitespace > whitespace) &&
|
||||||
@ -9838,6 +9857,42 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// If we have a - or ~ heredoc, then we can match after
|
// If we have a - or ~ heredoc, then we can match after
|
||||||
// some leading whitespace.
|
// some leading whitespace.
|
||||||
const uint8_t *start = breakpoint + 1;
|
const uint8_t *start = breakpoint + 1;
|
||||||
|
|
||||||
|
if (!was_escaped_newline && (start + ident_length <= parser->end)) {
|
||||||
|
// We want to match the terminator starting from the end of the line in case
|
||||||
|
// there is whitespace in the ident such as <<-' DOC' or <<~' DOC'.
|
||||||
|
const uint8_t *newline = next_newline(start, parser->end - start);
|
||||||
|
|
||||||
|
if (newline == NULL) {
|
||||||
|
newline = parser->end;
|
||||||
|
} else if (newline[-1] == '\r') {
|
||||||
|
newline--; // Remove \r
|
||||||
|
}
|
||||||
|
|
||||||
|
// Start of a possible terminator.
|
||||||
|
const uint8_t *terminator_start = newline - ident_length;
|
||||||
|
|
||||||
|
// Cursor to check for the leading whitespace. We skip the
|
||||||
|
// leading whitespace if we have a - or ~ heredoc.
|
||||||
|
const uint8_t *cursor = start;
|
||||||
|
|
||||||
|
if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_DASH ||
|
||||||
|
lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
|
||||||
|
while (cursor < terminator_start && pm_char_is_inline_whitespace(*cursor)) {
|
||||||
|
cursor++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
cursor == terminator_start &&
|
||||||
|
(memcmp(terminator_start, ident_start, ident_length) == 0)
|
||||||
|
) {
|
||||||
|
parser->current.end = breakpoint + 1;
|
||||||
|
pm_token_buffer_flush(parser, &token_buffer);
|
||||||
|
LEX(PM_TOKEN_STRING_CONTENT);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
|
size_t whitespace = pm_heredoc_strspn_inline_whitespace(parser, &start, lex_mode->as.heredoc.indent);
|
||||||
|
|
||||||
// If we have hit a newline that is followed by a valid
|
// If we have hit a newline that is followed by a valid
|
||||||
@ -9845,22 +9900,6 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
// heredoc here as string content. Then, the next time a
|
// heredoc here as string content. Then, the next time a
|
||||||
// token is lexed, it will match again and return the
|
// token is lexed, it will match again and return the
|
||||||
// end of the heredoc.
|
// end of the heredoc.
|
||||||
if (
|
|
||||||
!was_escaped_newline &&
|
|
||||||
(start + ident_length <= parser->end) &&
|
|
||||||
(memcmp(start, ident_start, ident_length) == 0)
|
|
||||||
) {
|
|
||||||
// Heredoc terminators must be followed by a
|
|
||||||
// newline, CRLF, or EOF to be valid.
|
|
||||||
if (
|
|
||||||
start + ident_length == parser->end ||
|
|
||||||
match_eol_at(parser, start + ident_length)
|
|
||||||
) {
|
|
||||||
parser->current.end = breakpoint + 1;
|
|
||||||
pm_token_buffer_flush(parser, &token_buffer);
|
|
||||||
LEX(PM_TOKEN_STRING_CONTENT);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
|
if (lex_mode->as.heredoc.indent == PM_HEREDOC_INDENT_TILDE) {
|
||||||
if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') {
|
if ((lex_mode->as.heredoc.common_whitespace > whitespace) && peek_at(parser, start) != '\n') {
|
||||||
|
19
test/prism/fixtures/heredocs_leading_whitespace.txt
Normal file
19
test/prism/fixtures/heredocs_leading_whitespace.txt
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
<<-' FOO'
|
||||||
|
a
|
||||||
|
b
|
||||||
|
FOO
|
||||||
|
|
||||||
|
<<-' FOO'
|
||||||
|
a
|
||||||
|
b
|
||||||
|
FOO
|
||||||
|
|
||||||
|
<<~' FOO'
|
||||||
|
a
|
||||||
|
b
|
||||||
|
FOO
|
||||||
|
|
||||||
|
<<~' FOO'
|
||||||
|
a
|
||||||
|
b
|
||||||
|
FOO
|
@ -68,6 +68,13 @@ module Prism
|
|||||||
# HERE
|
# HERE
|
||||||
todos << "seattlerb/heredoc_nested.txt"
|
todos << "seattlerb/heredoc_nested.txt"
|
||||||
|
|
||||||
|
# Ruby < 3.3.0 fails to parse:
|
||||||
|
#
|
||||||
|
# <<-' HERE'
|
||||||
|
# foo
|
||||||
|
# HERE
|
||||||
|
invalid << "heredocs_leading_whitespace.txt" if RUBY_VERSION < "3.3.0"
|
||||||
|
|
||||||
base = File.join(__dir__, "fixtures")
|
base = File.join(__dir__, "fixtures")
|
||||||
skips = invalid | todos
|
skips = invalid | todos
|
||||||
|
|
||||||
|
@ -111,6 +111,11 @@ module Prism
|
|||||||
# Additionally, Ripper cannot parse the %w[] fixture in this file, so set ripper_should_parse to false.
|
# Additionally, Ripper cannot parse the %w[] fixture in this file, so set ripper_should_parse to false.
|
||||||
ripper_should_parse = false if relative == "spanning_heredoc.txt"
|
ripper_should_parse = false if relative == "spanning_heredoc.txt"
|
||||||
|
|
||||||
|
# Ruby < 3.3.0 cannot parse heredocs where there are leading whitespace charactes in the heredoc start.
|
||||||
|
# Example: <<~' EOF' or <<-' EOF'
|
||||||
|
# https://bugs.ruby-lang.org/issues/19539
|
||||||
|
ripper_should_parse = false if relative == "heredocs_leading_whitespace.txt" && RUBY_VERSION < "3.3.0"
|
||||||
|
|
||||||
define_method "test_filepath_#{relative}" do
|
define_method "test_filepath_#{relative}" do
|
||||||
# First, read the source from the filepath. Use binmode to avoid converting CRLF on Windows,
|
# First, read the source from the filepath. Use binmode to avoid converting CRLF on Windows,
|
||||||
# and explicitly set the external encoding to UTF-8 to override the binmode default.
|
# and explicitly set the external encoding to UTF-8 to override the binmode default.
|
||||||
|
49
test/prism/snapshots/heredocs_leading_whitespace.txt
Normal file
49
test/prism/snapshots/heredocs_leading_whitespace.txt
Normal file
@ -0,0 +1,49 @@
|
|||||||
|
@ ProgramNode (location: (1,0)-(16,10))
|
||||||
|
├── locals: []
|
||||||
|
└── statements:
|
||||||
|
@ StatementsNode (location: (1,0)-(16,10))
|
||||||
|
└── body: (length: 4)
|
||||||
|
├── @ StringNode (location: (1,0)-(1,10))
|
||||||
|
│ ├── flags: ∅
|
||||||
|
│ ├── opening_loc: (1,0)-(1,10) = "<<-' FOO'"
|
||||||
|
│ ├── content_loc: (2,0)-(4,0) = "a\nb\n"
|
||||||
|
│ ├── closing_loc: (4,0)-(5,0) = " FOO\n"
|
||||||
|
│ └── unescaped: "a\nb\n"
|
||||||
|
├── @ StringNode (location: (6,0)-(6,10))
|
||||||
|
│ ├── flags: ∅
|
||||||
|
│ ├── opening_loc: (6,0)-(6,10) = "<<-' FOO'"
|
||||||
|
│ ├── content_loc: (7,0)-(9,0) = "a\nb\n"
|
||||||
|
│ ├── closing_loc: (9,0)-(10,0) = " FOO\n"
|
||||||
|
│ └── unescaped: "a\nb\n"
|
||||||
|
├── @ InterpolatedStringNode (location: (11,0)-(11,10))
|
||||||
|
│ ├── opening_loc: (11,0)-(11,10) = "<<~' FOO'"
|
||||||
|
│ ├── parts: (length: 2)
|
||||||
|
│ │ ├── @ StringNode (location: (12,0)-(13,0))
|
||||||
|
│ │ │ ├── flags: ∅
|
||||||
|
│ │ │ ├── opening_loc: ∅
|
||||||
|
│ │ │ ├── content_loc: (12,0)-(13,0) = "a\n"
|
||||||
|
│ │ │ ├── closing_loc: ∅
|
||||||
|
│ │ │ └── unescaped: "a\n"
|
||||||
|
│ │ └── @ StringNode (location: (13,0)-(14,0))
|
||||||
|
│ │ ├── flags: ∅
|
||||||
|
│ │ ├── opening_loc: ∅
|
||||||
|
│ │ ├── content_loc: (13,0)-(14,0) = "b\n"
|
||||||
|
│ │ ├── closing_loc: ∅
|
||||||
|
│ │ └── unescaped: "b\n"
|
||||||
|
│ └── closing_loc: (14,0)-(15,0) = " FOO\n"
|
||||||
|
└── @ InterpolatedStringNode (location: (16,0)-(16,10))
|
||||||
|
├── opening_loc: (16,0)-(16,10) = "<<~' FOO'"
|
||||||
|
├── parts: (length: 2)
|
||||||
|
│ ├── @ StringNode (location: (17,0)-(18,0))
|
||||||
|
│ │ ├── flags: ∅
|
||||||
|
│ │ ├── opening_loc: ∅
|
||||||
|
│ │ ├── content_loc: (17,0)-(18,0) = "a\n"
|
||||||
|
│ │ ├── closing_loc: ∅
|
||||||
|
│ │ └── unescaped: "a\n"
|
||||||
|
│ └── @ StringNode (location: (18,0)-(19,0))
|
||||||
|
│ ├── flags: ∅
|
||||||
|
│ ├── opening_loc: ∅
|
||||||
|
│ ├── content_loc: (18,0)-(19,0) = "b\n"
|
||||||
|
│ ├── closing_loc: ∅
|
||||||
|
│ └── unescaped: "b\n"
|
||||||
|
└── closing_loc: (19,0)-(20,0) = " FOO\n"
|
Loading…
x
Reference in New Issue
Block a user