[ruby/yarp] Improve handling of line endings
Introduce three new inline helper functions: - `match_line_ending` - `match_line_ending_at` - `match_line_ending_addr` These functions are similar in signature to the `peek*` functions, but return the length of the line ending being inspected (or 0 if no line ending was found). These functions are then used to simplify how we're detecting line endings throughout "src/yarp.c". Also: - test coverage backfilled for `__END__` comments with CRLF line endings. - error message for invalid `%` tokens updated to not include the potential line endings. - some small refactorings for readability along the way https://github.com/ruby/yarp/commit/a00067386d
This commit is contained in:
parent
5ec1fc52c1
commit
20927a89c2
@ -31,6 +31,12 @@ class CommentsTest < Test::Unit::TestCase
|
|||||||
assert_comment source, :__END__, 0..16
|
assert_comment source, :__END__, 0..16
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def test_comment___END__crlf
|
||||||
|
source = "__END__\r\ncomment\r\n"
|
||||||
|
|
||||||
|
assert_comment source, :__END__, 0..18
|
||||||
|
end
|
||||||
|
|
||||||
def test_comment_embedded_document
|
def test_comment_embedded_document
|
||||||
source = <<~RUBY
|
source = <<~RUBY
|
||||||
=begin
|
=begin
|
||||||
|
@ -164,7 +164,7 @@ class ErrorsTest < Test::Unit::TestCase
|
|||||||
|
|
||||||
def test_cr_without_lf_in_percent_expression
|
def test_cr_without_lf_in_percent_expression
|
||||||
assert_errors expression("%\r"), "%\r", [
|
assert_errors expression("%\r"), "%\r", [
|
||||||
["Invalid %% token", 0..3],
|
["Invalid %% token", 0..2],
|
||||||
]
|
]
|
||||||
end
|
end
|
||||||
|
|
||||||
|
170
yarp/yarp.c
170
yarp/yarp.c
@ -4216,6 +4216,33 @@ next_newline(const char *cursor, ptrdiff_t length) {
|
|||||||
return memchr(cursor, '\n', (size_t) length);
|
return memchr(cursor, '\n', (size_t) length);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Return the length of the line ending string starting at +cursor+, or 0 if it is not a line
|
||||||
|
// ending. This function is intended to be CRLF/LF agnostic.
|
||||||
|
static inline size_t
|
||||||
|
match_line_ending_addr(yp_parser_t *parser, const char *cursor) {
|
||||||
|
if (peek_addr(parser, cursor) == '\n') {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
if (peek_addr(parser, cursor) == '\r' && peek_addr(parser, cursor + 1) == '\n') {
|
||||||
|
return 2;
|
||||||
|
}
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the length of the line ending string starting at parser->current.end + offset, or 0 if it
|
||||||
|
// is not a line ending. This function is intended to be CRLF/LF agnostic.
|
||||||
|
static inline size_t
|
||||||
|
match_line_ending_at(yp_parser_t *parser, ptrdiff_t offset) {
|
||||||
|
return match_line_ending_addr(parser, parser->current.end + offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Return the length of the line ending string starting at parser->current.end, or 0 if it is not a
|
||||||
|
// line ending. This function is intended to be CRLF/LF agnostic.
|
||||||
|
static inline size_t
|
||||||
|
match_line_ending(yp_parser_t *parser) {
|
||||||
|
return match_line_ending_addr(parser, parser->current.end);
|
||||||
|
}
|
||||||
|
|
||||||
// Find the start of the encoding comment. This is effectively an inlined
|
// Find the start of the encoding comment. This is effectively an inlined
|
||||||
// version of strnstr with some modifications.
|
// version of strnstr with some modifications.
|
||||||
static inline const char *
|
static inline const char *
|
||||||
@ -5302,7 +5329,7 @@ parser_lex(yp_parser_t *parser) {
|
|||||||
space_seen = true;
|
space_seen = true;
|
||||||
break;
|
break;
|
||||||
case '\r':
|
case '\r':
|
||||||
if (peek_at(parser, 1) == '\n') {
|
if (match_line_ending_at(parser, 1)) {
|
||||||
chomping = false;
|
chomping = false;
|
||||||
} else {
|
} else {
|
||||||
parser->current.end++;
|
parser->current.end++;
|
||||||
@ -5310,28 +5337,22 @@ parser_lex(yp_parser_t *parser) {
|
|||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case '\\':
|
case '\\':
|
||||||
if (peek_at(parser, 1) == '\n') {
|
{
|
||||||
if (parser->heredoc_end) {
|
size_t le_len = match_line_ending_at(parser, 1);
|
||||||
parser->current.end = parser->heredoc_end;
|
if (le_len) {
|
||||||
parser->heredoc_end = NULL;
|
if (parser->heredoc_end) {
|
||||||
} else {
|
parser->current.end = parser->heredoc_end;
|
||||||
yp_newline_list_append(&parser->newline_list, parser->current.end + 1);
|
parser->heredoc_end = NULL;
|
||||||
|
} else {
|
||||||
|
parser->current.end += le_len + 1;
|
||||||
|
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
|
||||||
|
space_seen = true;
|
||||||
|
}
|
||||||
|
} else if (yp_char_is_inline_whitespace(*parser->current.end)) {
|
||||||
parser->current.end += 2;
|
parser->current.end += 2;
|
||||||
space_seen = true;
|
|
||||||
}
|
|
||||||
} else if (peek_at(parser, 1) == '\r' && peek_at(parser, 2) == '\n') {
|
|
||||||
if (parser->heredoc_end) {
|
|
||||||
parser->current.end = parser->heredoc_end;
|
|
||||||
parser->heredoc_end = NULL;
|
|
||||||
} else {
|
} else {
|
||||||
yp_newline_list_append(&parser->newline_list, parser->current.end + 2);
|
chomping = false;
|
||||||
parser->current.end += 3;
|
|
||||||
space_seen = true;
|
|
||||||
}
|
}
|
||||||
} else if (yp_char_is_inline_whitespace(*parser->current.end)) {
|
|
||||||
parser->current.end += 2;
|
|
||||||
} else {
|
|
||||||
chomping = false;
|
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
@ -5376,21 +5397,26 @@ parser_lex(yp_parser_t *parser) {
|
|||||||
lexed_comment = true;
|
lexed_comment = true;
|
||||||
}
|
}
|
||||||
/* fallthrough */
|
/* fallthrough */
|
||||||
case '\r': {
|
case '\r':
|
||||||
// The only way you can have carriage returns in this particular loop
|
case '\n':
|
||||||
// is if you have a carriage return followed by a newline. In that
|
{
|
||||||
// case we'll just skip over the carriage return and continue lexing,
|
size_t le_len = match_line_ending_addr(parser, parser->current.end - 1);
|
||||||
// in order to make it so that the newline token encapsulates both the
|
if (le_len) {
|
||||||
// carriage return and the newline. Note that we need to check that
|
// The only way you can have carriage returns in this particular loop
|
||||||
// we haven't already lexed a comment here because that falls through
|
// is if you have a carriage return followed by a newline. In that
|
||||||
// into here as well.
|
// case we'll just skip over the carriage return and continue lexing,
|
||||||
if (!lexed_comment) parser->current.end++;
|
// in order to make it so that the newline token encapsulates both the
|
||||||
}
|
// carriage return and the newline. Note that we need to check that
|
||||||
/* fallthrough */
|
// we haven't already lexed a comment here because that falls through
|
||||||
case '\n': {
|
// into here as well.
|
||||||
if (parser->heredoc_end == NULL) {
|
if (!lexed_comment) parser->current.end += le_len - 1 ; // skip CR
|
||||||
yp_newline_list_check_append(&parser->newline_list, parser->current.end - 1);
|
|
||||||
} else {
|
if (parser->heredoc_end == NULL) {
|
||||||
|
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (parser->heredoc_end) {
|
||||||
parser_flush_heredoc_end(parser);
|
parser_flush_heredoc_end(parser);
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6183,15 +6209,14 @@ parser_lex(yp_parser_t *parser) {
|
|||||||
if (!parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end)) {
|
if (!parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end)) {
|
||||||
lex_mode_push_string(parser, true, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
|
lex_mode_push_string(parser, true, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
|
||||||
|
|
||||||
if (*parser->current.end == '\r') {
|
size_t le_len = match_line_ending(parser);
|
||||||
|
if (le_len) {
|
||||||
|
parser->current.end += le_len;
|
||||||
|
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
|
||||||
|
} else {
|
||||||
parser->current.end++;
|
parser->current.end++;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (peek(parser) == '\n') {
|
|
||||||
yp_newline_list_append(&parser->newline_list, parser->current.end);
|
|
||||||
}
|
|
||||||
|
|
||||||
parser->current.end++;
|
|
||||||
if (parser->current.end < parser->end) {
|
if (parser->current.end < parser->end) {
|
||||||
LEX(YP_TOKEN_STRING_BEGIN);
|
LEX(YP_TOKEN_STRING_BEGIN);
|
||||||
}
|
}
|
||||||
@ -6345,10 +6370,7 @@ parser_lex(yp_parser_t *parser) {
|
|||||||
((parser->current.end - parser->current.start) == 7) &&
|
((parser->current.end - parser->current.start) == 7) &&
|
||||||
current_token_starts_line(parser) &&
|
current_token_starts_line(parser) &&
|
||||||
(strncmp(parser->current.start, "__END__", 7) == 0) &&
|
(strncmp(parser->current.start, "__END__", 7) == 0) &&
|
||||||
(
|
(parser->current.end == parser->end || match_line_ending(parser))
|
||||||
parser->current.end == parser->end ||
|
|
||||||
peek(parser) == '\n' ||
|
|
||||||
(peek(parser) == '\r' && peek_at(parser, 1) == '\n'))
|
|
||||||
)
|
)
|
||||||
{
|
{
|
||||||
parser->current.end = parser->end;
|
parser->current.end = parser->end;
|
||||||
@ -6675,12 +6697,11 @@ parser_lex(yp_parser_t *parser) {
|
|||||||
|
|
||||||
// Otherwise we need to switch back to the parent lex mode and
|
// Otherwise we need to switch back to the parent lex mode and
|
||||||
// return the end of the string.
|
// return the end of the string.
|
||||||
if (peek(parser) == '\r' && peek_at(parser, 1) == '\n') {
|
size_t le_len = match_line_ending_addr(parser, breakpoint);
|
||||||
parser->current.end = breakpoint + 2;
|
if (le_len) {
|
||||||
yp_newline_list_append(&parser->newline_list, breakpoint + 1);
|
parser->current.end = breakpoint + le_len;
|
||||||
|
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
|
||||||
} else {
|
} else {
|
||||||
yp_newline_list_check_append(&parser->newline_list, parser->current.end);
|
|
||||||
|
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -6791,13 +6812,10 @@ parser_lex(yp_parser_t *parser) {
|
|||||||
bool matched = true;
|
bool matched = true;
|
||||||
bool at_end = false;
|
bool at_end = false;
|
||||||
|
|
||||||
if (peek_addr(parser, start + ident_length) == '\n') {
|
size_t le_len = match_line_ending_addr(parser, start + ident_length);
|
||||||
parser->current.end = start + ident_length + 1;
|
if (le_len) {
|
||||||
yp_newline_list_append(&parser->newline_list, start + ident_length);
|
parser->current.end = start + ident_length + le_len;
|
||||||
} else if (peek_addr(parser, start + ident_length) == '\r' &&
|
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
|
||||||
peek_addr(parser, start + ident_length + 1) == '\n') {
|
|
||||||
parser->current.end = start + ident_length + 2;
|
|
||||||
yp_newline_list_append(&parser->newline_list, start + ident_length + 1);
|
|
||||||
} else if (parser->end == (start + ident_length)) {
|
} else if (parser->end == (start + ident_length)) {
|
||||||
parser->current.end = start + ident_length;
|
parser->current.end = start + ident_length;
|
||||||
at_end = true;
|
at_end = true;
|
||||||
@ -6862,20 +6880,11 @@ parser_lex(yp_parser_t *parser) {
|
|||||||
(start + ident_length <= parser->end) &&
|
(start + ident_length <= parser->end) &&
|
||||||
(strncmp(start, ident_start, ident_length) == 0)
|
(strncmp(start, ident_start, ident_length) == 0)
|
||||||
) {
|
) {
|
||||||
// Heredoc terminators must be followed by a newline or EOF to be valid.
|
// Heredoc terminators must be followed by a newline, CRLF, or EOF to be valid.
|
||||||
if (start + ident_length == parser->end ||
|
if (
|
||||||
peek_addr(parser, start + ident_length) == '\n')
|
start + ident_length == parser->end ||
|
||||||
{
|
match_line_ending_addr(parser, start + ident_length)
|
||||||
parser->current.end = breakpoint + 1;
|
) {
|
||||||
LEX(YP_TOKEN_STRING_CONTENT);
|
|
||||||
}
|
|
||||||
|
|
||||||
// They can also be followed by a carriage return and then a
|
|
||||||
// newline. Be sure here that we don't accidentally read off the
|
|
||||||
// end.
|
|
||||||
if (peek_addr(parser, start + ident_length) == '\r' &&
|
|
||||||
peek_addr(parser, start + ident_length + 1) == '\n')
|
|
||||||
{
|
|
||||||
parser->current.end = breakpoint + 1;
|
parser->current.end = breakpoint + 1;
|
||||||
LEX(YP_TOKEN_STRING_CONTENT);
|
LEX(YP_TOKEN_STRING_CONTENT);
|
||||||
}
|
}
|
||||||
@ -6893,11 +6902,9 @@ parser_lex(yp_parser_t *parser) {
|
|||||||
// stop looping before the newline and not after the
|
// stop looping before the newline and not after the
|
||||||
// newline so that we can still potentially find the
|
// newline so that we can still potentially find the
|
||||||
// terminator of the heredoc.
|
// terminator of the heredoc.
|
||||||
if (peek_addr(parser, breakpoint + 1) == '\n') {
|
size_t le_len = match_line_ending_addr(parser, breakpoint + 1);
|
||||||
breakpoint++;
|
if (le_len) {
|
||||||
} else if (peek_addr(parser, breakpoint + 1) == '\r' &&
|
breakpoint += le_len;
|
||||||
peek_addr(parser, breakpoint + 2) == '\n') {
|
|
||||||
breakpoint += 2;
|
|
||||||
} else {
|
} else {
|
||||||
yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL;
|
yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL;
|
||||||
size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, unescape_type, false, &parser->error_list);
|
size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, unescape_type, false, &parser->error_list);
|
||||||
@ -9183,8 +9190,10 @@ parse_heredoc_common_whitespace(yp_parser_t *parser, yp_node_list_t *nodes) {
|
|||||||
|
|
||||||
while (cur_char && cur_char < content_loc->end) {
|
while (cur_char && cur_char < content_loc->end) {
|
||||||
// Any empty newlines aren't included in the minimum whitespace calculation
|
// Any empty newlines aren't included in the minimum whitespace calculation
|
||||||
while (cur_char < content_loc->end && *cur_char == '\n') cur_char++;
|
size_t le_len;
|
||||||
while (cur_char + 1 < content_loc->end && *cur_char == '\r' && cur_char[1] == '\n') cur_char += 2;
|
while ((le_len = match_line_ending_addr(parser, cur_char))) {
|
||||||
|
cur_char += le_len;
|
||||||
|
}
|
||||||
|
|
||||||
if (cur_char == content_loc->end) break;
|
if (cur_char == content_loc->end) break;
|
||||||
|
|
||||||
@ -9202,8 +9211,9 @@ parse_heredoc_common_whitespace(yp_parser_t *parser, yp_node_list_t *nodes) {
|
|||||||
// If we hit a newline, then we have encountered a line that contains
|
// If we hit a newline, then we have encountered a line that contains
|
||||||
// only whitespace, and it shouldn't be considered in the calculation of
|
// only whitespace, and it shouldn't be considered in the calculation of
|
||||||
// common leading whitespace.
|
// common leading whitespace.
|
||||||
if (*cur_char == '\n') {
|
le_len = match_line_ending_addr(parser, cur_char);
|
||||||
cur_char++;
|
if (le_len) {
|
||||||
|
cur_char += le_len;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user