[ruby/yarp] Improve handling of line endings

Introduce three new inline helper functions:

- `match_line_ending`
- `match_line_ending_at`
- `match_line_ending_addr`

These functions are similar in signature to the `peek*` functions, but
return the length of the line ending being inspected (or 0 if no line
ending was found).

These functions are then used to simplify how we're detecting line
endings throughout "src/yarp.c".

Also:
- test coverage backfilled for `__END__` comments with CRLF line endings.
- error message for invalid `%` tokens updated to not include
  the potential line endings.
- some small refactorings for readability along the way

https://github.com/ruby/yarp/commit/a00067386d
This commit is contained in:
Mike Dalessio 2023-08-23 11:39:15 -04:00 committed by git
parent 5ec1fc52c1
commit 20927a89c2
3 changed files with 97 additions and 81 deletions

View File

@ -31,6 +31,12 @@ class CommentsTest < Test::Unit::TestCase
assert_comment source, :__END__, 0..16
end
def test_comment___END__crlf
source = "__END__\r\ncomment\r\n"
assert_comment source, :__END__, 0..18
end
def test_comment_embedded_document
source = <<~RUBY
=begin

View File

@ -164,7 +164,7 @@ class ErrorsTest < Test::Unit::TestCase
def test_cr_without_lf_in_percent_expression
assert_errors expression("%\r"), "%\r", [
["Invalid %% token", 0..3],
["Invalid %% token", 0..2],
]
end

View File

@ -4216,6 +4216,33 @@ next_newline(const char *cursor, ptrdiff_t length) {
return memchr(cursor, '\n', (size_t) length);
}
// Return the length of the line ending string starting at +cursor+, or 0 if it is not a line
// ending. This function is intended to be CRLF/LF agnostic.
static inline size_t
match_line_ending_addr(yp_parser_t *parser, const char *cursor) {
if (peek_addr(parser, cursor) == '\n') {
return 1;
}
if (peek_addr(parser, cursor) == '\r' && peek_addr(parser, cursor + 1) == '\n') {
return 2;
}
return 0;
}
// Return the length of the line ending string starting at parser->current.end + offset, or 0 if it
// is not a line ending. This function is intended to be CRLF/LF agnostic.
static inline size_t
match_line_ending_at(yp_parser_t *parser, ptrdiff_t offset) {
return match_line_ending_addr(parser, parser->current.end + offset);
}
// Return the length of the line ending string starting at parser->current.end, or 0 if it is not a
// line ending. This function is intended to be CRLF/LF agnostic.
static inline size_t
match_line_ending(yp_parser_t *parser) {
return match_line_ending_addr(parser, parser->current.end);
}
// Find the start of the encoding comment. This is effectively an inlined
// version of strnstr with some modifications.
static inline const char *
@ -5302,7 +5329,7 @@ parser_lex(yp_parser_t *parser) {
space_seen = true;
break;
case '\r':
if (peek_at(parser, 1) == '\n') {
if (match_line_ending_at(parser, 1)) {
chomping = false;
} else {
parser->current.end++;
@ -5310,28 +5337,22 @@ parser_lex(yp_parser_t *parser) {
}
break;
case '\\':
if (peek_at(parser, 1) == '\n') {
if (parser->heredoc_end) {
parser->current.end = parser->heredoc_end;
parser->heredoc_end = NULL;
} else {
yp_newline_list_append(&parser->newline_list, parser->current.end + 1);
{
size_t le_len = match_line_ending_at(parser, 1);
if (le_len) {
if (parser->heredoc_end) {
parser->current.end = parser->heredoc_end;
parser->heredoc_end = NULL;
} else {
parser->current.end += le_len + 1;
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
space_seen = true;
}
} else if (yp_char_is_inline_whitespace(*parser->current.end)) {
parser->current.end += 2;
space_seen = true;
}
} else if (peek_at(parser, 1) == '\r' && peek_at(parser, 2) == '\n') {
if (parser->heredoc_end) {
parser->current.end = parser->heredoc_end;
parser->heredoc_end = NULL;
} else {
yp_newline_list_append(&parser->newline_list, parser->current.end + 2);
parser->current.end += 3;
space_seen = true;
chomping = false;
}
} else if (yp_char_is_inline_whitespace(*parser->current.end)) {
parser->current.end += 2;
} else {
chomping = false;
}
break;
default:
@ -5376,21 +5397,26 @@ parser_lex(yp_parser_t *parser) {
lexed_comment = true;
}
/* fallthrough */
case '\r': {
// The only way you can have carriage returns in this particular loop
// is if you have a carriage return followed by a newline. In that
// case we'll just skip over the carriage return and continue lexing,
// in order to make it so that the newline token encapsulates both the
// carriage return and the newline. Note that we need to check that
// we haven't already lexed a comment here because that falls through
// into here as well.
if (!lexed_comment) parser->current.end++;
}
/* fallthrough */
case '\n': {
if (parser->heredoc_end == NULL) {
yp_newline_list_check_append(&parser->newline_list, parser->current.end - 1);
} else {
case '\r':
case '\n':
{
size_t le_len = match_line_ending_addr(parser, parser->current.end - 1);
if (le_len) {
// The only way you can have carriage returns in this particular loop
// is if you have a carriage return followed by a newline. In that
// case we'll just skip over the carriage return and continue lexing,
// in order to make it so that the newline token encapsulates both the
// carriage return and the newline. Note that we need to check that
// we haven't already lexed a comment here because that falls through
// into here as well.
if (!lexed_comment) parser->current.end += le_len - 1 ; // skip CR
if (parser->heredoc_end == NULL) {
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
}
}
if (parser->heredoc_end) {
parser_flush_heredoc_end(parser);
}
@ -6183,15 +6209,14 @@ parser_lex(yp_parser_t *parser) {
if (!parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end)) {
lex_mode_push_string(parser, true, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
if (*parser->current.end == '\r') {
size_t le_len = match_line_ending(parser);
if (le_len) {
parser->current.end += le_len;
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
} else {
parser->current.end++;
}
if (peek(parser) == '\n') {
yp_newline_list_append(&parser->newline_list, parser->current.end);
}
parser->current.end++;
if (parser->current.end < parser->end) {
LEX(YP_TOKEN_STRING_BEGIN);
}
@ -6345,10 +6370,7 @@ parser_lex(yp_parser_t *parser) {
((parser->current.end - parser->current.start) == 7) &&
current_token_starts_line(parser) &&
(strncmp(parser->current.start, "__END__", 7) == 0) &&
(
parser->current.end == parser->end ||
peek(parser) == '\n' ||
(peek(parser) == '\r' && peek_at(parser, 1) == '\n'))
(parser->current.end == parser->end || match_line_ending(parser))
)
{
parser->current.end = parser->end;
@ -6675,12 +6697,11 @@ parser_lex(yp_parser_t *parser) {
// Otherwise we need to switch back to the parent lex mode and
// return the end of the string.
if (peek(parser) == '\r' && peek_at(parser, 1) == '\n') {
parser->current.end = breakpoint + 2;
yp_newline_list_append(&parser->newline_list, breakpoint + 1);
size_t le_len = match_line_ending_addr(parser, breakpoint);
if (le_len) {
parser->current.end = breakpoint + le_len;
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
} else {
yp_newline_list_check_append(&parser->newline_list, parser->current.end);
parser->current.end = breakpoint + 1;
}
@ -6791,13 +6812,10 @@ parser_lex(yp_parser_t *parser) {
bool matched = true;
bool at_end = false;
if (peek_addr(parser, start + ident_length) == '\n') {
parser->current.end = start + ident_length + 1;
yp_newline_list_append(&parser->newline_list, start + ident_length);
} else if (peek_addr(parser, start + ident_length) == '\r' &&
peek_addr(parser, start + ident_length + 1) == '\n') {
parser->current.end = start + ident_length + 2;
yp_newline_list_append(&parser->newline_list, start + ident_length + 1);
size_t le_len = match_line_ending_addr(parser, start + ident_length);
if (le_len) {
parser->current.end = start + ident_length + le_len;
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
} else if (parser->end == (start + ident_length)) {
parser->current.end = start + ident_length;
at_end = true;
@ -6862,20 +6880,11 @@ parser_lex(yp_parser_t *parser) {
(start + ident_length <= parser->end) &&
(strncmp(start, ident_start, ident_length) == 0)
) {
// Heredoc terminators must be followed by a newline or EOF to be valid.
if (start + ident_length == parser->end ||
peek_addr(parser, start + ident_length) == '\n')
{
parser->current.end = breakpoint + 1;
LEX(YP_TOKEN_STRING_CONTENT);
}
// They can also be followed by a carriage return and then a
// newline. Be sure here that we don't accidentally read off the
// end.
if (peek_addr(parser, start + ident_length) == '\r' &&
peek_addr(parser, start + ident_length + 1) == '\n')
{
// Heredoc terminators must be followed by a newline, CRLF, or EOF to be valid.
if (
start + ident_length == parser->end ||
match_line_ending_addr(parser, start + ident_length)
) {
parser->current.end = breakpoint + 1;
LEX(YP_TOKEN_STRING_CONTENT);
}
@ -6893,11 +6902,9 @@ parser_lex(yp_parser_t *parser) {
// stop looping before the newline and not after the
// newline so that we can still potentially find the
// terminator of the heredoc.
if (peek_addr(parser, breakpoint + 1) == '\n') {
breakpoint++;
} else if (peek_addr(parser, breakpoint + 1) == '\r' &&
peek_addr(parser, breakpoint + 2) == '\n') {
breakpoint += 2;
size_t le_len = match_line_ending_addr(parser, breakpoint + 1);
if (le_len) {
breakpoint += le_len;
} else {
yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL;
size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, unescape_type, false, &parser->error_list);
@ -9183,8 +9190,10 @@ parse_heredoc_common_whitespace(yp_parser_t *parser, yp_node_list_t *nodes) {
while (cur_char && cur_char < content_loc->end) {
// Any empty newlines aren't included in the minimum whitespace calculation
while (cur_char < content_loc->end && *cur_char == '\n') cur_char++;
while (cur_char + 1 < content_loc->end && *cur_char == '\r' && cur_char[1] == '\n') cur_char += 2;
size_t le_len;
while ((le_len = match_line_ending_addr(parser, cur_char))) {
cur_char += le_len;
}
if (cur_char == content_loc->end) break;
@ -9202,8 +9211,9 @@ parse_heredoc_common_whitespace(yp_parser_t *parser, yp_node_list_t *nodes) {
// If we hit a newline, then we have encountered a line that contains
// only whitespace, and it shouldn't be considered in the calculation of
// common leading whitespace.
if (*cur_char == '\n') {
cur_char++;
le_len = match_line_ending_addr(parser, cur_char);
if (le_len) {
cur_char += le_len;
continue;
}