[ruby/yarp] Improve handling of line endings
Introduce three new inline helper functions: - `match_line_ending` - `match_line_ending_at` - `match_line_ending_addr` These functions are similar in signature to the `peek*` functions, but return the length of the line ending being inspected (or 0 if no line ending was found). These functions are then used to simplify how we're detecting line endings throughout "src/yarp.c". Also: - test coverage backfilled for `__END__` comments with CRLF line endings. - error message for invalid `%` tokens updated to not include the potential line endings. - some small refactorings for readability along the way https://github.com/ruby/yarp/commit/a00067386d
This commit is contained in:
parent
5ec1fc52c1
commit
20927a89c2
@ -31,6 +31,12 @@ class CommentsTest < Test::Unit::TestCase
|
||||
assert_comment source, :__END__, 0..16
|
||||
end
|
||||
|
||||
def test_comment___END__crlf
|
||||
source = "__END__\r\ncomment\r\n"
|
||||
|
||||
assert_comment source, :__END__, 0..18
|
||||
end
|
||||
|
||||
def test_comment_embedded_document
|
||||
source = <<~RUBY
|
||||
=begin
|
||||
|
@ -164,7 +164,7 @@ class ErrorsTest < Test::Unit::TestCase
|
||||
|
||||
def test_cr_without_lf_in_percent_expression
|
||||
assert_errors expression("%\r"), "%\r", [
|
||||
["Invalid %% token", 0..3],
|
||||
["Invalid %% token", 0..2],
|
||||
]
|
||||
end
|
||||
|
||||
|
170
yarp/yarp.c
170
yarp/yarp.c
@ -4216,6 +4216,33 @@ next_newline(const char *cursor, ptrdiff_t length) {
|
||||
return memchr(cursor, '\n', (size_t) length);
|
||||
}
|
||||
|
||||
// Return the length of the line ending string starting at +cursor+, or 0 if it is not a line
|
||||
// ending. This function is intended to be CRLF/LF agnostic.
|
||||
static inline size_t
|
||||
match_line_ending_addr(yp_parser_t *parser, const char *cursor) {
|
||||
if (peek_addr(parser, cursor) == '\n') {
|
||||
return 1;
|
||||
}
|
||||
if (peek_addr(parser, cursor) == '\r' && peek_addr(parser, cursor + 1) == '\n') {
|
||||
return 2;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Return the length of the line ending string starting at parser->current.end + offset, or 0 if it
|
||||
// is not a line ending. This function is intended to be CRLF/LF agnostic.
|
||||
static inline size_t
|
||||
match_line_ending_at(yp_parser_t *parser, ptrdiff_t offset) {
|
||||
return match_line_ending_addr(parser, parser->current.end + offset);
|
||||
}
|
||||
|
||||
// Return the length of the line ending string starting at parser->current.end, or 0 if it is not a
|
||||
// line ending. This function is intended to be CRLF/LF agnostic.
|
||||
static inline size_t
|
||||
match_line_ending(yp_parser_t *parser) {
|
||||
return match_line_ending_addr(parser, parser->current.end);
|
||||
}
|
||||
|
||||
// Find the start of the encoding comment. This is effectively an inlined
|
||||
// version of strnstr with some modifications.
|
||||
static inline const char *
|
||||
@ -5302,7 +5329,7 @@ parser_lex(yp_parser_t *parser) {
|
||||
space_seen = true;
|
||||
break;
|
||||
case '\r':
|
||||
if (peek_at(parser, 1) == '\n') {
|
||||
if (match_line_ending_at(parser, 1)) {
|
||||
chomping = false;
|
||||
} else {
|
||||
parser->current.end++;
|
||||
@ -5310,28 +5337,22 @@ parser_lex(yp_parser_t *parser) {
|
||||
}
|
||||
break;
|
||||
case '\\':
|
||||
if (peek_at(parser, 1) == '\n') {
|
||||
if (parser->heredoc_end) {
|
||||
parser->current.end = parser->heredoc_end;
|
||||
parser->heredoc_end = NULL;
|
||||
} else {
|
||||
yp_newline_list_append(&parser->newline_list, parser->current.end + 1);
|
||||
{
|
||||
size_t le_len = match_line_ending_at(parser, 1);
|
||||
if (le_len) {
|
||||
if (parser->heredoc_end) {
|
||||
parser->current.end = parser->heredoc_end;
|
||||
parser->heredoc_end = NULL;
|
||||
} else {
|
||||
parser->current.end += le_len + 1;
|
||||
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
|
||||
space_seen = true;
|
||||
}
|
||||
} else if (yp_char_is_inline_whitespace(*parser->current.end)) {
|
||||
parser->current.end += 2;
|
||||
space_seen = true;
|
||||
}
|
||||
} else if (peek_at(parser, 1) == '\r' && peek_at(parser, 2) == '\n') {
|
||||
if (parser->heredoc_end) {
|
||||
parser->current.end = parser->heredoc_end;
|
||||
parser->heredoc_end = NULL;
|
||||
} else {
|
||||
yp_newline_list_append(&parser->newline_list, parser->current.end + 2);
|
||||
parser->current.end += 3;
|
||||
space_seen = true;
|
||||
chomping = false;
|
||||
}
|
||||
} else if (yp_char_is_inline_whitespace(*parser->current.end)) {
|
||||
parser->current.end += 2;
|
||||
} else {
|
||||
chomping = false;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
@ -5376,21 +5397,26 @@ parser_lex(yp_parser_t *parser) {
|
||||
lexed_comment = true;
|
||||
}
|
||||
/* fallthrough */
|
||||
case '\r': {
|
||||
// The only way you can have carriage returns in this particular loop
|
||||
// is if you have a carriage return followed by a newline. In that
|
||||
// case we'll just skip over the carriage return and continue lexing,
|
||||
// in order to make it so that the newline token encapsulates both the
|
||||
// carriage return and the newline. Note that we need to check that
|
||||
// we haven't already lexed a comment here because that falls through
|
||||
// into here as well.
|
||||
if (!lexed_comment) parser->current.end++;
|
||||
}
|
||||
/* fallthrough */
|
||||
case '\n': {
|
||||
if (parser->heredoc_end == NULL) {
|
||||
yp_newline_list_check_append(&parser->newline_list, parser->current.end - 1);
|
||||
} else {
|
||||
case '\r':
|
||||
case '\n':
|
||||
{
|
||||
size_t le_len = match_line_ending_addr(parser, parser->current.end - 1);
|
||||
if (le_len) {
|
||||
// The only way you can have carriage returns in this particular loop
|
||||
// is if you have a carriage return followed by a newline. In that
|
||||
// case we'll just skip over the carriage return and continue lexing,
|
||||
// in order to make it so that the newline token encapsulates both the
|
||||
// carriage return and the newline. Note that we need to check that
|
||||
// we haven't already lexed a comment here because that falls through
|
||||
// into here as well.
|
||||
if (!lexed_comment) parser->current.end += le_len - 1 ; // skip CR
|
||||
|
||||
if (parser->heredoc_end == NULL) {
|
||||
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
|
||||
}
|
||||
}
|
||||
|
||||
if (parser->heredoc_end) {
|
||||
parser_flush_heredoc_end(parser);
|
||||
}
|
||||
|
||||
@ -6183,15 +6209,14 @@ parser_lex(yp_parser_t *parser) {
|
||||
if (!parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end)) {
|
||||
lex_mode_push_string(parser, true, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end));
|
||||
|
||||
if (*parser->current.end == '\r') {
|
||||
size_t le_len = match_line_ending(parser);
|
||||
if (le_len) {
|
||||
parser->current.end += le_len;
|
||||
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
|
||||
} else {
|
||||
parser->current.end++;
|
||||
}
|
||||
|
||||
if (peek(parser) == '\n') {
|
||||
yp_newline_list_append(&parser->newline_list, parser->current.end);
|
||||
}
|
||||
|
||||
parser->current.end++;
|
||||
if (parser->current.end < parser->end) {
|
||||
LEX(YP_TOKEN_STRING_BEGIN);
|
||||
}
|
||||
@ -6345,10 +6370,7 @@ parser_lex(yp_parser_t *parser) {
|
||||
((parser->current.end - parser->current.start) == 7) &&
|
||||
current_token_starts_line(parser) &&
|
||||
(strncmp(parser->current.start, "__END__", 7) == 0) &&
|
||||
(
|
||||
parser->current.end == parser->end ||
|
||||
peek(parser) == '\n' ||
|
||||
(peek(parser) == '\r' && peek_at(parser, 1) == '\n'))
|
||||
(parser->current.end == parser->end || match_line_ending(parser))
|
||||
)
|
||||
{
|
||||
parser->current.end = parser->end;
|
||||
@ -6675,12 +6697,11 @@ parser_lex(yp_parser_t *parser) {
|
||||
|
||||
// Otherwise we need to switch back to the parent lex mode and
|
||||
// return the end of the string.
|
||||
if (peek(parser) == '\r' && peek_at(parser, 1) == '\n') {
|
||||
parser->current.end = breakpoint + 2;
|
||||
yp_newline_list_append(&parser->newline_list, breakpoint + 1);
|
||||
size_t le_len = match_line_ending_addr(parser, breakpoint);
|
||||
if (le_len) {
|
||||
parser->current.end = breakpoint + le_len;
|
||||
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
|
||||
} else {
|
||||
yp_newline_list_check_append(&parser->newline_list, parser->current.end);
|
||||
|
||||
parser->current.end = breakpoint + 1;
|
||||
}
|
||||
|
||||
@ -6791,13 +6812,10 @@ parser_lex(yp_parser_t *parser) {
|
||||
bool matched = true;
|
||||
bool at_end = false;
|
||||
|
||||
if (peek_addr(parser, start + ident_length) == '\n') {
|
||||
parser->current.end = start + ident_length + 1;
|
||||
yp_newline_list_append(&parser->newline_list, start + ident_length);
|
||||
} else if (peek_addr(parser, start + ident_length) == '\r' &&
|
||||
peek_addr(parser, start + ident_length + 1) == '\n') {
|
||||
parser->current.end = start + ident_length + 2;
|
||||
yp_newline_list_append(&parser->newline_list, start + ident_length + 1);
|
||||
size_t le_len = match_line_ending_addr(parser, start + ident_length);
|
||||
if (le_len) {
|
||||
parser->current.end = start + ident_length + le_len;
|
||||
yp_newline_list_append(&parser->newline_list, parser->current.end - 1);
|
||||
} else if (parser->end == (start + ident_length)) {
|
||||
parser->current.end = start + ident_length;
|
||||
at_end = true;
|
||||
@ -6862,20 +6880,11 @@ parser_lex(yp_parser_t *parser) {
|
||||
(start + ident_length <= parser->end) &&
|
||||
(strncmp(start, ident_start, ident_length) == 0)
|
||||
) {
|
||||
// Heredoc terminators must be followed by a newline or EOF to be valid.
|
||||
if (start + ident_length == parser->end ||
|
||||
peek_addr(parser, start + ident_length) == '\n')
|
||||
{
|
||||
parser->current.end = breakpoint + 1;
|
||||
LEX(YP_TOKEN_STRING_CONTENT);
|
||||
}
|
||||
|
||||
// They can also be followed by a carriage return and then a
|
||||
// newline. Be sure here that we don't accidentally read off the
|
||||
// end.
|
||||
if (peek_addr(parser, start + ident_length) == '\r' &&
|
||||
peek_addr(parser, start + ident_length + 1) == '\n')
|
||||
{
|
||||
// Heredoc terminators must be followed by a newline, CRLF, or EOF to be valid.
|
||||
if (
|
||||
start + ident_length == parser->end ||
|
||||
match_line_ending_addr(parser, start + ident_length)
|
||||
) {
|
||||
parser->current.end = breakpoint + 1;
|
||||
LEX(YP_TOKEN_STRING_CONTENT);
|
||||
}
|
||||
@ -6893,11 +6902,9 @@ parser_lex(yp_parser_t *parser) {
|
||||
// stop looping before the newline and not after the
|
||||
// newline so that we can still potentially find the
|
||||
// terminator of the heredoc.
|
||||
if (peek_addr(parser, breakpoint + 1) == '\n') {
|
||||
breakpoint++;
|
||||
} else if (peek_addr(parser, breakpoint + 1) == '\r' &&
|
||||
peek_addr(parser, breakpoint + 2) == '\n') {
|
||||
breakpoint += 2;
|
||||
size_t le_len = match_line_ending_addr(parser, breakpoint + 1);
|
||||
if (le_len) {
|
||||
breakpoint += le_len;
|
||||
} else {
|
||||
yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL;
|
||||
size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, unescape_type, false, &parser->error_list);
|
||||
@ -9183,8 +9190,10 @@ parse_heredoc_common_whitespace(yp_parser_t *parser, yp_node_list_t *nodes) {
|
||||
|
||||
while (cur_char && cur_char < content_loc->end) {
|
||||
// Any empty newlines aren't included in the minimum whitespace calculation
|
||||
while (cur_char < content_loc->end && *cur_char == '\n') cur_char++;
|
||||
while (cur_char + 1 < content_loc->end && *cur_char == '\r' && cur_char[1] == '\n') cur_char += 2;
|
||||
size_t le_len;
|
||||
while ((le_len = match_line_ending_addr(parser, cur_char))) {
|
||||
cur_char += le_len;
|
||||
}
|
||||
|
||||
if (cur_char == content_loc->end) break;
|
||||
|
||||
@ -9202,8 +9211,9 @@ parse_heredoc_common_whitespace(yp_parser_t *parser, yp_node_list_t *nodes) {
|
||||
// If we hit a newline, then we have encountered a line that contains
|
||||
// only whitespace, and it shouldn't be considered in the calculation of
|
||||
// common leading whitespace.
|
||||
if (*cur_char == '\n') {
|
||||
cur_char++;
|
||||
le_len = match_line_ending_addr(parser, cur_char);
|
||||
if (le_len) {
|
||||
cur_char += le_len;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user