[ruby/yarp] refactor: extract peek_addr()

In many places in the code we use the idiom:

    x < parser->end && *x == 'y'

which is essentially an extension of an existing pattern:

- `peek()` looks at `parser->current.end`
- `peek_at()` looks at `(parser->current.end + offset)`

This commit introduces a new inline function, `peek_addr`, which
accepts a pointer and encapsulates the address value check and
conditional dereferencing. The result is more readable code, and more
ubiquitous safety checks on pointer values, allowing us to rewrite the
above as:

    peek_addr(parser, x) == 'y'

Also:

- change the type of `peek_at()`'s offset argument from `size_t` to
  `ptrdiff_t` so that it can accept negative offsets.
- use `current_token_starts_line` in one place where the equivalent
  code is inline.
- use `peek` or `peek_at` to replace inline code in a few places

These changes simplify the code and make it easier to visually spot
patterns, particularly around line-endings (which will be a subject of
a future pull request).

https://github.com/ruby/yarp/commit/4c608d53ea
This commit is contained in:
Mike Dalessio 2023-08-22 17:59:06 -04:00 committed by git
parent 58c1ebb634
commit f902df128d

View File

@ -4157,27 +4157,30 @@ yp_do_loop_stack_p(yp_parser_t *parser) {
/* Lexer check helpers */ /* Lexer check helpers */
/******************************************************************************/ /******************************************************************************/
// Get the next character in the source starting from +cursor+. If that position is beyond the end
// of the source then return '\0'.
static inline char
peek_addr(yp_parser_t *parser, const char *cursor) {
if (cursor < parser->end) {
return *cursor;
} else {
return '\0';
}
}
// Get the next character in the source starting from parser->current.end and // Get the next character in the source starting from parser->current.end and
// adding the given offset. If that position is beyond the end of the source // adding the given offset. If that position is beyond the end of the source
// then return '\0'. // then return '\0'.
static inline char static inline char
peek_at(yp_parser_t *parser, size_t offset) { peek_at(yp_parser_t *parser, ptrdiff_t offset) {
if (parser->current.end + offset < parser->end) { return peek_addr(parser, parser->current.end + offset);
return parser->current.end[offset];
} else {
return '\0';
}
} }
// Get the next character in the source starting from parser->current.end. If // Get the next character in the source starting from parser->current.end. If
// that position is beyond the end of the source then return '\0'. // that position is beyond the end of the source then return '\0'.
static inline char static inline char
peek(yp_parser_t *parser) { peek(yp_parser_t *parser) {
if (parser->current.end < parser->end) { return peek_addr(parser, parser->current.end);
return *parser->current.end;
} else {
return '\0';
}
} }
// Get the next string of length len in the source starting from parser->current.end. // Get the next string of length len in the source starting from parser->current.end.
@ -4518,7 +4521,7 @@ static yp_token_type_t
lex_numeric_prefix(yp_parser_t *parser) { lex_numeric_prefix(yp_parser_t *parser) {
yp_token_type_t type = YP_TOKEN_INTEGER; yp_token_type_t type = YP_TOKEN_INTEGER;
if (parser->current.end[-1] == '0') { if (peek_at(parser, -1) == '0') {
switch (*parser->current.end) { switch (*parser->current.end) {
// 0d1111 is a decimal number // 0d1111 is a decimal number
case 'd': case 'd':
@ -4601,7 +4604,7 @@ lex_numeric_prefix(yp_parser_t *parser) {
// If the last character that we consumed was an underscore, then this is // If the last character that we consumed was an underscore, then this is
// actually an invalid integer value, and we should return an invalid token. // actually an invalid integer value, and we should return an invalid token.
if (parser->current.end[-1] == '_') { if (peek_at(parser, -1) == '_') {
yp_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, "Number literal cannot end with a `_`."); yp_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, "Number literal cannot end with a `_`.");
} }
@ -5083,7 +5086,7 @@ lex_question_mark(yp_parser_t *parser) {
// an underscore. We check for this case // an underscore. We check for this case
if ( if (
!(parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end) || !(parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end) ||
*parser->current.end == '_') || peek(parser) == '_') ||
( (
(parser->current.end + encoding_width >= parser->end) || (parser->current.end + encoding_width >= parser->end) ||
!char_is_identifier(parser, parser->current.end + encoding_width) !char_is_identifier(parser, parser->current.end + encoding_width)
@ -5356,7 +5359,7 @@ parser_lex(yp_parser_t *parser) {
case '#': { // comments case '#': { // comments
const char *ending = next_newline(parser->current.end, parser->end - parser->current.end); const char *ending = next_newline(parser->current.end, parser->end - parser->current.end);
while (ending && ending < parser->end && *ending != '\n') { while (ending && peek_addr(parser, ending) != '\n') {
ending = next_newline(ending + 1, parser->end - ending); ending = next_newline(ending + 1, parser->end - ending);
} }
@ -5446,7 +5449,11 @@ parser_lex(yp_parser_t *parser) {
// If the lex state was ignored, or we hit a '.' or a '&.', // If the lex state was ignored, or we hit a '.' or a '&.',
// we will lex the ignored newline // we will lex the ignored newline
if (lex_state_ignored_p(parser) || (following && ((following[0] == '.') || (following + 1 < parser->end && following[0] == '&' && following[1] == '.')))) { if (lex_state_ignored_p(parser) ||
(following && (
(peek_addr(parser, following) == '.') ||
(peek_addr(parser, following) == '&' && peek_addr(parser, following + 1) == '.')
))) {
if (!lexed_comment) parser_lex_ignored_newline(parser); if (!lexed_comment) parser_lex_ignored_newline(parser);
lexed_comment = false; lexed_comment = false;
goto lex_next_token; goto lex_next_token;
@ -5459,7 +5466,7 @@ parser_lex(yp_parser_t *parser) {
// To match ripper, we need to emit an ignored newline even though // To match ripper, we need to emit an ignored newline even though
// its a real newline in the case that we have a beginless range // its a real newline in the case that we have a beginless range
// on a subsequent line. // on a subsequent line.
if ((next_content + 1 < parser->end) && (next_content[1] == '.')) { if (peek_addr(parser, next_content + 1) == '.') {
if (!lexed_comment) parser_lex_ignored_newline(parser); if (!lexed_comment) parser_lex_ignored_newline(parser);
lex_state_set(parser, YP_LEX_STATE_BEG); lex_state_set(parser, YP_LEX_STATE_BEG);
parser->command_start = true; parser->command_start = true;
@ -5477,7 +5484,7 @@ parser_lex(yp_parser_t *parser) {
// If we hit a &. after a newline, then we're in a call chain and // If we hit a &. after a newline, then we're in a call chain and
// we need to return the call operator. // we need to return the call operator.
if (next_content + 1 < parser->end && next_content[0] == '&' && next_content[1] == '.') { if (peek_addr(parser, next_content) == '&' && peek_addr(parser, next_content + 1) == '.') {
if (!lexed_comment) parser_lex_ignored_newline(parser); if (!lexed_comment) parser_lex_ignored_newline(parser);
lex_state_set(parser, YP_LEX_STATE_DOT); lex_state_set(parser, YP_LEX_STATE_DOT);
parser->current.start = next_content; parser->current.start = next_content;
@ -6098,13 +6105,13 @@ parser_lex(yp_parser_t *parser) {
LEX(YP_TOKEN_COLON_COLON); LEX(YP_TOKEN_COLON_COLON);
} }
if (lex_state_end_p(parser) || yp_char_is_whitespace(*parser->current.end) || (*parser->current.end == '#')) { if (lex_state_end_p(parser) || yp_char_is_whitespace(*parser->current.end) || peek(parser) == '#') {
lex_state_set(parser, YP_LEX_STATE_BEG); lex_state_set(parser, YP_LEX_STATE_BEG);
LEX(YP_TOKEN_COLON); LEX(YP_TOKEN_COLON);
} }
if ((*parser->current.end == '"') || (*parser->current.end == '\'')) { if (peek(parser) == '"' || peek(parser) == '\'') {
lex_mode_push_string(parser, *parser->current.end == '"', false, '\0', *parser->current.end); lex_mode_push_string(parser, peek(parser) == '"', false, '\0', *parser->current.end);
parser->current.end++; parser->current.end++;
} }
@ -6173,7 +6180,7 @@ parser_lex(yp_parser_t *parser) {
} }
else if( else if(
lex_state_beg_p(parser) || lex_state_beg_p(parser) ||
(lex_state_p(parser, YP_LEX_STATE_FITEM) && (*parser->current.end == 's')) || (lex_state_p(parser, YP_LEX_STATE_FITEM) && (peek(parser) == 's')) ||
lex_state_spcarg_p(parser, space_seen) lex_state_spcarg_p(parser, space_seen)
) { ) {
if (!parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end)) { if (!parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end)) {
@ -6341,8 +6348,12 @@ parser_lex(yp_parser_t *parser) {
((parser->current.end - parser->current.start) == 7) && ((parser->current.end - parser->current.start) == 7) &&
current_token_starts_line(parser) && current_token_starts_line(parser) &&
(strncmp(parser->current.start, "__END__", 7) == 0) && (strncmp(parser->current.start, "__END__", 7) == 0) &&
(parser->current.end == parser->end || *parser->current.end == '\n' || (*parser->current.end == '\r' && parser->current.end[1] == '\n')) (
) { parser->current.end == parser->end ||
peek(parser) == '\n' ||
(peek(parser) == '\r' && peek_at(parser, 1) == '\n'))
)
{
parser->current.end = parser->end; parser->current.end = parser->end;
parser->current.type = YP_TOKEN___END__; parser->current.type = YP_TOKEN___END__;
parser_lex_callback(parser); parser_lex_callback(parser);
@ -6399,7 +6410,7 @@ parser_lex(yp_parser_t *parser) {
if ((whitespace = yp_strspn_whitespace_newlines(parser->current.end, parser->end - parser->current.end, &parser->newline_list, should_stop)) > 0) { if ((whitespace = yp_strspn_whitespace_newlines(parser->current.end, parser->end - parser->current.end, &parser->newline_list, should_stop)) > 0) {
parser->current.end += whitespace; parser->current.end += whitespace;
if (parser->current.end[-1] == '\n') { if (peek_at(parser, -1) == '\n') {
// mutates next_start // mutates next_start
parser_flush_heredoc_end(parser); parser_flush_heredoc_end(parser);
} }
@ -6667,7 +6678,7 @@ parser_lex(yp_parser_t *parser) {
// Otherwise we need to switch back to the parent lex mode and // Otherwise we need to switch back to the parent lex mode and
// return the end of the string. // return the end of the string.
if (*parser->current.end == '\r' && parser->current.end + 1 < parser->end && parser->current.end[1] == '\n') { if (peek(parser) == '\r' && peek_at(parser, 1) == '\n') {
parser->current.end = breakpoint + 2; parser->current.end = breakpoint + 2;
yp_newline_list_append(&parser->newline_list, breakpoint + 1); yp_newline_list_append(&parser->newline_list, breakpoint + 1);
} else { } else {
@ -6773,7 +6784,7 @@ parser_lex(yp_parser_t *parser) {
// If we are immediately following a newline and we have hit the // If we are immediately following a newline and we have hit the
// terminator, then we need to return the ending of the heredoc. // terminator, then we need to return the ending of the heredoc.
if (parser->current.start[-1] == '\n') { if (current_token_starts_line(parser)) {
const char *start = parser->current.start; const char *start = parser->current.start;
if (parser->lex_modes.current->as.heredoc.indent != YP_HEREDOC_INDENT_NONE) { if (parser->lex_modes.current->as.heredoc.indent != YP_HEREDOC_INDENT_NONE) {
start += yp_strspn_inline_whitespace(start, parser->end - start); start += yp_strspn_inline_whitespace(start, parser->end - start);
@ -6783,10 +6794,11 @@ parser_lex(yp_parser_t *parser) {
bool matched = true; bool matched = true;
bool at_end = false; bool at_end = false;
if ((start + ident_length < parser->end) && (start[ident_length] == '\n')) { if (peek_addr(parser, start + ident_length) == '\n') {
parser->current.end = start + ident_length + 1; parser->current.end = start + ident_length + 1;
yp_newline_list_append(&parser->newline_list, start + ident_length); yp_newline_list_append(&parser->newline_list, start + ident_length);
} else if ((start + ident_length + 1 < parser->end) && (start[ident_length] == '\r') && (start[ident_length + 1] == '\n')) { } else if (peek_addr(parser, start + ident_length) == '\r' &&
peek_addr(parser, start + ident_length + 1) == '\n') {
parser->current.end = start + ident_length + 2; parser->current.end = start + ident_length + 2;
yp_newline_list_append(&parser->newline_list, start + ident_length + 1); yp_newline_list_append(&parser->newline_list, start + ident_length + 1);
} else if (parser->end == (start + ident_length)) { } else if (parser->end == (start + ident_length)) {
@ -6854,7 +6866,9 @@ parser_lex(yp_parser_t *parser) {
(strncmp(start, ident_start, ident_length) == 0) (strncmp(start, ident_start, ident_length) == 0)
) { ) {
// Heredoc terminators must be followed by a newline or EOF to be valid. // Heredoc terminators must be followed by a newline or EOF to be valid.
if (start + ident_length == parser->end || start[ident_length] == '\n') { if (start + ident_length == parser->end ||
peek_addr(parser, start + ident_length) == '\n')
{
parser->current.end = breakpoint + 1; parser->current.end = breakpoint + 1;
LEX(YP_TOKEN_STRING_CONTENT); LEX(YP_TOKEN_STRING_CONTENT);
} }
@ -6862,11 +6876,9 @@ parser_lex(yp_parser_t *parser) {
// They can also be followed by a carriage return and then a // They can also be followed by a carriage return and then a
// newline. Be sure here that we don't accidentally read off the // newline. Be sure here that we don't accidentally read off the
// end. // end.
if ( if (peek_addr(parser, start + ident_length) == '\r' &&
(start + ident_length + 1 < parser->end) && peek_addr(parser, start + ident_length + 1) == '\n')
(start[ident_length] == '\r') && {
(start[ident_length + 1] == '\n')
) {
parser->current.end = breakpoint + 1; parser->current.end = breakpoint + 1;
LEX(YP_TOKEN_STRING_CONTENT); LEX(YP_TOKEN_STRING_CONTENT);
} }
@ -6884,9 +6896,10 @@ parser_lex(yp_parser_t *parser) {
// stop looping before the newline and not after the // stop looping before the newline and not after the
// newline so that we can still potentially find the // newline so that we can still potentially find the
// terminator of the heredoc. // terminator of the heredoc.
if (breakpoint + 1 < parser->end && breakpoint[1] == '\n') { if (peek_addr(parser, breakpoint + 1) == '\n') {
breakpoint++; breakpoint++;
} else if (breakpoint + 2 < parser->end && breakpoint[1] == '\r' && breakpoint[2] == '\n') { } else if (peek_addr(parser, breakpoint + 1) == '\r' &&
peek_addr(parser, breakpoint + 2) == '\n') {
breakpoint += 2; breakpoint += 2;
} else { } else {
yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL; yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL;