[ruby/yarp] refactor: extract peek_addr()

In many places in the code we use the idiom:

    x < parser->end && *x == 'y'

which is essentially an extension of an existing pattern:

- `peek()` looks at `parser->current.end`
- `peek_at()` looks at `(parser->current.end + offset)`

This commit introduces a new inline function, `peek_addr`, which
accepts a pointer and encapsulates the address value check and
conditional dereferencing. The result is more readable code, and more
ubiquitous safety checks on pointer values, allowing us to rewrite the
above as:

    peek_addr(parser, x) == 'y'

Also:

- change the type of `peek_at()`'s offset argument from `size_t` to
  `ptrdiff_t` so that it can accept negative offsets.
- use `current_token_starts_line` in one place where the equivalent
  code is inline.
- use `peek` or `peek_at` to replace inline code in a few places

These changes simplify the code and make it easier to visually spot
patterns, particularly around line-endings (which will be a subject of
a future pull request).

https://github.com/ruby/yarp/commit/4c608d53ea
This commit is contained in:
Mike Dalessio 2023-08-22 17:59:06 -04:00 committed by git
parent 58c1ebb634
commit f902df128d

View File

@ -4157,27 +4157,30 @@ yp_do_loop_stack_p(yp_parser_t *parser) {
/* Lexer check helpers */
/******************************************************************************/
// Get the next character in the source starting from +cursor+. If that position is beyond the end
// of the source then return '\0'.
static inline char
peek_addr(yp_parser_t *parser, const char *cursor) {
if (cursor < parser->end) {
return *cursor;
} else {
return '\0';
}
}
// Get the next character in the source starting from parser->current.end and
// adding the given offset. If that position is beyond the end of the source
// then return '\0'.
static inline char
peek_at(yp_parser_t *parser, size_t offset) {
if (parser->current.end + offset < parser->end) {
return parser->current.end[offset];
} else {
return '\0';
}
peek_at(yp_parser_t *parser, ptrdiff_t offset) {
return peek_addr(parser, parser->current.end + offset);
}
// Get the next character in the source starting from parser->current.end. If
// that position is beyond the end of the source then return '\0'.
static inline char
peek(yp_parser_t *parser) {
if (parser->current.end < parser->end) {
return *parser->current.end;
} else {
return '\0';
}
return peek_addr(parser, parser->current.end);
}
// Get the next string of length len in the source starting from parser->current.end.
@ -4518,7 +4521,7 @@ static yp_token_type_t
lex_numeric_prefix(yp_parser_t *parser) {
yp_token_type_t type = YP_TOKEN_INTEGER;
if (parser->current.end[-1] == '0') {
if (peek_at(parser, -1) == '0') {
switch (*parser->current.end) {
// 0d1111 is a decimal number
case 'd':
@ -4601,7 +4604,7 @@ lex_numeric_prefix(yp_parser_t *parser) {
// If the last character that we consumed was an underscore, then this is
// actually an invalid integer value, and we should return an invalid token.
if (parser->current.end[-1] == '_') {
if (peek_at(parser, -1) == '_') {
yp_diagnostic_list_append(&parser->error_list, parser->current.start, parser->current.end, "Number literal cannot end with a `_`.");
}
@ -5083,7 +5086,7 @@ lex_question_mark(yp_parser_t *parser) {
// an underscore. We check for this case
if (
!(parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end) ||
*parser->current.end == '_') ||
peek(parser) == '_') ||
(
(parser->current.end + encoding_width >= parser->end) ||
!char_is_identifier(parser, parser->current.end + encoding_width)
@ -5356,7 +5359,7 @@ parser_lex(yp_parser_t *parser) {
case '#': { // comments
const char *ending = next_newline(parser->current.end, parser->end - parser->current.end);
while (ending && ending < parser->end && *ending != '\n') {
while (ending && peek_addr(parser, ending) != '\n') {
ending = next_newline(ending + 1, parser->end - ending);
}
@ -5446,7 +5449,11 @@ parser_lex(yp_parser_t *parser) {
// If the lex state was ignored, or we hit a '.' or a '&.',
// we will lex the ignored newline
if (lex_state_ignored_p(parser) || (following && ((following[0] == '.') || (following + 1 < parser->end && following[0] == '&' && following[1] == '.')))) {
if (lex_state_ignored_p(parser) ||
(following && (
(peek_addr(parser, following) == '.') ||
(peek_addr(parser, following) == '&' && peek_addr(parser, following + 1) == '.')
))) {
if (!lexed_comment) parser_lex_ignored_newline(parser);
lexed_comment = false;
goto lex_next_token;
@ -5459,7 +5466,7 @@ parser_lex(yp_parser_t *parser) {
// To match ripper, we need to emit an ignored newline even though
// its a real newline in the case that we have a beginless range
// on a subsequent line.
if ((next_content + 1 < parser->end) && (next_content[1] == '.')) {
if (peek_addr(parser, next_content + 1) == '.') {
if (!lexed_comment) parser_lex_ignored_newline(parser);
lex_state_set(parser, YP_LEX_STATE_BEG);
parser->command_start = true;
@ -5477,7 +5484,7 @@ parser_lex(yp_parser_t *parser) {
// If we hit a &. after a newline, then we're in a call chain and
// we need to return the call operator.
if (next_content + 1 < parser->end && next_content[0] == '&' && next_content[1] == '.') {
if (peek_addr(parser, next_content) == '&' && peek_addr(parser, next_content + 1) == '.') {
if (!lexed_comment) parser_lex_ignored_newline(parser);
lex_state_set(parser, YP_LEX_STATE_DOT);
parser->current.start = next_content;
@ -6098,13 +6105,13 @@ parser_lex(yp_parser_t *parser) {
LEX(YP_TOKEN_COLON_COLON);
}
if (lex_state_end_p(parser) || yp_char_is_whitespace(*parser->current.end) || (*parser->current.end == '#')) {
if (lex_state_end_p(parser) || yp_char_is_whitespace(*parser->current.end) || peek(parser) == '#') {
lex_state_set(parser, YP_LEX_STATE_BEG);
LEX(YP_TOKEN_COLON);
}
if ((*parser->current.end == '"') || (*parser->current.end == '\'')) {
lex_mode_push_string(parser, *parser->current.end == '"', false, '\0', *parser->current.end);
if (peek(parser) == '"' || peek(parser) == '\'') {
lex_mode_push_string(parser, peek(parser) == '"', false, '\0', *parser->current.end);
parser->current.end++;
}
@ -6173,7 +6180,7 @@ parser_lex(yp_parser_t *parser) {
}
else if(
lex_state_beg_p(parser) ||
(lex_state_p(parser, YP_LEX_STATE_FITEM) && (*parser->current.end == 's')) ||
(lex_state_p(parser, YP_LEX_STATE_FITEM) && (peek(parser) == 's')) ||
lex_state_spcarg_p(parser, space_seen)
) {
if (!parser->encoding.alnum_char(parser->current.end, parser->end - parser->current.end)) {
@ -6341,8 +6348,12 @@ parser_lex(yp_parser_t *parser) {
((parser->current.end - parser->current.start) == 7) &&
current_token_starts_line(parser) &&
(strncmp(parser->current.start, "__END__", 7) == 0) &&
(parser->current.end == parser->end || *parser->current.end == '\n' || (*parser->current.end == '\r' && parser->current.end[1] == '\n'))
) {
(
parser->current.end == parser->end ||
peek(parser) == '\n' ||
(peek(parser) == '\r' && peek_at(parser, 1) == '\n'))
)
{
parser->current.end = parser->end;
parser->current.type = YP_TOKEN___END__;
parser_lex_callback(parser);
@ -6399,7 +6410,7 @@ parser_lex(yp_parser_t *parser) {
if ((whitespace = yp_strspn_whitespace_newlines(parser->current.end, parser->end - parser->current.end, &parser->newline_list, should_stop)) > 0) {
parser->current.end += whitespace;
if (parser->current.end[-1] == '\n') {
if (peek_at(parser, -1) == '\n') {
// mutates next_start
parser_flush_heredoc_end(parser);
}
@ -6667,7 +6678,7 @@ parser_lex(yp_parser_t *parser) {
// Otherwise we need to switch back to the parent lex mode and
// return the end of the string.
if (*parser->current.end == '\r' && parser->current.end + 1 < parser->end && parser->current.end[1] == '\n') {
if (peek(parser) == '\r' && peek_at(parser, 1) == '\n') {
parser->current.end = breakpoint + 2;
yp_newline_list_append(&parser->newline_list, breakpoint + 1);
} else {
@ -6773,7 +6784,7 @@ parser_lex(yp_parser_t *parser) {
// If we are immediately following a newline and we have hit the
// terminator, then we need to return the ending of the heredoc.
if (parser->current.start[-1] == '\n') {
if (current_token_starts_line(parser)) {
const char *start = parser->current.start;
if (parser->lex_modes.current->as.heredoc.indent != YP_HEREDOC_INDENT_NONE) {
start += yp_strspn_inline_whitespace(start, parser->end - start);
@ -6783,10 +6794,11 @@ parser_lex(yp_parser_t *parser) {
bool matched = true;
bool at_end = false;
if ((start + ident_length < parser->end) && (start[ident_length] == '\n')) {
if (peek_addr(parser, start + ident_length) == '\n') {
parser->current.end = start + ident_length + 1;
yp_newline_list_append(&parser->newline_list, start + ident_length);
} else if ((start + ident_length + 1 < parser->end) && (start[ident_length] == '\r') && (start[ident_length + 1] == '\n')) {
} else if (peek_addr(parser, start + ident_length) == '\r' &&
peek_addr(parser, start + ident_length + 1) == '\n') {
parser->current.end = start + ident_length + 2;
yp_newline_list_append(&parser->newline_list, start + ident_length + 1);
} else if (parser->end == (start + ident_length)) {
@ -6854,7 +6866,9 @@ parser_lex(yp_parser_t *parser) {
(strncmp(start, ident_start, ident_length) == 0)
) {
// Heredoc terminators must be followed by a newline or EOF to be valid.
if (start + ident_length == parser->end || start[ident_length] == '\n') {
if (start + ident_length == parser->end ||
peek_addr(parser, start + ident_length) == '\n')
{
parser->current.end = breakpoint + 1;
LEX(YP_TOKEN_STRING_CONTENT);
}
@ -6862,11 +6876,9 @@ parser_lex(yp_parser_t *parser) {
// They can also be followed by a carriage return and then a
// newline. Be sure here that we don't accidentally read off the
// end.
if (
(start + ident_length + 1 < parser->end) &&
(start[ident_length] == '\r') &&
(start[ident_length + 1] == '\n')
) {
if (peek_addr(parser, start + ident_length) == '\r' &&
peek_addr(parser, start + ident_length + 1) == '\n')
{
parser->current.end = breakpoint + 1;
LEX(YP_TOKEN_STRING_CONTENT);
}
@ -6884,9 +6896,10 @@ parser_lex(yp_parser_t *parser) {
// stop looping before the newline and not after the
// newline so that we can still potentially find the
// terminator of the heredoc.
if (breakpoint + 1 < parser->end && breakpoint[1] == '\n') {
if (peek_addr(parser, breakpoint + 1) == '\n') {
breakpoint++;
} else if (breakpoint + 2 < parser->end && breakpoint[1] == '\r' && breakpoint[2] == '\n') {
} else if (peek_addr(parser, breakpoint + 1) == '\r' &&
peek_addr(parser, breakpoint + 2) == '\n') {
breakpoint += 2;
} else {
yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL;