[ruby/prism] Simplify the regular expression ASCII-only check

https://github.com/ruby/prism/commit/b7851f4c2d
This commit is contained in:
Kevin Newton 2024-03-08 14:05:21 -05:00 committed by git
parent cfcbbbd87c
commit 35c20cda70
2 changed files with 22 additions and 32 deletions

View File

@ -663,17 +663,6 @@ struct pm_parser {
*/ */
pm_string_t current_string; pm_string_t current_string;
/**
* This string is used to pass information from the lexer to the parser. When
* processing regular expressions we must track the string source for the expression
* as well as its unescaped representation. In that case, `current_string` will hold
* the unescaped value while this field will hold the translated source value. There
* are some escape sequences in regular expressions that will cause the associated
* source string to have a different value than the content of the expression so we
* must track this state separately.
*/
pm_string_t current_regular_expression_source;
/** /**
* The line number at the start of the parse. This will be used to offset * The line number at the start of the parse. This will be used to offset
* the line numbers of all of the locations. * the line numbers of all of the locations.
@ -753,6 +742,12 @@ struct pm_parser {
* a true value. * a true value.
*/ */
bool frozen_string_literal; bool frozen_string_literal;
/**
* True if the current regular expression being lexed contains only ASCII
* characters.
*/
bool current_regular_expression_ascii_only;
}; };
#endif #endif

View File

@ -8931,6 +8931,15 @@ pm_regexp_token_buffer_push_escaped(pm_regexp_token_buffer_t *token_buffer, pm_p
parser->current.end += width; parser->current.end += width;
} }
static bool
pm_slice_ascii_only_p(const uint8_t *value, size_t length) {
for (size_t index = 0; index < length; index++) {
if (value[index] & 0x80) return false;
}
return true;
}
/** /**
* When we're about to return from lexing the current token and we know for sure * When we're about to return from lexing the current token and we know for sure
* that we have found an escape sequence, this function is called to copy the * that we have found an escape sequence, this function is called to copy the
@ -8945,7 +8954,8 @@ pm_token_buffer_copy(pm_parser_t *parser, pm_token_buffer_t *token_buffer) {
static inline void static inline void
pm_regexp_token_buffer_copy(pm_parser_t *parser, pm_regexp_token_buffer_t *token_buffer) { pm_regexp_token_buffer_copy(pm_parser_t *parser, pm_regexp_token_buffer_t *token_buffer) {
pm_string_owned_init(&parser->current_string, (uint8_t *) pm_buffer_value(&token_buffer->base.buffer), pm_buffer_length(&token_buffer->base.buffer)); pm_string_owned_init(&parser->current_string, (uint8_t *) pm_buffer_value(&token_buffer->base.buffer), pm_buffer_length(&token_buffer->base.buffer));
pm_string_owned_init(&parser->current_regular_expression_source, (uint8_t *) pm_buffer_value(&token_buffer->regexp_buffer), pm_buffer_length(&token_buffer->regexp_buffer)); parser->current_regular_expression_ascii_only = pm_slice_ascii_only_p((const uint8_t *) pm_buffer_value(&token_buffer->regexp_buffer), pm_buffer_length(&token_buffer->regexp_buffer));
pm_buffer_free(&token_buffer->regexp_buffer);
} }
/** /**
@ -8971,7 +8981,7 @@ static void
pm_regexp_token_buffer_flush(pm_parser_t *parser, pm_regexp_token_buffer_t *token_buffer) { pm_regexp_token_buffer_flush(pm_parser_t *parser, pm_regexp_token_buffer_t *token_buffer) {
if (token_buffer->base.cursor == NULL) { if (token_buffer->base.cursor == NULL) {
pm_string_shared_init(&parser->current_string, parser->current.start, parser->current.end); pm_string_shared_init(&parser->current_string, parser->current.start, parser->current.end);
pm_string_shared_init(&parser->current_regular_expression_source, parser->current.start, parser->current.end); parser->current_regular_expression_ascii_only = pm_slice_ascii_only_p(parser->current.start, (size_t) (parser->current.end - parser->current.start));
} else { } else {
pm_buffer_append_bytes(&token_buffer->base.buffer, token_buffer->base.cursor, (size_t) (parser->current.end - token_buffer->base.cursor)); pm_buffer_append_bytes(&token_buffer->base.buffer, token_buffer->base.cursor, (size_t) (parser->current.end - token_buffer->base.cursor));
pm_buffer_append_bytes(&token_buffer->regexp_buffer, token_buffer->base.cursor, (size_t) (parser->current.end - token_buffer->base.cursor)); pm_buffer_append_bytes(&token_buffer->regexp_buffer, token_buffer->base.cursor, (size_t) (parser->current.end - token_buffer->base.cursor));
@ -17156,10 +17166,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
// regular expression) or if it's not then it has interpolation. // regular expression) or if it's not then it has interpolation.
pm_string_t unescaped = parser->current_string; pm_string_t unescaped = parser->current_string;
pm_token_t content = parser->current; pm_token_t content = parser->current;
bool ascii_only = parser->current_regular_expression_ascii_only;
pm_string_t source = parser->current_regular_expression_source;
bool ascii_only = pm_ascii_only_p(&source);
pm_string_constant_init(&parser->current_regular_expression_source, "", 0);
parser_lex(parser); parser_lex(parser);
// If we hit an end, then we can create a regular expression node // If we hit an end, then we can create a regular expression node
@ -17168,7 +17175,6 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
if (accept1(parser, PM_TOKEN_REGEXP_END)) { if (accept1(parser, PM_TOKEN_REGEXP_END)) {
pm_node_t *node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); pm_node_t *node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
pm_node_flag_set(node, parse_and_validate_regular_expression_encoding(parser, &unescaped, ascii_only, node->flags)); pm_node_flag_set(node, parse_and_validate_regular_expression_encoding(parser, &unescaped, ascii_only, node->flags));
pm_string_free(&source);
return node; return node;
} }
@ -17180,7 +17186,6 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
pm_token_t closing = not_provided(parser); pm_token_t closing = not_provided(parser);
pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped); pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped);
pm_interpolated_regular_expression_node_append(interpolated, part); pm_interpolated_regular_expression_node_append(interpolated, part);
pm_string_free(&source);
} else { } else {
// If the first part of the body of the regular expression is not a // If the first part of the body of the regular expression is not a
// string content, then we have interpolation and we need to create an // string content, then we have interpolation and we need to create an
@ -17192,17 +17197,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
// parts into the list. // parts into the list.
pm_node_t *part; pm_node_t *part;
while (!match2(parser, PM_TOKEN_REGEXP_END, PM_TOKEN_EOF)) { while (!match2(parser, PM_TOKEN_REGEXP_END, PM_TOKEN_EOF)) {
if (match1(parser, PM_TOKEN_STRING_CONTENT)) { if ((part = parse_string_part(parser)) != NULL) {
pm_token_t opening = not_provided(parser);
pm_token_t closing = not_provided(parser);
pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
pm_node_flag_set(node, parse_unescaped_encoding(parser));
pm_string_free(&parser->current_regular_expression_source);
parser_lex(parser);
pm_interpolated_regular_expression_node_append(interpolated, node);
} else if ((part = parse_string_part(parser)) != NULL) {
pm_interpolated_regular_expression_node_append(interpolated, part); pm_interpolated_regular_expression_node_append(interpolated, part);
} }
} }
@ -18741,7 +18736,6 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
.newline_list = { 0 }, .newline_list = { 0 },
.integer_base = 0, .integer_base = 0,
.current_string = PM_STRING_EMPTY, .current_string = PM_STRING_EMPTY,
.current_regular_expression_source = PM_STRING_EMPTY,
.start_line = 1, .start_line = 1,
.explicit_encoding = NULL, .explicit_encoding = NULL,
.command_line = 0, .command_line = 0,
@ -18752,7 +18746,8 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
.in_keyword_arg = false, .in_keyword_arg = false,
.current_param_name = 0, .current_param_name = 0,
.semantic_token_seen = false, .semantic_token_seen = false,
.frozen_string_literal = false .frozen_string_literal = false,
.current_regular_expression_ascii_only = false
}; };
// Initialize the constant pool. We're going to completely guess as to the // Initialize the constant pool. We're going to completely guess as to the