[ruby/prism] Simplify the ASCII-only check in regexp
https://github.com/ruby/prism/commit/b0935be727
This commit is contained in:
parent
dd24d88473
commit
cfcbbbd87c
@ -5950,7 +5950,7 @@ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
static pm_node_flags_t
|
static pm_node_flags_t
|
||||||
parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding) {
|
parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, const pm_string_t *source, bool ascii_only, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding) {
|
||||||
assert ((modifier == 'n' && modifier_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) ||
|
assert ((modifier == 'n' && modifier_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) ||
|
||||||
(modifier == 'u' && modifier_encoding == PM_ENCODING_UTF_8_ENTRY) ||
|
(modifier == 'u' && modifier_encoding == PM_ENCODING_UTF_8_ENTRY) ||
|
||||||
(modifier == 'e' && modifier_encoding == PM_ENCODING_EUC_JP_ENTRY) ||
|
(modifier == 'e' && modifier_encoding == PM_ENCODING_EUC_JP_ENTRY) ||
|
||||||
@ -5961,7 +5961,6 @@ parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, con
|
|||||||
// If an ASCII-only string without character escapes is used with an encoding modifier, then resulting Regexp
|
// If an ASCII-only string without character escapes is used with an encoding modifier, then resulting Regexp
|
||||||
// has the modifier encoding, unless the ASCII-8BIT modifier is used, in which case the Regexp "downgrades" to
|
// has the modifier encoding, unless the ASCII-8BIT modifier is used, in which case the Regexp "downgrades" to
|
||||||
// the US-ASCII encoding.
|
// the US-ASCII encoding.
|
||||||
bool ascii_only = pm_ascii_only_p(contents);
|
|
||||||
if (ascii_only) {
|
if (ascii_only) {
|
||||||
return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags;
|
return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags;
|
||||||
}
|
}
|
||||||
@ -6011,7 +6010,7 @@ parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, con
|
|||||||
* may be explicitly set with an escape sequence.
|
* may be explicitly set with an escape sequence.
|
||||||
*/
|
*/
|
||||||
static pm_node_flags_t
|
static pm_node_flags_t
|
||||||
parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags) {
|
parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_string_t *source, bool ascii_only, pm_node_flags_t flags) {
|
||||||
// TODO (nirvdrum 22-Feb-2024): CRuby reports a special Regexp-specific error for invalid Unicode ranges. We either need to scan again or modify the "invalid Unicode escape sequence" message we already report.
|
// TODO (nirvdrum 22-Feb-2024): CRuby reports a special Regexp-specific error for invalid Unicode ranges. We either need to scan again or modify the "invalid Unicode escape sequence" message we already report.
|
||||||
bool valid_unicode_range = true;
|
bool valid_unicode_range = true;
|
||||||
if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && !valid_unicode_range) {
|
if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && !valid_unicode_range) {
|
||||||
@ -6021,7 +6020,7 @@ parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_str
|
|||||||
|
|
||||||
// US-ASCII strings do not admit multi-byte character literals. However, character escape sequences corresponding
|
// US-ASCII strings do not admit multi-byte character literals. However, character escape sequences corresponding
|
||||||
// to multi-byte characters are allowed.
|
// to multi-byte characters are allowed.
|
||||||
if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !pm_ascii_only_p(contents)) {
|
if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !ascii_only) {
|
||||||
// CRuby will continue processing even though a SyntaxError has already been detected. It may result in the
|
// CRuby will continue processing even though a SyntaxError has already been detected. It may result in the
|
||||||
// following error message appearing twice. We do the same for compatibility.
|
// following error message appearing twice. We do the same for compatibility.
|
||||||
PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
|
PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name);
|
||||||
@ -6036,25 +6035,25 @@ parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_str
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) {
|
if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) {
|
||||||
return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY);
|
return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) {
|
if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) {
|
||||||
return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'u', PM_ENCODING_UTF_8_ENTRY);
|
return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'u', PM_ENCODING_UTF_8_ENTRY);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) {
|
if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) {
|
||||||
return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'e', PM_ENCODING_EUC_JP_ENTRY);
|
return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 'e', PM_ENCODING_EUC_JP_ENTRY);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) {
|
if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) {
|
||||||
return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY);
|
return parse_and_validate_regular_expression_encoding_modifier(parser, source, ascii_only, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY);
|
||||||
}
|
}
|
||||||
|
|
||||||
// At this point no encoding modifiers will be present on the regular expression as they would have already
|
// At this point no encoding modifiers will be present on the regular expression as they would have already
|
||||||
// been processed. Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all
|
// been processed. Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all
|
||||||
// regular expressions without an encoding modifier appearing in source are eligible for "downgrading" to US-ASCII.
|
// regular expressions without an encoding modifier appearing in source are eligible for "downgrading" to US-ASCII.
|
||||||
if (pm_ascii_only_p(contents)) {
|
if (ascii_only) {
|
||||||
return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING;
|
return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -10594,10 +10593,10 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
parser->current.end++;
|
parser->current.end++;
|
||||||
if (peek(parser) != '\n') {
|
if (peek(parser) != '\n') {
|
||||||
if (lex_mode->as.regexp.terminator != '\r') {
|
if (lex_mode->as.regexp.terminator != '\r') {
|
||||||
pm_regexp_token_buffer_push_byte(&token_buffer, '\\');
|
pm_token_buffer_push_byte(&token_buffer.base, '\\');
|
||||||
}
|
}
|
||||||
pm_token_buffer_push_byte(&token_buffer.base, '\r');
|
|
||||||
pm_regexp_token_buffer_push_byte(&token_buffer, '\r');
|
pm_regexp_token_buffer_push_byte(&token_buffer, '\r');
|
||||||
|
pm_token_buffer_push_byte(&token_buffer.base, '\r');
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
/* fallthrough */
|
/* fallthrough */
|
||||||
@ -10621,7 +10620,7 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
case 'M':
|
case 'M':
|
||||||
case 'u':
|
case 'u':
|
||||||
case 'x':
|
case 'x':
|
||||||
escape_read(parser, &token_buffer.base.buffer, &token_buffer.regexp_buffer, PM_ESCAPE_FLAG_REGEXP);
|
escape_read(parser, &token_buffer.regexp_buffer, &token_buffer.base.buffer, PM_ESCAPE_FLAG_REGEXP);
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
if (lex_mode->as.regexp.terminator == peeked) {
|
if (lex_mode->as.regexp.terminator == peeked) {
|
||||||
@ -10632,19 +10631,19 @@ parser_lex(pm_parser_t *parser) {
|
|||||||
case '$': case ')': case '*': case '+':
|
case '$': case ')': case '*': case '+':
|
||||||
case '.': case '>': case '?': case ']':
|
case '.': case '>': case '?': case ']':
|
||||||
case '^': case '|': case '}':
|
case '^': case '|': case '}':
|
||||||
pm_regexp_token_buffer_push_byte(&token_buffer, '\\');
|
pm_token_buffer_push_byte(&token_buffer.base, '\\');
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
pm_token_buffer_push_byte(&token_buffer.base, peeked);
|
|
||||||
pm_regexp_token_buffer_push_byte(&token_buffer, peeked);
|
pm_regexp_token_buffer_push_byte(&token_buffer, peeked);
|
||||||
|
pm_token_buffer_push_byte(&token_buffer.base, peeked);
|
||||||
parser->current.end++;
|
parser->current.end++;
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (peeked < 0x80) pm_regexp_token_buffer_push_byte(&token_buffer, '\\');
|
if (peeked < 0x80) pm_token_buffer_push_byte(&token_buffer.base, '\\');
|
||||||
pm_regexp_token_buffer_push_escaped(&token_buffer, parser);
|
pm_regexp_token_buffer_push_escaped(&token_buffer, parser);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -17159,17 +17158,17 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
|
|||||||
pm_token_t content = parser->current;
|
pm_token_t content = parser->current;
|
||||||
|
|
||||||
pm_string_t source = parser->current_regular_expression_source;
|
pm_string_t source = parser->current_regular_expression_source;
|
||||||
|
bool ascii_only = pm_ascii_only_p(&source);
|
||||||
pm_string_constant_init(&parser->current_regular_expression_source, "", 0);
|
pm_string_constant_init(&parser->current_regular_expression_source, "", 0);
|
||||||
|
|
||||||
parser_lex(parser);
|
parser_lex(parser);
|
||||||
|
|
||||||
// If we hit an end, then we can create a regular expression node
|
// If we hit an end, then we can create a regular expression node
|
||||||
// without interpolation, which can be represented more succinctly and
|
// without interpolation, which can be represented more succinctly and
|
||||||
// more easily compiled.
|
// more easily compiled.
|
||||||
if (accept1(parser, PM_TOKEN_REGEXP_END)) {
|
if (accept1(parser, PM_TOKEN_REGEXP_END)) {
|
||||||
pm_node_t *node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &source);
|
pm_node_t *node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped);
|
||||||
pm_node_flag_set(node, parse_and_validate_regular_expression_encoding(parser, &source, &unescaped, node->flags));
|
pm_node_flag_set(node, parse_and_validate_regular_expression_encoding(parser, &unescaped, ascii_only, node->flags));
|
||||||
pm_string_free(&unescaped);
|
pm_string_free(&source);
|
||||||
return node;
|
return node;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -17179,9 +17178,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
|
|||||||
|
|
||||||
pm_token_t opening = not_provided(parser);
|
pm_token_t opening = not_provided(parser);
|
||||||
pm_token_t closing = not_provided(parser);
|
pm_token_t closing = not_provided(parser);
|
||||||
pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &source);
|
pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped);
|
||||||
pm_interpolated_regular_expression_node_append(interpolated, part);
|
pm_interpolated_regular_expression_node_append(interpolated, part);
|
||||||
pm_string_free(&unescaped);
|
pm_string_free(&source);
|
||||||
} else {
|
} else {
|
||||||
// If the first part of the body of the regular expression is not a
|
// If the first part of the body of the regular expression is not a
|
||||||
// string content, then we have interpolation and we need to create an
|
// string content, then we have interpolation and we need to create an
|
||||||
@ -17197,11 +17196,9 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b
|
|||||||
pm_token_t opening = not_provided(parser);
|
pm_token_t opening = not_provided(parser);
|
||||||
pm_token_t closing = not_provided(parser);
|
pm_token_t closing = not_provided(parser);
|
||||||
|
|
||||||
pm_node_t *node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->current, &closing, &parser->current_regular_expression_source);
|
pm_node_t *node = (pm_node_t *) pm_string_node_create_current_string(parser, &opening, &parser->current, &closing);
|
||||||
pm_node_flag_set(node, parse_unescaped_encoding(parser));
|
pm_node_flag_set(node, parse_unescaped_encoding(parser));
|
||||||
|
pm_string_free(&parser->current_regular_expression_source);
|
||||||
pm_string_free(&parser->current_string);
|
|
||||||
pm_string_constant_init(&parser->current_regular_expression_source, "", 0);
|
|
||||||
|
|
||||||
parser_lex(parser);
|
parser_lex(parser);
|
||||||
pm_interpolated_regular_expression_node_append(interpolated, node);
|
pm_interpolated_regular_expression_node_append(interpolated, node);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user