[ruby/prism] Introduce char_is_identifier_utf8

https://github.com/ruby/prism/commit/5f43e57b0f
This commit is contained in:
Haldun Bayhantopcu 2023-11-27 22:57:46 +01:00 committed by git
parent 031e81c8f3
commit 32b5f5be7c

View File

@ -5907,6 +5907,19 @@ char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) {
}
}
/**
* Similar to char_is_identifier but this function assumes that the encoding
* has not been changed.
*/
static inline size_t
char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
if (*b < 0x80) {
return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0);
} else {
return (size_t) (pm_encoding_utf_8_alnum_char(b, end - b) || 1u);
}
}
/**
* Like the above, this function is also used extremely frequently to lex all of
* the identifiers in a source file once the first character has been found. So
@ -5925,11 +5938,8 @@ char_is_identifier(pm_parser_t *parser, const uint8_t *b) {
} else {
return 0;
}
} else if (*b < 0x80) {
return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0) || (*b == '_');
} else {
return (size_t) (pm_encoding_utf_8_alnum_char(b, parser->end - b) || 1u);
}
return char_is_identifier_utf8(b, parser->end);
}
// Here we're defining a perfect hash for the characters that are allowed in
@ -7003,9 +7013,16 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) {
const uint8_t *end = parser->end;
const uint8_t *current_start = parser->current.start;
const uint8_t *current_end = parser->current.end;
bool encoding_changed = parser->encoding_changed;
while (current_end < end && (width = char_is_identifier(parser, current_end)) > 0) {
current_end += width;
if (encoding_changed) {
while (current_end < end && (width = char_is_identifier(parser, current_end)) > 0) {
current_end += width;
}
} else {
while (current_end < end && (width = char_is_identifier_utf8(current_end, end)) > 0) {
current_end += width;
}
}
parser->current.end = current_end;
@ -7123,7 +7140,7 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) {
}
}
if (parser->encoding_changed) {
if (encoding_changed) {
return parser->encoding.isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
}
return pm_encoding_utf_8_isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;