diff --git a/prism/prism.c b/prism/prism.c index 96ed3989e2..1751857e1e 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -5907,6 +5907,19 @@ char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) { } } +/** + * Similar to char_is_identifier but this function assumes that the encoding + * has not been changed. + */ +static inline size_t +char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) { + if (*b < 0x80) { + return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0); + } else { + return (size_t) (pm_encoding_utf_8_alnum_char(b, end - b) || 1u); + } +} + /** * Like the above, this function is also used extremely frequently to lex all of * the identifiers in a source file once the first character has been found. So @@ -5925,11 +5938,8 @@ char_is_identifier(pm_parser_t *parser, const uint8_t *b) { } else { return 0; } - } else if (*b < 0x80) { - return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0) || (*b == '_'); - } else { - return (size_t) (pm_encoding_utf_8_alnum_char(b, parser->end - b) || 1u); } + return char_is_identifier_utf8(b, parser->end); } // Here we're defining a perfect hash for the characters that are allowed in @@ -7003,9 +7013,16 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) { const uint8_t *end = parser->end; const uint8_t *current_start = parser->current.start; const uint8_t *current_end = parser->current.end; + bool encoding_changed = parser->encoding_changed; - while (current_end < end && (width = char_is_identifier(parser, current_end)) > 0) { - current_end += width; + if (encoding_changed) { + while (current_end < end && (width = char_is_identifier(parser, current_end)) > 0) { + current_end += width; + } + } else { + while (current_end < end && (width = char_is_identifier_utf8(current_end, end)) > 0) { + current_end += width; + } } parser->current.end = current_end; @@ -7123,7 +7140,7 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) { } } - if (parser->encoding_changed) { + if (encoding_changed) { return parser->encoding.isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER; } return pm_encoding_utf_8_isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;