[ruby/prism] Introduce char_is_identifier_utf8

https://github.com/ruby/prism/commit/5f43e57b0f
2023-11-27 22:57:46 +01:00 · 2023-11-27 22:57:46 +01:00 · 32b5f5be7c
commit 32b5f5be7c
parent 031e81c8f3
1 changed files with 24 additions and 7 deletions
--- a/prism/prism.c
+++ b/prism/prism.c
@ -5907,6 +5907,19 @@ char_is_identifier_start(pm_parser_t *parser, const uint8_t *b) {
    }
 }

+/**
+ * Similar to char_is_identifier but this function assumes that the encoding
+ * has not been changed.
+ */
+static inline size_t
+char_is_identifier_utf8(const uint8_t *b, const uint8_t *end) {
+    if (*b < 0x80) {
+        return (*b == '_') || (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0);
+    } else {
+        return (size_t) (pm_encoding_utf_8_alnum_char(b, end - b) || 1u);
+    }
+}
+
 /**
 * Like the above, this function is also used extremely frequently to lex all of
 * the identifiers in a source file once the first character has been found. So
@ -5925,11 +5938,8 @@ char_is_identifier(pm_parser_t *parser, const uint8_t *b) {
        } else {
            return 0;
        }
-    } else if (*b < 0x80) {
-        return (pm_encoding_unicode_table[*b] & PRISM_ENCODING_ALPHANUMERIC_BIT ? 1 : 0) || (*b == '_');
-    } else {
-        return (size_t) (pm_encoding_utf_8_alnum_char(b, parser->end - b) || 1u);
    }
+    return char_is_identifier_utf8(b, parser->end);
 }

 // Here we're defining a perfect hash for the characters that are allowed in
@ -7003,10 +7013,17 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) {
    const uint8_t *end = parser->end;
    const uint8_t *current_start = parser->current.start;
    const uint8_t *current_end = parser->current.end;
+    bool encoding_changed = parser->encoding_changed;

+    if (encoding_changed) {
        while (current_end < end && (width = char_is_identifier(parser, current_end)) > 0) {
            current_end += width;
        }
+    } else {
+        while (current_end < end && (width = char_is_identifier_utf8(current_end, end)) > 0) {
+            current_end += width;
+        }
+    }
    parser->current.end = current_end;

    // Now cache the length of the identifier so that we can quickly compare it
@ -7123,7 +7140,7 @@ lex_identifier(pm_parser_t *parser, bool previous_command_start) {
        }
    }

-    if (parser->encoding_changed) {
+    if (encoding_changed) {
        return parser->encoding.isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;
    }
    return pm_encoding_utf_8_isupper_char(current_start, end - current_start) ? PM_TOKEN_CONSTANT : PM_TOKEN_IDENTIFIER;