[ruby/prism] Provide ability to lock encoding while parsing

https://github.com/ruby/prism/commit/f7faedfb3f
2024-06-10 13:39:16 -04:00 · 2024-06-10 13:39:16 -04:00 · d827d32527
commit d827d32527
parent 0d5eea9957
8 changed files with 54 additions and 7 deletions
--- a/lib/prism/ffi.rb
+++ b/lib/prism/ffi.rb
@ -431,6 +431,9 @@ module Prism
      template << "C"
      values << { nil => 0, "3.3.0" => 1, "3.3.1" => 1, "3.4.0" => 0, "latest" => 0 }.fetch(options[:version])

+      template << "C"
+      values << (options[:encoding] == false ? 1 : 0)
+
      template << "L"
      if (scopes = options[:scopes])
        values << scopes.length
--- a/lib/prism/translation/parser.rb
+++ b/lib/prism/translation/parser.rb
@ -51,7 +51,7 @@ module Prism
        source = source_buffer.source

        offset_cache = build_offset_cache(source)
-        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache)
+        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)

        build_ast(result.value, offset_cache)
      ensure
@ -64,7 +64,7 @@ module Prism
        source = source_buffer.source

        offset_cache = build_offset_cache(source)
-        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache)
+        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)

        [
          build_ast(result.value, offset_cache),
@ -83,7 +83,7 @@ module Prism
        offset_cache = build_offset_cache(source)
        result =
          begin
-            unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache)
+            unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)
          rescue ::Parser::SyntaxError
            raise if !recover
          end
--- a/prism/extension.c
+++ b/prism/extension.c
@ -138,7 +138,13 @@ build_options_i(VALUE key, VALUE value, VALUE argument) {
    if (key_id == rb_id_option_filepath) {
        if (!NIL_P(value)) pm_options_filepath_set(options, check_string(value));
    } else if (key_id == rb_id_option_encoding) {
-        if (!NIL_P(value)) pm_options_encoding_set(options, rb_enc_name(rb_to_encoding(value)));
+        if (!NIL_P(value)) {
+            if (value == Qfalse) {
+                pm_options_encoding_locked_set(options, true);
+            } else {
+                pm_options_encoding_set(options, rb_enc_name(rb_to_encoding(value)));
+            }
+        }
    } else if (key_id == rb_id_option_line) {
        if (!NIL_P(value)) pm_options_line_set(options, NUM2INT(value));
    } else if (key_id == rb_id_option_frozen_string_literal) {
@ -206,6 +212,7 @@ build_options(VALUE argument) {
 static void
 extract_options(pm_options_t *options, VALUE filepath, VALUE keywords) {
    options->line = 1; // default
+
    if (!NIL_P(keywords)) {
        struct build_options_data data = { .options = options, .keywords = keywords };
        struct build_options_data *argument = &data;
--- a/prism/options.c
+++ b/prism/options.c
@ -16,6 +16,14 @@ pm_options_encoding_set(pm_options_t *options, const char *encoding) {
    pm_string_constant_init(&options->encoding, encoding, strlen(encoding));
 }

+/**
+ * Set the encoding_locked option on the given options struct.
+ */
+PRISM_EXPORTED_FUNCTION void
+pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked) {
+    options->encoding_locked = encoding_locked;
+}
+
 /**
 * Set the line option on the given options struct.
 */
@ -215,6 +223,7 @@ pm_options_read(pm_options_t *options, const char *data) {
    options->frozen_string_literal = (int8_t) *data++;
    options->command_line = (uint8_t) *data++;
    options->version = (pm_options_version_t) *data++;
+    options->encoding_locked = ((uint8_t) *data++) > 0;

    uint32_t scopes_count = pm_options_read_u32(data);
    data += 4;
--- a/prism/options.h
+++ b/prism/options.h
@ -103,6 +103,13 @@ typedef struct {
    *  - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET
    */
    int8_t frozen_string_literal;
+
+    /**
+     * Whether or not the encoding magic comments should be respected. This is a
+     * niche use-case where you want to parse a file with a specific encoding
+     * but ignore any encoding magic comments at the top of the file.
+     */
+    bool encoding_locked;
 } pm_options_t;

 /**
@ -166,6 +173,14 @@ PRISM_EXPORTED_FUNCTION void pm_options_line_set(pm_options_t *options, int32_t
 */
 PRISM_EXPORTED_FUNCTION void pm_options_encoding_set(pm_options_t *options, const char *encoding);

+/**
+ * Set the encoding_locked option on the given options struct.
+ *
+ * @param options The options struct to set the encoding_locked value on.
+ * @param encoding_locked The encoding_locked value to set.
+ */
+PRISM_EXPORTED_FUNCTION void pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked);
+
 /**
 * Set the frozen string literal option on the given options struct.
 *
--- a/prism/parser.h
+++ b/prism/parser.h
@ -860,6 +860,14 @@ struct pm_parser {
    /** Whether or not we're currently recovering from a syntax error. */
    bool recovering;

+    /**
+     * This is very specialized behavior for when you want to parse in a context
+     * that does not respect encoding comments. Its main use case is translating
+     * into the whitequark/parser AST which re-encodes source files in UTF-8
+     * before they are parsed and ignores encoding comments.
+     */
+    bool encoding_locked;
+
    /**
     * Whether or not the encoding has been changed by a magic comment. We use
     * this to provide a fast path for the lexer instead of going through the
--- a/prism/prism.c
+++ b/prism/prism.c
@ -8261,7 +8261,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {

        // We only want to attempt to compare against encoding comments if it's
        // the first line in the file (or the second in the case of a shebang).
-        if (parser->current.start == parser->encoding_comment_start) {
+        if (parser->current.start == parser->encoding_comment_start && !parser->encoding_locked) {
            if (
                (key_length == 8 && pm_strncasecmp(key_source, (const uint8_t *) "encoding", 8) == 0) ||
                (key_length == 6 && pm_strncasecmp(key_source, (const uint8_t *) "coding", 6) == 0)
@ -10438,7 +10438,9 @@ parser_lex(pm_parser_t *parser) {
                        // pass and we're at the start of the file, then we need
                        // to do another pass to potentially find other patterns
                        // for encoding comments.
-                        if (length >= 10) parser_lex_magic_comment_encoding(parser);
+                        if (length >= 10 && !parser->encoding_locked) {
+                            parser_lex_magic_comment_encoding(parser);
+                        }
                    }

                    lexed_comment = true;
@ -21244,6 +21246,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
        .parsing_eval = false,
        .command_start = true,
        .recovering = false,
+        .encoding_locked = false,
        .encoding_changed = false,
        .pattern_matching_newlines = false,
        .in_keyword_arg = false,
@ -21291,6 +21294,9 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
            parser_lex_magic_comment_encoding_value(parser, encoding_source, encoding_source + encoding_length);
        }

+        // encoding_locked option
+        parser->encoding_locked = options->encoding_locked;
+
        // frozen_string_literal option
        parser->frozen_string_literal = options->frozen_string_literal;

--- a/test/prism/ruby/parser_test.rb
+++ b/test/prism/ruby/parser_test.rb
@ -60,7 +60,6 @@ module Prism
    # skip them for now.
    skip_all = skip_incorrect | [
      "regex.txt",
-      "regex_char_width.txt",
      "unescaping.txt",
      "seattlerb/bug190.txt",
      "seattlerb/heredoc_with_extra_carriage_returns_windows.txt",