From d827d3252786a5e1153f4e8bfa30f40a2aaafb95 Mon Sep 17 00:00:00 2001
From: Kevin Newton <kddnewton@gmail.com>
Date: Mon, 10 Jun 2024 13:39:16 -0400
Subject: [PATCH] [ruby/prism] Provide ability to lock encoding while parsing

https://github.com/ruby/prism/commit/f7faedfb3f
---
 lib/prism/ffi.rb                |  3 +++
 lib/prism/translation/parser.rb |  6 +++---
 prism/extension.c               |  9 ++++++++-
 prism/options.c                 |  9 +++++++++
 prism/options.h                 | 15 +++++++++++++++
 prism/parser.h                  |  8 ++++++++
 prism/prism.c                   | 10 ++++++++--
 test/prism/ruby/parser_test.rb  |  1 -
 8 files changed, 54 insertions(+), 7 deletions(-)

diff --git a/lib/prism/ffi.rb b/lib/prism/ffi.rb
index 6b48af43cc..46c4a1a755 100644
--- a/lib/prism/ffi.rb
+++ b/lib/prism/ffi.rb
@@ -431,6 +431,9 @@ module Prism
       template << "C"
       values << { nil => 0, "3.3.0" => 1, "3.3.1" => 1, "3.4.0" => 0, "latest" => 0 }.fetch(options[:version])
 
+      template << "C"
+      values << (options[:encoding] == false ? 1 : 0)
+
       template << "L"
       if (scopes = options[:scopes])
         values << scopes.length
diff --git a/lib/prism/translation/parser.rb b/lib/prism/translation/parser.rb
index 3748fc896e..8c7eb3aa75 100644
--- a/lib/prism/translation/parser.rb
+++ b/lib/prism/translation/parser.rb
@@ -51,7 +51,7 @@ module Prism
         source = source_buffer.source
 
         offset_cache = build_offset_cache(source)
-        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache)
+        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)
 
         build_ast(result.value, offset_cache)
       ensure
@@ -64,7 +64,7 @@ module Prism
         source = source_buffer.source
 
         offset_cache = build_offset_cache(source)
-        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache)
+        result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)
 
         [
           build_ast(result.value, offset_cache),
@@ -83,7 +83,7 @@ module Prism
         offset_cache = build_offset_cache(source)
         result =
           begin
-            unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache)
+            unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)
           rescue ::Parser::SyntaxError
             raise if !recover
           end
diff --git a/prism/extension.c b/prism/extension.c
index 091cac79ce..1fb858491e 100644
--- a/prism/extension.c
+++ b/prism/extension.c
@@ -138,7 +138,13 @@ build_options_i(VALUE key, VALUE value, VALUE argument) {
     if (key_id == rb_id_option_filepath) {
         if (!NIL_P(value)) pm_options_filepath_set(options, check_string(value));
     } else if (key_id == rb_id_option_encoding) {
-        if (!NIL_P(value)) pm_options_encoding_set(options, rb_enc_name(rb_to_encoding(value)));
+        if (!NIL_P(value)) {
+            if (value == Qfalse) {
+                pm_options_encoding_locked_set(options, true);
+            } else {
+                pm_options_encoding_set(options, rb_enc_name(rb_to_encoding(value)));
+            }
+        }
     } else if (key_id == rb_id_option_line) {
         if (!NIL_P(value)) pm_options_line_set(options, NUM2INT(value));
     } else if (key_id == rb_id_option_frozen_string_literal) {
@@ -206,6 +212,7 @@ build_options(VALUE argument) {
 static void
 extract_options(pm_options_t *options, VALUE filepath, VALUE keywords) {
     options->line = 1; // default
+
     if (!NIL_P(keywords)) {
         struct build_options_data data = { .options = options, .keywords = keywords };
         struct build_options_data *argument = &data;
diff --git a/prism/options.c b/prism/options.c
index 664db4f061..2ab2f260fd 100644
--- a/prism/options.c
+++ b/prism/options.c
@@ -16,6 +16,14 @@ pm_options_encoding_set(pm_options_t *options, const char *encoding) {
     pm_string_constant_init(&options->encoding, encoding, strlen(encoding));
 }
 
+/**
+ * Set the encoding_locked option on the given options struct.
+ */
+PRISM_EXPORTED_FUNCTION void
+pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked) {
+    options->encoding_locked = encoding_locked;
+}
+
 /**
  * Set the line option on the given options struct.
  */
@@ -215,6 +223,7 @@ pm_options_read(pm_options_t *options, const char *data) {
     options->frozen_string_literal = (int8_t) *data++;
     options->command_line = (uint8_t) *data++;
     options->version = (pm_options_version_t) *data++;
+    options->encoding_locked = ((uint8_t) *data++) > 0;
 
     uint32_t scopes_count = pm_options_read_u32(data);
     data += 4;
diff --git a/prism/options.h b/prism/options.h
index a623ae0b83..784769f880 100644
--- a/prism/options.h
+++ b/prism/options.h
@@ -103,6 +103,13 @@ typedef struct {
     *  - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET
     */
     int8_t frozen_string_literal;
+
+    /**
+     * Whether or not the encoding magic comments should be respected. This is a
+     * niche use-case where you want to parse a file with a specific encoding
+     * but ignore any encoding magic comments at the top of the file.
+     */
+    bool encoding_locked;
 } pm_options_t;
 
 /**
@@ -166,6 +173,14 @@ PRISM_EXPORTED_FUNCTION void pm_options_line_set(pm_options_t *options, int32_t
  */
 PRISM_EXPORTED_FUNCTION void pm_options_encoding_set(pm_options_t *options, const char *encoding);
 
+/**
+ * Set the encoding_locked option on the given options struct.
+ *
+ * @param options The options struct to set the encoding_locked value on.
+ * @param encoding_locked The encoding_locked value to set.
+ */
+PRISM_EXPORTED_FUNCTION void pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked);
+
 /**
  * Set the frozen string literal option on the given options struct.
  *
diff --git a/prism/parser.h b/prism/parser.h
index c5f8ab9df4..048955409b 100644
--- a/prism/parser.h
+++ b/prism/parser.h
@@ -860,6 +860,14 @@ struct pm_parser {
     /** Whether or not we're currently recovering from a syntax error. */
     bool recovering;
 
+    /**
+     * This is very specialized behavior for when you want to parse in a context
+     * that does not respect encoding comments. Its main use case is translating
+     * into the whitequark/parser AST which re-encodes source files in UTF-8
+     * before they are parsed and ignores encoding comments.
+     */
+    bool encoding_locked;
+
     /**
      * Whether or not the encoding has been changed by a magic comment. We use
      * this to provide a fast path for the lexer instead of going through the
diff --git a/prism/prism.c b/prism/prism.c
index 197bebc97b..bba357a5b1 100644
--- a/prism/prism.c
+++ b/prism/prism.c
@@ -8261,7 +8261,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
 
         // We only want to attempt to compare against encoding comments if it's
         // the first line in the file (or the second in the case of a shebang).
-        if (parser->current.start == parser->encoding_comment_start) {
+        if (parser->current.start == parser->encoding_comment_start && !parser->encoding_locked) {
             if (
                 (key_length == 8 && pm_strncasecmp(key_source, (const uint8_t *) "encoding", 8) == 0) ||
                 (key_length == 6 && pm_strncasecmp(key_source, (const uint8_t *) "coding", 6) == 0)
@@ -10438,7 +10438,9 @@ parser_lex(pm_parser_t *parser) {
                         // pass and we're at the start of the file, then we need
                         // to do another pass to potentially find other patterns
                         // for encoding comments.
-                        if (length >= 10) parser_lex_magic_comment_encoding(parser);
+                        if (length >= 10 && !parser->encoding_locked) {
+                            parser_lex_magic_comment_encoding(parser);
+                        }
                     }
 
                     lexed_comment = true;
@@ -21244,6 +21246,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
         .parsing_eval = false,
         .command_start = true,
         .recovering = false,
+        .encoding_locked = false,
         .encoding_changed = false,
         .pattern_matching_newlines = false,
         .in_keyword_arg = false,
@@ -21291,6 +21294,9 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
             parser_lex_magic_comment_encoding_value(parser, encoding_source, encoding_source + encoding_length);
         }
 
+        // encoding_locked option
+        parser->encoding_locked = options->encoding_locked;
+
         // frozen_string_literal option
         parser->frozen_string_literal = options->frozen_string_literal;
 
diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb
index 65535af0fd..a5cf919ae5 100644
--- a/test/prism/ruby/parser_test.rb
+++ b/test/prism/ruby/parser_test.rb
@@ -60,7 +60,6 @@ module Prism
     # skip them for now.
     skip_all = skip_incorrect | [
       "regex.txt",
-      "regex_char_width.txt",
       "unescaping.txt",
       "seattlerb/bug190.txt",
       "seattlerb/heredoc_with_extra_carriage_returns_windows.txt",