From d827d3252786a5e1153f4e8bfa30f40a2aaafb95 Mon Sep 17 00:00:00 2001 From: Kevin Newton Date: Mon, 10 Jun 2024 13:39:16 -0400 Subject: [PATCH] [ruby/prism] Provide ability to lock encoding while parsing https://github.com/ruby/prism/commit/f7faedfb3f --- lib/prism/ffi.rb | 3 +++ lib/prism/translation/parser.rb | 6 +++--- prism/extension.c | 9 ++++++++- prism/options.c | 9 +++++++++ prism/options.h | 15 +++++++++++++++ prism/parser.h | 8 ++++++++ prism/prism.c | 10 ++++++++-- test/prism/ruby/parser_test.rb | 1 - 8 files changed, 54 insertions(+), 7 deletions(-) diff --git a/lib/prism/ffi.rb b/lib/prism/ffi.rb index 6b48af43cc..46c4a1a755 100644 --- a/lib/prism/ffi.rb +++ b/lib/prism/ffi.rb @@ -431,6 +431,9 @@ module Prism template << "C" values << { nil => 0, "3.3.0" => 1, "3.3.1" => 1, "3.4.0" => 0, "latest" => 0 }.fetch(options[:version]) + template << "C" + values << (options[:encoding] == false ? 1 : 0) + template << "L" if (scopes = options[:scopes]) values << scopes.length diff --git a/lib/prism/translation/parser.rb b/lib/prism/translation/parser.rb index 3748fc896e..8c7eb3aa75 100644 --- a/lib/prism/translation/parser.rb +++ b/lib/prism/translation/parser.rb @@ -51,7 +51,7 @@ module Prism source = source_buffer.source offset_cache = build_offset_cache(source) - result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache) + result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache) build_ast(result.value, offset_cache) ensure @@ -64,7 +64,7 @@ module Prism source = source_buffer.source offset_cache = build_offset_cache(source) - result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache) + result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache) [ build_ast(result.value, offset_cache), @@ -83,7 +83,7 @@ module Prism offset_cache = build_offset_cache(source) result = begin - unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache) + unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache) rescue ::Parser::SyntaxError raise if !recover end diff --git a/prism/extension.c b/prism/extension.c index 091cac79ce..1fb858491e 100644 --- a/prism/extension.c +++ b/prism/extension.c @@ -138,7 +138,13 @@ build_options_i(VALUE key, VALUE value, VALUE argument) { if (key_id == rb_id_option_filepath) { if (!NIL_P(value)) pm_options_filepath_set(options, check_string(value)); } else if (key_id == rb_id_option_encoding) { - if (!NIL_P(value)) pm_options_encoding_set(options, rb_enc_name(rb_to_encoding(value))); + if (!NIL_P(value)) { + if (value == Qfalse) { + pm_options_encoding_locked_set(options, true); + } else { + pm_options_encoding_set(options, rb_enc_name(rb_to_encoding(value))); + } + } } else if (key_id == rb_id_option_line) { if (!NIL_P(value)) pm_options_line_set(options, NUM2INT(value)); } else if (key_id == rb_id_option_frozen_string_literal) { @@ -206,6 +212,7 @@ build_options(VALUE argument) { static void extract_options(pm_options_t *options, VALUE filepath, VALUE keywords) { options->line = 1; // default + if (!NIL_P(keywords)) { struct build_options_data data = { .options = options, .keywords = keywords }; struct build_options_data *argument = &data; diff --git a/prism/options.c b/prism/options.c index 664db4f061..2ab2f260fd 100644 --- a/prism/options.c +++ b/prism/options.c @@ -16,6 +16,14 @@ pm_options_encoding_set(pm_options_t *options, const char *encoding) { pm_string_constant_init(&options->encoding, encoding, strlen(encoding)); } +/** + * Set the encoding_locked option on the given options struct. + */ +PRISM_EXPORTED_FUNCTION void +pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked) { + options->encoding_locked = encoding_locked; +} + /** * Set the line option on the given options struct. */ @@ -215,6 +223,7 @@ pm_options_read(pm_options_t *options, const char *data) { options->frozen_string_literal = (int8_t) *data++; options->command_line = (uint8_t) *data++; options->version = (pm_options_version_t) *data++; + options->encoding_locked = ((uint8_t) *data++) > 0; uint32_t scopes_count = pm_options_read_u32(data); data += 4; diff --git a/prism/options.h b/prism/options.h index a623ae0b83..784769f880 100644 --- a/prism/options.h +++ b/prism/options.h @@ -103,6 +103,13 @@ typedef struct { * - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET */ int8_t frozen_string_literal; + + /** + * Whether or not the encoding magic comments should be respected. This is a + * niche use-case where you want to parse a file with a specific encoding + * but ignore any encoding magic comments at the top of the file. + */ + bool encoding_locked; } pm_options_t; /** @@ -166,6 +173,14 @@ PRISM_EXPORTED_FUNCTION void pm_options_line_set(pm_options_t *options, int32_t */ PRISM_EXPORTED_FUNCTION void pm_options_encoding_set(pm_options_t *options, const char *encoding); +/** + * Set the encoding_locked option on the given options struct. + * + * @param options The options struct to set the encoding_locked value on. + * @param encoding_locked The encoding_locked value to set. + */ +PRISM_EXPORTED_FUNCTION void pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked); + /** * Set the frozen string literal option on the given options struct. * diff --git a/prism/parser.h b/prism/parser.h index c5f8ab9df4..048955409b 100644 --- a/prism/parser.h +++ b/prism/parser.h @@ -860,6 +860,14 @@ struct pm_parser { /** Whether or not we're currently recovering from a syntax error. */ bool recovering; + /** + * This is very specialized behavior for when you want to parse in a context + * that does not respect encoding comments. Its main use case is translating + * into the whitequark/parser AST which re-encodes source files in UTF-8 + * before they are parsed and ignores encoding comments. + */ + bool encoding_locked; + /** * Whether or not the encoding has been changed by a magic comment. We use * this to provide a fast path for the lexer instead of going through the diff --git a/prism/prism.c b/prism/prism.c index 197bebc97b..bba357a5b1 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -8261,7 +8261,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { // We only want to attempt to compare against encoding comments if it's // the first line in the file (or the second in the case of a shebang). - if (parser->current.start == parser->encoding_comment_start) { + if (parser->current.start == parser->encoding_comment_start && !parser->encoding_locked) { if ( (key_length == 8 && pm_strncasecmp(key_source, (const uint8_t *) "encoding", 8) == 0) || (key_length == 6 && pm_strncasecmp(key_source, (const uint8_t *) "coding", 6) == 0) @@ -10438,7 +10438,9 @@ parser_lex(pm_parser_t *parser) { // pass and we're at the start of the file, then we need // to do another pass to potentially find other patterns // for encoding comments. - if (length >= 10) parser_lex_magic_comment_encoding(parser); + if (length >= 10 && !parser->encoding_locked) { + parser_lex_magic_comment_encoding(parser); + } } lexed_comment = true; @@ -21244,6 +21246,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm .parsing_eval = false, .command_start = true, .recovering = false, + .encoding_locked = false, .encoding_changed = false, .pattern_matching_newlines = false, .in_keyword_arg = false, @@ -21291,6 +21294,9 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm parser_lex_magic_comment_encoding_value(parser, encoding_source, encoding_source + encoding_length); } + // encoding_locked option + parser->encoding_locked = options->encoding_locked; + // frozen_string_literal option parser->frozen_string_literal = options->frozen_string_literal; diff --git a/test/prism/ruby/parser_test.rb b/test/prism/ruby/parser_test.rb index 65535af0fd..a5cf919ae5 100644 --- a/test/prism/ruby/parser_test.rb +++ b/test/prism/ruby/parser_test.rb @@ -60,7 +60,6 @@ module Prism # skip them for now. skip_all = skip_incorrect | [ "regex.txt", - "regex_char_width.txt", "unescaping.txt", "seattlerb/bug190.txt", "seattlerb/heredoc_with_extra_carriage_returns_windows.txt",