[ruby/prism] Provide ability to lock encoding while parsing

https://github.com/ruby/prism/commit/f7faedfb3f
This commit is contained in:
Kevin Newton 2024-06-10 13:39:16 -04:00
parent 0d5eea9957
commit d827d32527
8 changed files with 54 additions and 7 deletions

View File

@ -431,6 +431,9 @@ module Prism
template << "C" template << "C"
values << { nil => 0, "3.3.0" => 1, "3.3.1" => 1, "3.4.0" => 0, "latest" => 0 }.fetch(options[:version]) values << { nil => 0, "3.3.0" => 1, "3.3.1" => 1, "3.4.0" => 0, "latest" => 0 }.fetch(options[:version])
template << "C"
values << (options[:encoding] == false ? 1 : 0)
template << "L" template << "L"
if (scopes = options[:scopes]) if (scopes = options[:scopes])
values << scopes.length values << scopes.length

View File

@ -51,7 +51,7 @@ module Prism
source = source_buffer.source source = source_buffer.source
offset_cache = build_offset_cache(source) offset_cache = build_offset_cache(source)
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache) result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)
build_ast(result.value, offset_cache) build_ast(result.value, offset_cache)
ensure ensure
@ -64,7 +64,7 @@ module Prism
source = source_buffer.source source = source_buffer.source
offset_cache = build_offset_cache(source) offset_cache = build_offset_cache(source)
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache) result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)
[ [
build_ast(result.value, offset_cache), build_ast(result.value, offset_cache),
@ -83,7 +83,7 @@ module Prism
offset_cache = build_offset_cache(source) offset_cache = build_offset_cache(source)
result = result =
begin begin
unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]]), offset_cache) unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), scopes: [[]], encoding: false), offset_cache)
rescue ::Parser::SyntaxError rescue ::Parser::SyntaxError
raise if !recover raise if !recover
end end

View File

@ -138,7 +138,13 @@ build_options_i(VALUE key, VALUE value, VALUE argument) {
if (key_id == rb_id_option_filepath) { if (key_id == rb_id_option_filepath) {
if (!NIL_P(value)) pm_options_filepath_set(options, check_string(value)); if (!NIL_P(value)) pm_options_filepath_set(options, check_string(value));
} else if (key_id == rb_id_option_encoding) { } else if (key_id == rb_id_option_encoding) {
if (!NIL_P(value)) pm_options_encoding_set(options, rb_enc_name(rb_to_encoding(value))); if (!NIL_P(value)) {
if (value == Qfalse) {
pm_options_encoding_locked_set(options, true);
} else {
pm_options_encoding_set(options, rb_enc_name(rb_to_encoding(value)));
}
}
} else if (key_id == rb_id_option_line) { } else if (key_id == rb_id_option_line) {
if (!NIL_P(value)) pm_options_line_set(options, NUM2INT(value)); if (!NIL_P(value)) pm_options_line_set(options, NUM2INT(value));
} else if (key_id == rb_id_option_frozen_string_literal) { } else if (key_id == rb_id_option_frozen_string_literal) {
@ -206,6 +212,7 @@ build_options(VALUE argument) {
static void static void
extract_options(pm_options_t *options, VALUE filepath, VALUE keywords) { extract_options(pm_options_t *options, VALUE filepath, VALUE keywords) {
options->line = 1; // default options->line = 1; // default
if (!NIL_P(keywords)) { if (!NIL_P(keywords)) {
struct build_options_data data = { .options = options, .keywords = keywords }; struct build_options_data data = { .options = options, .keywords = keywords };
struct build_options_data *argument = &data; struct build_options_data *argument = &data;

View File

@ -16,6 +16,14 @@ pm_options_encoding_set(pm_options_t *options, const char *encoding) {
pm_string_constant_init(&options->encoding, encoding, strlen(encoding)); pm_string_constant_init(&options->encoding, encoding, strlen(encoding));
} }
/**
* Set the encoding_locked option on the given options struct.
*/
PRISM_EXPORTED_FUNCTION void
pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked) {
options->encoding_locked = encoding_locked;
}
/** /**
* Set the line option on the given options struct. * Set the line option on the given options struct.
*/ */
@ -215,6 +223,7 @@ pm_options_read(pm_options_t *options, const char *data) {
options->frozen_string_literal = (int8_t) *data++; options->frozen_string_literal = (int8_t) *data++;
options->command_line = (uint8_t) *data++; options->command_line = (uint8_t) *data++;
options->version = (pm_options_version_t) *data++; options->version = (pm_options_version_t) *data++;
options->encoding_locked = ((uint8_t) *data++) > 0;
uint32_t scopes_count = pm_options_read_u32(data); uint32_t scopes_count = pm_options_read_u32(data);
data += 4; data += 4;

View File

@ -103,6 +103,13 @@ typedef struct {
* - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET * - PM_OPTIONS_FROZEN_STRING_LITERAL_UNSET
*/ */
int8_t frozen_string_literal; int8_t frozen_string_literal;
/**
* Whether or not the encoding magic comments should be respected. This is a
* niche use-case where you want to parse a file with a specific encoding
* but ignore any encoding magic comments at the top of the file.
*/
bool encoding_locked;
} pm_options_t; } pm_options_t;
/** /**
@ -166,6 +173,14 @@ PRISM_EXPORTED_FUNCTION void pm_options_line_set(pm_options_t *options, int32_t
*/ */
PRISM_EXPORTED_FUNCTION void pm_options_encoding_set(pm_options_t *options, const char *encoding); PRISM_EXPORTED_FUNCTION void pm_options_encoding_set(pm_options_t *options, const char *encoding);
/**
* Set the encoding_locked option on the given options struct.
*
* @param options The options struct to set the encoding_locked value on.
* @param encoding_locked The encoding_locked value to set.
*/
PRISM_EXPORTED_FUNCTION void pm_options_encoding_locked_set(pm_options_t *options, bool encoding_locked);
/** /**
* Set the frozen string literal option on the given options struct. * Set the frozen string literal option on the given options struct.
* *

View File

@ -860,6 +860,14 @@ struct pm_parser {
/** Whether or not we're currently recovering from a syntax error. */ /** Whether or not we're currently recovering from a syntax error. */
bool recovering; bool recovering;
/**
* This is very specialized behavior for when you want to parse in a context
* that does not respect encoding comments. Its main use case is translating
* into the whitequark/parser AST which re-encodes source files in UTF-8
* before they are parsed and ignores encoding comments.
*/
bool encoding_locked;
/** /**
* Whether or not the encoding has been changed by a magic comment. We use * Whether or not the encoding has been changed by a magic comment. We use
* this to provide a fast path for the lexer instead of going through the * this to provide a fast path for the lexer instead of going through the

View File

@ -8261,7 +8261,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
// We only want to attempt to compare against encoding comments if it's // We only want to attempt to compare against encoding comments if it's
// the first line in the file (or the second in the case of a shebang). // the first line in the file (or the second in the case of a shebang).
if (parser->current.start == parser->encoding_comment_start) { if (parser->current.start == parser->encoding_comment_start && !parser->encoding_locked) {
if ( if (
(key_length == 8 && pm_strncasecmp(key_source, (const uint8_t *) "encoding", 8) == 0) || (key_length == 8 && pm_strncasecmp(key_source, (const uint8_t *) "encoding", 8) == 0) ||
(key_length == 6 && pm_strncasecmp(key_source, (const uint8_t *) "coding", 6) == 0) (key_length == 6 && pm_strncasecmp(key_source, (const uint8_t *) "coding", 6) == 0)
@ -10438,7 +10438,9 @@ parser_lex(pm_parser_t *parser) {
// pass and we're at the start of the file, then we need // pass and we're at the start of the file, then we need
// to do another pass to potentially find other patterns // to do another pass to potentially find other patterns
// for encoding comments. // for encoding comments.
if (length >= 10) parser_lex_magic_comment_encoding(parser); if (length >= 10 && !parser->encoding_locked) {
parser_lex_magic_comment_encoding(parser);
}
} }
lexed_comment = true; lexed_comment = true;
@ -21244,6 +21246,7 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
.parsing_eval = false, .parsing_eval = false,
.command_start = true, .command_start = true,
.recovering = false, .recovering = false,
.encoding_locked = false,
.encoding_changed = false, .encoding_changed = false,
.pattern_matching_newlines = false, .pattern_matching_newlines = false,
.in_keyword_arg = false, .in_keyword_arg = false,
@ -21291,6 +21294,9 @@ pm_parser_init(pm_parser_t *parser, const uint8_t *source, size_t size, const pm
parser_lex_magic_comment_encoding_value(parser, encoding_source, encoding_source + encoding_length); parser_lex_magic_comment_encoding_value(parser, encoding_source, encoding_source + encoding_length);
} }
// encoding_locked option
parser->encoding_locked = options->encoding_locked;
// frozen_string_literal option // frozen_string_literal option
parser->frozen_string_literal = options->frozen_string_literal; parser->frozen_string_literal = options->frozen_string_literal;

View File

@ -60,7 +60,6 @@ module Prism
# skip them for now. # skip them for now.
skip_all = skip_incorrect | [ skip_all = skip_incorrect | [
"regex.txt", "regex.txt",
"regex_char_width.txt",
"unescaping.txt", "unescaping.txt",
"seattlerb/bug190.txt", "seattlerb/bug190.txt",
"seattlerb/heredoc_with_extra_carriage_returns_windows.txt", "seattlerb/heredoc_with_extra_carriage_returns_windows.txt",