diff --git a/prism/config.yml b/prism/config.yml index 34086cd8cf..08d4f78c47 100644 --- a/prism/config.yml +++ b/prism/config.yml @@ -1,5 +1,4 @@ errors: - - CANNOT_PARSE_EXPRESSION - ALIAS_ARGUMENT - AMPAMPEQ_MULTI_ASSIGN - ARGUMENT_AFTER_BLOCK @@ -34,6 +33,7 @@ errors: - BLOCK_PARAM_PIPE_TERM - BLOCK_TERM_BRACE - BLOCK_TERM_END + - CANNOT_PARSE_EXPRESSION - CANNOT_PARSE_STRING_PART - CASE_EXPRESSION_AFTER_CASE - CASE_EXPRESSION_AFTER_WHEN @@ -82,13 +82,13 @@ errors: - EXPECT_ARGUMENT - EXPECT_EOL_AFTER_STATEMENT - EXPECT_EXPRESSION_AFTER_AMPAMPEQ - - EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ - EXPECT_EXPRESSION_AFTER_COMMA - EXPECT_EXPRESSION_AFTER_EQUAL - EXPECT_EXPRESSION_AFTER_LESS_LESS - EXPECT_EXPRESSION_AFTER_LPAREN - - EXPECT_EXPRESSION_AFTER_QUESTION - EXPECT_EXPRESSION_AFTER_OPERATOR + - EXPECT_EXPRESSION_AFTER_PIPEPIPEEQ + - EXPECT_EXPRESSION_AFTER_QUESTION - EXPECT_EXPRESSION_AFTER_SPLAT - EXPECT_EXPRESSION_AFTER_SPLAT_HASH - EXPECT_EXPRESSION_AFTER_STAR @@ -113,23 +113,25 @@ errors: - HASH_VALUE - HEREDOC_TERM - INCOMPLETE_QUESTION_MARK - - INCOMPLETE_VARIABLE_CLASS_3_3_0 - INCOMPLETE_VARIABLE_CLASS - - INCOMPLETE_VARIABLE_INSTANCE_3_3_0 + - INCOMPLETE_VARIABLE_CLASS_3_3_0 - INCOMPLETE_VARIABLE_INSTANCE + - INCOMPLETE_VARIABLE_INSTANCE_3_3_0 + - INVALID_CHARACTER - INVALID_ENCODING_MAGIC_COMMENT - INVALID_FLOAT_EXPONENT + - INVALID_MULTIBYTE_CHAR + - INVALID_MULTIBYTE_CHARACTER + - INVALID_MULTIBYTE_ESCAPE - INVALID_NUMBER_BINARY - INVALID_NUMBER_DECIMAL - INVALID_NUMBER_HEXADECIMAL - INVALID_NUMBER_OCTAL - INVALID_NUMBER_UNDERSCORE - - INVALID_CHARACTER - - INVALID_MULTIBYTE_CHARACTER - - INVALID_PRINTABLE_CHARACTER - INVALID_PERCENT - - INVALID_VARIABLE_GLOBAL_3_3_0 + - INVALID_PRINTABLE_CHARACTER - INVALID_VARIABLE_GLOBAL + - INVALID_VARIABLE_GLOBAL_3_3_0 - IT_NOT_ALLOWED_NUMBERED - IT_NOT_ALLOWED_ORDINARY - LAMBDA_OPEN @@ -150,8 +152,8 @@ errors: - MODULE_TERM - MULTI_ASSIGN_MULTI_SPLATS - MULTI_ASSIGN_UNEXPECTED_REST - - NOT_EXPRESSION - NO_LOCAL_VARIABLE + - NOT_EXPRESSION - NUMBER_LITERAL_UNDERSCORE - NUMBERED_PARAMETER_IT - NUMBERED_PARAMETER_ORDINARY @@ -173,8 +175,8 @@ errors: - PARAMETER_UNEXPECTED_FWD - PARAMETER_WILD_LOOSE_COMMA - PATTERN_EXPRESSION_AFTER_BRACKET - - PATTERN_EXPRESSION_AFTER_HROCKET - PATTERN_EXPRESSION_AFTER_COMMA + - PATTERN_EXPRESSION_AFTER_HROCKET - PATTERN_EXPRESSION_AFTER_IN - PATTERN_EXPRESSION_AFTER_KEY - PATTERN_EXPRESSION_AFTER_PAREN @@ -191,7 +193,12 @@ errors: - PATTERN_TERM_BRACKET - PATTERN_TERM_PAREN - PIPEPIPEEQ_MULTI_ASSIGN + - REGEXP_ENCODING_OPTION_MISMATCH + - REGEXP_INCOMPAT_CHAR_ENCODING + - REGEXP_INVALID_UNICODE_RANGE + - REGEXP_NON_ESCAPED_MBC - REGEXP_TERM + - REGEXP_UTF8_CHAR_NON_UTF8_REGEXP - RESCUE_EXPRESSION - RESCUE_MODIFIER_VALUE - RESCUE_TERM @@ -213,9 +220,9 @@ errors: - TERNARY_EXPRESSION_FALSE - TERNARY_EXPRESSION_TRUE - UNARY_RECEIVER + - UNDEF_ARGUMENT - UNEXPECTED_TOKEN_CLOSE_CONTEXT - UNEXPECTED_TOKEN_IGNORE - - UNDEF_ARGUMENT - UNTIL_TERM - VOID_EXPRESSION - WHILE_TERM diff --git a/prism/encoding.h b/prism/encoding.h index a1af1298e0..0850e291d8 100644 --- a/prism/encoding.h +++ b/prism/encoding.h @@ -252,6 +252,18 @@ extern const pm_encoding_t pm_encodings[PM_ENCODING_MAXIMUM]; */ #define PM_ENCODING_ASCII_8BIT_ENTRY (&pm_encodings[PM_ENCODING_ASCII_8BIT]) +/** + * This is the EUC-JP encoding. We need a reference to it to quickly process + * regular expression modifiers. + */ +#define PM_ENCODING_EUC_JP_ENTRY (&pm_encodings[PM_ENCODING_EUC_JP]) + +/** + * This is the Windows-31J encoding. We need a reference to it to quickly + * process regular expression modifiers. + */ +#define PM_ENCODING_WINDOWS_31J_ENTRY (&pm_encodings[PM_ENCODING_WINDOWS_31J]) + /** * Parse the given name of an encoding and return a pointer to the corresponding * encoding struct if one can be found, otherwise return NULL. diff --git a/prism/prism.c b/prism/prism.c index 2b1d35588d..d263afab9c 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -5951,6 +5951,61 @@ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) { return 0; } +static inline pm_node_flags_t +parse_and_validate_regular_expression_encoding_modifier(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags, char modifier, const pm_encoding_t *modifier_encoding) { + assert ((modifier == 'n' && modifier_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) || + (modifier == 'u' && modifier_encoding == PM_ENCODING_UTF_8_ENTRY) || + (modifier == 'e' && modifier_encoding == PM_ENCODING_EUC_JP_ENTRY) || + (modifier == 's' && modifier_encoding == PM_ENCODING_WINDOWS_31J_ENTRY)); + + // There's special validation logic used if a string does not contain any character escape sequences. + if (parser->explicit_encoding == NULL) { + // If an ASCII-only string without character escapes is used with an encoding modifier, then resulting Regexp + // has the modifier encoding, unless the ASCII-8BIT modifier is used, in which case the Regexp "downgrades" to + // the US-ASCII encoding. + bool ascii_only = pm_ascii_only_p(contents); + if (ascii_only) { + return modifier == 'n' ? PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING : flags; + } + + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + if (!ascii_only) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + } else if (parser->encoding != modifier_encoding) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH, modifier, parser->encoding->name); + + if (modifier == 'n' && !ascii_only) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_NON_ESCAPED_MBC, pm_string_source(source)); + } + } + + return flags; + } + + // TODO (nirvdrum 21-Feb-2024): To validate regexp sources with character escape sequences we need to know whether hex or Unicode escape sequences were used and Prism doesn't currently provide that data. We handle a subset of unambiguous cases in the meanwhile. + bool mixed_encoding = false; + + if (mixed_encoding) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, pm_string_source(source)); + } else if (modifier != 'n' && parser->explicit_encoding == PM_ENCODING_ASCII_8BIT_ENTRY) { + // TODO (nirvdrum 21-Feb-2024): Validate the content is valid in the modifier encoding. Do this on-demand so we don't pay the cost of computation unnecessarily. + bool valid_string_in_modifier_encoding = true; + + if (!valid_string_in_modifier_encoding) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_ESCAPE, pm_string_source(source)); + } + } else if (modifier != 'u' && parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + // TODO (nirvdrum 21-Feb-2024): There's currently no way to tell if the source used hex or Unicode character escapes from `explicit_encoding` alone. If the source encoding was already UTF-8, both character escape types would set `explicit_encoding` to UTF-8, but need to be processed differently. Skip for now. + if (parser->encoding != PM_ENCODING_UTF_8_ENTRY) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING, pm_string_source(source)); + } + } + + // We've determined the encoding would naturally be EUC-JP and there is no need to force the encoding to anything else. + return flags; +} + /** * Ruby "downgrades" the encoding of Regexps to US-ASCII if the associated encoding is ASCII-compatible and * the unescaped representation of a Regexp source consists only of US-ASCII code points. This is true even @@ -5958,9 +6013,50 @@ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) { * may be explicitly set with an escape sequence. */ static inline pm_node_flags_t -parse_regular_expression_encoding(const pm_parser_t *parser, const pm_string_t *contents) { - // Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all regular expressions - // appearing in source are eligible for "downgrading" to US-ASCII. +parse_and_validate_regular_expression_encoding(pm_parser_t *parser, const pm_string_t *source, const pm_string_t *contents, pm_node_flags_t flags) { + // TODO (nirvdrum 22-Feb-2024): CRuby reports a special Regexp-specific error for invalid Unicode ranges. We either need to scan again or modify the "invalid Unicode escape sequence" message we already report. + bool valid_unicode_range = true; + if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY && !valid_unicode_range) { + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_REGEXP_INVALID_UNICODE_RANGE, pm_string_source(source)); + + return flags; + } + + // US-ASCII strings do not admit multi-byte character literals. However, character escape sequences corresponding + // to multi-byte characters are allowed. + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY && parser->explicit_encoding == NULL && !pm_ascii_only_p(contents)) { + // CRuby will continue processing even though a SyntaxError has already been detected. It may result in the + // following error message appearing twice. We do the same for compatibility. + PM_PARSER_ERR_TOKEN_FORMAT(parser, parser->current, PM_ERR_INVALID_MULTIBYTE_CHAR, parser->encoding->name); + } + + /** + * Start checking modifier flags. We need to process these before considering any explicit encodings that may have + * been set by character literals. The order in which the encoding modifiers is checked does not matter. In the + * event that both an encoding modifier and an explicit encoding would result in the same encoding we do not set + * the corresponding "forced_" flag. Instead, the caller should check the encoding modifier flag and + * determine the encoding that way. + */ + + if (flags & PM_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) { + return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'n', PM_ENCODING_ASCII_8BIT_ENTRY); + } + + if (flags & PM_REGULAR_EXPRESSION_FLAGS_UTF_8) { + return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'u', PM_ENCODING_UTF_8_ENTRY); + } + + if (flags & PM_REGULAR_EXPRESSION_FLAGS_EUC_JP) { + return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 'e', PM_ENCODING_EUC_JP_ENTRY); + } + + if (flags & PM_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) { + return parse_and_validate_regular_expression_encoding_modifier(parser, source, contents, flags, 's', PM_ENCODING_WINDOWS_31J_ENTRY); + } + + // At this point no encoding modifiers will be present on the regular expression as they would have already + // been processed. Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all + // regular expressions without an encoding modifier appearing in source are eligible for "downgrading" to US-ASCII. if (pm_ascii_only_p(contents)) { return PM_REGULAR_EXPRESSION_FLAGS_FORCED_US_ASCII_ENCODING; } @@ -5976,6 +6072,7 @@ parse_regular_expression_encoding(const pm_parser_t *parser, const pm_string_t * return PM_REGULAR_EXPRESSION_FLAGS_FORCED_BINARY_ENCODING; } } + return 0; } @@ -17030,7 +17127,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b // more easily compiled. if (accept1(parser, PM_TOKEN_REGEXP_END)) { pm_node_t *regular_expression_node = (pm_node_t *) pm_regular_expression_node_create_unescaped(parser, &opening, &content, &parser->previous, &source); - pm_node_flag_set(regular_expression_node, parse_regular_expression_encoding(parser, &unescaped)); + pm_node_flag_set(regular_expression_node, parse_and_validate_regular_expression_encoding(parser, &source, &unescaped, regular_expression_node->flags)); return regular_expression_node; } diff --git a/prism/templates/src/diagnostic.c.erb b/prism/templates/src/diagnostic.c.erb index c8c35bd49b..06681dda91 100644 --- a/prism/templates/src/diagnostic.c.erb +++ b/prism/templates/src/diagnostic.c.erb @@ -204,7 +204,9 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_INVALID_NUMBER_OCTAL] = { "invalid octal number", PM_ERROR_LEVEL_FATAL }, [PM_ERR_INVALID_NUMBER_UNDERSCORE] = { "invalid underscore placement in number", PM_ERROR_LEVEL_FATAL }, [PM_ERR_INVALID_CHARACTER] = { "invalid character 0x%X", PM_ERROR_LEVEL_FATAL }, + [PM_ERR_INVALID_MULTIBYTE_CHAR] = { "invalid multibyte char (%s)", PM_ERROR_LEVEL_FATAL }, [PM_ERR_INVALID_MULTIBYTE_CHARACTER] = { "invalid multibyte character 0x%X", PM_ERROR_LEVEL_FATAL }, + [PM_ERR_INVALID_MULTIBYTE_ESCAPE] = { "invalid multibyte escape: /%s/", PM_ERROR_LEVEL_FATAL }, [PM_ERR_INVALID_PRINTABLE_CHARACTER] = { "invalid character `%c`", PM_ERROR_LEVEL_FATAL }, [PM_ERR_INVALID_PERCENT] = { "invalid `%` token", PM_ERROR_LEVEL_FATAL }, // TODO WHAT? [PM_ERR_INVALID_VARIABLE_GLOBAL_3_3_0] = { "`%.*s' is not allowed as a global variable name", PM_ERROR_LEVEL_FATAL }, @@ -270,7 +272,12 @@ static const pm_diagnostic_data_t diagnostic_messages[PM_DIAGNOSTIC_ID_MAX] = { [PM_ERR_PATTERN_TERM_BRACKET] = { "expected a `]` to close the pattern expression", PM_ERROR_LEVEL_FATAL }, [PM_ERR_PATTERN_TERM_PAREN] = { "expected a `)` to close the pattern expression", PM_ERROR_LEVEL_FATAL }, [PM_ERR_PIPEPIPEEQ_MULTI_ASSIGN] = { "unexpected `||=` in a multiple assignment", PM_ERROR_LEVEL_FATAL }, + [PM_ERR_REGEXP_ENCODING_OPTION_MISMATCH] = { "regexp encoding option '%c' differs from source encoding '%s'", PM_ERROR_LEVEL_FATAL }, + [PM_ERR_REGEXP_INCOMPAT_CHAR_ENCODING] = { "incompatible character encoding: /%s/", PM_ERROR_LEVEL_FATAL }, + [PM_ERR_REGEXP_NON_ESCAPED_MBC] = { "/.../n has a non escaped non ASCII character in non ASCII-8BIT script: /%s/", PM_ERROR_LEVEL_FATAL }, + [PM_ERR_REGEXP_INVALID_UNICODE_RANGE] = { "invalid Unicode range: /%s/", PM_ERROR_LEVEL_FATAL }, [PM_ERR_REGEXP_TERM] = { "expected a closing delimiter for the regular expression", PM_ERROR_LEVEL_FATAL }, + [PM_ERR_REGEXP_UTF8_CHAR_NON_UTF8_REGEXP] = { "UTF-8 character in non UTF-8 regexp: /%s/", PM_ERROR_LEVEL_FATAL }, [PM_ERR_RESCUE_EXPRESSION] = { "expected a rescued expression", PM_ERROR_LEVEL_FATAL }, [PM_ERR_RESCUE_MODIFIER_VALUE] = { "expected a value after the `rescue` modifier", PM_ERROR_LEVEL_FATAL }, [PM_ERR_RESCUE_TERM] = { "expected a closing delimiter for the `rescue` clause", PM_ERROR_LEVEL_FATAL }, diff --git a/test/prism/encoding_test.rb b/test/prism/encoding_test.rb index 762dcde717..0d65200b90 100644 --- a/test/prism/encoding_test.rb +++ b/test/prism/encoding_test.rb @@ -181,6 +181,18 @@ module Prism end end + encoding_modifiers = { ascii_8bit: "n", utf_8: "u", euc_jp: "e", windows_31j: "s" } + regexp_sources = ["abc", "garçon", "\\x80", "gar\\xC3\\xA7on", "gar\\u{E7}on", "abc\\u{FFFFFF}", "\\x80\\u{80}" ] + + encoding_modifiers.each_value do |modifier| + encodings.each_key do |encoding| + define_method(:"test_regular_expression_encoding_modifiers_/#{modifier}_#{encoding.name}") do + regexps = regexp_sources.product(encoding_modifiers.values).map { |r, modifier| "/#{r}/#{modifier}" } + assert_regular_expression_encoding_flags(encoding, regexps) + end + end + end + def test_coding result = Prism.parse("# coding: utf-8\n'string'") actual = result.value.statements.body.first.unescaped.encoding @@ -470,16 +482,25 @@ module Prism def assert_regular_expression_encoding_flags(encoding, regexps) regexps.each do |regexp| + regexp_modifier_used = regexp.end_with?("/u") || regexp.end_with?("/e") || regexp.end_with?("/s") || regexp.end_with?("/n") source = "# encoding: #{encoding.name}\n#{regexp}" + encoding_errors = ["invalid multibyte char", "escaped non ASCII character in UTF-8 regexp", "differs from source encoding"] + skipped_errors = ["invalid multibyte escape", "incompatible character encoding", "UTF-8 character in non UTF-8 regexp", "invalid Unicode range", "invalid Unicode list"] + + # TODO (nirvdrum 21-Feb-2024): Prism currently does not handle Regexp validation unless modifiers are used. So, skip processing those errors for now: https://github.com/ruby/prism/issues/2104 + unless regexp_modifier_used + skipped_errors += encoding_errors + encoding_errors.clear + end + expected = begin eval(source).encoding rescue SyntaxError => error - if error.message.include?("UTF-8 character in non UTF-8 regexp") || error.message.include?("escaped non ASCII character in UTF-8 regexp") - error.message[/: (.+?)\n/, 1] - elsif error.message.include?("invalid multibyte char") - # TODO (nirvdrum 26-Jan-2024): Bail out early of the rest of the test due to https://github.com/ruby/prism/issues/2104. + if encoding_errors.find { |e| error.message.include?(e) } + messages = error.message.split("\n").map { |m| m[/: (.+?)$/, 1] } + elsif skipped_errors.find { |e| error.message.include?(e) } next else raise @@ -491,24 +512,65 @@ module Prism if result.success? regexp = result.value.statements.body.first - if regexp.forced_utf8_encoding? + actual_encoding = if regexp.forced_utf8_encoding? Encoding::UTF_8 elsif regexp.forced_binary_encoding? Encoding::ASCII_8BIT elsif regexp.forced_us_ascii_encoding? Encoding::US_ASCII + elsif regexp.ascii_8bit? + Encoding::ASCII_8BIT + elsif regexp.utf_8? + Encoding::UTF_8 + elsif regexp.euc_jp? + Encoding::EUC_JP + elsif regexp.windows_31j? + Encoding::Windows_31J else encoding end - else - error = result.errors.last - unless error.message.include?("UTF-8 mixed within") - raise error.message + if regexp.utf_8? && actual_encoding != Encoding::UTF_8 + raise "expected regexp encoding to be UTF-8 due to '/u' modifier, but got #{actual_encoding.name}" + elsif regexp.ascii_8bit? && (actual_encoding != Encoding::ASCII_8BIT && actual_encoding != Encoding::US_ASCII) + raise "expected regexp encoding to be ASCII-8BIT or US-ASCII due to '/n' modifier, but got #{actual_encoding.name}" + elsif regexp.euc_jp? && actual_encoding != Encoding::EUC_JP + raise "expected regexp encoding to be EUC-JP due to '/e' modifier, but got #{actual_encoding.name}" + elsif regexp.windows_31j? && actual_encoding != Encoding::Windows_31J + raise "expected regexp encoding to be Windows-31J due to '/s' modifier, but got #{actual_encoding.name}" + end + + if regexp.utf_8? && regexp.forced_utf8_encoding? + raise "the forced_utf8 flag should not be set when the UTF-8 modifier (/u) is used" + elsif regexp.ascii_8bit? && regexp.forced_binary_encoding? + raise "the forced_ascii_8bit flag should not be set when the UTF-8 modifier (/u) is used" + end + + actual_encoding + else + errors = result.errors.map(&:message) + + if errors.last&.include?("UTF-8 mixed within") + nil + else + errors end end end + # TODO (nirvdrum 22-Feb-2024): Remove this workaround once Prism better maps CRuby's error messages. + # This class of error message is tricky. The part not being compared is a representation of the regexp. + # Depending on the source encoding and any encoding modifiers being used, CRuby alters how the regexp is represented. + # Sometimes it's an MBC string. Other times it uses hexadecimal character escapes. And in other cases it uses + # the long-form Unicode escape sequences. This short-circuit checks that the error message is mostly correct. + if expected.is_a?(Array) && actual.is_a?(Array) + if expected.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:") && + actual.last.start_with?("/.../n has a non escaped non ASCII character in non ASCII-8BIT script:") + expected.last.clear + actual.last.clear + end + end + assert_equal expected, actual end end