[ruby/prism] Fix binary encoding for the parser translator
Skipping detecting the encoding is almost always right, just for binary it should actually happen. A symbol containing escapes that are invalid in utf-8 would fail to parse since symbols must be valid in the script encoding. Additionally, the parser gem would raise an exception somewhere during string handling https://github.com/ruby/prism/commit/fa0154d9e4
This commit is contained in:
parent
8e56d9e415
commit
723f31cf6b
@ -51,7 +51,7 @@ module Prism
|
||||
source = source_buffer.source
|
||||
|
||||
offset_cache = build_offset_cache(source)
|
||||
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
|
||||
result = unwrap(Prism.parse(source, **prism_options), offset_cache)
|
||||
|
||||
build_ast(result.value, offset_cache)
|
||||
ensure
|
||||
@ -64,7 +64,7 @@ module Prism
|
||||
source = source_buffer.source
|
||||
|
||||
offset_cache = build_offset_cache(source)
|
||||
result = unwrap(Prism.parse(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
|
||||
result = unwrap(Prism.parse(source, **prism_options), offset_cache)
|
||||
|
||||
[
|
||||
build_ast(result.value, offset_cache),
|
||||
@ -83,7 +83,7 @@ module Prism
|
||||
offset_cache = build_offset_cache(source)
|
||||
result =
|
||||
begin
|
||||
unwrap(Prism.parse_lex(source, filepath: source_buffer.name, version: convert_for_prism(version), partial_script: true, encoding: false), offset_cache)
|
||||
unwrap(Prism.parse_lex(source, **prism_options), offset_cache)
|
||||
rescue ::Parser::SyntaxError
|
||||
raise if !recover
|
||||
end
|
||||
@ -285,6 +285,20 @@ module Prism
|
||||
)
|
||||
end
|
||||
|
||||
# Options for how prism should parse/lex the source.
|
||||
def prism_options
|
||||
options = {
|
||||
filepath: @source_buffer.name,
|
||||
version: convert_for_prism(version),
|
||||
partial_script: true,
|
||||
}
|
||||
# The parser gem always encodes to UTF-8, unless it is binary.
|
||||
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/source/buffer.rb#L80-L107
|
||||
options[:encoding] = false if @source_buffer.source.encoding != Encoding::BINARY
|
||||
|
||||
options
|
||||
end
|
||||
|
||||
# Converts the version format handled by Parser to the format handled by Prism.
|
||||
def convert_for_prism(version)
|
||||
case version
|
||||
|
9
test/prism/fixtures/encoding_binary.txt
Normal file
9
test/prism/fixtures/encoding_binary.txt
Normal file
@ -0,0 +1,9 @@
|
||||
# encoding: binary
|
||||
|
||||
"\xcd"
|
||||
|
||||
:"\xcd"
|
||||
|
||||
/#{"\xcd"}/
|
||||
|
||||
%W[\xC0]
|
6
test/prism/fixtures/encoding_euc_jp.txt
Normal file
6
test/prism/fixtures/encoding_euc_jp.txt
Normal file
@ -0,0 +1,6 @@
|
||||
# encoding: euc-jp
|
||||
|
||||
# \x8E indicates a double-byte character, \x01 is not a valid second byte in euc-jp
|
||||
"\x8E\x01"
|
||||
|
||||
%W["\x8E\x01"]
|
@ -17,6 +17,18 @@ end
|
||||
# First, opt in to every AST feature.
|
||||
Parser::Builders::Default.modernize
|
||||
|
||||
# The parser gem rejects some strings that would most likely lead to errors
|
||||
# in consumers due to encoding problems. RuboCop however monkey-patches this
|
||||
# method out in order to accept such code.
|
||||
# https://github.com/whitequark/parser/blob/v3.3.6.0/lib/parser/builders/default.rb#L2289-L2295
|
||||
Parser::Builders::Default.prepend(
|
||||
Module.new {
|
||||
def string_value(token)
|
||||
value(token)
|
||||
end
|
||||
}
|
||||
)
|
||||
|
||||
# Modify the source map == check so that it doesn't check against the node
|
||||
# itself so we don't get into a recursive loop.
|
||||
Parser::Source::Map.prepend(
|
||||
|
@ -26,6 +26,7 @@ Sexp.prepend(
|
||||
module Prism
|
||||
class RubyParserTest < TestCase
|
||||
todos = [
|
||||
"encoding_euc_jp.txt",
|
||||
"newline_terminated.txt",
|
||||
"regex_char_width.txt",
|
||||
"seattlerb/bug169.txt",
|
||||
|
49
test/prism/snapshots/encoding_binary.txt
Normal file
49
test/prism/snapshots/encoding_binary.txt
Normal file
@ -0,0 +1,49 @@
|
||||
@ ProgramNode (location: (3,0)-(9,8))
|
||||
├── flags: ∅
|
||||
├── locals: []
|
||||
└── statements:
|
||||
@ StatementsNode (location: (3,0)-(9,8))
|
||||
├── flags: ∅
|
||||
└── body: (length: 4)
|
||||
├── @ StringNode (location: (3,0)-(3,6))
|
||||
│ ├── flags: newline
|
||||
│ ├── opening_loc: (3,0)-(3,1) = "\""
|
||||
│ ├── content_loc: (3,1)-(3,5) = "\\xcd"
|
||||
│ ├── closing_loc: (3,5)-(3,6) = "\""
|
||||
│ └── unescaped: "\xCD"
|
||||
├── @ SymbolNode (location: (5,0)-(5,7))
|
||||
│ ├── flags: newline, static_literal
|
||||
│ ├── opening_loc: (5,0)-(5,2) = ":\""
|
||||
│ ├── value_loc: (5,2)-(5,6) = "\\xcd"
|
||||
│ ├── closing_loc: (5,6)-(5,7) = "\""
|
||||
│ └── unescaped: "\xCD"
|
||||
├── @ InterpolatedRegularExpressionNode (location: (7,0)-(7,11))
|
||||
│ ├── flags: newline, static_literal
|
||||
│ ├── opening_loc: (7,0)-(7,1) = "/"
|
||||
│ ├── parts: (length: 1)
|
||||
│ │ └── @ EmbeddedStatementsNode (location: (7,1)-(7,10))
|
||||
│ │ ├── flags: ∅
|
||||
│ │ ├── opening_loc: (7,1)-(7,3) = "\#{"
|
||||
│ │ ├── statements:
|
||||
│ │ │ @ StatementsNode (location: (7,3)-(7,9))
|
||||
│ │ │ ├── flags: ∅
|
||||
│ │ │ └── body: (length: 1)
|
||||
│ │ │ └── @ StringNode (location: (7,3)-(7,9))
|
||||
│ │ │ ├── flags: static_literal, frozen
|
||||
│ │ │ ├── opening_loc: (7,3)-(7,4) = "\""
|
||||
│ │ │ ├── content_loc: (7,4)-(7,8) = "\\xcd"
|
||||
│ │ │ ├── closing_loc: (7,8)-(7,9) = "\""
|
||||
│ │ │ └── unescaped: "\xCD"
|
||||
│ │ └── closing_loc: (7,9)-(7,10) = "}"
|
||||
│ └── closing_loc: (7,10)-(7,11) = "/"
|
||||
└── @ ArrayNode (location: (9,0)-(9,8))
|
||||
├── flags: newline
|
||||
├── elements: (length: 1)
|
||||
│ └── @ StringNode (location: (9,3)-(9,7))
|
||||
│ ├── flags: ∅
|
||||
│ ├── opening_loc: ∅
|
||||
│ ├── content_loc: (9,3)-(9,7) = "\\xC0"
|
||||
│ ├── closing_loc: ∅
|
||||
│ └── unescaped: "\xC0"
|
||||
├── opening_loc: (9,0)-(9,3) = "%W["
|
||||
└── closing_loc: (9,7)-(9,8) = "]"
|
24
test/prism/snapshots/encoding_euc_jp.txt
Normal file
24
test/prism/snapshots/encoding_euc_jp.txt
Normal file
@ -0,0 +1,24 @@
|
||||
@ ProgramNode (location: (4,0)-(6,14))
|
||||
├── flags: ∅
|
||||
├── locals: []
|
||||
└── statements:
|
||||
@ StatementsNode (location: (4,0)-(6,14))
|
||||
├── flags: ∅
|
||||
└── body: (length: 2)
|
||||
├── @ StringNode (location: (4,0)-(4,10))
|
||||
│ ├── flags: newline
|
||||
│ ├── opening_loc: (4,0)-(4,1) = "\""
|
||||
│ ├── content_loc: (4,1)-(4,9) = "\\x8E\\x01"
|
||||
│ ├── closing_loc: (4,9)-(4,10) = "\""
|
||||
│ └── unescaped: "\x8E\x01"
|
||||
└── @ ArrayNode (location: (6,0)-(6,14))
|
||||
├── flags: newline
|
||||
├── elements: (length: 1)
|
||||
│ └── @ StringNode (location: (6,3)-(6,13))
|
||||
│ ├── flags: ∅
|
||||
│ ├── opening_loc: ∅
|
||||
│ ├── content_loc: (6,3)-(6,13) = "\"\\x8E\\x01\""
|
||||
│ ├── closing_loc: ∅
|
||||
│ └── unescaped: "\"\x8E\x01\""
|
||||
├── opening_loc: (6,0)-(6,3) = "%W["
|
||||
└── closing_loc: (6,13)-(6,14) = "]"
|
@ -5,6 +5,7 @@ require_relative "test_helper"
|
||||
module Prism
|
||||
class SnippetsTest < TestCase
|
||||
except = [
|
||||
"encoding_binary.txt",
|
||||
"newline_terminated.txt",
|
||||
"seattlerb/begin_rescue_else_ensure_no_bodies.txt",
|
||||
"seattlerb/case_in.txt",
|
||||
|
Loading…
x
Reference in New Issue
Block a user