[ruby/prism] Attempt to assume binary is UTF-8
https://github.com/ruby/prism/commit/343197e4ff
This commit is contained in:
parent
2e2a5e4ff9
commit
e39e582594
@ -12,6 +12,21 @@ module Prism
|
|||||||
def self.for(source, start_line = 1, offsets = [])
|
def self.for(source, start_line = 1, offsets = [])
|
||||||
if source.ascii_only?
|
if source.ascii_only?
|
||||||
ASCIISource.new(source, start_line, offsets)
|
ASCIISource.new(source, start_line, offsets)
|
||||||
|
elsif source.encoding == Encoding::BINARY
|
||||||
|
source.force_encoding(Encoding::UTF_8)
|
||||||
|
|
||||||
|
if source.valid_encoding?
|
||||||
|
new(source, start_line, offsets)
|
||||||
|
else
|
||||||
|
# This is an extremely niche use case where the file is marked as
|
||||||
|
# binary, contains multi-byte characters, and those characters are not
|
||||||
|
# valid UTF-8. In this case we'll mark it as binary and fall back to
|
||||||
|
# treating everything as a single-byte character. This _may_ cause
|
||||||
|
# problems when asking for code units, but it appears to be the
|
||||||
|
# cleanest solution at the moment.
|
||||||
|
source.force_encoding(Encoding::BINARY)
|
||||||
|
ASCIISource.new(source, start_line, offsets)
|
||||||
|
end
|
||||||
else
|
else
|
||||||
new(source, start_line, offsets)
|
new(source, start_line, offsets)
|
||||||
end
|
end
|
||||||
@ -89,6 +104,12 @@ module Prism
|
|||||||
# This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
|
# This method is tested with UTF-8, UTF-16, and UTF-32. If there is the
|
||||||
# concept of code units that differs from the number of characters in other
|
# concept of code units that differs from the number of characters in other
|
||||||
# encodings, it is not captured here.
|
# encodings, it is not captured here.
|
||||||
|
#
|
||||||
|
# We purposefully replace invalid and undefined characters with replacement
|
||||||
|
# characters in this conversion. This happens for two reasons. First, it's
|
||||||
|
# possible that the given byte offset will not occur on a character
|
||||||
|
# boundary. Second, it's possible that the source code will contain a
|
||||||
|
# character that has no equivalent in the given encoding.
|
||||||
def code_units_offset(byte_offset, encoding)
|
def code_units_offset(byte_offset, encoding)
|
||||||
byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)
|
byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)
|
||||||
|
|
||||||
@ -130,8 +151,12 @@ module Prism
|
|||||||
|
|
||||||
# Specialized version of Prism::Source for source code that includes ASCII
|
# Specialized version of Prism::Source for source code that includes ASCII
|
||||||
# characters only. This class is used to apply performance optimizations that
|
# characters only. This class is used to apply performance optimizations that
|
||||||
# cannot be applied to sources that include multibyte characters. Sources that
|
# cannot be applied to sources that include multibyte characters.
|
||||||
# include multibyte characters are represented by the Prism::Source class.
|
#
|
||||||
|
# In the extremely rare case that a source includes multi-byte characters but
|
||||||
|
# is marked as binary because of a magic encoding comment and it cannot be
|
||||||
|
# eagerly converted to UTF-8, this class will be used as well. This is because
|
||||||
|
# at that point we will treat everything as single-byte characters.
|
||||||
class ASCIISource < Source
|
class ASCIISource < Source
|
||||||
# Return the character offset for the given byte offset.
|
# Return the character offset for the given byte offset.
|
||||||
def character_offset(byte_offset)
|
def character_offset(byte_offset)
|
||||||
|
@ -20,10 +20,21 @@ module Prism
|
|||||||
def self.load(input, serialized)
|
def self.load(input, serialized)
|
||||||
input = input.dup
|
input = input.dup
|
||||||
source = Source.for(input)
|
source = Source.for(input)
|
||||||
|
|
||||||
loader = Loader.new(source, serialized)
|
loader = Loader.new(source, serialized)
|
||||||
result = loader.load_result
|
result = loader.load_result
|
||||||
|
|
||||||
input.force_encoding(loader.encoding)
|
input.force_encoding(loader.encoding)
|
||||||
|
|
||||||
|
# This is an extremely niche use-case where the file was marked as binary
|
||||||
|
# but it contained UTF-8-encoded characters. In that case we will actually
|
||||||
|
# put it back to UTF-8 to give the location APIs the best chance of being
|
||||||
|
# correct.
|
||||||
|
if !input.ascii_only? && input.encoding == Encoding::BINARY
|
||||||
|
input.force_encoding(Encoding::UTF_8)
|
||||||
|
input.force_encoding(Encoding::BINARY) unless input.valid_encoding?
|
||||||
|
end
|
||||||
|
|
||||||
result
|
result
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@ -140,23 +140,36 @@ module Prism
|
|||||||
assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE)
|
assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE)
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_code_units_handles_binary_encoding_with_multibyte_characters
|
def test_code_units_binary_valid_utf8
|
||||||
# If the encoding is set to binary and the source contains multibyte
|
|
||||||
# characters, we avoid breaking the code unit offsets, but they will
|
|
||||||
# still be incorrect.
|
|
||||||
|
|
||||||
program = Prism.parse(<<~RUBY).value
|
program = Prism.parse(<<~RUBY).value
|
||||||
# -*- encoding: binary -*-
|
# -*- encoding: binary -*-
|
||||||
|
|
||||||
😀 + 😀
|
😀 + 😀
|
||||||
RUBY
|
RUBY
|
||||||
|
|
||||||
# first 😀
|
receiver = program.statements.body.first.receiver
|
||||||
location = program.statements.body.first.receiver.location
|
assert_equal "😀".b.to_sym, receiver.name
|
||||||
|
|
||||||
assert_equal 4, location.end_code_units_column(Encoding::UTF_8)
|
location = receiver.location
|
||||||
assert_equal 4, location.end_code_units_column(Encoding::UTF_16LE)
|
assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
|
||||||
assert_equal 4, location.end_code_units_column(Encoding::UTF_32LE)
|
assert_equal 2, location.end_code_units_column(Encoding::UTF_16LE)
|
||||||
|
assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_code_units_binary_invalid_utf8
|
||||||
|
program = Prism.parse(<<~RUBY).value
|
||||||
|
# -*- encoding: binary -*-
|
||||||
|
|
||||||
|
\x90 + \x90
|
||||||
|
RUBY
|
||||||
|
|
||||||
|
receiver = program.statements.body.first.receiver
|
||||||
|
assert_equal "\x90".b.to_sym, receiver.name
|
||||||
|
|
||||||
|
location = receiver.location
|
||||||
|
assert_equal 1, location.end_code_units_column(Encoding::UTF_8)
|
||||||
|
assert_equal 1, location.end_code_units_column(Encoding::UTF_16LE)
|
||||||
|
assert_equal 1, location.end_code_units_column(Encoding::UTF_32LE)
|
||||||
end
|
end
|
||||||
|
|
||||||
def test_chop
|
def test_chop
|
||||||
|
Loading…
x
Reference in New Issue
Block a user