[ruby/prism] Avoid breaking code units offset on binary encoding

https://github.com/ruby/prism/commit/25a4cf6794

Co-authored-by: Kevin Newton <kddnewton@users.noreply.github.com>
This commit is contained in:
Vinicius Stock 2024-10-08 10:47:08 -04:00 committed by git
parent 615a087216
commit e50754fcfa
2 changed files with 20 additions and 1 deletions

View File

@ -90,7 +90,7 @@ module Prism
# concept of code units that differs from the number of characters in other
# encodings, it is not captured here.
def code_units_offset(byte_offset, encoding)
byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding)
byteslice = (source.byteslice(0, byte_offset) or raise).encode(encoding, invalid: :replace, undef: :replace)
if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
byteslice.bytesize / 2

View File

@ -140,6 +140,25 @@ module Prism
assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE)
end
def test_code_units_handles_binary_encoding_with_multibyte_characters
# If the encoding is set to binary and the source contains multibyte
# characters, we avoid breaking the code unit offsets, but they will
# still be incorrect.
program = Prism.parse(<<~RUBY).value
# -*- encoding: binary -*-
😀 + 😀
RUBY
# first 😀
location = program.statements.body.first.receiver.location
assert_equal 4, location.end_code_units_column(Encoding::UTF_8)
assert_equal 4, location.end_code_units_column(Encoding::UTF_16LE)
assert_equal 4, location.end_code_units_column(Encoding::UTF_32LE)
end
def test_chop
location = Prism.parse("foo").value.location