diff --git a/lib/prism/parse_result.rb b/lib/prism/parse_result.rb index e3ba7e7c8e..46bd33d1db 100644 --- a/lib/prism/parse_result.rb +++ b/lib/prism/parse_result.rb @@ -120,6 +120,12 @@ module Prism end end + # Generate a cache that targets a specific encoding for calculating code + # unit offsets. + def code_units_cache(encoding) + CodeUnitsCache.new(source, encoding) + end + # Returns the column number in code units for the given encoding for the # given byte offset. def code_units_column(byte_offset, encoding) @@ -149,6 +155,76 @@ module Prism end end + # A cache that can be used to quickly compute code unit offsets from byte + # offsets. It purposefully provides only a single #[] method to access the + # cache in order to minimize surface area. + # + # Note that there are some known issues here that may or may not be addressed + # in the future: + # + # * The first is that there are issues when the cache computes values that are + # not on character boundaries. This can result in subsequent computations + # being off by one or more code units. + # * The second is that this cache is currently unbounded. In theory we could + # introduce some kind of LRU cache to limit the number of entries, but this + # has not yet been implemented. + # + class CodeUnitsCache + class UTF16Counter # :nodoc: + def initialize(source, encoding) + @source = source + @encoding = encoding + end + + def count(byte_offset, byte_length) + @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2 + end + end + + class LengthCounter # :nodoc: + def initialize(source, encoding) + @source = source + @encoding = encoding + end + + def count(byte_offset, byte_length) + @source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length + end + end + + private_constant :UTF16Counter, :LengthCounter + + # Initialize a new cache with the given source and encoding. + def initialize(source, encoding) + @source = source + @counter = + if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE + UTF16Counter.new(source, encoding) + else + LengthCounter.new(source, encoding) + end + + @cache = {} + @offsets = [] + end + + # Retrieve the code units offset from the given byte offset. + def [](byte_offset) + @cache[byte_offset] ||= + if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil? + @offsets << byte_offset + @counter.count(0, byte_offset) + elsif index == 0 + @offsets.unshift(byte_offset) + @counter.count(0, byte_offset) + else + @offsets.insert(index, byte_offset) + offset = @offsets[index - 1] + @cache[offset] + @counter.count(offset, byte_offset - offset) + end + end + end + # Specialized version of Prism::Source for source code that includes ASCII # characters only. This class is used to apply performance optimizations that # cannot be applied to sources that include multibyte characters. @@ -178,6 +254,13 @@ module Prism byte_offset end + # Returns a cache that is the identity function in order to maintain the + # same interface. We can do this because code units are always equivalent to + # byte offsets for ASCII-only sources. + def code_units_cache(encoding) + ->(byte_offset) { byte_offset } + end + # Specialized version of `code_units_column` that does not depend on # `code_units_offset`, which is a more expensive operation. This is # essentially the same as `Prism::Source#column`. @@ -287,6 +370,12 @@ module Prism source.code_units_offset(start_offset, encoding) end + # The start offset from the start of the file in code units using the given + # cache to fetch or calculate the value. + def cached_start_code_units_offset(cache) + cache[start_offset] + end + # The byte offset from the beginning of the source where this location ends. def end_offset start_offset + length @@ -303,6 +392,12 @@ module Prism source.code_units_offset(end_offset, encoding) end + # The end offset from the start of the file in code units using the given + # cache to fetch or calculate the value. + def cached_end_code_units_offset(cache) + cache[end_offset] + end + # The line number where this location starts. def start_line source.line(start_offset) @@ -337,6 +432,12 @@ module Prism source.code_units_column(start_offset, encoding) end + # The start column in code units using the given cache to fetch or calculate + # the value. + def cached_start_code_units_column(cache) + cache[start_offset] - cache[source.line_start(start_offset)] + end + # The column number in bytes where this location ends from the start of the # line. def end_column @@ -355,6 +456,12 @@ module Prism source.code_units_column(end_offset, encoding) end + # The end column in code units using the given cache to fetch or calculate + # the value. + def cached_end_code_units_column(cache) + cache[end_offset] - cache[source.line_start(end_offset)] + end + # Implement the hash pattern matching interface for Location. def deconstruct_keys(keys) { start_offset: start_offset, end_offset: end_offset } @@ -604,6 +711,11 @@ module Prism def failure? !success? end + + # Create a code units cache for the given encoding. + def code_units_cache(encoding) + source.code_units_cache(encoding) + end end # This is a result specific to the `parse` and `parse_file` methods. diff --git a/test/prism/ruby/location_test.rb b/test/prism/ruby/location_test.rb index 3d3e7dd562..33f844243c 100644 --- a/test/prism/ruby/location_test.rb +++ b/test/prism/ruby/location_test.rb @@ -140,6 +140,52 @@ module Prism assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE) end + def test_cached_code_units + result = Prism.parse("šŸ˜€ + šŸ˜€\nšŸ˜ ||= šŸ˜") + + utf8_cache = result.code_units_cache(Encoding::UTF_8) + utf16_cache = result.code_units_cache(Encoding::UTF_16LE) + utf32_cache = result.code_units_cache(Encoding::UTF_32LE) + + # first šŸ˜€ + location = result.value.statements.body.first.receiver.location + + assert_equal 0, location.cached_start_code_units_offset(utf8_cache) + assert_equal 0, location.cached_start_code_units_offset(utf16_cache) + assert_equal 0, location.cached_start_code_units_offset(utf32_cache) + + assert_equal 1, location.cached_end_code_units_offset(utf8_cache) + assert_equal 2, location.cached_end_code_units_offset(utf16_cache) + assert_equal 1, location.cached_end_code_units_offset(utf32_cache) + + assert_equal 0, location.cached_start_code_units_column(utf8_cache) + assert_equal 0, location.cached_start_code_units_column(utf16_cache) + assert_equal 0, location.cached_start_code_units_column(utf32_cache) + + assert_equal 1, location.cached_end_code_units_column(utf8_cache) + assert_equal 2, location.cached_end_code_units_column(utf16_cache) + assert_equal 1, location.cached_end_code_units_column(utf32_cache) + + # second šŸ˜€ + location = result.value.statements.body.first.arguments.arguments.first.location + + assert_equal 4, location.cached_start_code_units_offset(utf8_cache) + assert_equal 5, location.cached_start_code_units_offset(utf16_cache) + assert_equal 4, location.cached_start_code_units_offset(utf32_cache) + + assert_equal 5, location.cached_end_code_units_offset(utf8_cache) + assert_equal 7, location.cached_end_code_units_offset(utf16_cache) + assert_equal 5, location.cached_end_code_units_offset(utf32_cache) + + assert_equal 4, location.cached_start_code_units_column(utf8_cache) + assert_equal 5, location.cached_start_code_units_column(utf16_cache) + assert_equal 4, location.cached_start_code_units_column(utf32_cache) + + assert_equal 5, location.cached_end_code_units_column(utf8_cache) + assert_equal 7, location.cached_end_code_units_column(utf16_cache) + assert_equal 5, location.cached_end_code_units_column(utf32_cache) + end + def test_code_units_binary_valid_utf8 program = Prism.parse(<<~RUBY).value # -*- encoding: binary -*-