[ruby/prism] Prism::CodeUnitsCache
Calculating code unit offsets for a source can be very expensive, especially when the source is large. This commit introduces a new class that wraps the source and desired encoding into a cache that reuses pre-computed offsets. It performs quite a bit better. There are still some problems with this approach, namely character boundaries and the fact that the cache is unbounded, but both of these may be addressed in subsequent commits. https://github.com/ruby/prism/commit/2e3e1a4d4d
This commit is contained in:
parent
b77ff342cc
commit
7a198af7cd
@ -120,6 +120,12 @@ module Prism
|
||||
end
|
||||
end
|
||||
|
||||
# Generate a cache that targets a specific encoding for calculating code
|
||||
# unit offsets.
|
||||
def code_units_cache(encoding)
|
||||
CodeUnitsCache.new(source, encoding)
|
||||
end
|
||||
|
||||
# Returns the column number in code units for the given encoding for the
|
||||
# given byte offset.
|
||||
def code_units_column(byte_offset, encoding)
|
||||
@ -149,6 +155,76 @@ module Prism
|
||||
end
|
||||
end
|
||||
|
||||
# A cache that can be used to quickly compute code unit offsets from byte
|
||||
# offsets. It purposefully provides only a single #[] method to access the
|
||||
# cache in order to minimize surface area.
|
||||
#
|
||||
# Note that there are some known issues here that may or may not be addressed
|
||||
# in the future:
|
||||
#
|
||||
# * The first is that there are issues when the cache computes values that are
|
||||
# not on character boundaries. This can result in subsequent computations
|
||||
# being off by one or more code units.
|
||||
# * The second is that this cache is currently unbounded. In theory we could
|
||||
# introduce some kind of LRU cache to limit the number of entries, but this
|
||||
# has not yet been implemented.
|
||||
#
|
||||
class CodeUnitsCache
|
||||
class UTF16Counter # :nodoc:
|
||||
def initialize(source, encoding)
|
||||
@source = source
|
||||
@encoding = encoding
|
||||
end
|
||||
|
||||
def count(byte_offset, byte_length)
|
||||
@source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).bytesize / 2
|
||||
end
|
||||
end
|
||||
|
||||
class LengthCounter # :nodoc:
|
||||
def initialize(source, encoding)
|
||||
@source = source
|
||||
@encoding = encoding
|
||||
end
|
||||
|
||||
def count(byte_offset, byte_length)
|
||||
@source.byteslice(byte_offset, byte_length).encode(@encoding, invalid: :replace, undef: :replace).length
|
||||
end
|
||||
end
|
||||
|
||||
private_constant :UTF16Counter, :LengthCounter
|
||||
|
||||
# Initialize a new cache with the given source and encoding.
|
||||
def initialize(source, encoding)
|
||||
@source = source
|
||||
@counter =
|
||||
if encoding == Encoding::UTF_16LE || encoding == Encoding::UTF_16BE
|
||||
UTF16Counter.new(source, encoding)
|
||||
else
|
||||
LengthCounter.new(source, encoding)
|
||||
end
|
||||
|
||||
@cache = {}
|
||||
@offsets = []
|
||||
end
|
||||
|
||||
# Retrieve the code units offset from the given byte offset.
|
||||
def [](byte_offset)
|
||||
@cache[byte_offset] ||=
|
||||
if (index = @offsets.bsearch_index { |offset| offset > byte_offset }).nil?
|
||||
@offsets << byte_offset
|
||||
@counter.count(0, byte_offset)
|
||||
elsif index == 0
|
||||
@offsets.unshift(byte_offset)
|
||||
@counter.count(0, byte_offset)
|
||||
else
|
||||
@offsets.insert(index, byte_offset)
|
||||
offset = @offsets[index - 1]
|
||||
@cache[offset] + @counter.count(offset, byte_offset - offset)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
||||
# Specialized version of Prism::Source for source code that includes ASCII
|
||||
# characters only. This class is used to apply performance optimizations that
|
||||
# cannot be applied to sources that include multibyte characters.
|
||||
@ -178,6 +254,13 @@ module Prism
|
||||
byte_offset
|
||||
end
|
||||
|
||||
# Returns a cache that is the identity function in order to maintain the
|
||||
# same interface. We can do this because code units are always equivalent to
|
||||
# byte offsets for ASCII-only sources.
|
||||
def code_units_cache(encoding)
|
||||
->(byte_offset) { byte_offset }
|
||||
end
|
||||
|
||||
# Specialized version of `code_units_column` that does not depend on
|
||||
# `code_units_offset`, which is a more expensive operation. This is
|
||||
# essentially the same as `Prism::Source#column`.
|
||||
@ -287,6 +370,12 @@ module Prism
|
||||
source.code_units_offset(start_offset, encoding)
|
||||
end
|
||||
|
||||
# The start offset from the start of the file in code units using the given
|
||||
# cache to fetch or calculate the value.
|
||||
def cached_start_code_units_offset(cache)
|
||||
cache[start_offset]
|
||||
end
|
||||
|
||||
# The byte offset from the beginning of the source where this location ends.
|
||||
def end_offset
|
||||
start_offset + length
|
||||
@ -303,6 +392,12 @@ module Prism
|
||||
source.code_units_offset(end_offset, encoding)
|
||||
end
|
||||
|
||||
# The end offset from the start of the file in code units using the given
|
||||
# cache to fetch or calculate the value.
|
||||
def cached_end_code_units_offset(cache)
|
||||
cache[end_offset]
|
||||
end
|
||||
|
||||
# The line number where this location starts.
|
||||
def start_line
|
||||
source.line(start_offset)
|
||||
@ -337,6 +432,12 @@ module Prism
|
||||
source.code_units_column(start_offset, encoding)
|
||||
end
|
||||
|
||||
# The start column in code units using the given cache to fetch or calculate
|
||||
# the value.
|
||||
def cached_start_code_units_column(cache)
|
||||
cache[start_offset] - cache[source.line_start(start_offset)]
|
||||
end
|
||||
|
||||
# The column number in bytes where this location ends from the start of the
|
||||
# line.
|
||||
def end_column
|
||||
@ -355,6 +456,12 @@ module Prism
|
||||
source.code_units_column(end_offset, encoding)
|
||||
end
|
||||
|
||||
# The end column in code units using the given cache to fetch or calculate
|
||||
# the value.
|
||||
def cached_end_code_units_column(cache)
|
||||
cache[end_offset] - cache[source.line_start(end_offset)]
|
||||
end
|
||||
|
||||
# Implement the hash pattern matching interface for Location.
|
||||
def deconstruct_keys(keys)
|
||||
{ start_offset: start_offset, end_offset: end_offset }
|
||||
@ -604,6 +711,11 @@ module Prism
|
||||
def failure?
|
||||
!success?
|
||||
end
|
||||
|
||||
# Create a code units cache for the given encoding.
|
||||
def code_units_cache(encoding)
|
||||
source.code_units_cache(encoding)
|
||||
end
|
||||
end
|
||||
|
||||
# This is a result specific to the `parse` and `parse_file` methods.
|
||||
|
@ -140,6 +140,52 @@ module Prism
|
||||
assert_equal 7, location.end_code_units_column(Encoding::UTF_32LE)
|
||||
end
|
||||
|
||||
def test_cached_code_units
|
||||
result = Prism.parse("😀 + 😀\n😍 ||= 😍")
|
||||
|
||||
utf8_cache = result.code_units_cache(Encoding::UTF_8)
|
||||
utf16_cache = result.code_units_cache(Encoding::UTF_16LE)
|
||||
utf32_cache = result.code_units_cache(Encoding::UTF_32LE)
|
||||
|
||||
# first 😀
|
||||
location = result.value.statements.body.first.receiver.location
|
||||
|
||||
assert_equal 0, location.cached_start_code_units_offset(utf8_cache)
|
||||
assert_equal 0, location.cached_start_code_units_offset(utf16_cache)
|
||||
assert_equal 0, location.cached_start_code_units_offset(utf32_cache)
|
||||
|
||||
assert_equal 1, location.cached_end_code_units_offset(utf8_cache)
|
||||
assert_equal 2, location.cached_end_code_units_offset(utf16_cache)
|
||||
assert_equal 1, location.cached_end_code_units_offset(utf32_cache)
|
||||
|
||||
assert_equal 0, location.cached_start_code_units_column(utf8_cache)
|
||||
assert_equal 0, location.cached_start_code_units_column(utf16_cache)
|
||||
assert_equal 0, location.cached_start_code_units_column(utf32_cache)
|
||||
|
||||
assert_equal 1, location.cached_end_code_units_column(utf8_cache)
|
||||
assert_equal 2, location.cached_end_code_units_column(utf16_cache)
|
||||
assert_equal 1, location.cached_end_code_units_column(utf32_cache)
|
||||
|
||||
# second 😀
|
||||
location = result.value.statements.body.first.arguments.arguments.first.location
|
||||
|
||||
assert_equal 4, location.cached_start_code_units_offset(utf8_cache)
|
||||
assert_equal 5, location.cached_start_code_units_offset(utf16_cache)
|
||||
assert_equal 4, location.cached_start_code_units_offset(utf32_cache)
|
||||
|
||||
assert_equal 5, location.cached_end_code_units_offset(utf8_cache)
|
||||
assert_equal 7, location.cached_end_code_units_offset(utf16_cache)
|
||||
assert_equal 5, location.cached_end_code_units_offset(utf32_cache)
|
||||
|
||||
assert_equal 4, location.cached_start_code_units_column(utf8_cache)
|
||||
assert_equal 5, location.cached_start_code_units_column(utf16_cache)
|
||||
assert_equal 4, location.cached_start_code_units_column(utf32_cache)
|
||||
|
||||
assert_equal 5, location.cached_end_code_units_column(utf8_cache)
|
||||
assert_equal 7, location.cached_end_code_units_column(utf16_cache)
|
||||
assert_equal 5, location.cached_end_code_units_column(utf32_cache)
|
||||
end
|
||||
|
||||
def test_code_units_binary_valid_utf8
|
||||
program = Prism.parse(<<~RUBY).value
|
||||
# -*- encoding: binary -*-
|
||||
|
Loading…
x
Reference in New Issue
Block a user