[ruby/prism] Add character APIs for locations

(https://github.com/ruby/prism/pull/1809) https://github.com/ruby/prism/commit/d493ccd093
2023-11-20 11:07:02 -05:00 · 2023-11-20 11:07:02 -05:00 · f2ed7eaba0
commit f2ed7eaba0
parent adee7dab3e
4 changed files with 83 additions and 23 deletions
--- a/lib/prism/ffi.rb
+++ b/lib/prism/ffi.rb
@ -230,7 +230,7 @@ module Prism
        loader = Serialize::Loader.new(source, buffer.read)

        loader.load_header
-        loader.load_force_encoding
+        loader.load_encoding
        loader.load_start_line
        loader.load_comments
      end
--- a/lib/prism/parse_result.rb
+++ b/lib/prism/parse_result.rb
@ -25,40 +25,50 @@ module Prism

    # Perform a byteslice on the source code using the given byte offset and
    # byte length.
-    def slice(offset, length)
-      source.byteslice(offset, length)
+    def slice(byte_offset, length)
+      source.byteslice(byte_offset, length)
    end

    # Binary search through the offsets to find the line number for the given
    # byte offset.
-    def line(value)
-      start_line + find_line(value)
+    def line(byte_offset)
+      start_line + find_line(byte_offset)
    end

    # Return the byte offset of the start of the line corresponding to the given
    # byte offset.
-    def line_offset(value)
-      offsets[find_line(value)]
+    def line_start(byte_offset)
+      offsets[find_line(byte_offset)]
    end

    # Return the column number for the given byte offset.
-    def column(value)
-      value - offsets[find_line(value)]
+    def column(byte_offset)
+      byte_offset - line_start(byte_offset)
+    end
+
+    # Return the character offset for the given byte offset.
+    def character_offset(byte_offset)
+      source.byteslice(0, byte_offset).length
+    end
+
+    # Return the column number in characters for the given byte offset.
+    def character_column(byte_offset)
+      character_offset(byte_offset) - character_offset(line_start(byte_offset))
    end

    private

    # Binary search through the offsets to find the line number for the given
    # byte offset.
-    def find_line(value)
+    def find_line(byte_offset)
      left = 0
      right = offsets.length - 1

      while left <= right
        mid = left + (right - left) / 2
-        return mid if offsets[mid] == value
+        return mid if offsets[mid] == byte_offset

-        if offsets[mid] < value
+        if offsets[mid] < byte_offset
          left = mid + 1
        else
          right = mid - 1
@ -121,11 +131,23 @@ module Prism
      source.slice(start_offset, length)
    end

+    # The character offset from the beginning of the source where this location
+    # starts.
+    def start_character_offset
+      source.character_offset(start_offset)
+    end
+
    # The byte offset from the beginning of the source where this location ends.
    def end_offset
      start_offset + length
    end

+    # The character offset from the beginning of the source where this location
+    # ends.
+    def end_character_offset
+      source.character_offset(end_offset)
+    end
+
    # The line number where this location starts.
    def start_line
      source.line(start_offset)
@ -133,7 +155,7 @@ module Prism

    # The content of the line where this location starts before this location.
    def start_line_slice
-      offset = source.line_offset(start_offset)
+      offset = source.line_start(start_offset)
      source.slice(offset, start_offset - offset)
    end

@ -148,12 +170,24 @@ module Prism
      source.column(start_offset)
    end

+    # The column number in characters where this location ends from the start of
+    # the line.
+    def start_character_column
+      source.character_column(start_offset)
+    end
+
    # The column number in bytes where this location ends from the start of the
    # line.
    def end_column
      source.column(end_offset)
    end

+    # The column number in characters where this location ends from the start of
+    # the line.
+    def end_character_column
+      source.character_column(end_offset)
+    end
+
    # Implement the hash pattern matching interface for Location.
    def deconstruct_keys(keys)
      { start_offset: start_offset, end_offset: end_offset }
--- a/prism/templates/lib/prism/serialize.rb.erb
+++ b/prism/templates/lib/prism/serialize.rb.erb
@ -73,12 +73,9 @@ module Prism
      end

      def load_encoding
-        Encoding.find(io.read(load_varint))
-      end
-
-      def load_force_encoding
-        @encoding = load_encoding
+        @encoding = Encoding.find(io.read(load_varint))
        @input = input.force_encoding(@encoding).freeze
+        @encoding
      end

      def load_start_line
@ -121,10 +118,7 @@ module Prism
        encoding = load_encoding
        load_start_line
        comments, magic_comments, errors, warnings = load_metadata
-
-        if encoding != @encoding
-          tokens.each { |token,| token.value.force_encoding(encoding) }
-        end
+        tokens.each { |token,| token.value.force_encoding(encoding) }

        raise "Expected to consume all bytes while deserializing" unless @io.eof?
        Prism::ParseResult.new(tokens, comments, magic_comments, errors, warnings, @source)
@ -132,7 +126,7 @@ module Prism

      def load_nodes
        load_header
-        load_force_encoding
+        load_encoding
        load_start_line

        comments, magic_comments, errors, warnings = load_metadata
--- a/test/prism/ruby_api_test.rb
+++ b/test/prism/ruby_api_test.rb
@ -71,6 +71,38 @@ module Prism
      end
    end

+    def test_location_character_offsets
+      program = Prism.parse("😀 + 😀\n😍 ||= 😍").value
+
+      # first 😀
+      location = program.statements.body.first.receiver.location
+      assert_equal 0, location.start_character_offset
+      assert_equal 1, location.end_character_offset
+      assert_equal 0, location.start_character_column
+      assert_equal 1, location.end_character_column
+
+      # second 😀
+      location = program.statements.body.first.arguments.arguments.first.location
+      assert_equal 4, location.start_character_offset
+      assert_equal 5, location.end_character_offset
+      assert_equal 4, location.start_character_column
+      assert_equal 5, location.end_character_column
+
+      # first 😍
+      location = program.statements.body.last.name_loc
+      assert_equal 6, location.start_character_offset
+      assert_equal 7, location.end_character_offset
+      assert_equal 0, location.start_character_column
+      assert_equal 1, location.end_character_column
+
+      # second 😍
+      location = program.statements.body.last.value.location
+      assert_equal 12, location.start_character_offset
+      assert_equal 13, location.end_character_offset
+      assert_equal 6, location.start_character_column
+      assert_equal 7, location.end_character_column
+    end
+
    private

    def parse_expression(source)