[ruby/prism] Better comment token handling for the parser translator

There appear to be a bunch of rules, changing behaviour for inline comments, multiple comments after another, etc. This seems to line up with reality pretty closely, token differences for RuboCop tests go from 1129 to 619 which seems pretty impressive https://github.com/ruby/prism/commit/2e1b92670c
2025-01-11 18:48:48 +01:00 · 2025-01-11 18:48:48 +01:00 · 70022224b2
commit 70022224b2
parent 117d6e145a
2 changed files with 146 additions and 53 deletions
--- a/lib/prism/translation/parser/lexer.rb
+++ b/lib/prism/translation/parser/lexer.rb
@ -199,10 +199,18 @@ module Prism
        # The following token types are listed as those classified as `tLPAREN`.
        LPAREN_CONVERSION_TOKEN_TYPES = [
          :kBREAK, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
-          :tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
+          :tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
        ]
-        private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES
+        # Types of tokens that are allowed to continue a method call with comments in-between.
        # For these, the parser gem doesn't emit a newline token after the last comment.
        COMMENT_CONTINUATION_TYPES = [:COMMENT, :AMPERSAND_DOT, :DOT]
        private_constant :COMMENT_CONTINUATION_TYPES
        # Heredocs are complex and require us to keep track of a bit of info to refer to later
        HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
        private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
        # The Parser::Source::Buffer that the tokens were lexed from.
        attr_reader :source_buffer
@ -232,7 +240,13 @@ module Prism
          index = 0
          length = lexed.length
-          heredoc_identifier_stack = []
+          heredoc_stack = []
          quote_stack = []
          # The parser gem emits the newline tokens for comments out of order. This saves
          # that token location to emit at a later time to properly line everything up.
          # https://github.com/whitequark/parser/issues/1025
          comment_newline_location = nil
          while index < length
            token, state = lexed[index]
@ -241,7 +255,7 @@ module Prism
            type = TYPES.fetch(token.type)
            value = token.value
-            location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset])
+            location = range(token.location.start_offset, token.location.end_offset)
            case type
            when :kDO
@ -257,23 +271,46 @@ module Prism
              value = unescape_string(value, "?")
            when :tCOMMENT
              if token.type == :EMBDOC_BEGIN
                start_index = index
                while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
                  value += next_token.value
                  index += 1
                end
                if start_index != index
                value += next_token.value
-                  location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[lexed[index][0].location.end_offset])
+                location = range(token.location.start_offset, lexed[index][0].location.end_offset)
                index += 1
                end
              else
                value.chomp!
-                location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - 1])
+                location = range(token.location.start_offset, token.location.end_offset - 1)
                prev_token = lexed[index - 2][0]
                next_token = lexed[index][0]
                is_inline_comment = prev_token.location.start_line == token.location.start_line
                if is_inline_comment && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
                  tokens << [:tCOMMENT, [value, location]]
                  nl_location = range(token.location.end_offset - 1, token.location.end_offset)
                  tokens << [:tNL, [nil, nl_location]]
                  next
                elsif is_inline_comment && next_token&.type == :COMMENT
                  comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
                elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
                  tokens << [:tCOMMENT, [value, location]]
                  tokens << [:tNL, [nil, comment_newline_location]]
                  comment_newline_location = nil
                  next
                end
              end
            when :tNL
              next_token = next_token = lexed[index][0]
              # Newlines after comments are emitted out of order.
              if next_token&.type == :COMMENT
                comment_newline_location = location
                next
              end
              value = nil
            when :tFLOAT
              value = parse_float(value)
@ -281,8 +318,8 @@ module Prism
              value = parse_complex(value)
            when :tINTEGER
              if value.start_with?("+")
-                tokens << [:tUNARY_NUM, ["+", Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])]]
+                tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
-                location = Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])
+                location = range(token.location.start_offset + 1, token.location.end_offset)
              end
              value = parse_integer(value)
@ -303,47 +340,80 @@ module Prism
            when :tSPACE
              value = nil
            when :tSTRING_BEG
-              if token.type == :HEREDOC_START
+              next_token = lexed[index][0]
-                heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier])
+              next_next_token = lexed[index + 1][0]
-              end
+              basic_quotes = ["\"", "'"].include?(value)
-              if ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_END
+
              if basic_quotes && next_token&.type == :STRING_END
                next_location = token.location.join(next_token.location)
                type = :tSTRING
                value = ""
-                location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
+                location = range(next_location.start_offset, next_location.end_offset)
                index += 1
-              elsif ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && (next_next_token = lexed[index + 1][0]) && next_next_token.type == :STRING_END
+              elsif value.start_with?("'", '"', "%")
                if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
                  # the parser gem doesn't simplify strings when its value ends in a newline
                  if !(string_value = next_token.value).end_with?("\n") && basic_quotes
                    next_location = token.location.join(next_next_token.location)
                    value = unescape_string(string_value, value)
                    type = :tSTRING
-                value = next_token.value.gsub("\\\\", "\\")
+                    location = range(next_location.start_offset, next_location.end_offset)
                location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
                    index += 2
-              elsif value.start_with?("<<")
+                    tokens << [type, [value, location]]
                    next
                  end
                end
                quote_stack.push(value)
              elsif token.type == :HEREDOC_START
                quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
                heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
                heredoc = HeredocData.new(
                  identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
                  common_whitespace: 0,
                )
                if quote == "`"
                  type = :tXSTRING_BEG
                  value = "<<`"
                else
                  value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
                end
                # The parser gem trims whitespace from squiggly heredocs. We must record
                # the most common whitespace to later remove.
                if heredoc_type == "~" || heredoc_type == "`"
                  heredoc.common_whitespace = calculate_heredoc_whitespace(index)
                end
                if quote == "'" || quote == '"' || quote == "`"
                  value = "<<#{quote}"
                else
                  value = '<<"'
                end
                heredoc_stack.push(heredoc)
                quote_stack.push(value)
              end
            when :tSTRING_CONTENT
-              unless (lines = token.value.lines).one?
+              if (lines = token.value.lines).one?
-                start_offset = offset_cache[token.location.start_offset]
+                # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
-                lines.map do |line|
+                is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
-                  newline = line.end_with?("\r\n") ? "\r\n" : "\n"
+                # The parser gem only removes indentation when the heredoc is not nested
-                  chomped_line = line.chomp
+                not_nested = heredoc_stack.size == 1
-<<<<<<< HEAD
+                if is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
-                  if match = chomped_line.match(/(?<backslashes>\\+)\z/)
+                  value = trim_heredoc_whitespace(value, current_heredoc)
                    adjustment = match[:backslashes].size / 2
                    adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
                    if match[:backslashes].size.odd?
                      adjusted_line.delete_suffix!("\\")
                      adjustment += 2
                    else
                      adjusted_line << newline
                end
-=======
+
                value = unescape_string(value, quote_stack.last)
              else
                # When the parser gem encounters a line continuation inside of a multiline string,
                # it emits a single string node. The backslash (and remaining newline) is removed.
                current_line = +""
                adjustment = 0
                start_offset = offset_cache[token.location.start_offset]
                emit = false
                lines.each.with_index do |line, index|
                  chomped_line = line.chomp
                  backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
                  is_interpolation = interpolation?(quote_stack.last)
                  is_percent_array = percent_array?(quote_stack.last)
@ -360,15 +430,18 @@ module Prism
                    end
                    # If the string ends with a line continuation emit the remainder
                    emit = index == lines.count - 1
 >>>>>>> b6554ad64e (Fix parser translator tokens for backslashes in single-quoted strings and word arrays)
                  else
-                    adjusted_line = line
+                    current_line << line
-                    adjustment = 0
+                    emit = true
                  end
-                  end_offset = start_offset + adjusted_line.bytesize + adjustment
+                  if emit
-                  tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
+                    end_offset = start_offset + current_line.bytesize + adjustment
                    tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
                    start_offset = end_offset
                    current_line = +""
                    adjustment = 0
                  end
                end
                next
              end
@ -377,20 +450,24 @@ module Prism
            when :tSTRING_END
              if token.type == :HEREDOC_END && value.end_with?("\n")
                newline_length = value.end_with?("\r\n") ? 2 : 1
-                value = heredoc_identifier_stack.pop
+                value = heredoc_stack.pop.identifier
-                location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - newline_length])
+                location = range(token.location.start_offset, token.location.end_offset - newline_length)
              elsif token.type == :REGEXP_END
                value = value[0]
-                location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])
+                location = range(token.location.start_offset, token.location.start_offset + 1)
              end
              quote_stack.pop
            when :tSYMBEG
              if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
                next_location = token.location.join(next_token.location)
                type = :tSYMBOL
                value = next_token.value
                value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
-                location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
+                location = range(next_location.start_offset, next_location.end_offset)
                index += 1
              else
                quote_stack.push(value)
              end
            when :tFID
              if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
@ -400,12 +477,21 @@ module Prism
              if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :STRING_END
                type = :tBACK_REF2
              end
              quote_stack.push(value)
            when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
              if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP
                index += 1
              end
              quote_stack.push(value)
            when :tREGEXP_BEG
              quote_stack.push(value)
            end
            tokens << [type, [value, location]]
            if token.type == :REGEXP_END
-              tokens << [:tREGEXP_OPT, [token.value[1..], Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])]]
+              tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
            end
          end
@ -414,6 +500,11 @@ module Prism
        private
        # Creates a new parser range, taking prisms byte offsets into account
        def range(start_offset, end_offset)
          Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
        end
        # Parse an integer from the string representation.
        def parse_integer(value)
          Integer(value)
--- a/test/prism/fixtures/ranges.txt
+++ b/test/prism/fixtures/ranges.txt
@ -2,6 +2,8 @@
 (..2)
 foo ((1..1))
 1...2
 foo[...2]