From 70022224b2627694cbbb30e819a6b1554e9c0a5b Mon Sep 17 00:00:00 2001
From: Earlopain <14981592+Earlopain@users.noreply.github.com>
Date: Sat, 11 Jan 2025 18:48:48 +0100
Subject: [PATCH] [ruby/prism] Better comment token handling for the parser
 translator

There appear to be a bunch of rules, changing behaviour for
inline comments, multiple comments after another, etc.

This seems to line up with reality pretty closely, token differences for RuboCop tests go from 1129 to 619 which seems pretty impressive

https://github.com/ruby/prism/commit/2e1b92670c
---
 lib/prism/translation/parser/lexer.rb | 197 +++++++++++++++++++-------
 test/prism/fixtures/ranges.txt        |   2 +
 2 files changed, 146 insertions(+), 53 deletions(-)

diff --git a/lib/prism/translation/parser/lexer.rb b/lib/prism/translation/parser/lexer.rb
index 270f0cf56d..49fdd2aea8 100644
--- a/lib/prism/translation/parser/lexer.rb
+++ b/lib/prism/translation/parser/lexer.rb
@@ -199,10 +199,18 @@ module Prism
         # The following token types are listed as those classified as `tLPAREN`.
         LPAREN_CONVERSION_TOKEN_TYPES = [
           :kBREAK, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
-          :tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
+          :tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
         ]
 
-        private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES
+        # Types of tokens that are allowed to continue a method call with comments in-between.
+        # For these, the parser gem doesn't emit a newline token after the last comment.
+        COMMENT_CONTINUATION_TYPES = [:COMMENT, :AMPERSAND_DOT, :DOT]
+        private_constant :COMMENT_CONTINUATION_TYPES
+
+        # Heredocs are complex and require us to keep track of a bit of info to refer to later
+        HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
+
+        private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
 
         # The Parser::Source::Buffer that the tokens were lexed from.
         attr_reader :source_buffer
@@ -232,7 +240,13 @@ module Prism
           index = 0
           length = lexed.length
 
-          heredoc_identifier_stack = []
+          heredoc_stack = []
+          quote_stack = []
+
+          # The parser gem emits the newline tokens for comments out of order. This saves
+          # that token location to emit at a later time to properly line everything up.
+          # https://github.com/whitequark/parser/issues/1025
+          comment_newline_location = nil
 
           while index < length
             token, state = lexed[index]
@@ -241,7 +255,7 @@ module Prism
 
             type = TYPES.fetch(token.type)
             value = token.value
-            location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset])
+            location = range(token.location.start_offset, token.location.end_offset)
 
             case type
             when :kDO
@@ -257,23 +271,46 @@ module Prism
               value = unescape_string(value, "?")
             when :tCOMMENT
               if token.type == :EMBDOC_BEGIN
-                start_index = index
 
                 while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
                   value += next_token.value
                   index += 1
                 end
 
-                if start_index != index
-                  value += next_token.value
-                  location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[lexed[index][0].location.end_offset])
-                  index += 1
-                end
+                value += next_token.value
+                location = range(token.location.start_offset, lexed[index][0].location.end_offset)
+                index += 1
               else
                 value.chomp!
-                location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - 1])
+                location = range(token.location.start_offset, token.location.end_offset - 1)
+
+                prev_token = lexed[index - 2][0]
+                next_token = lexed[index][0]
+
+                is_inline_comment = prev_token.location.start_line == token.location.start_line
+                if is_inline_comment && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
+                  tokens << [:tCOMMENT, [value, location]]
+
+                  nl_location = range(token.location.end_offset - 1, token.location.end_offset)
+                  tokens << [:tNL, [nil, nl_location]]
+                  next
+                elsif is_inline_comment && next_token&.type == :COMMENT
+                  comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
+                elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
+                  tokens << [:tCOMMENT, [value, location]]
+                  tokens << [:tNL, [nil, comment_newline_location]]
+                  comment_newline_location = nil
+                  next
+                end
               end
             when :tNL
+              next_token = next_token = lexed[index][0]
+              # Newlines after comments are emitted out of order.
+              if next_token&.type == :COMMENT
+                comment_newline_location = location
+                next
+              end
+
               value = nil
             when :tFLOAT
               value = parse_float(value)
@@ -281,8 +318,8 @@ module Prism
               value = parse_complex(value)
             when :tINTEGER
               if value.start_with?("+")
-                tokens << [:tUNARY_NUM, ["+", Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])]]
-                location = Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])
+                tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
+                location = range(token.location.start_offset + 1, token.location.end_offset)
               end
 
               value = parse_integer(value)
@@ -303,47 +340,80 @@ module Prism
             when :tSPACE
               value = nil
             when :tSTRING_BEG
-              if token.type == :HEREDOC_START
-                heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier])
-              end
-              if ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_END
+              next_token = lexed[index][0]
+              next_next_token = lexed[index + 1][0]
+              basic_quotes = ["\"", "'"].include?(value)
+
+              if basic_quotes && next_token&.type == :STRING_END
                 next_location = token.location.join(next_token.location)
                 type = :tSTRING
                 value = ""
-                location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
+                location = range(next_location.start_offset, next_location.end_offset)
                 index += 1
-              elsif ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && (next_next_token = lexed[index + 1][0]) && next_next_token.type == :STRING_END
-                next_location = token.location.join(next_next_token.location)
-                type = :tSTRING
-                value = next_token.value.gsub("\\\\", "\\")
-                location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
-                index += 2
-              elsif value.start_with?("<<")
+              elsif value.start_with?("'", '"', "%")
+                if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
+                  # the parser gem doesn't simplify strings when its value ends in a newline
+                  if !(string_value = next_token.value).end_with?("\n") && basic_quotes
+                    next_location = token.location.join(next_next_token.location)
+                    value = unescape_string(string_value, value)
+                    type = :tSTRING
+                    location = range(next_location.start_offset, next_location.end_offset)
+                    index += 2
+                    tokens << [type, [value, location]]
+
+                    next
+                  end
+                end
+
+                quote_stack.push(value)
+              elsif token.type == :HEREDOC_START
                 quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
+                heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
+                heredoc = HeredocData.new(
+                  identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
+                  common_whitespace: 0,
+                )
+
                 if quote == "`"
                   type = :tXSTRING_BEG
-                  value = "<<`"
-                else
-                  value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
                 end
+
+                # The parser gem trims whitespace from squiggly heredocs. We must record
+                # the most common whitespace to later remove.
+                if heredoc_type == "~" || heredoc_type == "`"
+                  heredoc.common_whitespace = calculate_heredoc_whitespace(index)
+                end
+
+                if quote == "'" || quote == '"' || quote == "`"
+                  value = "<<#{quote}"
+                else
+                  value = '<<"'
+                end
+
+                heredoc_stack.push(heredoc)
+                quote_stack.push(value)
               end
             when :tSTRING_CONTENT
-              unless (lines = token.value.lines).one?
+              if (lines = token.value.lines).one?
+                # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
+                is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
+                # The parser gem only removes indentation when the heredoc is not nested
+                not_nested = heredoc_stack.size == 1
+                if is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
+                  value = trim_heredoc_whitespace(value, current_heredoc)
+                end
+
+                value = unescape_string(value, quote_stack.last)
+              else
+                # When the parser gem encounters a line continuation inside of a multiline string,
+                # it emits a single string node. The backslash (and remaining newline) is removed.
+                current_line = +""
+                adjustment = 0
                 start_offset = offset_cache[token.location.start_offset]
-                lines.map do |line|
-                  newline = line.end_with?("\r\n") ? "\r\n" : "\n"
+                emit = false
+
+                lines.each.with_index do |line, index|
                   chomped_line = line.chomp
-<<<<<<< HEAD
-                  if match = chomped_line.match(/(?<backslashes>\\+)\z/)
-                    adjustment = match[:backslashes].size / 2
-                    adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
-                    if match[:backslashes].size.odd?
-                      adjusted_line.delete_suffix!("\\")
-                      adjustment += 2
-                    else
-                      adjusted_line << newline
-                    end
-=======
                   backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
                   is_interpolation = interpolation?(quote_stack.last)
                   is_percent_array = percent_array?(quote_stack.last)
@@ -360,15 +430,18 @@ module Prism
                     end
                     # If the string ends with a line continuation emit the remainder
                     emit = index == lines.count - 1
->>>>>>> b6554ad64e (Fix parser translator tokens for backslashes in single-quoted strings and word arrays)
                   else
-                    adjusted_line = line
-                    adjustment = 0
+                    current_line << line
+                    emit = true
                   end
 
-                  end_offset = start_offset + adjusted_line.bytesize + adjustment
-                  tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
-                  start_offset = end_offset
+                  if emit
+                    end_offset = start_offset + current_line.bytesize + adjustment
+                    tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
+                    start_offset = end_offset
+                    current_line = +""
+                    adjustment = 0
+                  end
                 end
                 next
               end
@@ -377,20 +450,24 @@ module Prism
             when :tSTRING_END
               if token.type == :HEREDOC_END && value.end_with?("\n")
                 newline_length = value.end_with?("\r\n") ? 2 : 1
-                value = heredoc_identifier_stack.pop
-                location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - newline_length])
+                value = heredoc_stack.pop.identifier
+                location = range(token.location.start_offset, token.location.end_offset - newline_length)
               elsif token.type == :REGEXP_END
                 value = value[0]
-                location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])
+                location = range(token.location.start_offset, token.location.start_offset + 1)
               end
+
+              quote_stack.pop
             when :tSYMBEG
               if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
                 next_location = token.location.join(next_token.location)
                 type = :tSYMBOL
                 value = next_token.value
                 value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
-                location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
+                location = range(next_location.start_offset, next_location.end_offset)
                 index += 1
+              else
+                quote_stack.push(value)
               end
             when :tFID
               if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
@@ -400,12 +477,21 @@ module Prism
               if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :STRING_END
                 type = :tBACK_REF2
               end
+              quote_stack.push(value)
+            when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
+              if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP
+                index += 1
+              end
+
+              quote_stack.push(value)
+            when :tREGEXP_BEG
+              quote_stack.push(value)
             end
 
             tokens << [type, [value, location]]
 
             if token.type == :REGEXP_END
-              tokens << [:tREGEXP_OPT, [token.value[1..], Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])]]
+              tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
             end
           end
 
@@ -414,6 +500,11 @@ module Prism
 
         private
 
+        # Creates a new parser range, taking prisms byte offsets into account
+        def range(start_offset, end_offset)
+          Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
+        end
+
         # Parse an integer from the string representation.
         def parse_integer(value)
           Integer(value)
diff --git a/test/prism/fixtures/ranges.txt b/test/prism/fixtures/ranges.txt
index e2e4136ae9..87eac6d241 100644
--- a/test/prism/fixtures/ranges.txt
+++ b/test/prism/fixtures/ranges.txt
@@ -2,6 +2,8 @@
 
 (..2)
 
+foo ((1..1))
+
 1...2
 
 foo[...2]