[ruby/prism] Better comment token handling for the parser translator

There appear to be a bunch of rules, changing behaviour for
inline comments, multiple comments after another, etc.

This seems to line up with reality pretty closely, token differences for RuboCop tests go from 1129 to 619 which seems pretty impressive

https://github.com/ruby/prism/commit/2e1b92670c
This commit is contained in:
Earlopain 2025-01-11 18:48:48 +01:00 committed by Kevin Newton
parent 117d6e145a
commit 70022224b2
2 changed files with 146 additions and 53 deletions

View File

@ -199,10 +199,18 @@ module Prism
# The following token types are listed as those classified as `tLPAREN`.
LPAREN_CONVERSION_TOKEN_TYPES = [
:kBREAK, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
:tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
:tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
]
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES
# Types of tokens that are allowed to continue a method call with comments in-between.
# For these, the parser gem doesn't emit a newline token after the last comment.
COMMENT_CONTINUATION_TYPES = [:COMMENT, :AMPERSAND_DOT, :DOT]
private_constant :COMMENT_CONTINUATION_TYPES
# Heredocs are complex and require us to keep track of a bit of info to refer to later
HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
# The Parser::Source::Buffer that the tokens were lexed from.
attr_reader :source_buffer
@ -232,7 +240,13 @@ module Prism
index = 0
length = lexed.length
heredoc_identifier_stack = []
heredoc_stack = []
quote_stack = []
# The parser gem emits the newline tokens for comments out of order. This saves
# that token location to emit at a later time to properly line everything up.
# https://github.com/whitequark/parser/issues/1025
comment_newline_location = nil
while index < length
token, state = lexed[index]
@ -241,7 +255,7 @@ module Prism
type = TYPES.fetch(token.type)
value = token.value
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset])
location = range(token.location.start_offset, token.location.end_offset)
case type
when :kDO
@ -257,23 +271,46 @@ module Prism
value = unescape_string(value, "?")
when :tCOMMENT
if token.type == :EMBDOC_BEGIN
start_index = index
while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
value += next_token.value
index += 1
end
if start_index != index
value += next_token.value
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[lexed[index][0].location.end_offset])
index += 1
end
value += next_token.value
location = range(token.location.start_offset, lexed[index][0].location.end_offset)
index += 1
else
value.chomp!
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - 1])
location = range(token.location.start_offset, token.location.end_offset - 1)
prev_token = lexed[index - 2][0]
next_token = lexed[index][0]
is_inline_comment = prev_token.location.start_line == token.location.start_line
if is_inline_comment && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
tokens << [:tCOMMENT, [value, location]]
nl_location = range(token.location.end_offset - 1, token.location.end_offset)
tokens << [:tNL, [nil, nl_location]]
next
elsif is_inline_comment && next_token&.type == :COMMENT
comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
tokens << [:tCOMMENT, [value, location]]
tokens << [:tNL, [nil, comment_newline_location]]
comment_newline_location = nil
next
end
end
when :tNL
next_token = next_token = lexed[index][0]
# Newlines after comments are emitted out of order.
if next_token&.type == :COMMENT
comment_newline_location = location
next
end
value = nil
when :tFLOAT
value = parse_float(value)
@ -281,8 +318,8 @@ module Prism
value = parse_complex(value)
when :tINTEGER
if value.start_with?("+")
tokens << [:tUNARY_NUM, ["+", Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])]]
location = Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])
tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
location = range(token.location.start_offset + 1, token.location.end_offset)
end
value = parse_integer(value)
@ -303,47 +340,80 @@ module Prism
when :tSPACE
value = nil
when :tSTRING_BEG
if token.type == :HEREDOC_START
heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier])
end
if ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_END
next_token = lexed[index][0]
next_next_token = lexed[index + 1][0]
basic_quotes = ["\"", "'"].include?(value)
if basic_quotes && next_token&.type == :STRING_END
next_location = token.location.join(next_token.location)
type = :tSTRING
value = ""
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
location = range(next_location.start_offset, next_location.end_offset)
index += 1
elsif ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && (next_next_token = lexed[index + 1][0]) && next_next_token.type == :STRING_END
next_location = token.location.join(next_next_token.location)
type = :tSTRING
value = next_token.value.gsub("\\\\", "\\")
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
index += 2
elsif value.start_with?("<<")
elsif value.start_with?("'", '"', "%")
if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
# the parser gem doesn't simplify strings when its value ends in a newline
if !(string_value = next_token.value).end_with?("\n") && basic_quotes
next_location = token.location.join(next_next_token.location)
value = unescape_string(string_value, value)
type = :tSTRING
location = range(next_location.start_offset, next_location.end_offset)
index += 2
tokens << [type, [value, location]]
next
end
end
quote_stack.push(value)
elsif token.type == :HEREDOC_START
quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
heredoc = HeredocData.new(
identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
common_whitespace: 0,
)
if quote == "`"
type = :tXSTRING_BEG
value = "<<`"
else
value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
end
# The parser gem trims whitespace from squiggly heredocs. We must record
# the most common whitespace to later remove.
if heredoc_type == "~" || heredoc_type == "`"
heredoc.common_whitespace = calculate_heredoc_whitespace(index)
end
if quote == "'" || quote == '"' || quote == "`"
value = "<<#{quote}"
else
value = '<<"'
end
heredoc_stack.push(heredoc)
quote_stack.push(value)
end
when :tSTRING_CONTENT
unless (lines = token.value.lines).one?
if (lines = token.value.lines).one?
# Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
# The parser gem only removes indentation when the heredoc is not nested
not_nested = heredoc_stack.size == 1
if is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
value = trim_heredoc_whitespace(value, current_heredoc)
end
value = unescape_string(value, quote_stack.last)
else
# When the parser gem encounters a line continuation inside of a multiline string,
# it emits a single string node. The backslash (and remaining newline) is removed.
current_line = +""
adjustment = 0
start_offset = offset_cache[token.location.start_offset]
lines.map do |line|
newline = line.end_with?("\r\n") ? "\r\n" : "\n"
emit = false
lines.each.with_index do |line, index|
chomped_line = line.chomp
<<<<<<< HEAD
if match = chomped_line.match(/(?<backslashes>\\+)\z/)
adjustment = match[:backslashes].size / 2
adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
if match[:backslashes].size.odd?
adjusted_line.delete_suffix!("\\")
adjustment += 2
else
adjusted_line << newline
end
=======
backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
is_interpolation = interpolation?(quote_stack.last)
is_percent_array = percent_array?(quote_stack.last)
@ -360,15 +430,18 @@ module Prism
end
# If the string ends with a line continuation emit the remainder
emit = index == lines.count - 1
>>>>>>> b6554ad64e (Fix parser translator tokens for backslashes in single-quoted strings and word arrays)
else
adjusted_line = line
adjustment = 0
current_line << line
emit = true
end
end_offset = start_offset + adjusted_line.bytesize + adjustment
tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]]
start_offset = end_offset
if emit
end_offset = start_offset + current_line.bytesize + adjustment
tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
start_offset = end_offset
current_line = +""
adjustment = 0
end
end
next
end
@ -377,20 +450,24 @@ module Prism
when :tSTRING_END
if token.type == :HEREDOC_END && value.end_with?("\n")
newline_length = value.end_with?("\r\n") ? 2 : 1
value = heredoc_identifier_stack.pop
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - newline_length])
value = heredoc_stack.pop.identifier
location = range(token.location.start_offset, token.location.end_offset - newline_length)
elsif token.type == :REGEXP_END
value = value[0]
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])
location = range(token.location.start_offset, token.location.start_offset + 1)
end
quote_stack.pop
when :tSYMBEG
if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
next_location = token.location.join(next_token.location)
type = :tSYMBOL
value = next_token.value
value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
location = range(next_location.start_offset, next_location.end_offset)
index += 1
else
quote_stack.push(value)
end
when :tFID
if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
@ -400,12 +477,21 @@ module Prism
if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :STRING_END
type = :tBACK_REF2
end
quote_stack.push(value)
when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP
index += 1
end
quote_stack.push(value)
when :tREGEXP_BEG
quote_stack.push(value)
end
tokens << [type, [value, location]]
if token.type == :REGEXP_END
tokens << [:tREGEXP_OPT, [token.value[1..], Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])]]
tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
end
end
@ -414,6 +500,11 @@ module Prism
private
# Creates a new parser range, taking prisms byte offsets into account
def range(start_offset, end_offset)
Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
end
# Parse an integer from the string representation.
def parse_integer(value)
Integer(value)

View File

@ -2,6 +2,8 @@
(..2)
foo ((1..1))
1...2
foo[...2]