[ruby/prism] Better comment token handling for the parser translator

There appear to be a bunch of rules, changing behaviour for
inline comments, multiple comments after another, etc.

This seems to line up with reality pretty closely, token differences for RuboCop tests go from 1129 to 619 which seems pretty impressive

https://github.com/ruby/prism/commit/2e1b92670c
This commit is contained in:
Earlopain 2025-01-11 18:48:48 +01:00 committed by Kevin Newton
parent 117d6e145a
commit 70022224b2
2 changed files with 146 additions and 53 deletions

View File

@ -199,10 +199,18 @@ module Prism
# The following token types are listed as those classified as `tLPAREN`. # The following token types are listed as those classified as `tLPAREN`.
LPAREN_CONVERSION_TOKEN_TYPES = [ LPAREN_CONVERSION_TOKEN_TYPES = [
:kBREAK, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3, :kBREAK, :kCASE, :tDIVIDE, :kFOR, :kIF, :kNEXT, :kRETURN, :kUNTIL, :kWHILE, :tAMPER, :tANDOP, :tBANG, :tCOMMA, :tDOT2, :tDOT3,
:tEQL, :tLPAREN, :tLPAREN2, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS :tEQL, :tLPAREN, :tLPAREN2, :tLPAREN_ARG, :tLSHFT, :tNL, :tOP_ASGN, :tOROP, :tPIPE, :tSEMI, :tSTRING_DBEG, :tUMINUS, :tUPLUS
] ]
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES # Types of tokens that are allowed to continue a method call with comments in-between.
# For these, the parser gem doesn't emit a newline token after the last comment.
COMMENT_CONTINUATION_TYPES = [:COMMENT, :AMPERSAND_DOT, :DOT]
private_constant :COMMENT_CONTINUATION_TYPES
# Heredocs are complex and require us to keep track of a bit of info to refer to later
HeredocData = Struct.new(:identifier, :common_whitespace, keyword_init: true)
private_constant :TYPES, :EXPR_BEG, :EXPR_LABEL, :LAMBDA_TOKEN_TYPES, :LPAREN_CONVERSION_TOKEN_TYPES, :HeredocData
# The Parser::Source::Buffer that the tokens were lexed from. # The Parser::Source::Buffer that the tokens were lexed from.
attr_reader :source_buffer attr_reader :source_buffer
@ -232,7 +240,13 @@ module Prism
index = 0 index = 0
length = lexed.length length = lexed.length
heredoc_identifier_stack = [] heredoc_stack = []
quote_stack = []
# The parser gem emits the newline tokens for comments out of order. This saves
# that token location to emit at a later time to properly line everything up.
# https://github.com/whitequark/parser/issues/1025
comment_newline_location = nil
while index < length while index < length
token, state = lexed[index] token, state = lexed[index]
@ -241,7 +255,7 @@ module Prism
type = TYPES.fetch(token.type) type = TYPES.fetch(token.type)
value = token.value value = token.value
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset]) location = range(token.location.start_offset, token.location.end_offset)
case type case type
when :kDO when :kDO
@ -257,23 +271,46 @@ module Prism
value = unescape_string(value, "?") value = unescape_string(value, "?")
when :tCOMMENT when :tCOMMENT
if token.type == :EMBDOC_BEGIN if token.type == :EMBDOC_BEGIN
start_index = index
while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1) while !((next_token = lexed[index][0]) && next_token.type == :EMBDOC_END) && (index < length - 1)
value += next_token.value value += next_token.value
index += 1 index += 1
end end
if start_index != index
value += next_token.value value += next_token.value
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[lexed[index][0].location.end_offset]) location = range(token.location.start_offset, lexed[index][0].location.end_offset)
index += 1 index += 1
end
else else
value.chomp! value.chomp!
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - 1]) location = range(token.location.start_offset, token.location.end_offset - 1)
prev_token = lexed[index - 2][0]
next_token = lexed[index][0]
is_inline_comment = prev_token.location.start_line == token.location.start_line
if is_inline_comment && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
tokens << [:tCOMMENT, [value, location]]
nl_location = range(token.location.end_offset - 1, token.location.end_offset)
tokens << [:tNL, [nil, nl_location]]
next
elsif is_inline_comment && next_token&.type == :COMMENT
comment_newline_location = range(token.location.end_offset - 1, token.location.end_offset)
elsif comment_newline_location && !COMMENT_CONTINUATION_TYPES.include?(next_token&.type)
tokens << [:tCOMMENT, [value, location]]
tokens << [:tNL, [nil, comment_newline_location]]
comment_newline_location = nil
next
end
end end
when :tNL when :tNL
next_token = next_token = lexed[index][0]
# Newlines after comments are emitted out of order.
if next_token&.type == :COMMENT
comment_newline_location = location
next
end
value = nil value = nil
when :tFLOAT when :tFLOAT
value = parse_float(value) value = parse_float(value)
@ -281,8 +318,8 @@ module Prism
value = parse_complex(value) value = parse_complex(value)
when :tINTEGER when :tINTEGER
if value.start_with?("+") if value.start_with?("+")
tokens << [:tUNARY_NUM, ["+", Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1])]] tokens << [:tUNARY_NUM, ["+", range(token.location.start_offset, token.location.start_offset + 1)]]
location = Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset]) location = range(token.location.start_offset + 1, token.location.end_offset)
end end
value = parse_integer(value) value = parse_integer(value)
@ -303,47 +340,80 @@ module Prism
when :tSPACE when :tSPACE
value = nil value = nil
when :tSTRING_BEG when :tSTRING_BEG
if token.type == :HEREDOC_START next_token = lexed[index][0]
heredoc_identifier_stack.push(value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier]) next_next_token = lexed[index + 1][0]
end basic_quotes = ["\"", "'"].include?(value)
if ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_END
if basic_quotes && next_token&.type == :STRING_END
next_location = token.location.join(next_token.location) next_location = token.location.join(next_token.location)
type = :tSTRING type = :tSTRING
value = "" value = ""
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset]) location = range(next_location.start_offset, next_location.end_offset)
index += 1 index += 1
elsif ["\"", "'"].include?(value) && (next_token = lexed[index][0]) && next_token.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && (next_next_token = lexed[index + 1][0]) && next_next_token.type == :STRING_END elsif value.start_with?("'", '"', "%")
if next_token&.type == :STRING_CONTENT && next_token.value.lines.count <= 1 && next_next_token&.type == :STRING_END
# the parser gem doesn't simplify strings when its value ends in a newline
if !(string_value = next_token.value).end_with?("\n") && basic_quotes
next_location = token.location.join(next_next_token.location) next_location = token.location.join(next_next_token.location)
value = unescape_string(string_value, value)
type = :tSTRING type = :tSTRING
value = next_token.value.gsub("\\\\", "\\") location = range(next_location.start_offset, next_location.end_offset)
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset])
index += 2 index += 2
elsif value.start_with?("<<") tokens << [type, [value, location]]
next
end
end
quote_stack.push(value)
elsif token.type == :HEREDOC_START
quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2] quote = value[2] == "-" || value[2] == "~" ? value[3] : value[2]
heredoc_type = value[2] == "-" || value[2] == "~" ? value[2] : ""
heredoc = HeredocData.new(
identifier: value.match(/<<[-~]?["'`]?(?<heredoc_identifier>.*?)["'`]?\z/)[:heredoc_identifier],
common_whitespace: 0,
)
if quote == "`" if quote == "`"
type = :tXSTRING_BEG type = :tXSTRING_BEG
value = "<<`"
else
value = "<<#{quote == "'" || quote == "\"" ? quote : "\""}"
end end
# The parser gem trims whitespace from squiggly heredocs. We must record
# the most common whitespace to later remove.
if heredoc_type == "~" || heredoc_type == "`"
heredoc.common_whitespace = calculate_heredoc_whitespace(index)
end
if quote == "'" || quote == '"' || quote == "`"
value = "<<#{quote}"
else
value = '<<"'
end
heredoc_stack.push(heredoc)
quote_stack.push(value)
end end
when :tSTRING_CONTENT when :tSTRING_CONTENT
unless (lines = token.value.lines).one? if (lines = token.value.lines).one?
start_offset = offset_cache[token.location.start_offset] # Heredoc interpolation can have multiple STRING_CONTENT nodes on the same line.
lines.map do |line| is_first_token_on_line = lexed[index - 1] && token.location.start_line != lexed[index - 2][0].location&.start_line
newline = line.end_with?("\r\n") ? "\r\n" : "\n" # The parser gem only removes indentation when the heredoc is not nested
chomped_line = line.chomp not_nested = heredoc_stack.size == 1
<<<<<<< HEAD if is_first_token_on_line && not_nested && (current_heredoc = heredoc_stack.last).common_whitespace > 0
if match = chomped_line.match(/(?<backslashes>\\+)\z/) value = trim_heredoc_whitespace(value, current_heredoc)
adjustment = match[:backslashes].size / 2
adjusted_line = chomped_line.delete_suffix("\\" * adjustment)
if match[:backslashes].size.odd?
adjusted_line.delete_suffix!("\\")
adjustment += 2
else
adjusted_line << newline
end end
=======
value = unescape_string(value, quote_stack.last)
else
# When the parser gem encounters a line continuation inside of a multiline string,
# it emits a single string node. The backslash (and remaining newline) is removed.
current_line = +""
adjustment = 0
start_offset = offset_cache[token.location.start_offset]
emit = false
lines.each.with_index do |line, index|
chomped_line = line.chomp
backslash_count = chomped_line[/\\{1,}\z/]&.length || 0 backslash_count = chomped_line[/\\{1,}\z/]&.length || 0
is_interpolation = interpolation?(quote_stack.last) is_interpolation = interpolation?(quote_stack.last)
is_percent_array = percent_array?(quote_stack.last) is_percent_array = percent_array?(quote_stack.last)
@ -360,15 +430,18 @@ module Prism
end end
# If the string ends with a line continuation emit the remainder # If the string ends with a line continuation emit the remainder
emit = index == lines.count - 1 emit = index == lines.count - 1
>>>>>>> b6554ad64e (Fix parser translator tokens for backslashes in single-quoted strings and word arrays)
else else
adjusted_line = line current_line << line
adjustment = 0 emit = true
end end
end_offset = start_offset + adjusted_line.bytesize + adjustment if emit
tokens << [:tSTRING_CONTENT, [adjusted_line, Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])]] end_offset = start_offset + current_line.bytesize + adjustment
tokens << [:tSTRING_CONTENT, [unescape_string(current_line, quote_stack.last), range(start_offset, end_offset)]]
start_offset = end_offset start_offset = end_offset
current_line = +""
adjustment = 0
end
end end
next next
end end
@ -377,20 +450,24 @@ module Prism
when :tSTRING_END when :tSTRING_END
if token.type == :HEREDOC_END && value.end_with?("\n") if token.type == :HEREDOC_END && value.end_with?("\n")
newline_length = value.end_with?("\r\n") ? 2 : 1 newline_length = value.end_with?("\r\n") ? 2 : 1
value = heredoc_identifier_stack.pop value = heredoc_stack.pop.identifier
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.end_offset - newline_length]) location = range(token.location.start_offset, token.location.end_offset - newline_length)
elsif token.type == :REGEXP_END elsif token.type == :REGEXP_END
value = value[0] value = value[0]
location = Range.new(source_buffer, offset_cache[token.location.start_offset], offset_cache[token.location.start_offset + 1]) location = range(token.location.start_offset, token.location.start_offset + 1)
end end
quote_stack.pop
when :tSYMBEG when :tSYMBEG
if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :EMBEXPR_BEGIN && next_token.type != :EMBVAR && next_token.type != :STRING_END
next_location = token.location.join(next_token.location) next_location = token.location.join(next_token.location)
type = :tSYMBOL type = :tSYMBOL
value = next_token.value value = next_token.value
value = { "~@" => "~", "!@" => "!" }.fetch(value, value) value = { "~@" => "~", "!@" => "!" }.fetch(value, value)
location = Range.new(source_buffer, offset_cache[next_location.start_offset], offset_cache[next_location.end_offset]) location = range(next_location.start_offset, next_location.end_offset)
index += 1 index += 1
else
quote_stack.push(value)
end end
when :tFID when :tFID
if !tokens.empty? && tokens.dig(-1, 0) == :kDEF if !tokens.empty? && tokens.dig(-1, 0) == :kDEF
@ -400,12 +477,21 @@ module Prism
if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :STRING_END if (next_token = lexed[index][0]) && next_token.type != :STRING_CONTENT && next_token.type != :STRING_END
type = :tBACK_REF2 type = :tBACK_REF2
end end
quote_stack.push(value)
when :tSYMBOLS_BEG, :tQSYMBOLS_BEG, :tWORDS_BEG, :tQWORDS_BEG
if (next_token = lexed[index][0]) && next_token.type == :WORDS_SEP
index += 1
end
quote_stack.push(value)
when :tREGEXP_BEG
quote_stack.push(value)
end end
tokens << [type, [value, location]] tokens << [type, [value, location]]
if token.type == :REGEXP_END if token.type == :REGEXP_END
tokens << [:tREGEXP_OPT, [token.value[1..], Range.new(source_buffer, offset_cache[token.location.start_offset + 1], offset_cache[token.location.end_offset])]] tokens << [:tREGEXP_OPT, [token.value[1..], range(token.location.start_offset + 1, token.location.end_offset)]]
end end
end end
@ -414,6 +500,11 @@ module Prism
private private
# Creates a new parser range, taking prisms byte offsets into account
def range(start_offset, end_offset)
Range.new(source_buffer, offset_cache[start_offset], offset_cache[end_offset])
end
# Parse an integer from the string representation. # Parse an integer from the string representation.
def parse_integer(value) def parse_integer(value)
Integer(value) Integer(value)

View File

@ -2,6 +2,8 @@
(..2) (..2)
foo ((1..1))
1...2 1...2
foo[...2] foo[...2]