diff --git a/lib/prism/prism.gemspec b/lib/prism/prism.gemspec index 5d8112b5ba..902a1f235f 100644 --- a/lib/prism/prism.gemspec +++ b/lib/prism/prism.gemspec @@ -93,6 +93,7 @@ Gem::Specification.new do |spec| "lib/prism/translation/parser/lexer.rb", "lib/prism/translation/parser/rubocop.rb", "lib/prism/translation/ripper.rb", + "lib/prism/translation/ripper/ripper_compiler.rb", "lib/prism/translation/ruby_parser.rb", "lib/prism/visitor.rb", "src/diagnostic.c", diff --git a/lib/prism/translation/ripper/ripper_compiler.rb b/lib/prism/translation/ripper/ripper_compiler.rb index 304e99c54c..6690717a28 100644 --- a/lib/prism/translation/ripper/ripper_compiler.rb +++ b/lib/prism/translation/ripper/ripper_compiler.rb @@ -20,6 +20,37 @@ module Prism @result = nil @lineno = nil @column = nil + + @offset_cache = build_offset_cache(source) + @void_stmt_val = on_stmts_add(on_stmts_new, on_void_stmt) + end + + # Excerpt a chunk of the source + def source_range(start_c, end_c) + @source[@offset_cache[start_c]..@offset_cache[end_c]] + end + + # Prism deals with offsets in bytes, while Ripper deals with + # offsets in characters. We need to handle this conversion in order to + # build the parser gem AST. + # + # If the bytesize of the source is the same as the length, then we can + # just use the offset directly. Otherwise, we build an array where the + # index is the byte offset and the value is the character offset. + def build_offset_cache(source) + if source.bytesize == source.length + -> (offset) { offset } + else + offset_cache = [] + offset = 0 + + source.each_char do |char| + char.bytesize.times { offset_cache << offset } + offset += 1 + end + + offset_cache << offset + end end ############################################################################ @@ -108,9 +139,29 @@ module Prism def visit_block_node(node) params_val = node.parameters.nil? ? nil : visit(node.parameters) - body_val = node.body.nil? ? on_stmts_add(on_stmts_new, on_void_stmt) : visit(node.body) + # If the body is empty, we use a void statement. If there is + # a semicolon after the opening delimiter, we append a void + # statement, unless the body is also empty. So we should never + # get a double void statement. - on_brace_block(params_val, body_val) + body_val = if node.body.nil? + @void_stmt_val + elsif node_has_semicolon?(node) + v = visit(node.body) + raise(NotImplementedError, "Unexpected statement structure #{v.inspect}") if v[0] != :stmts_add + v[1] = @void_stmt_val + v + else + visit(node.body) + end + + if node.opening == "{" + on_brace_block(params_val, body_val) + elsif node.opening == "do" + on_do_block(params_val, on_bodystmt(body_val, nil, nil, nil)) + else + raise NotImplementedError, "Unexpected Block opening character!" + end end # Visit a BlockParametersNode. @@ -218,7 +269,7 @@ module Prism def visit_parentheses_node(node) body = if node.body.nil? - on_stmts_add(on_stmts_new, on_void_stmt) + @void_stmt_val else visit(node.body) end @@ -228,16 +279,80 @@ module Prism end # Visit a BeginNode node. - # This is not at all bulletproof against different structures of begin/rescue/else/ensure/end. def visit_begin_node(node) - rescue_val = node.rescue_clause ? on_rescue(nil, nil, visit(node.rescue_clause), nil) : nil - ensure_val = node.ensure_clause ? on_ensure(visit(node.ensure_clause.statements)) : nil - on_begin(on_bodystmt(visit(node.statements), rescue_val, nil, ensure_val)) + rescue_val = node.rescue_clause ? visit(node.rescue_clause) : nil + ensure_val = node.ensure_clause ? visit(node.ensure_clause) : nil + + if node.statements + stmts_val = visit(node.statements) + if node_has_semicolon?(node) + # If there's a semicolon, we need to replace [:stmts_new] with + # [:stmts_add, [:stmts_new], [:void_stmt]]. + stmts_val[1] = @void_stmt_val + end + else + stmts_val = @void_stmt_val + end + + on_begin(on_bodystmt(stmts_val, rescue_val, nil, ensure_val)) + end + + # Visit an EnsureNode node. + def visit_ensure_node(node) + if node.statements + # If there are any statements, we need to see if there's a semicolon + # between the ensure and the start of the first statement. + + stmts_val = visit(node.statements) + if node_has_semicolon?(node) + # If there's a semicolon, we need to replace [:stmts_new] with + # [:stmts_add, [:stmts_new], [:void_stmt]]. + stmts_val[1] = @void_stmt_val + end + else + stmts_val = @void_stmt_val + end + on_ensure(stmts_val) end # Visit a RescueNode node. def visit_rescue_node(node) - visit(node.statements) + consequent_val = nil + if node.consequent + consequent_val = visit(node.consequent) + end + + if node.statements + stmts_val = visit(node.statements) + else + stmts_val = @void_stmt_val + end + + if node.reference + raise NotImplementedError unless node.reference.is_a?(LocalVariableTargetNode) + bounds(node.reference.location) + ref_val = on_var_field(on_ident(node.reference.name.to_s)) + else + ref_val = nil + end + + # No exception(s) + if !node.exceptions || node.exceptions.empty? + return on_rescue(nil, ref_val, stmts_val, consequent_val) + end + + exc_vals = node.exceptions.map { |exc| visit(exc) } + + if node.exceptions.length == 1 + return on_rescue(exc_vals, ref_val, stmts_val, consequent_val) + end + + inner_vals = exc_vals[0..-2].inject(on_args_new) do |output, exc_val| + on_args_add(output, exc_val) + end + exc_vals = on_mrhs_add(on_mrhs_new_from_args(inner_vals), exc_vals[-1]) + + on_rescue(exc_vals, ref_val, stmts_val, consequent_val) end # Visit a ProgramNode node. @@ -284,6 +399,20 @@ module Prism on_string_literal(visit_enumerated_node(node)) end + # Visit a ConstantReadNode node. + def visit_constant_read_node(node) + bounds(node.location) + on_var_ref(on_const(node.name.to_s)) + end + + # Visit a ConstantWriteNode node. + def visit_constant_write_node(node) + bounds(node.location) + const_val = on_var_field(on_const(node.name.to_s)) + + on_assign(const_val, visit(node.value)) + end + # Visit an EmbeddedStatementsNode node. def visit_embedded_statements_node(node) visit(node.statements) @@ -558,6 +687,43 @@ module Prism on_binary(left_val, node.operator.to_sym, right_val) end + # Some nodes, such as `begin`, `ensure` and `do` may have a semicolon + # after the keyword and before the first statement. This affects + # Ripper's return values. + def node_has_semicolon?(node) + first_field, second_field = case node + when BeginNode + [:begin_keyword_loc, :statements] + when EnsureNode + [:ensure_keyword_loc, :statements] + when BlockNode + [:opening_loc, :body] + else + raise NotImplementedError + end + first_offs, second_offs = delimiter_offsets_for(node, first_field, second_field) + + # We need to know if there's a semicolon after the keyword, but before + # the start of the first statement in the ensure. + range_has_string?(first_offs, second_offs, ";") + end + + # For a given node, grab the offsets for the end of the first field + # and the beginning of the second field. + def delimiter_offsets_for(node, first, second) + first_field = node.send(first) + first_end_loc = first_field.start_offset + first_field.length + second_begin_loc = node.send(second).body[0].location.start_offset - 1 + [first_end_loc, second_begin_loc] + end + + # Check whether the source code contains the given substring between the + # specified offsets. + def range_has_string?(first, last, token) + sr = source_range(first, last) + sr.include?(token) + end + # This method is responsible for updating lineno and column information # to reflect the current node. # diff --git a/test/prism/ripper_test.rb b/test/prism/ripper_test.rb index 8a9af18a13..9ff052ff04 100644 --- a/test/prism/ripper_test.rb +++ b/test/prism/ripper_test.rb @@ -104,6 +104,15 @@ module Prism assert_equivalent("foo(bar 1)") assert_equivalent("foo bar 1") assert_equivalent("foo(bar 1) { 7 }") + assert_equivalent("foo(bar 1) {; 7 }") + assert_equivalent("foo(bar 1) {;}") + + assert_equivalent("foo do\n bar\nend") + assert_equivalent("foo do\nend") + assert_equivalent("foo do; end") + assert_equivalent("foo do bar; end") + assert_equivalent("foo do bar end") + assert_equivalent("foo do; bar; end") end def test_method_calls_on_immediate_values @@ -137,8 +146,45 @@ module Prism assert_equivalent("[1ri, -1ri, +1ri, 1.5ri, -1.5ri, +1.5ri]") end + def test_begin_end + # Empty begin + assert_equivalent("begin; end") + assert_equivalent("begin end") + assert_equivalent("begin; rescue; end") + + assert_equivalent("begin:s.l end") + end + def test_begin_rescue + # Rescue with exception(s) + assert_equivalent("begin a; rescue Exception => ex; c; end") + assert_equivalent("begin a; rescue RuntimeError => ex; c; rescue Exception => ex; d; end") + assert_equivalent("begin a; rescue RuntimeError => ex; c; rescue Exception => ex; end") + assert_equivalent("begin a; rescue RuntimeError,FakeError,Exception => ex; c; end") + assert_equivalent("begin a; rescue RuntimeError,FakeError,Exception; c; end") + + # Empty rescue + assert_equivalent("begin a; rescue; ensure b; end") + assert_equivalent("begin a; rescue; end") + + assert_equivalent("begin; a; ensure; b; end") + end + + def test_begin_ensure + # Empty ensure + assert_equivalent("begin a; rescue; c; ensure; end") + assert_equivalent("begin a; ensure; end") + assert_equivalent("begin; ensure; end") + + # Ripper treats statements differently, depending whether there's + # a semicolon after the keyword. assert_equivalent("begin a; rescue; c; ensure b; end") + assert_equivalent("begin a; rescue c; ensure b; end") + assert_equivalent("begin a; rescue; c; ensure; b; end") + + # Need to make sure we're handling multibyte characters correctly for source offsets + assert_equivalent("begin 🗻; rescue; c; ensure;🗻🗻🗻🗻🗻; end") + assert_equivalent("begin 🗻; rescue; c; ensure 🗻🗻🗻🗻🗻; end") end def test_break @@ -147,6 +193,12 @@ module Prism assert_equivalent("foo { break [1, 2, 3] }") end + def test_constants + assert_equivalent("Foo") + assert_equivalent("Foo + F🗻") + assert_equivalent("Foo = 'soda'") + end + def test_op_assign assert_equivalent("a += b") assert_equivalent("a -= b")