[ruby/yarp] Introduce parse_lex instead of asking for a block
https://github.com/ruby/yarp/commit/7e70339fe1
This commit is contained in:
parent
76512d78fc
commit
9b8602dd90
@ -70,7 +70,8 @@ module YARP
|
||||
"yarp.h",
|
||||
"yp_version",
|
||||
"yp_parse_serialize",
|
||||
"yp_lex_serialize"
|
||||
"yp_lex_serialize",
|
||||
"yp_parse_lex_serialize"
|
||||
)
|
||||
|
||||
load_exported_functions_from(
|
||||
@ -225,4 +226,29 @@ module YARP
|
||||
parse(string.read, filepath)
|
||||
end
|
||||
end
|
||||
|
||||
# Mirror the YARP.parse_lex API by using the serialization API.
|
||||
def self.parse_lex(code, filepath = nil)
|
||||
LibRubyParser::YPBuffer.with do |buffer|
|
||||
metadata = [filepath.bytesize, filepath.b, 0].pack("LA*L") if filepath
|
||||
LibRubyParser.yp_parse_lex_serialize(code, code.bytesize, buffer.pointer, metadata)
|
||||
|
||||
source = Source.new(code)
|
||||
loader = Serialize::Loader.new(source, buffer.read)
|
||||
|
||||
tokens = loader.load_tokens
|
||||
node, comments, errors, warnings = loader.load_nodes
|
||||
|
||||
tokens.each { |token,| token.value.force_encoding(loader.encoding) }
|
||||
|
||||
ParseResult.new([node, tokens], comments, errors, warnings, source)
|
||||
end
|
||||
end
|
||||
|
||||
# Mirror the YARP.parse_lex_file API by using the serialization API.
|
||||
def self.parse_lex_file(filepath)
|
||||
LibRubyParser::YPString.with(filepath) do |string|
|
||||
parse_lex(string.read, filepath)
|
||||
end
|
||||
end
|
||||
end
|
||||
|
@ -90,4 +90,9 @@ class EncodingTest < Test::Unit::TestCase
|
||||
assert_equal Encoding.find("utf-8"), actual
|
||||
end
|
||||
end
|
||||
|
||||
def test_first_lexed_token
|
||||
encoding = YARP.lex("# encoding: ascii-8bit").value[0][0].value.encoding
|
||||
assert_equal Encoding.find("ascii-8bit"), encoding
|
||||
end
|
||||
end
|
||||
|
@ -30,6 +30,20 @@ class ParseTest < Test::Unit::TestCase
|
||||
assert_equal filepath, find_source_file_node(result.value).filepath
|
||||
end
|
||||
|
||||
def test_parse_lex
|
||||
node, tokens = YARP.parse_lex("def foo; end").value
|
||||
|
||||
assert_kind_of YARP::ProgramNode, node
|
||||
assert_equal 5, tokens.length
|
||||
end
|
||||
|
||||
def test_parse_lex_file
|
||||
node, tokens = YARP.parse_lex_file(__FILE__).value
|
||||
|
||||
assert_kind_of YARP::ProgramNode, node
|
||||
refute_empty tokens
|
||||
end
|
||||
|
||||
# To accurately compare against Ripper, we need to make sure that we're
|
||||
# running on Ruby 3.2+.
|
||||
check_ripper = RUBY_VERSION >= "3.2.0"
|
||||
|
143
yarp/extension.c
143
yarp/extension.c
@ -198,66 +198,67 @@ typedef struct {
|
||||
VALUE source;
|
||||
VALUE tokens;
|
||||
rb_encoding *encoding;
|
||||
} lex_data_t;
|
||||
} parse_lex_data_t;
|
||||
|
||||
// This is passed as a callback to the parser. It gets called every time a new
|
||||
// token is found. Once found, we initialize a new instance of Token and push it
|
||||
// onto the tokens array.
|
||||
static void
|
||||
lex_token(void *data, yp_parser_t *parser, yp_token_t *token) {
|
||||
lex_data_t *lex_data = (lex_data_t *) parser->lex_callback->data;
|
||||
parse_lex_token(void *data, yp_parser_t *parser, yp_token_t *token) {
|
||||
parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data;
|
||||
|
||||
VALUE yields = rb_ary_new_capa(2);
|
||||
rb_ary_push(yields, yp_token_new(parser, token, lex_data->encoding, lex_data->source));
|
||||
rb_ary_push(yields, yp_token_new(parser, token, parse_lex_data->encoding, parse_lex_data->source));
|
||||
rb_ary_push(yields, INT2FIX(parser->lex_state));
|
||||
|
||||
rb_ary_push(lex_data->tokens, yields);
|
||||
rb_ary_push(parse_lex_data->tokens, yields);
|
||||
}
|
||||
|
||||
// This is called whenever the encoding changes based on the magic comment at
|
||||
// the top of the file. We use it to update the encoding that we are using to
|
||||
// create tokens.
|
||||
static void
|
||||
lex_encoding_changed_callback(yp_parser_t *parser) {
|
||||
lex_data_t *lex_data = (lex_data_t *) parser->lex_callback->data;
|
||||
lex_data->encoding = rb_enc_find(parser->encoding.name);
|
||||
parse_lex_encoding_changed_callback(yp_parser_t *parser) {
|
||||
parse_lex_data_t *parse_lex_data = (parse_lex_data_t *) parser->lex_callback->data;
|
||||
parse_lex_data->encoding = rb_enc_find(parser->encoding.name);
|
||||
|
||||
// Since we got a new encoding, we need to go back and change the encoding
|
||||
// of the tokens that we've already lexed. This should be a tiny amount
|
||||
// since encoding magic comments need to be the first or second line of the
|
||||
// Since the encoding changed, we need to go back and change the encoding of
|
||||
// the tokens that were already lexed. This is only going to end up being
|
||||
// one or two tokens, since the encoding can only change at the top of the
|
||||
// file.
|
||||
VALUE tokens = lex_data->tokens;
|
||||
VALUE tokens = parse_lex_data->tokens;
|
||||
for (long index = 0; index < RARRAY_LEN(tokens); index++) {
|
||||
VALUE yields = rb_ary_entry(tokens, index);
|
||||
VALUE token = rb_ary_entry(yields, 0);
|
||||
|
||||
VALUE value = rb_ivar_get(token, rb_intern("@value"));
|
||||
rb_enc_associate(value, lex_data->encoding);
|
||||
rb_enc_associate(value, parse_lex_data->encoding);
|
||||
ENC_CODERANGE_CLEAR(value);
|
||||
}
|
||||
}
|
||||
|
||||
// Return an array of tokens corresponding to the given source.
|
||||
// Parse the given input and return a ParseResult containing just the tokens or
|
||||
// the nodes and tokens.
|
||||
static VALUE
|
||||
lex_input(yp_string_t *input, const char *filepath) {
|
||||
parse_lex_input(yp_string_t *input, const char *filepath, bool return_nodes) {
|
||||
yp_parser_t parser;
|
||||
yp_parser_init(&parser, yp_string_source(input), yp_string_length(input), filepath);
|
||||
yp_parser_register_encoding_changed_callback(&parser, lex_encoding_changed_callback);
|
||||
yp_parser_register_encoding_changed_callback(&parser, parse_lex_encoding_changed_callback);
|
||||
|
||||
VALUE offsets = rb_ary_new();
|
||||
VALUE source_argv[] = { rb_str_new(yp_string_source(input), yp_string_length(input)), offsets };
|
||||
VALUE source = rb_class_new_instance(2, source_argv, rb_cYARPSource);
|
||||
|
||||
lex_data_t lex_data = {
|
||||
parse_lex_data_t parse_lex_data = {
|
||||
.source = source,
|
||||
.tokens = rb_ary_new(),
|
||||
.encoding = rb_utf8_encoding()
|
||||
};
|
||||
|
||||
lex_data_t *data = &lex_data;
|
||||
parse_lex_data_t *data = &parse_lex_data;
|
||||
yp_lex_callback_t lex_callback = (yp_lex_callback_t) {
|
||||
.data = (void *) data,
|
||||
.callback = lex_token,
|
||||
.callback = parse_lex_token,
|
||||
};
|
||||
|
||||
parser.lex_callback = &lex_callback;
|
||||
@ -270,20 +271,26 @@ lex_input(yp_string_t *input, const char *filepath) {
|
||||
rb_ary_push(offsets, INT2FIX(parser.newline_list.offsets[index]));
|
||||
}
|
||||
|
||||
VALUE value;
|
||||
if (return_nodes) {
|
||||
value = rb_ary_new_capa(2);
|
||||
rb_ary_push(value, yp_ast_new(&parser, node, parse_lex_data.encoding));
|
||||
rb_ary_push(value, parse_lex_data.tokens);
|
||||
} else {
|
||||
value = parse_lex_data.tokens;
|
||||
}
|
||||
|
||||
VALUE result_argv[] = {
|
||||
lex_data.tokens,
|
||||
value,
|
||||
parser_comments(&parser, source),
|
||||
parser_errors(&parser, lex_data.encoding, source),
|
||||
parser_warnings(&parser, lex_data.encoding, source),
|
||||
parser_errors(&parser, parse_lex_data.encoding, source),
|
||||
parser_warnings(&parser, parse_lex_data.encoding, source),
|
||||
source
|
||||
};
|
||||
|
||||
VALUE result = rb_class_new_instance(5, result_argv, rb_cYARPParseResult);
|
||||
|
||||
yp_node_destroy(&parser, node);
|
||||
yp_parser_free(&parser);
|
||||
|
||||
return result;
|
||||
return rb_class_new_instance(5, result_argv, rb_cYARPParseResult);
|
||||
}
|
||||
|
||||
// Return an array of tokens corresponding to the given string.
|
||||
@ -295,7 +302,8 @@ lex(int argc, VALUE *argv, VALUE self) {
|
||||
|
||||
yp_string_t input;
|
||||
input_load_string(&input, string);
|
||||
return lex_input(&input, check_string(filepath));
|
||||
|
||||
return parse_lex_input(&input, check_string(filepath), false);
|
||||
}
|
||||
|
||||
// Return an array of tokens corresponding to the given file.
|
||||
@ -306,7 +314,7 @@ lex_file(VALUE self, VALUE filepath) {
|
||||
const char *checked = check_string(filepath);
|
||||
if (!yp_string_mapped_init(&input, checked)) return Qnil;
|
||||
|
||||
VALUE value = lex_input(&input, checked);
|
||||
VALUE value = parse_lex_input(&input, checked, false);
|
||||
yp_string_free(&input);
|
||||
|
||||
return value;
|
||||
@ -316,65 +324,16 @@ lex_file(VALUE self, VALUE filepath) {
|
||||
/* Parsing Ruby code */
|
||||
/******************************************************************************/
|
||||
|
||||
// This is passed as a callback to the parser. It gets called every time a new
|
||||
// token is found from within a call to parse that accepted a block.
|
||||
static void
|
||||
parse_token(void *data, yp_parser_t *parser, yp_token_t *token) {
|
||||
lex_data_t *lex_data = (lex_data_t *) parser->lex_callback->data;
|
||||
rb_yield_values(2, yp_token_new(parser, token, lex_data->encoding, lex_data->source), INT2FIX(parser->lex_state));
|
||||
}
|
||||
|
||||
// Parse the given input and return a ParseResult instance.
|
||||
static VALUE
|
||||
parse_input(yp_string_t *input, const char *filepath) {
|
||||
yp_parser_t parser;
|
||||
yp_parser_init(&parser, yp_string_source(input), yp_string_length(input), filepath);
|
||||
|
||||
VALUE offsets;
|
||||
VALUE source;
|
||||
|
||||
// If a block was given to the parse method, then we're going to register a
|
||||
// lex callback that will yield the tokens to the block. This means you can
|
||||
// get the lexer and the parser output in one method call instead of having
|
||||
// to parse twice.
|
||||
if (rb_block_given_p()) {
|
||||
offsets = rb_ary_new();
|
||||
|
||||
VALUE source_argv[] = { rb_str_new(yp_string_source(input), yp_string_length(input)), offsets };
|
||||
source = rb_class_new_instance(2, source_argv, rb_cYARPSource);
|
||||
|
||||
lex_data_t lex_data = {
|
||||
.source = source,
|
||||
.tokens = Qnil,
|
||||
.encoding = rb_utf8_encoding()
|
||||
};
|
||||
|
||||
lex_data_t *data = &lex_data;
|
||||
yp_lex_callback_t lex_callback = (yp_lex_callback_t) {
|
||||
.data = (void *) data,
|
||||
.callback = parse_token,
|
||||
};
|
||||
|
||||
parser.lex_callback = &lex_callback;
|
||||
yp_parser_register_encoding_changed_callback(&parser, lex_encoding_changed_callback);
|
||||
}
|
||||
|
||||
yp_node_t *node = yp_parse(&parser);
|
||||
rb_encoding *encoding = rb_enc_find(parser.encoding.name);
|
||||
|
||||
if (rb_block_given_p()) {
|
||||
// Here we need to update the source range to have the correct newline
|
||||
// offsets. We do it here because we've already created the object and
|
||||
// given it over to all of the tokens.
|
||||
for (size_t index = 0; index < parser.newline_list.size; index++) {
|
||||
rb_ary_push(offsets, INT2FIX(parser.newline_list.offsets[index]));
|
||||
}
|
||||
} else {
|
||||
// Since a block was not given, we can just create the source now the
|
||||
// regular way.
|
||||
source = yp_source_new(&parser);
|
||||
}
|
||||
|
||||
VALUE source = yp_source_new(&parser);
|
||||
VALUE result_argv[] = {
|
||||
yp_ast_new(&parser, node, encoding),
|
||||
parser_comments(&parser, source),
|
||||
@ -431,6 +390,32 @@ parse_file(VALUE self, VALUE filepath) {
|
||||
return value;
|
||||
}
|
||||
|
||||
// Parse the given string and return a ParseResult instance.
|
||||
static VALUE
|
||||
parse_lex(int argc, VALUE *argv, VALUE self) {
|
||||
VALUE string;
|
||||
VALUE filepath;
|
||||
rb_scan_args(argc, argv, "11", &string, &filepath);
|
||||
|
||||
yp_string_t input;
|
||||
input_load_string(&input, string);
|
||||
return parse_lex_input(&input, check_string(filepath), true);
|
||||
}
|
||||
|
||||
// Parse and lex the given file and return a ParseResult instance.
|
||||
static VALUE
|
||||
parse_lex_file(VALUE self, VALUE filepath) {
|
||||
yp_string_t input;
|
||||
|
||||
const char *checked = check_string(filepath);
|
||||
if (!yp_string_mapped_init(&input, checked)) return Qnil;
|
||||
|
||||
VALUE value = parse_lex_input(&input, checked, true);
|
||||
yp_string_free(&input);
|
||||
|
||||
return value;
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
/* Utility functions exposed to make testing easier */
|
||||
/******************************************************************************/
|
||||
@ -590,6 +575,8 @@ Init_yarp(void) {
|
||||
rb_define_singleton_method(rb_cYARP, "lex_file", lex_file, 1);
|
||||
rb_define_singleton_method(rb_cYARP, "parse", parse, -1);
|
||||
rb_define_singleton_method(rb_cYARP, "parse_file", parse_file, 1);
|
||||
rb_define_singleton_method(rb_cYARP, "parse_lex", parse_lex, -1);
|
||||
rb_define_singleton_method(rb_cYARP, "parse_lex_file", parse_lex_file, 1);
|
||||
|
||||
// Next, the functions that will be called by the parser to perform various
|
||||
// internal tasks. We expose these to make them easier to test.
|
||||
|
@ -14,11 +14,11 @@ end
|
||||
module YARP
|
||||
module Serialize
|
||||
def self.load(input, serialized)
|
||||
Loader.new(Source.new(input), serialized).load
|
||||
Loader.new(Source.new(input), serialized).load_result
|
||||
end
|
||||
|
||||
def self.load_tokens(source, serialized)
|
||||
Loader.new(source, serialized).load_tokens
|
||||
Loader.new(source, serialized).load_tokens_result
|
||||
end
|
||||
|
||||
class Loader
|
||||
@ -39,6 +39,17 @@ module YARP
|
||||
@source = source
|
||||
end
|
||||
|
||||
def load_encoding
|
||||
Encoding.find(io.read(load_varint))
|
||||
end
|
||||
|
||||
def load_metadata
|
||||
comments = load_varint.times.map { Comment.new(Comment::TYPES.fetch(load_varint), load_location) }
|
||||
errors = load_varint.times.map { ParseError.new(load_embedded_string, load_location) }
|
||||
warnings = load_varint.times.map { ParseWarning.new(load_embedded_string, load_location) }
|
||||
[comments, errors, warnings]
|
||||
end
|
||||
|
||||
def load_tokens
|
||||
tokens = []
|
||||
while type = TOKEN_TYPES.fetch(load_varint)
|
||||
@ -49,34 +60,42 @@ module YARP
|
||||
tokens << [YARP::Token.new(type, location.slice, location), lex_state]
|
||||
end
|
||||
|
||||
comments = load_varint.times.map { Comment.new(Comment::TYPES.fetch(load_varint), load_location) }
|
||||
errors = load_varint.times.map { ParseError.new(load_embedded_string, load_location) }
|
||||
warnings = load_varint.times.map { ParseWarning.new(load_embedded_string, load_location) }
|
||||
tokens
|
||||
end
|
||||
|
||||
def load_tokens_result
|
||||
tokens = load_tokens
|
||||
encoding = load_encoding
|
||||
comments, errors, warnings = load_metadata
|
||||
|
||||
if encoding != @encoding
|
||||
tokens.each { |token,| token.value.force_encoding(encoding) }
|
||||
end
|
||||
|
||||
raise "Expected to consume all bytes while deserializing" unless @io.eof?
|
||||
|
||||
YARP::ParseResult.new(tokens, comments, errors, warnings, @source)
|
||||
end
|
||||
|
||||
def load
|
||||
def load_nodes
|
||||
raise "Invalid serialization" if io.read(4) != "YARP"
|
||||
if io.read(3).unpack("C3") != [<%= YARP_VERSION_MAJOR %>, <%= YARP_VERSION_MINOR %>, <%= YARP_VERSION_PATCH %>]
|
||||
raise "Invalid serialization version"
|
||||
end
|
||||
|
||||
@encoding = Encoding.find(io.read(load_varint))
|
||||
@encoding = load_encoding
|
||||
@input = input.force_encoding(@encoding).freeze
|
||||
|
||||
comments = load_varint.times.map { Comment.new(Comment::TYPES.fetch(io.getbyte), load_location) }
|
||||
errors = load_varint.times.map { ParseError.new(load_embedded_string, load_location) }
|
||||
warnings = load_varint.times.map { ParseWarning.new(load_embedded_string, load_location) }
|
||||
comments, errors, warnings = load_metadata
|
||||
|
||||
@constant_pool_offset = io.read(4).unpack1("L")
|
||||
@constant_pool = Array.new(load_varint, nil)
|
||||
|
||||
ast = load_node
|
||||
[load_node, comments, errors, warnings]
|
||||
end
|
||||
|
||||
YARP::ParseResult.new(ast, comments, errors, warnings, @source)
|
||||
def load_result
|
||||
node, comments, errors, warnings = load_nodes
|
||||
YARP::ParseResult.new(node, comments, errors, warnings, @source)
|
||||
end
|
||||
|
||||
private
|
||||
|
@ -170,14 +170,17 @@ yp_serialize_diagnostic_list(yp_parser_t *parser, yp_list_t *list, yp_buffer_t *
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
yp_serialize_encoding(yp_encoding_t *encoding, yp_buffer_t *buffer) {
|
||||
size_t encoding_length = strlen(encoding->name);
|
||||
yp_buffer_append_u32(buffer, yp_sizet_to_u32(encoding_length));
|
||||
yp_buffer_append_str(buffer, encoding->name, encoding_length);
|
||||
}
|
||||
|
||||
#line <%= __LINE__ + 1 %> "<%= File.basename(__FILE__) %>"
|
||||
void
|
||||
yp_serialize_content(yp_parser_t *parser, yp_node_t *node, yp_buffer_t *buffer) {
|
||||
// First, serialize the encoding of the parser.
|
||||
size_t encoding_length = strlen(parser->encoding.name);
|
||||
yp_buffer_append_u32(buffer, yp_sizet_to_u32(encoding_length));
|
||||
yp_buffer_append_str(buffer, parser->encoding.name, encoding_length);
|
||||
|
||||
yp_serialize_encoding(&parser->encoding, buffer);
|
||||
yp_serialize_comment_list(parser, &parser->comment_list, buffer);
|
||||
yp_serialize_diagnostic_list(parser, &parser->error_list, buffer);
|
||||
yp_serialize_diagnostic_list(parser, &parser->warning_list, buffer);
|
||||
@ -246,6 +249,7 @@ yp_lex_serialize(const char *source, size_t size, const char *filepath, yp_buffe
|
||||
// Append 0 to mark end of tokens
|
||||
yp_buffer_append_u8(buffer, 0);
|
||||
|
||||
yp_serialize_encoding(&parser.encoding, buffer);
|
||||
yp_serialize_comment_list(&parser, &parser.comment_list, buffer);
|
||||
yp_serialize_diagnostic_list(&parser, &parser.error_list, buffer);
|
||||
yp_serialize_diagnostic_list(&parser, &parser.warning_list, buffer);
|
||||
@ -253,3 +257,26 @@ yp_lex_serialize(const char *source, size_t size, const char *filepath, yp_buffe
|
||||
yp_node_destroy(&parser, node);
|
||||
yp_parser_free(&parser);
|
||||
}
|
||||
|
||||
// Parse and serialize both the AST and the tokens represented by the given
|
||||
// source to the given buffer.
|
||||
YP_EXPORTED_FUNCTION void
|
||||
yp_parse_lex_serialize(const char *source, size_t size, yp_buffer_t *buffer, const char *metadata) {
|
||||
yp_parser_t parser;
|
||||
yp_parser_init(&parser, source, size, NULL);
|
||||
if (metadata) yp_parser_metadata(&parser, metadata);
|
||||
|
||||
yp_lex_callback_t lex_callback = (yp_lex_callback_t) {
|
||||
.data = (void *) buffer,
|
||||
.callback = serialize_token,
|
||||
};
|
||||
|
||||
parser.lex_callback = &lex_callback;
|
||||
yp_node_t *node = yp_parse(&parser);
|
||||
|
||||
yp_buffer_append_u8(buffer, 0);
|
||||
yp_serialize(&parser, node, buffer);
|
||||
|
||||
yp_node_destroy(&parser, node);
|
||||
yp_parser_free(&parser);
|
||||
}
|
||||
|
@ -13482,7 +13482,7 @@ yp_metadata_read_u32(const char *ptr) {
|
||||
// ]*
|
||||
// ]
|
||||
// ```
|
||||
static void
|
||||
void
|
||||
yp_parser_metadata(yp_parser_t *parser, const char *metadata) {
|
||||
uint32_t filepath_size = yp_metadata_read_u32(metadata);
|
||||
metadata += 4;
|
||||
|
@ -31,6 +31,8 @@ void yp_serialize_content(yp_parser_t *parser, yp_node_t *node, yp_buffer_t *buf
|
||||
|
||||
void yp_print_node(yp_parser_t *parser, yp_node_t *node);
|
||||
|
||||
void yp_parser_metadata(yp_parser_t *parser, const char *metadata);
|
||||
|
||||
// Generate a scope node from the given node.
|
||||
void yp_scope_node_init(yp_node_t *node, yp_scope_node_t *dest);
|
||||
|
||||
@ -69,6 +71,10 @@ YP_EXPORTED_FUNCTION void yp_parse_serialize(const char *source, size_t size, yp
|
||||
// Lex the given source and serialize to the given buffer.
|
||||
YP_EXPORTED_FUNCTION void yp_lex_serialize(const char *source, size_t size, const char *filepath, yp_buffer_t *buffer);
|
||||
|
||||
// Parse and serialize both the AST and the tokens represented by the given
|
||||
// source to the given buffer.
|
||||
YP_EXPORTED_FUNCTION void yp_parse_lex_serialize(const char *source, size_t size, yp_buffer_t *buffer, const char *metadata);
|
||||
|
||||
// Returns a string representation of the given token type.
|
||||
YP_EXPORTED_FUNCTION const char * yp_token_type_to_str(yp_token_type_t token_type);
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user