[ruby/yarp] Fix lex compat with BOM
* BOM should not impact looking for the encoding string * We should re-encode tokens when the encoding changes * BOM should change the column of comments only https://github.com/ruby/yarp/commit/119fc2d7b2
This commit is contained in:
parent
90048241ca
commit
0e3dc5a056
@ -574,19 +574,41 @@ module YARP
|
|||||||
result = YARP.lex(source, @filepath)
|
result = YARP.lex(source, @filepath)
|
||||||
result_value = result.value
|
result_value = result.value
|
||||||
previous_state = nil
|
previous_state = nil
|
||||||
|
bom = source.byteslice(0..2) == "\xEF\xBB\xBF"
|
||||||
# If there's a UTF-8 byte-order mark as the start of the file, then ripper
|
|
||||||
# sets every token's on the first line back by 6 bytes. It also keeps the
|
|
||||||
# byte order mark in the first token's value. This is weird, and I don't
|
|
||||||
# want to mirror that in our parser. So instead, we'll match up the values
|
|
||||||
# here, and then match up the locations as we process the tokens.
|
|
||||||
bom = source.bytes[0..2] == [0xEF, 0xBB, 0xBF]
|
|
||||||
result_value[0][0].value.prepend("\xEF\xBB\xBF") if bom
|
|
||||||
|
|
||||||
result_value.each_with_index do |(token, lex_state), index|
|
result_value.each_with_index do |(token, lex_state), index|
|
||||||
lineno = token.location.start_line
|
lineno = token.location.start_line
|
||||||
column = token.location.start_column
|
column = token.location.start_column
|
||||||
column -= index == 0 ? 6 : 3 if bom && lineno == 1
|
|
||||||
|
# If there's a UTF-8 byte-order mark as the start of the file, then for
|
||||||
|
# certain tokens ripper sets the first token back by 3 bytes. It also
|
||||||
|
# keeps the byte order mark in the first token's value. This is weird,
|
||||||
|
# and I don't want to mirror that in our parser. So instead, we'll match
|
||||||
|
# up the columns and values here.
|
||||||
|
if bom && lineno == 1
|
||||||
|
column -= 3
|
||||||
|
|
||||||
|
if index == 0 && column == 0
|
||||||
|
flushed =
|
||||||
|
case token.type
|
||||||
|
when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE,
|
||||||
|
:GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I,
|
||||||
|
:PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I,
|
||||||
|
:PERCENT_UPPER_W, :STRING_BEGIN
|
||||||
|
true
|
||||||
|
when :REGEXP_BEGIN, :SYMBOL_BEGIN
|
||||||
|
token.value.start_with?("%")
|
||||||
|
else
|
||||||
|
false
|
||||||
|
end
|
||||||
|
|
||||||
|
unless flushed
|
||||||
|
column -= 3
|
||||||
|
value = token.value
|
||||||
|
value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding))
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
||||||
|
|
||||||
event = RIPPER.fetch(token.type)
|
event = RIPPER.fetch(token.type)
|
||||||
value = token.value
|
value = token.value
|
||||||
@ -668,6 +690,11 @@ module YARP
|
|||||||
end_offset = token.location.start_offset
|
end_offset = token.location.start_offset
|
||||||
|
|
||||||
if previous_token.type == :COMMENT && start_offset < end_offset
|
if previous_token.type == :COMMENT && start_offset < end_offset
|
||||||
|
if bom
|
||||||
|
start_offset += 3
|
||||||
|
end_offset += 3
|
||||||
|
end
|
||||||
|
|
||||||
tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
|
tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state])
|
||||||
end
|
end
|
||||||
|
|
||||||
|
57
test/bom_test.rb
Normal file
57
test/bom_test.rb
Normal file
@ -0,0 +1,57 @@
|
|||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
# Don't bother checking this on these engines, this is such a specific Ripper
|
||||||
|
# test.
|
||||||
|
return if RUBY_ENGINE == "jruby" || RUBY_ENGINE == "truffleruby"
|
||||||
|
|
||||||
|
require "yarp_test_helper"
|
||||||
|
|
||||||
|
class BOMTest < Test::Unit::TestCase
|
||||||
|
def test_ident
|
||||||
|
assert_bom("foo")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_back_reference
|
||||||
|
assert_bom("$+")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_instance_variable
|
||||||
|
assert_bom("@foo")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_class_variable
|
||||||
|
assert_bom("@@foo")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_global_variable
|
||||||
|
assert_bom("$foo")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_numbered_reference
|
||||||
|
assert_bom("$1")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_percents
|
||||||
|
assert_bom("%i[]")
|
||||||
|
assert_bom("%r[]")
|
||||||
|
assert_bom("%s[]")
|
||||||
|
assert_bom("%q{}")
|
||||||
|
assert_bom("%w[]")
|
||||||
|
assert_bom("%x[]")
|
||||||
|
assert_bom("%I[]")
|
||||||
|
assert_bom("%W[]")
|
||||||
|
assert_bom("%Q{}")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_string
|
||||||
|
assert_bom("\"\"")
|
||||||
|
assert_bom("''")
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def assert_bom(source)
|
||||||
|
bommed = "\xEF\xBB\xBF#{source}"
|
||||||
|
assert_equal YARP.lex_ripper(bommed), YARP.lex_compat(bommed).value
|
||||||
|
end
|
||||||
|
end
|
@ -221,6 +221,20 @@ static void
|
|||||||
lex_encoding_changed_callback(yp_parser_t *parser) {
|
lex_encoding_changed_callback(yp_parser_t *parser) {
|
||||||
lex_data_t *lex_data = (lex_data_t *) parser->lex_callback->data;
|
lex_data_t *lex_data = (lex_data_t *) parser->lex_callback->data;
|
||||||
lex_data->encoding = rb_enc_find(parser->encoding.name);
|
lex_data->encoding = rb_enc_find(parser->encoding.name);
|
||||||
|
|
||||||
|
// Since we got a new encoding, we need to go back and change the encoding
|
||||||
|
// of the tokens that we've already lexed. This should be a tiny amount
|
||||||
|
// since encoding magic comments need to be the first or second line of the
|
||||||
|
// file.
|
||||||
|
VALUE tokens = lex_data->tokens;
|
||||||
|
for (long index = 0; index < RARRAY_LEN(tokens); index++) {
|
||||||
|
VALUE yields = rb_ary_entry(tokens, index);
|
||||||
|
VALUE token = rb_ary_entry(yields, 0);
|
||||||
|
|
||||||
|
VALUE value = rb_ivar_get(token, rb_intern("@value"));
|
||||||
|
rb_enc_associate(value, lex_data->encoding);
|
||||||
|
ENC_CODERANGE_CLEAR(value);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return an array of tokens corresponding to the given source.
|
// Return an array of tokens corresponding to the given source.
|
||||||
|
15
yarp/yarp.c
15
yarp/yarp.c
@ -12876,6 +12876,8 @@ yp_parser_metadata(yp_parser_t *parser, const char *metadata) {
|
|||||||
// Initialize a parser with the given start and end pointers.
|
// Initialize a parser with the given start and end pointers.
|
||||||
YP_EXPORTED_FUNCTION void
|
YP_EXPORTED_FUNCTION void
|
||||||
yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char *filepath) {
|
yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char *filepath) {
|
||||||
|
assert(source != NULL);
|
||||||
|
|
||||||
// Set filepath to the file that was passed
|
// Set filepath to the file that was passed
|
||||||
if (!filepath) filepath = "";
|
if (!filepath) filepath = "";
|
||||||
yp_string_t filepath_string;
|
yp_string_t filepath_string;
|
||||||
@ -12944,14 +12946,15 @@ yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char
|
|||||||
size_t newline_size = size / 22;
|
size_t newline_size = size / 22;
|
||||||
yp_newline_list_init(&parser->newline_list, source, newline_size < 4 ? 4 : newline_size);
|
yp_newline_list_init(&parser->newline_list, source, newline_size < 4 ? 4 : newline_size);
|
||||||
|
|
||||||
assert(source != NULL);
|
// Skip past the UTF-8 BOM if it exists.
|
||||||
if (size >= 3 && (unsigned char) source[0] == 0xef && (unsigned char) source[1] == 0xbb && (unsigned char) source[2] == 0xbf) {
|
if (size >= 3 && (unsigned char) source[0] == 0xef && (unsigned char) source[1] == 0xbb && (unsigned char) source[2] == 0xbf) {
|
||||||
// If the first three bytes of the source are the UTF-8 BOM, then we'll skip
|
|
||||||
// over them.
|
|
||||||
parser->current.end += 3;
|
parser->current.end += 3;
|
||||||
} else if (size >= 2 && source[0] == '#' && source[1] == '!') {
|
parser->encoding_comment_start += 3;
|
||||||
// If the first two bytes of the source are a shebang, then we'll indicate
|
}
|
||||||
// that the encoding comment is at the end of the shebang.
|
|
||||||
|
// If the first two bytes of the source are a shebang, then we'll indicate
|
||||||
|
// that the encoding comment is at the end of the shebang.
|
||||||
|
if (peek(parser) == '#' && peek_offset(parser, 1) == '!') {
|
||||||
const char *encoding_comment_start = next_newline(source, (ptrdiff_t) size);
|
const char *encoding_comment_start = next_newline(source, (ptrdiff_t) size);
|
||||||
if (encoding_comment_start) {
|
if (encoding_comment_start) {
|
||||||
parser->encoding_comment_start = encoding_comment_start + 1;
|
parser->encoding_comment_start = encoding_comment_start + 1;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user