diff --git a/lib/yarp/lex_compat.rb b/lib/yarp/lex_compat.rb index 8362b9063a..a9867737c2 100644 --- a/lib/yarp/lex_compat.rb +++ b/lib/yarp/lex_compat.rb @@ -574,19 +574,41 @@ module YARP result = YARP.lex(source, @filepath) result_value = result.value previous_state = nil - - # If there's a UTF-8 byte-order mark as the start of the file, then ripper - # sets every token's on the first line back by 6 bytes. It also keeps the - # byte order mark in the first token's value. This is weird, and I don't - # want to mirror that in our parser. So instead, we'll match up the values - # here, and then match up the locations as we process the tokens. - bom = source.bytes[0..2] == [0xEF, 0xBB, 0xBF] - result_value[0][0].value.prepend("\xEF\xBB\xBF") if bom + bom = source.byteslice(0..2) == "\xEF\xBB\xBF" result_value.each_with_index do |(token, lex_state), index| lineno = token.location.start_line column = token.location.start_column - column -= index == 0 ? 6 : 3 if bom && lineno == 1 + + # If there's a UTF-8 byte-order mark as the start of the file, then for + # certain tokens ripper sets the first token back by 3 bytes. It also + # keeps the byte order mark in the first token's value. This is weird, + # and I don't want to mirror that in our parser. So instead, we'll match + # up the columns and values here. + if bom && lineno == 1 + column -= 3 + + if index == 0 && column == 0 + flushed = + case token.type + when :BACK_REFERENCE, :INSTANCE_VARIABLE, :CLASS_VARIABLE, + :GLOBAL_VARIABLE, :NUMBERED_REFERENCE, :PERCENT_LOWER_I, + :PERCENT_LOWER_X, :PERCENT_LOWER_W, :PERCENT_UPPER_I, + :PERCENT_UPPER_W, :STRING_BEGIN + true + when :REGEXP_BEGIN, :SYMBOL_BEGIN + token.value.start_with?("%") + else + false + end + + unless flushed + column -= 3 + value = token.value + value.prepend(String.new("\xEF\xBB\xBF", encoding: value.encoding)) + end + end + end event = RIPPER.fetch(token.type) value = token.value @@ -668,6 +690,11 @@ module YARP end_offset = token.location.start_offset if previous_token.type == :COMMENT && start_offset < end_offset + if bom + start_offset += 3 + end_offset += 3 + end + tokens << Token.new([[lineno, 0], :on_nl, source.byteslice(start_offset...end_offset), lex_state]) end diff --git a/test/bom_test.rb b/test/bom_test.rb new file mode 100644 index 0000000000..7dc7eabe92 --- /dev/null +++ b/test/bom_test.rb @@ -0,0 +1,57 @@ +# frozen_string_literal: true + +# Don't bother checking this on these engines, this is such a specific Ripper +# test. +return if RUBY_ENGINE == "jruby" || RUBY_ENGINE == "truffleruby" + +require "yarp_test_helper" + +class BOMTest < Test::Unit::TestCase + def test_ident + assert_bom("foo") + end + + def test_back_reference + assert_bom("$+") + end + + def test_instance_variable + assert_bom("@foo") + end + + def test_class_variable + assert_bom("@@foo") + end + + def test_global_variable + assert_bom("$foo") + end + + def test_numbered_reference + assert_bom("$1") + end + + def test_percents + assert_bom("%i[]") + assert_bom("%r[]") + assert_bom("%s[]") + assert_bom("%q{}") + assert_bom("%w[]") + assert_bom("%x[]") + assert_bom("%I[]") + assert_bom("%W[]") + assert_bom("%Q{}") + end + + def test_string + assert_bom("\"\"") + assert_bom("''") + end + + private + + def assert_bom(source) + bommed = "\xEF\xBB\xBF#{source}" + assert_equal YARP.lex_ripper(bommed), YARP.lex_compat(bommed).value + end +end diff --git a/yarp/extension.c b/yarp/extension.c index 455cdcadcc..8aef456c00 100644 --- a/yarp/extension.c +++ b/yarp/extension.c @@ -221,6 +221,20 @@ static void lex_encoding_changed_callback(yp_parser_t *parser) { lex_data_t *lex_data = (lex_data_t *) parser->lex_callback->data; lex_data->encoding = rb_enc_find(parser->encoding.name); + + // Since we got a new encoding, we need to go back and change the encoding + // of the tokens that we've already lexed. This should be a tiny amount + // since encoding magic comments need to be the first or second line of the + // file. + VALUE tokens = lex_data->tokens; + for (long index = 0; index < RARRAY_LEN(tokens); index++) { + VALUE yields = rb_ary_entry(tokens, index); + VALUE token = rb_ary_entry(yields, 0); + + VALUE value = rb_ivar_get(token, rb_intern("@value")); + rb_enc_associate(value, lex_data->encoding); + ENC_CODERANGE_CLEAR(value); + } } // Return an array of tokens corresponding to the given source. diff --git a/yarp/yarp.c b/yarp/yarp.c index 2a5a923c76..4de3338dc1 100644 --- a/yarp/yarp.c +++ b/yarp/yarp.c @@ -12876,6 +12876,8 @@ yp_parser_metadata(yp_parser_t *parser, const char *metadata) { // Initialize a parser with the given start and end pointers. YP_EXPORTED_FUNCTION void yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char *filepath) { + assert(source != NULL); + // Set filepath to the file that was passed if (!filepath) filepath = ""; yp_string_t filepath_string; @@ -12944,14 +12946,15 @@ yp_parser_init(yp_parser_t *parser, const char *source, size_t size, const char size_t newline_size = size / 22; yp_newline_list_init(&parser->newline_list, source, newline_size < 4 ? 4 : newline_size); - assert(source != NULL); + // Skip past the UTF-8 BOM if it exists. if (size >= 3 && (unsigned char) source[0] == 0xef && (unsigned char) source[1] == 0xbb && (unsigned char) source[2] == 0xbf) { - // If the first three bytes of the source are the UTF-8 BOM, then we'll skip - // over them. parser->current.end += 3; - } else if (size >= 2 && source[0] == '#' && source[1] == '!') { - // If the first two bytes of the source are a shebang, then we'll indicate - // that the encoding comment is at the end of the shebang. + parser->encoding_comment_start += 3; + } + + // If the first two bytes of the source are a shebang, then we'll indicate + // that the encoding comment is at the end of the shebang. + if (peek(parser) == '#' && peek_offset(parser, 1) == '!') { const char *encoding_comment_start = next_newline(source, (ptrdiff_t) size); if (encoding_comment_start) { parser->encoding_comment_start = encoding_comment_start + 1;