[ruby/prism] Additionally handle encoding comments in vim mode

https://github.com/ruby/prism/commit/bf9bdb9d82
This commit is contained in:
Kevin Newton 2023-10-13 13:34:56 -04:00 committed by Jemma Issroff
parent 5523a23469
commit 9f16f07cf1
3 changed files with 105 additions and 17 deletions

View File

@ -5221,7 +5221,7 @@ next_newline(const uint8_t *cursor, ptrdiff_t length) {
// Here we're going to check if this is a "magic" comment, and perform whatever // Here we're going to check if this is a "magic" comment, and perform whatever
// actions are necessary for it here. // actions are necessary for it here.
static void static void
parser_lex_magic_comment_encoding(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { parser_lex_magic_comment_encoding_value(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
size_t width = (size_t) (end - start); size_t width = (size_t) (end - start);
// First, we're going to call out to a user-defined callback if one was // First, we're going to call out to a user-defined callback if one was
@ -5301,10 +5301,58 @@ parser_lex_magic_comment_encoding(pm_parser_t *parser, const uint8_t *start, con
pm_parser_err(parser, start, end, PM_ERR_INVALID_ENCODING_MAGIC_COMMENT); pm_parser_err(parser, start, end, PM_ERR_INVALID_ENCODING_MAGIC_COMMENT);
} }
// Look for a specific pattern of "coding" and potentially set the encoding on
// the parser.
static void
parser_lex_magic_comment_encoding(pm_parser_t *parser) {
const uint8_t *cursor = parser->current.start + 1;
const uint8_t *end = parser->current.end;
bool separator = false;
while (true) {
if (end - cursor <= 6) return;
switch (cursor[6]) {
case 'C': case 'c': cursor += 6; continue;
case 'O': case 'o': cursor += 5; continue;
case 'D': case 'd': cursor += 4; continue;
case 'I': case 'i': cursor += 3; continue;
case 'N': case 'n': cursor += 2; continue;
case 'G': case 'g': cursor += 1; continue;
case '=': case ':':
separator = true;
cursor += 6;
break;
default:
cursor += 6;
if (pm_char_is_whitespace(*cursor)) break;
continue;
}
if (pm_strncasecmp(cursor - 6, (const uint8_t *) "coding", 6) == 0) break;
separator = false;
}
while (true) {
do {
if (++cursor >= end) return;
} while (pm_char_is_whitespace(*cursor));
if (separator) break;
if (*cursor != '=' && *cursor != ':') return;
separator = true;
cursor++;
}
const uint8_t *value_start = cursor;
while ((*cursor == '-' || *cursor == '_' || parser->encoding.alnum_char(cursor, 1)) && ++cursor < end);
parser_lex_magic_comment_encoding_value(parser, value_start, cursor);
}
// Check if this is a magic comment that includes the frozen_string_literal // Check if this is a magic comment that includes the frozen_string_literal
// pragma. If it does, set that field on the parser. // pragma. If it does, set that field on the parser.
static void static void
parser_lex_magic_comment_frozen_string_literal(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) { parser_lex_magic_comment_frozen_string_literal_value(pm_parser_t *parser, const uint8_t *start, const uint8_t *end) {
if (start + 4 <= end && pm_strncasecmp(start, (const uint8_t *) "true", 4) == 0) { if (start + 4 <= end && pm_strncasecmp(start, (const uint8_t *) "true", 4) == 0) {
parser->frozen_string_literal = true; parser->frozen_string_literal = true;
} }
@ -5335,10 +5383,13 @@ parser_lex_magic_comment_emacs_marker(pm_parser_t *parser, const uint8_t *cursor
// //
// %r"([^\\s\'\":;]+)\\s*:\\s*(\"(?:\\\\.|[^\"])*\"|[^\"\\s;]+)[\\s;]*" // %r"([^\\s\'\":;]+)\\s*:\\s*(\"(?:\\\\.|[^\"])*\"|[^\"\\s;]+)[\\s;]*"
// //
static inline void // It returns true if it consumes the entire comment. Otherwise it returns
// false.
static inline bool
parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) { parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
const uint8_t *start = parser->current.start + 1; const uint8_t *start = parser->current.start + 1;
const uint8_t *end = parser->current.end; const uint8_t *end = parser->current.end;
if (end - start <= 7) return false;
const uint8_t *cursor; const uint8_t *cursor;
bool indicator = false; bool indicator = false;
@ -5352,7 +5403,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
} else { } else {
// If we have a start marker but not an end marker, then we cannot // If we have a start marker but not an end marker, then we cannot
// have a magic comment. // have a magic comment.
return; return false;
} }
} }
@ -5365,17 +5416,17 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
const uint8_t *key_end = cursor; const uint8_t *key_end = cursor;
while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++; while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
if (cursor == end) return; if (cursor == end) break;
if (*cursor == ':') { if (*cursor == ':') {
cursor++; cursor++;
} else { } else {
if (!indicator) return; if (!indicator) return false;
continue; continue;
} }
while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++; while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
if (cursor == end) return; if (cursor == end) break;
const uint8_t *value_start; const uint8_t *value_start;
const uint8_t *value_end; const uint8_t *value_end;
@ -5396,7 +5447,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
while (cursor < end && (*cursor == ';' || pm_char_is_whitespace(*cursor))) cursor++; while (cursor < end && (*cursor == ';' || pm_char_is_whitespace(*cursor))) cursor++;
} else { } else {
while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++; while (cursor < end && pm_char_is_whitespace(*cursor)) cursor++;
if (cursor != end) return; if (cursor != end) return false;
} }
// Here, we need to do some processing on the key to swap out dashes for // Here, we need to do some processing on the key to swap out dashes for
@ -5409,7 +5460,7 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
} else { } else {
size_t width = (size_t) (key_end - key_start); size_t width = (size_t) (key_end - key_start);
uint8_t *buffer = malloc(width); uint8_t *buffer = malloc(width);
if (buffer == NULL) return; if (buffer == NULL) break;
memcpy(buffer, key_start, width); memcpy(buffer, key_start, width);
buffer[dash - key_start] = '_'; buffer[dash - key_start] = '_';
@ -5423,25 +5474,25 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
// Finally, we can start checking the key against the list of known // Finally, we can start checking the key against the list of known
// magic comment keys, and potentially change state based on that. // magic comment keys, and potentially change state based on that.
const char *key_source = (const char *) pm_string_source(&key); const uint8_t *key_source = pm_string_source(&key);
const size_t key_length = pm_string_length(&key); const size_t key_length = pm_string_length(&key);
// We only want to attempt to compare against encoding comments if it's // We only want to attempt to compare against encoding comments if it's
// the first line in the file (or the second in the case of a shebang). // the first line in the file (or the second in the case of a shebang).
if (parser->current.start == parser->encoding_comment_start) { if (parser->current.start == parser->encoding_comment_start) {
if ( if (
(key_length == 8 && strncasecmp(key_source, "encoding", 8) == 0) || (key_length == 8 && pm_strncasecmp(key_source, (const uint8_t *) "encoding", 8) == 0) ||
(key_length == 6 && strncasecmp(key_source, "coding", 6) == 0) (key_length == 6 && pm_strncasecmp(key_source, (const uint8_t *) "coding", 6) == 0)
) { ) {
parser_lex_magic_comment_encoding(parser, value_start, value_end); parser_lex_magic_comment_encoding_value(parser, value_start, value_end);
} }
} }
// We only want to handle frozen string literal comments if it's before // We only want to handle frozen string literal comments if it's before
// any semantic tokens have been seen. // any semantic tokens have been seen.
if (!semantic_token_seen) { if (!semantic_token_seen) {
if (key_length == 21 && strncasecmp(key_source, "frozen_string_literal", 21) == 0) { if (key_length == 21 && pm_strncasecmp(key_source, (const uint8_t *) "frozen_string_literal", 21) == 0) {
parser_lex_magic_comment_frozen_string_literal(parser, value_start, value_end); parser_lex_magic_comment_frozen_string_literal_value(parser, value_start, value_end);
} }
} }
@ -5459,6 +5510,8 @@ parser_lex_magic_comment(pm_parser_t *parser, bool semantic_token_seen) {
pm_list_append(&parser->magic_comment_list, (pm_list_node_t *) magic_comment); pm_list_append(&parser->magic_comment_list, (pm_list_node_t *) magic_comment);
} }
} }
return true;
} }
/******************************************************************************/ /******************************************************************************/
@ -7061,7 +7114,15 @@ parser_lex(pm_parser_t *parser) {
// Here, parse the comment to see if it's a magic comment // Here, parse the comment to see if it's a magic comment
// and potentially change state on the parser. // and potentially change state on the parser.
parser_lex_magic_comment(parser, semantic_token_seen); if (!parser_lex_magic_comment(parser, semantic_token_seen) && (parser->current.start == parser->encoding_comment_start)) {
ptrdiff_t length = parser->current.end - parser->current.start;
// If we didn't find a magic comment within the first
// pass and we're at the start of the file, then we need
// to do another pass to potentially find other patterns
// for encoding comments.
if (length >= 10) parser_lex_magic_comment_encoding(parser);
}
lexed_comment = true; lexed_comment = true;
} }

View File

@ -16,7 +16,8 @@ module Prism
"# -*- CoDiNg: ascii -*-", "# -*- CoDiNg: ascii -*-",
"# -*- \s\t\v encoding \s\t\v : \s\t\v ascii \s\t\v -*-", "# -*- \s\t\v encoding \s\t\v : \s\t\v ascii \s\t\v -*-",
"# -*- foo: bar; encoding: ascii -*-", "# -*- foo: bar; encoding: ascii -*-",
"# coding \t \r \v : \t \v \r ascii-8bit\n" "# coding \t \r \v : \t \v \r ascii-8bit\n",
"# vim: filetype=ruby, fileencoding=big5, tabsize=3, shiftwidth=3\n"
] ]
examples.each do |example| examples.each do |example|

View File

@ -4,6 +4,21 @@ require_relative "test_helper"
module Prism module Prism
class ParseTest < TestCase class ParseTest < TestCase
# A subclass of Ripper that extracts out magic comments.
class MagicCommentRipper < Ripper
attr_reader :magic_comments
def initialize(*)
super
@magic_comments = []
end
def on_magic_comment(key, value)
@magic_comments << [key, value]
super
end
end
# When we pretty-print the trees to compare against the snapshots, we want to # When we pretty-print the trees to compare against the snapshots, we want to
# be certain that we print with the same external encoding. This is because # be certain that we print with the same external encoding. This is because
# methods like Symbol#inspect take into account external encoding and it could # methods like Symbol#inspect take into account external encoding and it could
@ -159,6 +174,17 @@ module Prism
rescue SyntaxError rescue SyntaxError
raise ArgumentError, "Test file has invalid syntax #{filepath}" raise ArgumentError, "Test file has invalid syntax #{filepath}"
end end
# Next, check that we get the correct number of magic comments when
# lexing with ripper.
expected = MagicCommentRipper.new(source).tap(&:parse).magic_comments
actual = result.magic_comments
assert_equal expected.length, actual.length
expected.zip(actual).each do |(expected_key, expected_value), magic_comment|
assert_equal expected_key, magic_comment.key
assert_equal expected_value, magic_comment.value
end
end end
end end
end end