[ruby/prism] Handle escapes in named capture names
https://github.com/ruby/prism/commit/b4b7a69ce7
This commit is contained in:
parent
36b6625ba9
commit
c4534c9fe8
161
prism/prism.c
161
prism/prism.c
@ -9551,21 +9551,7 @@ escape_write_unicode(pm_parser_t *parser, pm_buffer_t *buffer, const uint8_t fla
|
|||||||
parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
|
parser->explicit_encoding = PM_ENCODING_UTF_8_ENTRY;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (value <= 0x7F) { // 0xxxxxxx
|
if (!pm_buffer_append_unicode_codepoint(buffer, value)) {
|
||||||
pm_buffer_append_byte(buffer, (uint8_t) value);
|
|
||||||
} else if (value <= 0x7FF) { // 110xxxxx 10xxxxxx
|
|
||||||
pm_buffer_append_byte(buffer, (uint8_t) (0xC0 | (value >> 6)));
|
|
||||||
pm_buffer_append_byte(buffer, (uint8_t) (0x80 | (value & 0x3F)));
|
|
||||||
} else if (value <= 0xFFFF) { // 1110xxxx 10xxxxxx 10xxxxxx
|
|
||||||
pm_buffer_append_byte(buffer, (uint8_t) (0xE0 | (value >> 12)));
|
|
||||||
pm_buffer_append_byte(buffer, (uint8_t) (0x80 | ((value >> 6) & 0x3F)));
|
|
||||||
pm_buffer_append_byte(buffer, (uint8_t) (0x80 | (value & 0x3F)));
|
|
||||||
} else if (value <= 0x10FFFF) { // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
|
||||||
pm_buffer_append_byte(buffer, (uint8_t) (0xF0 | (value >> 18)));
|
|
||||||
pm_buffer_append_byte(buffer, (uint8_t) (0x80 | ((value >> 12) & 0x3F)));
|
|
||||||
pm_buffer_append_byte(buffer, (uint8_t) (0x80 | ((value >> 6) & 0x3F)));
|
|
||||||
pm_buffer_append_byte(buffer, (uint8_t) (0x80 | (value & 0x3F)));
|
|
||||||
} else {
|
|
||||||
pm_parser_err(parser, start, end, PM_ERR_ESCAPE_INVALID_UNICODE);
|
pm_parser_err(parser, start, end, PM_ERR_ESCAPE_INVALID_UNICODE);
|
||||||
pm_buffer_append_byte(buffer, 0xEF);
|
pm_buffer_append_byte(buffer, 0xEF);
|
||||||
pm_buffer_append_byte(buffer, 0xBF);
|
pm_buffer_append_byte(buffer, 0xBF);
|
||||||
@ -20873,6 +20859,123 @@ typedef struct {
|
|||||||
bool shared;
|
bool shared;
|
||||||
} parse_regular_expression_named_capture_data_t;
|
} parse_regular_expression_named_capture_data_t;
|
||||||
|
|
||||||
|
static inline const uint8_t *
|
||||||
|
pm_named_capture_escape_hex(pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) {
|
||||||
|
cursor++;
|
||||||
|
|
||||||
|
if (cursor < end && pm_char_is_hexadecimal_digit(*cursor)) {
|
||||||
|
uint8_t value = escape_hexadecimal_digit(*cursor);
|
||||||
|
cursor++;
|
||||||
|
|
||||||
|
if (cursor < end && pm_char_is_hexadecimal_digit(*cursor)) {
|
||||||
|
value = (uint8_t) ((value << 4) | escape_hexadecimal_digit(*cursor));
|
||||||
|
cursor++;
|
||||||
|
}
|
||||||
|
|
||||||
|
pm_buffer_append_byte(unescaped, value);
|
||||||
|
} else {
|
||||||
|
pm_buffer_append_string(unescaped, "\\x", 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
return cursor;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline const uint8_t *
|
||||||
|
pm_named_capture_escape_octal(pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) {
|
||||||
|
uint8_t value = (uint8_t) (*cursor - '0');
|
||||||
|
cursor++;
|
||||||
|
|
||||||
|
if (cursor < end && pm_char_is_octal_digit(*cursor)) {
|
||||||
|
value = ((uint8_t) (value << 3)) | ((uint8_t) (*cursor - '0'));
|
||||||
|
cursor++;
|
||||||
|
|
||||||
|
if (cursor < end && pm_char_is_octal_digit(*cursor)) {
|
||||||
|
value = ((uint8_t) (value << 3)) | ((uint8_t) (*cursor - '0'));
|
||||||
|
cursor++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pm_buffer_append_byte(unescaped, value);
|
||||||
|
return cursor;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline const uint8_t *
|
||||||
|
pm_named_capture_escape_unicode(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *cursor, const uint8_t *end) {
|
||||||
|
const uint8_t *start = cursor - 1;
|
||||||
|
cursor++;
|
||||||
|
|
||||||
|
if (cursor >= end) {
|
||||||
|
pm_buffer_append_string(unescaped, "\\u", 2);
|
||||||
|
return cursor;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (*cursor != '{') {
|
||||||
|
size_t length = pm_strspn_hexadecimal_digit(cursor, MIN(end - cursor, 4));
|
||||||
|
uint32_t value = escape_unicode(parser, cursor, length);
|
||||||
|
|
||||||
|
if (!pm_buffer_append_unicode_codepoint(unescaped, value)) {
|
||||||
|
pm_buffer_append_string(unescaped, (const char *) start, (size_t) ((cursor + length) - start));
|
||||||
|
}
|
||||||
|
|
||||||
|
return cursor + length;
|
||||||
|
}
|
||||||
|
|
||||||
|
cursor++;
|
||||||
|
for (;;) {
|
||||||
|
while (cursor < end && *cursor == ' ') cursor++;
|
||||||
|
|
||||||
|
if (cursor >= end) break;
|
||||||
|
if (*cursor == '}') {
|
||||||
|
cursor++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t length = pm_strspn_hexadecimal_digit(cursor, end - cursor);
|
||||||
|
uint32_t value = escape_unicode(parser, cursor, length);
|
||||||
|
|
||||||
|
(void) pm_buffer_append_unicode_codepoint(unescaped, value);
|
||||||
|
cursor += length;
|
||||||
|
}
|
||||||
|
|
||||||
|
return cursor;
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
pm_named_capture_escape(pm_parser_t *parser, pm_buffer_t *unescaped, const uint8_t *source, const size_t length, const uint8_t *cursor) {
|
||||||
|
const uint8_t *end = source + length;
|
||||||
|
pm_buffer_append_string(unescaped, (const char *) source, (size_t) (cursor - source));
|
||||||
|
|
||||||
|
for (;;) {
|
||||||
|
if (++cursor >= end) {
|
||||||
|
pm_buffer_append_byte(unescaped, '\\');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
switch (*cursor) {
|
||||||
|
case 'x':
|
||||||
|
cursor = pm_named_capture_escape_hex(unescaped, cursor, end);
|
||||||
|
break;
|
||||||
|
case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7':
|
||||||
|
cursor = pm_named_capture_escape_octal(unescaped, cursor, end);
|
||||||
|
break;
|
||||||
|
case 'u':
|
||||||
|
cursor = pm_named_capture_escape_unicode(parser, unescaped, cursor, end);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
pm_buffer_append_byte(unescaped, '\\');
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
const uint8_t *next_cursor = pm_memchr(cursor, '\\', (size_t) (end - cursor), parser->encoding_changed, parser->encoding);
|
||||||
|
if (next_cursor == NULL) break;
|
||||||
|
|
||||||
|
pm_buffer_append_string(unescaped, (const char *) cursor, (size_t) (next_cursor - cursor));
|
||||||
|
cursor = next_cursor;
|
||||||
|
}
|
||||||
|
|
||||||
|
pm_buffer_append_string(unescaped, (const char *) cursor, (size_t) (end - cursor));
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This callback is called when the regular expression parser encounters a named
|
* This callback is called when the regular expression parser encounters a named
|
||||||
* capture group.
|
* capture group.
|
||||||
@ -20887,13 +20990,32 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
|
|||||||
|
|
||||||
const uint8_t *source = pm_string_source(capture);
|
const uint8_t *source = pm_string_source(capture);
|
||||||
size_t length = pm_string_length(capture);
|
size_t length = pm_string_length(capture);
|
||||||
|
pm_buffer_t unescaped = { 0 };
|
||||||
|
|
||||||
|
// First, we need to handle escapes within the name of the capture group.
|
||||||
|
// This is because regular expressions have three different representations
|
||||||
|
// in prism. The first is the plain source code. The second is the
|
||||||
|
// representation that will be sent to the regular expression engine, which
|
||||||
|
// is the value of the "unescaped" field. This is poorly named, because it
|
||||||
|
// actually still contains escapes, just a subset of them that the regular
|
||||||
|
// expression engine knows how to handle. The third representation is fully
|
||||||
|
// unescaped, which is what we need.
|
||||||
|
const uint8_t *cursor = pm_memchr(source, '\\', length, parser->encoding_changed, parser->encoding);
|
||||||
|
if (PRISM_UNLIKELY(cursor != NULL)) {
|
||||||
|
pm_named_capture_escape(parser, &unescaped, source, length, cursor);
|
||||||
|
source = (const uint8_t *) pm_buffer_value(&unescaped);
|
||||||
|
length = pm_buffer_length(&unescaped);
|
||||||
|
}
|
||||||
|
|
||||||
pm_location_t location;
|
pm_location_t location;
|
||||||
pm_constant_id_t name;
|
pm_constant_id_t name;
|
||||||
|
|
||||||
// If the name of the capture group isn't a valid identifier, we do
|
// If the name of the capture group isn't a valid identifier, we do
|
||||||
// not add it to the local table.
|
// not add it to the local table.
|
||||||
if (!pm_slice_is_valid_local(parser, source, source + length)) return;
|
if (!pm_slice_is_valid_local(parser, source, source + length)) {
|
||||||
|
pm_buffer_free(&unescaped);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (callback_data->shared) {
|
if (callback_data->shared) {
|
||||||
// If the unescaped string is a slice of the source, then we can
|
// If the unescaped string is a slice of the source, then we can
|
||||||
@ -20921,7 +21043,10 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
|
|||||||
if ((depth = pm_parser_local_depth_constant_id(parser, name)) == -1) {
|
if ((depth = pm_parser_local_depth_constant_id(parser, name)) == -1) {
|
||||||
// If the local is not already a local but it is a keyword, then we
|
// If the local is not already a local but it is a keyword, then we
|
||||||
// do not want to add a capture for this.
|
// do not want to add a capture for this.
|
||||||
if (pm_local_is_keyword((const char *) source, length)) return;
|
if (pm_local_is_keyword((const char *) source, length)) {
|
||||||
|
pm_buffer_free(&unescaped);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
// If the identifier is not already a local, then we will add it to
|
// If the identifier is not already a local, then we will add it to
|
||||||
// the local table.
|
// the local table.
|
||||||
@ -20939,6 +21064,8 @@ parse_regular_expression_named_capture(const pm_string_t *capture, void *data) {
|
|||||||
pm_node_t *target = (pm_node_t *) pm_local_variable_target_node_create(parser, &location, name, depth == -1 ? 0 : (uint32_t) depth);
|
pm_node_t *target = (pm_node_t *) pm_local_variable_target_node_create(parser, &location, name, depth == -1 ? 0 : (uint32_t) depth);
|
||||||
pm_node_list_append(&callback_data->match->targets, target);
|
pm_node_list_append(&callback_data->match->targets, target);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pm_buffer_free(&unescaped);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -172,6 +172,46 @@ pm_buffer_append_double(pm_buffer_t *buffer, double value) {
|
|||||||
pm_buffer_append(buffer, source, sizeof(double));
|
pm_buffer_append(buffer, source, sizeof(double));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Append a unicode codepoint to the buffer.
|
||||||
|
*/
|
||||||
|
bool
|
||||||
|
pm_buffer_append_unicode_codepoint(pm_buffer_t *buffer, uint32_t value) {
|
||||||
|
if (value <= 0x7F) {
|
||||||
|
pm_buffer_append_byte(buffer, (uint8_t) value); // 0xxxxxxx
|
||||||
|
return true;
|
||||||
|
} else if (value <= 0x7FF) {
|
||||||
|
uint8_t bytes[] = {
|
||||||
|
(uint8_t) (0xC0 | ((value >> 6) & 0x3F)), // 110xxxxx
|
||||||
|
(uint8_t) (0x80 | (value & 0x3F)) // 10xxxxxx
|
||||||
|
};
|
||||||
|
|
||||||
|
pm_buffer_append_bytes(buffer, bytes, 2);
|
||||||
|
return true;
|
||||||
|
} else if (value <= 0xFFFF) {
|
||||||
|
uint8_t bytes[] = {
|
||||||
|
(uint8_t) (0xE0 | ((value >> 12) & 0x3F)), // 1110xxxx
|
||||||
|
(uint8_t) (0x80 | ((value >> 6) & 0x3F)), // 10xxxxxx
|
||||||
|
(uint8_t) (0x80 | (value & 0x3F)) // 10xxxxxx
|
||||||
|
};
|
||||||
|
|
||||||
|
pm_buffer_append_bytes(buffer, bytes, 3);
|
||||||
|
return true;
|
||||||
|
} else if (value <= 0x10FFFF) {
|
||||||
|
uint8_t bytes[] = {
|
||||||
|
(uint8_t) (0xF0 | ((value >> 18) & 0x3F)), // 11110xxx
|
||||||
|
(uint8_t) (0x80 | ((value >> 12) & 0x3F)), // 10xxxxxx
|
||||||
|
(uint8_t) (0x80 | ((value >> 6) & 0x3F)), // 10xxxxxx
|
||||||
|
(uint8_t) (0x80 | (value & 0x3F)) // 10xxxxxx
|
||||||
|
};
|
||||||
|
|
||||||
|
pm_buffer_append_bytes(buffer, bytes, 4);
|
||||||
|
return true;
|
||||||
|
} else {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Append a slice of source code to the buffer.
|
* Append a slice of source code to the buffer.
|
||||||
*/
|
*/
|
||||||
|
@ -137,6 +137,16 @@ void pm_buffer_append_varsint(pm_buffer_t *buffer, int32_t value);
|
|||||||
*/
|
*/
|
||||||
void pm_buffer_append_double(pm_buffer_t *buffer, double value);
|
void pm_buffer_append_double(pm_buffer_t *buffer, double value);
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Append a unicode codepoint to the buffer.
|
||||||
|
*
|
||||||
|
* @param buffer The buffer to append to.
|
||||||
|
* @param value The character to append.
|
||||||
|
* @returns True if the codepoint was valid and appended successfully, false
|
||||||
|
* otherwise.
|
||||||
|
*/
|
||||||
|
bool pm_buffer_append_unicode_codepoint(pm_buffer_t *buffer, uint32_t value);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* The different types of escaping that can be performed by the buffer when
|
* The different types of escaping that can be performed by the buffer when
|
||||||
* appending a slice of Ruby source code.
|
* appending a slice of Ruby source code.
|
||||||
|
29
test/prism/result/named_capture_test.rb
Normal file
29
test/prism/result/named_capture_test.rb
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
# frozen_string_literal: true
|
||||||
|
|
||||||
|
require_relative "../test_helper"
|
||||||
|
|
||||||
|
module Prism
|
||||||
|
class NamedCaptureTest < TestCase
|
||||||
|
def test_hex_escapes
|
||||||
|
assert_equal :😀, parse_name("\\xf0\\x9f\\x98\\x80")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_unicode_escape
|
||||||
|
assert_equal :し, parse_name("\\u3057")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_unicode_escapes_bracess
|
||||||
|
assert_equal :😀, parse_name("\\u{1f600}")
|
||||||
|
end
|
||||||
|
|
||||||
|
def test_octal_escapes
|
||||||
|
assert_equal :😀, parse_name("\\xf0\\x9f\\x98\\200")
|
||||||
|
end
|
||||||
|
|
||||||
|
private
|
||||||
|
|
||||||
|
def parse_name(content)
|
||||||
|
Prism.parse_statement("/(?<#{content}>)/ =~ ''").targets.first.name
|
||||||
|
end
|
||||||
|
end
|
||||||
|
end
|
Loading…
x
Reference in New Issue
Block a user