[ruby/prism] Reject invalid capture groups (keywords)

https://github.com/ruby/prism/commit/bb78d83e88
This commit is contained in:
Kevin Newton 2024-03-28 15:16:26 -04:00 committed by git
parent bb3cbdfe2f
commit 8780059c38
3 changed files with 237 additions and 64 deletions

View File

@ -1184,6 +1184,77 @@ token_is_setter_name(pm_token_t *token) {
);
}
/**
* Returns true if the given local variable is a keyword.
*/
static bool
pm_local_is_keyword(const char *source, size_t length) {
#define KEYWORD(name) if (memcmp(source, name, length) == 0) return true
switch (length) {
case 2:
switch (source[0]) {
case 'd': KEYWORD("do"); return false;
case 'i': KEYWORD("if"); KEYWORD("in"); return false;
case 'o': KEYWORD("or"); return false;
default: return false;
}
case 3:
switch (source[0]) {
case 'a': KEYWORD("and"); return false;
case 'd': KEYWORD("def"); return false;
case 'e': KEYWORD("end"); return false;
case 'f': KEYWORD("for"); return false;
case 'n': KEYWORD("nil"); KEYWORD("not"); return false;
default: return false;
}
case 4:
switch (source[0]) {
case 'c': KEYWORD("case"); return false;
case 'e': KEYWORD("else"); return false;
case 'n': KEYWORD("next"); return false;
case 'r': KEYWORD("redo"); return false;
case 's': KEYWORD("self"); return false;
case 't': KEYWORD("then"); KEYWORD("true"); return false;
case 'w': KEYWORD("when"); return false;
default: return false;
}
case 5:
switch (source[0]) {
case 'a': KEYWORD("alias"); return false;
case 'b': KEYWORD("begin"); KEYWORD("break"); return false;
case 'c': KEYWORD("class"); return false;
case 'e': KEYWORD("elsif"); return false;
case 'f': KEYWORD("false"); return false;
case 'r': KEYWORD("retry"); return false;
case 's': KEYWORD("super"); return false;
case 'u': KEYWORD("undef"); KEYWORD("until"); return false;
case 'w': KEYWORD("while"); return false;
case 'y': KEYWORD("yield"); return false;
default: return false;
}
case 6:
switch (source[0]) {
case 'e': KEYWORD("ensure"); return false;
case 'm': KEYWORD("module"); return false;
case 'r': KEYWORD("rescue"); KEYWORD("return"); return false;
case 'u': KEYWORD("unless"); return false;
default: return false;
}
case 8:
KEYWORD("__LINE__");
KEYWORD("__FILE__");
return false;
case 12:
KEYWORD("__ENCODING__");
return false;
default:
return false;
}
#undef KEYWORD
}
/******************************************************************************/
/* Node flag handling functions */
/******************************************************************************/
@ -10576,19 +10647,19 @@ parser_lex(pm_parser_t *parser) {
pm_token_type_t type = lex_identifier(parser, previous_command_start);
// If we've hit a __END__ and it was at the start of the line or the
// start of the file and it is followed by either a \n or a \r\n, then
// this is the last token of the file.
// If we've hit a __END__ and it was at the start of the
// line or the start of the file and it is followed by
// either a \n or a \r\n, then this is the last token of the
// file.
if (
((parser->current.end - parser->current.start) == 7) &&
current_token_starts_line(parser) &&
(memcmp(parser->current.start, "__END__", 7) == 0) &&
(parser->current.end == parser->end || match_eol(parser))
)
{
// Since we know we're about to add an __END__ comment, we know we
// need to add all of the newlines to get the correct column
// information for it.
) {
// Since we know we're about to add an __END__ comment,
// we know we need to add all of the newlines to get the
// correct column information for it.
const uint8_t *cursor = parser->current.end;
while ((cursor = next_newline(cursor, parser->end - cursor)) != NULL) {
pm_newline_list_append(&parser->newline_list, cursor++);
@ -18006,22 +18077,39 @@ parse_call_operator_write(pm_parser_t *parser, pm_call_node_t *call_node, const
}
}
/**
* Returns true if the name of the capture group is a valid local variable that
* can be written to.
*/
static bool
name_is_identifier(pm_parser_t *parser, const uint8_t *source, size_t length) {
parse_regular_expression_named_capture(pm_parser_t *parser, const uint8_t *source, size_t length) {
if (length == 0) {
return false;
}
// First ensure that it starts with a valid identifier starting character.
size_t width = char_is_identifier_start(parser, source);
if (!width) {
return false;
}
uint8_t *cursor = ((uint8_t *)source) + width;
// Next, ensure that it's not an uppercase character.
if (parser->encoding_changed) {
if (parser->encoding->isupper_char(source, (ptrdiff_t) length)) return false;
} else {
if (pm_encoding_utf_8_isupper_char(source, (ptrdiff_t) length)) return false;
}
// Next, iterate through all of the bytes of the string to ensure that they
// are all valid identifier characters.
const uint8_t *cursor = source + width;
while (cursor < source + length && (width = char_is_identifier(parser, cursor))) {
cursor += width;
}
// Finally, validate that the identifier is not a keywor.
if (pm_local_is_keyword((const char *) source, length)) return false;
return cursor == source + length;
}
@ -18051,7 +18139,7 @@ parse_regular_expression_named_captures(pm_parser_t *parser, const pm_string_t *
// If the name of the capture group isn't a valid identifier, we do
// not add it to the local table.
if (!name_is_identifier(parser, source, length)) continue;
if (!parse_regular_expression_named_capture(parser, source, length)) continue;
if (content->type == PM_STRING_SHARED) {
// If the unescaped string is a slice of the source, then we can

View File

@ -38,3 +38,7 @@ b>)/ =~ ""; ab
a = 1
tap { /(?<a>)/ =~ to_s }
/(?<foo>)/ =~ ""
/(?<Foo>)/ =~ ""
/(?<nil>)/ =~ ""

View File

@ -1,8 +1,8 @@
@ ProgramNode (location: (1,0)-(40,24))
@ ProgramNode (location: (1,0)-(44,16))
├── locals: [:foo, :ab, :abc, :a]
└── statements:
@ StatementsNode (location: (1,0)-(40,24))
└── body: (length: 21)
@ StatementsNode (location: (1,0)-(44,16))
└── body: (length: 24)
├── @ CallNode (location: (1,0)-(1,9))
│ ├── flags: ignore_visibility
│ ├── receiver: ∅
@ -316,56 +316,137 @@
│ │ ├── flags: decimal
│ │ └── value: 1
│ └── operator_loc: (39,2)-(39,3) = "="
└── @ CallNode (location: (40,0)-(40,24))
├── flags: ignore_visibility
├── receiver: ∅
├── @ CallNode (location: (40,0)-(40,24))
│ ├── flags: ignore_visibility
│ ├── receiver: ∅
│ ├── call_operator_loc: ∅
│ ├── name: :tap
│ ├── message_loc: (40,0)-(40,3) = "tap"
│ ├── opening_loc: ∅
│ ├── arguments: ∅
│ ├── closing_loc: ∅
│ └── block:
│ @ BlockNode (location: (40,4)-(40,24))
│ ├── locals: []
│ ├── parameters: ∅
│ ├── body:
│ │ @ StatementsNode (location: (40,6)-(40,22))
│ │ └── body: (length: 1)
│ │ └── @ MatchWriteNode (location: (40,6)-(40,22))
│ │ ├── call:
│ │ │ @ CallNode (location: (40,6)-(40,22))
│ │ │ ├── flags: ∅
│ │ │ ├── receiver:
│ │ │ │ @ RegularExpressionNode (location: (40,6)-(40,14))
│ │ │ │ ├── flags: forced_us_ascii_encoding
│ │ │ │ ├── opening_loc: (40,6)-(40,7) = "/"
│ │ │ │ ├── content_loc: (40,7)-(40,13) = "(?<a>)"
│ │ │ │ ├── closing_loc: (40,13)-(40,14) = "/"
│ │ │ │ └── unescaped: "(?<a>)"
│ │ │ ├── call_operator_loc: ∅
│ │ │ ├── name: :=~
│ │ │ ├── message_loc: (40,15)-(40,17) = "=~"
│ │ │ ├── opening_loc: ∅
│ │ │ ├── arguments:
│ │ │ │ @ ArgumentsNode (location: (40,18)-(40,22))
│ │ │ │ ├── flags: ∅
│ │ │ │ └── arguments: (length: 1)
│ │ │ │ └── @ CallNode (location: (40,18)-(40,22))
│ │ │ │ ├── flags: variable_call, ignore_visibility
│ │ │ │ ├── receiver: ∅
│ │ │ │ ├── call_operator_loc: ∅
│ │ │ │ ├── name: :to_s
│ │ │ │ ├── message_loc: (40,18)-(40,22) = "to_s"
│ │ │ │ ├── opening_loc: ∅
│ │ │ │ ├── arguments: ∅
│ │ │ │ ├── closing_loc: ∅
│ │ │ │ └── block: ∅
│ │ │ ├── closing_loc: ∅
│ │ │ └── block: ∅
│ │ └── targets: (length: 1)
│ │ └── @ LocalVariableTargetNode (location: (40,10)-(40,11))
│ │ ├── name: :a
│ │ └── depth: 1
│ ├── opening_loc: (40,4)-(40,5) = "{"
│ └── closing_loc: (40,23)-(40,24) = "}"
├── @ MatchWriteNode (location: (42,0)-(42,16))
│ ├── call:
│ │ @ CallNode (location: (42,0)-(42,16))
│ │ ├── flags: ∅
│ │ ├── receiver:
│ │ │ @ RegularExpressionNode (location: (42,0)-(42,10))
│ │ │ ├── flags: forced_us_ascii_encoding
│ │ │ ├── opening_loc: (42,0)-(42,1) = "/"
│ │ │ ├── content_loc: (42,1)-(42,9) = "(?<foo>)"
│ │ │ ├── closing_loc: (42,9)-(42,10) = "/"
│ │ │ └── unescaped: "(?<foo>)"
│ │ ├── call_operator_loc: ∅
│ │ ├── name: :=~
│ │ ├── message_loc: (42,11)-(42,13) = "=~"
│ │ ├── opening_loc: ∅
│ │ ├── arguments:
│ │ │ @ ArgumentsNode (location: (42,14)-(42,16))
│ │ │ ├── flags: ∅
│ │ │ └── arguments: (length: 1)
│ │ │ └── @ StringNode (location: (42,14)-(42,16))
│ │ │ ├── flags: ∅
│ │ │ ├── opening_loc: (42,14)-(42,15) = "\""
│ │ │ ├── content_loc: (42,15)-(42,15) = ""
│ │ │ ├── closing_loc: (42,15)-(42,16) = "\""
│ │ │ └── unescaped: ""
│ │ ├── closing_loc: ∅
│ │ └── block: ∅
│ └── targets: (length: 1)
│ └── @ LocalVariableTargetNode (location: (42,4)-(42,7))
│ ├── name: :foo
│ └── depth: 0
├── @ CallNode (location: (43,0)-(43,16))
│ ├── flags: ∅
│ ├── receiver:
│ │ @ RegularExpressionNode (location: (43,0)-(43,10))
│ │ ├── flags: forced_us_ascii_encoding
│ │ ├── opening_loc: (43,0)-(43,1) = "/"
│ │ ├── content_loc: (43,1)-(43,9) = "(?<Foo>)"
│ │ ├── closing_loc: (43,9)-(43,10) = "/"
│ │ └── unescaped: "(?<Foo>)"
│ ├── call_operator_loc: ∅
│ ├── name: :=~
│ ├── message_loc: (43,11)-(43,13) = "=~"
│ ├── opening_loc: ∅
│ ├── arguments:
│ │ @ ArgumentsNode (location: (43,14)-(43,16))
│ │ ├── flags: ∅
│ │ └── arguments: (length: 1)
│ │ └── @ StringNode (location: (43,14)-(43,16))
│ │ ├── flags: ∅
│ │ ├── opening_loc: (43,14)-(43,15) = "\""
│ │ ├── content_loc: (43,15)-(43,15) = ""
│ │ ├── closing_loc: (43,15)-(43,16) = "\""
│ │ └── unescaped: ""
│ ├── closing_loc: ∅
│ └── block: ∅
└── @ CallNode (location: (44,0)-(44,16))
├── flags: ∅
├── receiver:
│ @ RegularExpressionNode (location: (44,0)-(44,10))
│ ├── flags: forced_us_ascii_encoding
│ ├── opening_loc: (44,0)-(44,1) = "/"
│ ├── content_loc: (44,1)-(44,9) = "(?<nil>)"
│ ├── closing_loc: (44,9)-(44,10) = "/"
│ └── unescaped: "(?<nil>)"
├── call_operator_loc: ∅
├── name: :tap
├── message_loc: (40,0)-(40,3) = "tap"
├── name: :=~
├── message_loc: (44,11)-(44,13) = "=~"
├── opening_loc: ∅
├── arguments: ∅
├── arguments:
│ @ ArgumentsNode (location: (44,14)-(44,16))
│ ├── flags: ∅
│ └── arguments: (length: 1)
│ └── @ StringNode (location: (44,14)-(44,16))
│ ├── flags: ∅
│ ├── opening_loc: (44,14)-(44,15) = "\""
│ ├── content_loc: (44,15)-(44,15) = ""
│ ├── closing_loc: (44,15)-(44,16) = "\""
│ └── unescaped: ""
├── closing_loc: ∅
└── block:
@ BlockNode (location: (40,4)-(40,24))
├── locals: []
├── parameters: ∅
├── body:
│ @ StatementsNode (location: (40,6)-(40,22))
│ └── body: (length: 1)
│ └── @ MatchWriteNode (location: (40,6)-(40,22))
│ ├── call:
│ │ @ CallNode (location: (40,6)-(40,22))
│ │ ├── flags: ∅
│ │ ├── receiver:
│ │ │ @ RegularExpressionNode (location: (40,6)-(40,14))
│ │ │ ├── flags: forced_us_ascii_encoding
│ │ │ ├── opening_loc: (40,6)-(40,7) = "/"
│ │ │ ├── content_loc: (40,7)-(40,13) = "(?<a>)"
│ │ │ ├── closing_loc: (40,13)-(40,14) = "/"
│ │ │ └── unescaped: "(?<a>)"
│ │ ├── call_operator_loc: ∅
│ │ ├── name: :=~
│ │ ├── message_loc: (40,15)-(40,17) = "=~"
│ │ ├── opening_loc: ∅
│ │ ├── arguments:
│ │ │ @ ArgumentsNode (location: (40,18)-(40,22))
│ │ │ ├── flags: ∅
│ │ │ └── arguments: (length: 1)
│ │ │ └── @ CallNode (location: (40,18)-(40,22))
│ │ │ ├── flags: variable_call, ignore_visibility
│ │ │ ├── receiver: ∅
│ │ │ ├── call_operator_loc: ∅
│ │ │ ├── name: :to_s
│ │ │ ├── message_loc: (40,18)-(40,22) = "to_s"
│ │ │ ├── opening_loc: ∅
│ │ │ ├── arguments: ∅
│ │ │ ├── closing_loc: ∅
│ │ │ └── block: ∅
│ │ ├── closing_loc: ∅
│ │ └── block: ∅
│ └── targets: (length: 1)
│ └── @ LocalVariableTargetNode (location: (40,10)-(40,11))
│ ├── name: :a
│ └── depth: 1
├── opening_loc: (40,4)-(40,5) = "{"
└── closing_loc: (40,23)-(40,24) = "}"
└── block: ∅