diff --git a/parse.y b/parse.y index d312797b98..437f4d2845 100644 --- a/parse.y +++ b/parse.y @@ -1512,6 +1512,7 @@ YYLTYPE *rb_parser_set_location(struct parser_params *p, YYLTYPE *yylloc); void ruby_show_error_line(struct parser_params *p, VALUE errbuf, const YYLTYPE *yylloc, int lineno, rb_parser_string_t *str); RUBY_SYMBOL_EXPORT_END +static void flush_string_content(struct parser_params *p, rb_encoding *enc, size_t back); static void error_duplicate_pattern_variable(struct parser_params *p, ID id, const YYLTYPE *loc); static void error_duplicate_pattern_key(struct parser_params *p, ID id, const YYLTYPE *loc); static VALUE formal_argument_error(struct parser_params*, ID); @@ -7885,6 +7886,7 @@ tok_hex(struct parser_params *p, size_t *numlen) c = (int)ruby_scan_hex(p->lex.pcur, 2, numlen); if (!*numlen) { + flush_string_content(p, p->enc, rb_strlen_lit("\\x")); yyerror0("invalid hex escape"); dispatch_scan_event(p, tSTRING_CONTENT); return 0; @@ -7927,27 +7929,33 @@ escaped_control_code(int c) static int tokadd_codepoint(struct parser_params *p, rb_encoding **encp, - int regexp_literal, int wide) + int regexp_literal, const char *begin) { + const int wide = !begin; size_t numlen; int codepoint = (int)ruby_scan_hex(p->lex.pcur, wide ? p->lex.pend - p->lex.pcur : 4, &numlen); + p->lex.pcur += numlen; if (p->lex.strterm == NULL || strterm_is_heredoc(p->lex.strterm) || (p->lex.strterm->u.literal.func != str_regexp)) { + if (!begin) begin = p->lex.pcur; if (wide ? (numlen == 0 || numlen > 6) : (numlen < 4)) { - literal_flush(p, p->lex.pcur); + flush_string_content(p, rb_utf8_encoding(), p->lex.pcur - begin); yyerror0("invalid Unicode escape"); + dispatch_scan_event(p, tSTRING_CONTENT); return wide && numlen > 0; } if (codepoint > 0x10ffff) { - literal_flush(p, p->lex.pcur); + flush_string_content(p, rb_utf8_encoding(), p->lex.pcur - begin); yyerror0("invalid Unicode codepoint (too large)"); + dispatch_scan_event(p, tSTRING_CONTENT); return wide; } if ((codepoint & 0xfffff800) == 0xd800) { - literal_flush(p, p->lex.pcur); + flush_string_content(p, rb_utf8_encoding(), p->lex.pcur - begin); yyerror0("invalid Unicode codepoint"); + dispatch_scan_event(p, tSTRING_CONTENT); return wide; } } @@ -8035,7 +8043,7 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp, if (second == multiple_codepoints) second = p->lex.pcur; if (regexp_literal) tokadd(p, last); - if (!tokadd_codepoint(p, encp, regexp_literal, TRUE)) { + if (!tokadd_codepoint(p, encp, regexp_literal, NULL)) { break; } while (ISSPACE(c = peekc(p))) { @@ -8048,8 +8056,9 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp, if (c != close_brace) { unterminated: - token_flush(p); + flush_string_content(p, rb_utf8_encoding(), 0); yyerror0("unterminated Unicode escape"); + dispatch_scan_event(p, tSTRING_CONTENT); return; } if (second && second != multiple_codepoints) { @@ -8067,7 +8076,7 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp, } } else { /* handle \uxxxx form */ - if (!tokadd_codepoint(p, encp, regexp_literal, FALSE)) { + if (!tokadd_codepoint(p, encp, regexp_literal, p->lex.pcur - rb_strlen_lit("\\u"))) { token_flush(p); return; } @@ -8078,7 +8087,7 @@ tokadd_utf8(struct parser_params *p, rb_encoding **encp, #define ESCAPE_META 2 static int -read_escape(struct parser_params *p, int flags) +read_escape(struct parser_params *p, int flags, const char *begin) { int c; size_t numlen; @@ -8137,7 +8146,7 @@ read_escape(struct parser_params *p, int flags) nextc(p); goto eof; } - return read_escape(p, flags|ESCAPE_META) | 0x80; + return read_escape(p, flags|ESCAPE_META, begin) | 0x80; } else if (c == -1) goto eof; else if (!ISASCII(c)) { @@ -8170,7 +8179,7 @@ read_escape(struct parser_params *p, int flags) nextc(p); goto eof; } - c = read_escape(p, flags|ESCAPE_CONTROL); + c = read_escape(p, flags|ESCAPE_CONTROL, begin); } else if (c == '?') return 0177; @@ -8205,6 +8214,7 @@ read_escape(struct parser_params *p, int flags) eof: case -1: + flush_string_content(p, p->enc, p->lex.pcur - begin); yyerror0("Invalid escape character syntax"); dispatch_scan_event(p, tSTRING_CONTENT); return '\0'; @@ -8226,6 +8236,7 @@ tokadd_escape(struct parser_params *p) { int c; size_t numlen; + const char *begin = p->lex.pcur; switch (c = nextc(p)) { case '\n': @@ -8251,6 +8262,7 @@ tokadd_escape(struct parser_params *p) eof: case -1: + flush_string_content(p, p->enc, p->lex.pcur - begin); yyerror0("Invalid escape character syntax"); token_flush(p); return -1; @@ -8521,7 +8533,7 @@ tokadd_string(struct parser_params *p, case 'C': case 'M': { pushback(p, c); - c = read_escape(p, 0); + c = read_escape(p, 0, p->lex.pcur - 1); char *t = tokspace(p, rb_strlen_lit("\\x00")); *t++ = '\\'; @@ -8547,7 +8559,7 @@ tokadd_string(struct parser_params *p, else if (func & STR_FUNC_EXPAND) { pushback(p, c); if (func & STR_FUNC_ESCAPE) tokadd(p, '\\'); - c = read_escape(p, 0); + c = read_escape(p, 0, p->lex.pcur - 1); } else if ((func & STR_FUNC_QWORDS) && ISSPACE(c)) { /* ignore backslashed spaces in %w */ @@ -8597,8 +8609,9 @@ tokadd_string(struct parser_params *p, #define NEW_STRTERM(func, term, paren) new_strterm(p, func, term, paren) static void -flush_string_content(struct parser_params *p, rb_encoding *enc) +flush_string_content(struct parser_params *p, rb_encoding *enc, size_t back) { + p->lex.pcur -= back; if (has_delayed_token(p)) { ptrdiff_t len = p->lex.pcur - p->lex.ptok; if (len > 0) { @@ -8610,6 +8623,7 @@ flush_string_content(struct parser_params *p, rb_encoding *enc) p->lex.ptok = p->lex.pcur; } dispatch_scan_event(p, tSTRING_CONTENT); + p->lex.pcur += back; } /* this can be shared with ripper, since it's independent from struct @@ -8777,7 +8791,7 @@ parse_string(struct parser_params *p, rb_strterm_literal_t *quote) tokfix(p); lit = STR_NEW3(tok(p), toklen(p), enc, func); set_yylval_str(lit); - flush_string_content(p, enc); + flush_string_content(p, enc, 0); return tSTRING_CONTENT; } @@ -9246,7 +9260,7 @@ here_document(struct parser_params *p, rb_strterm_heredoc_t *here) #ifndef RIPPER if (bol) nd_set_fl_newline(yylval.node); #endif - flush_string_content(p, enc); + flush_string_content(p, enc, 0); return tSTRING_CONTENT; } tokadd(p, nextc(p)); @@ -10064,7 +10078,7 @@ parse_qmark(struct parser_params *p, int space_seen) if (tokadd_mbchar(p, c) == -1) return 0; } else { - c = read_escape(p, 0); + c = read_escape(p, 0, p->lex.pcur - rb_strlen_lit("?\\")); tokadd(p, c); } } diff --git a/test/ripper/test_lexer.rb b/test/ripper/test_lexer.rb index 64b4336375..e460a7fd13 100644 --- a/test/ripper/test_lexer.rb +++ b/test/ripper/test_lexer.rb @@ -390,6 +390,115 @@ world" assert_lexer(expected, code) end + def test_invalid_escape_string + code = "\"hello\\x world" + expected = [ + [[1, 0], :on_tstring_beg, "\"", state(:EXPR_BEG)], + [[1, 1], :on_tstring_content, "hello", state(:EXPR_BEG)], + [[1, 5], :on_tstring_content, "\\x", state(:EXPR_BEG)], + [[1, 7], :on_tstring_content, " world", state(:EXPR_BEG)], + ] + + code = "\"\nhello\\x world" + expected = [ + [[1, 0], :on_tstring_beg, "\"", state(:EXPR_BEG)], + [[1, 1], :on_tstring_content, "\n" "hello", state(:EXPR_BEG)], + [[2, 5], :on_tstring_content, "\\x", state(:EXPR_BEG)], + [[2, 7], :on_tstring_content, " world", state(:EXPR_BEG)], + ] + assert_lexer(expected, code) + + code = "\"\n\\Cxx\"" + expected = [ + [[1, 0], :on_tstring_beg, "\"", state(:EXPR_BEG)], + [[1, 1], :on_tstring_content, "\n", state(:EXPR_BEG)], + [[2, 0], :on_tstring_content, "\\Cx", state(:EXPR_BEG)], + [[2, 3], :on_tstring_content, "x", state(:EXPR_BEG)], + [[2, 4], :on_tstring_end, "\"", state(:EXPR_END)], + ] + assert_lexer(expected, code) + + code = "\"\n\\Mxx\"" + expected = [ + [[1, 0], :on_tstring_beg, "\"", state(:EXPR_BEG)], + [[1, 1], :on_tstring_content, "\n", state(:EXPR_BEG)], + [[2, 0], :on_tstring_content, "\\Mx", state(:EXPR_BEG)], + [[2, 3], :on_tstring_content, "x", state(:EXPR_BEG)], + [[2, 4], :on_tstring_end, "\"", state(:EXPR_END)], + ] + assert_lexer(expected, code) + + code = "\"\n\\c\\cx\"" + expected = [ + [[1, 0], :on_tstring_beg, "\"", state(:EXPR_BEG)], + [[1, 1], :on_tstring_content, "\n", state(:EXPR_BEG)], + [[2, 0], :on_tstring_content, "\\c\\c", state(:EXPR_BEG)], + [[2, 4], :on_tstring_content, "x", state(:EXPR_BEG)], + [[2, 5], :on_tstring_end, "\"", state(:EXPR_END)], + ] + assert_lexer(expected, code) + + code = "\"\n\\ux\"" + expected = [ + [[1, 0], :on_tstring_beg, "\"", state(:EXPR_BEG)], + [[1, 1], :on_tstring_content, "\n", state(:EXPR_BEG)], + [[2, 0], :on_tstring_content, "\\u", state(:EXPR_BEG)], + [[2, 2], :on_tstring_content, "x", state(:EXPR_BEG)], + [[2, 3], :on_tstring_end, "\"", state(:EXPR_END)], + ] + assert_lexer(expected, code) + + code = "\"\n\\xx\"" + expected = [ + [[1, 0], :on_tstring_beg, "\"", state(:EXPR_BEG)], + [[1, 1], :on_tstring_content, "\n", state(:EXPR_BEG)], + [[2, 0], :on_tstring_content, "\\x", state(:EXPR_BEG)], + [[2, 2], :on_tstring_content, "x", state(:EXPR_BEG)], + [[2, 3], :on_tstring_end, "\"", state(:EXPR_END)], + ] + assert_lexer(expected, code) + + code = "<