diff --git a/prism/prism.c b/prism/prism.c index d9c75c31be..152e42a574 100644 --- a/prism/prism.c +++ b/prism/prism.c @@ -19173,6 +19173,14 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power, b pm_token_t opening = not_provided(parser); pm_token_t closing = not_provided(parser); pm_node_t *part = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &parser->previous, &closing, &unescaped); + + if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + // This is extremely strange, but the first string part of a + // regular expression will always be tagged as binary if we + // are in a US-ASCII file, no matter its contents. + pm_node_flag_set(part, PM_STRING_FLAGS_FORCED_BINARY_ENCODING); + } + pm_interpolated_regular_expression_node_append(interpolated, part); } else { // If the first part of the body of the regular expression is not a diff --git a/prism_compile.c b/prism_compile.c index f05f92f4ee..137ddc1e72 100644 --- a/prism_compile.c +++ b/prism_compile.c @@ -363,18 +363,34 @@ parse_regexp_error(rb_iseq_t *iseq, int32_t line_number, const char *fmt, ...) } static VALUE -parse_regexp_string_part(rb_iseq_t *iseq, const pm_scope_node_t *scope_node, const pm_node_t *node, const pm_string_t *unescaped, rb_encoding *regexp_encoding) +parse_regexp_string_part(rb_iseq_t *iseq, const pm_scope_node_t *scope_node, const pm_node_t *node, const pm_string_t *unescaped, rb_encoding *implicit_regexp_encoding, rb_encoding *explicit_regexp_encoding) { // If we were passed an explicit regexp encoding, then we need to double // check that it's okay here for this fragment of the string. - VALUE string = rb_enc_str_new((const char *) pm_string_source(unescaped), pm_string_length(unescaped), regexp_encoding); + rb_encoding *encoding; + + if (explicit_regexp_encoding != NULL) { + encoding = explicit_regexp_encoding; + } + else if (node->flags & PM_STRING_FLAGS_FORCED_BINARY_ENCODING) { + encoding = rb_ascii8bit_encoding(); + } + else if (node->flags & PM_STRING_FLAGS_FORCED_UTF8_ENCODING) { + encoding = rb_utf8_encoding(); + } + else { + encoding = implicit_regexp_encoding; + } + + VALUE string = rb_enc_str_new((const char *) pm_string_source(unescaped), pm_string_length(unescaped), encoding); VALUE error = rb_reg_check_preprocess(string); + if (error != Qnil) parse_regexp_error(iseq, pm_node_line_number(scope_node->parser, node), "%" PRIsVALUE, rb_obj_as_string(error)); return string; } static VALUE -pm_static_literal_concat(rb_iseq_t *iseq, const pm_node_list_t *nodes, const pm_scope_node_t *scope_node, rb_encoding *regexp_encoding, bool top) +pm_static_literal_concat(rb_iseq_t *iseq, const pm_node_list_t *nodes, const pm_scope_node_t *scope_node, rb_encoding *implicit_regexp_encoding, rb_encoding *explicit_regexp_encoding, bool top) { VALUE current = Qnil; @@ -384,9 +400,9 @@ pm_static_literal_concat(rb_iseq_t *iseq, const pm_node_list_t *nodes, const pm_ switch (PM_NODE_TYPE(part)) { case PM_STRING_NODE: - if (regexp_encoding != NULL) { + if (implicit_regexp_encoding != NULL) { if (top) { - string = parse_regexp_string_part(iseq, scope_node, part, &((const pm_string_node_t *) part)->unescaped, regexp_encoding); + string = parse_regexp_string_part(iseq, scope_node, part, &((const pm_string_node_t *) part)->unescaped, implicit_regexp_encoding, explicit_regexp_encoding); } else { string = parse_string_encoded(part, &((const pm_string_node_t *) part)->unescaped, scope_node->encoding); @@ -399,11 +415,11 @@ pm_static_literal_concat(rb_iseq_t *iseq, const pm_node_list_t *nodes, const pm_ } break; case PM_INTERPOLATED_STRING_NODE: - string = pm_static_literal_concat(iseq, &((const pm_interpolated_string_node_t *) part)->parts, scope_node, regexp_encoding, false); + string = pm_static_literal_concat(iseq, &((const pm_interpolated_string_node_t *) part)->parts, scope_node, implicit_regexp_encoding, explicit_regexp_encoding, false); break; case PM_EMBEDDED_STATEMENTS_NODE: { const pm_embedded_statements_node_t *cast = (const pm_embedded_statements_node_t *) part; - string = pm_static_literal_concat(iseq, &cast->statements->body, scope_node, regexp_encoding, false); + string = pm_static_literal_concat(iseq, &cast->statements->body, scope_node, implicit_regexp_encoding, explicit_regexp_encoding, false); break; } default: @@ -499,7 +515,7 @@ parse_regexp_encoding(const pm_scope_node_t *scope_node, const pm_node_t *node) return rb_enc_get_from_index(ENCINDEX_Windows_31J); } else { - return scope_node->encoding; + return NULL; } } @@ -527,6 +543,8 @@ static inline VALUE parse_regexp_literal(rb_iseq_t *iseq, const pm_scope_node_t *scope_node, const pm_node_t *node, const pm_string_t *unescaped) { rb_encoding *regexp_encoding = parse_regexp_encoding(scope_node, node); + if (regexp_encoding == NULL) regexp_encoding = scope_node->encoding; + VALUE string = rb_enc_str_new((const char *) pm_string_source(unescaped), pm_string_length(unescaped), regexp_encoding); return parse_regexp(iseq, scope_node, node, string); } @@ -534,15 +552,17 @@ parse_regexp_literal(rb_iseq_t *iseq, const pm_scope_node_t *scope_node, const p static inline VALUE parse_regexp_concat(rb_iseq_t *iseq, const pm_scope_node_t *scope_node, const pm_node_t *node, const pm_node_list_t *parts) { - rb_encoding *regexp_encoding = parse_regexp_encoding(scope_node, node); - VALUE string = pm_static_literal_concat(iseq, parts, scope_node, regexp_encoding, false); + rb_encoding *explicit_regexp_encoding = parse_regexp_encoding(scope_node, node); + rb_encoding *implicit_regexp_encoding = explicit_regexp_encoding != NULL ? explicit_regexp_encoding : scope_node->encoding; + + VALUE string = pm_static_literal_concat(iseq, parts, scope_node, implicit_regexp_encoding, explicit_regexp_encoding, false); return parse_regexp(iseq, scope_node, node, string); } static void pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret, bool popped, pm_scope_node_t *scope_node); static int -pm_interpolated_node_compile(rb_iseq_t *iseq, const pm_node_list_t *parts, const pm_line_column_t *node_location, LINK_ANCHOR *const ret, bool popped, pm_scope_node_t *scope_node, rb_encoding *regexp_encoding) +pm_interpolated_node_compile(rb_iseq_t *iseq, const pm_node_list_t *parts, const pm_line_column_t *node_location, LINK_ANCHOR *const ret, bool popped, pm_scope_node_t *scope_node, rb_encoding *implicit_regexp_encoding, rb_encoding *explicit_regexp_encoding) { int stack_size = 0; size_t parts_size = parts->size; @@ -558,11 +578,11 @@ pm_interpolated_node_compile(rb_iseq_t *iseq, const pm_node_list_t *parts, const const pm_string_node_t *string_node = (const pm_string_node_t *) part; VALUE string_value; - if (regexp_encoding == NULL) { + if (implicit_regexp_encoding == NULL) { string_value = parse_string_encoded(part, &string_node->unescaped, scope_node->encoding); } else { - string_value = parse_regexp_string_part(iseq, scope_node, (const pm_node_t *) string_node, &string_node->unescaped, regexp_encoding); + string_value = parse_regexp_string_part(iseq, scope_node, (const pm_node_t *) string_node, &string_node->unescaped, implicit_regexp_encoding, explicit_regexp_encoding); } if (RTEST(current_string)) { @@ -584,11 +604,11 @@ pm_interpolated_node_compile(rb_iseq_t *iseq, const pm_node_list_t *parts, const const pm_string_node_t *string_node = (const pm_string_node_t *) ((const pm_embedded_statements_node_t *) part)->statements->body.nodes[0]; VALUE string_value; - if (regexp_encoding == NULL) { + if (implicit_regexp_encoding == NULL) { string_value = parse_string_encoded(part, &string_node->unescaped, scope_node->encoding); } else { - string_value = parse_regexp_string_part(iseq, scope_node, (const pm_node_t *) string_node, &string_node->unescaped, regexp_encoding); + string_value = parse_regexp_string_part(iseq, scope_node, (const pm_node_t *) string_node, &string_node->unescaped, implicit_regexp_encoding, explicit_regexp_encoding); } if (RTEST(current_string)) { @@ -600,7 +620,24 @@ pm_interpolated_node_compile(rb_iseq_t *iseq, const pm_node_list_t *parts, const } else { if (!RTEST(current_string)) { - current_string = rb_enc_str_new(NULL, 0, regexp_encoding != NULL ? regexp_encoding : scope_node->encoding); + rb_encoding *encoding; + + if (implicit_regexp_encoding != NULL) { + if (explicit_regexp_encoding != NULL) { + encoding = explicit_regexp_encoding; + } + else if (scope_node->parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + encoding = rb_ascii8bit_encoding(); + } + else { + encoding = implicit_regexp_encoding; + } + } + else { + encoding = scope_node->encoding; + } + + current_string = rb_enc_str_new(NULL, 0, encoding); } PUSH_INSN1(ret, *node_location, putobject, rb_fstring(current_string)); @@ -639,9 +676,10 @@ pm_interpolated_node_compile(rb_iseq_t *iseq, const pm_node_list_t *parts, const static void pm_compile_regexp_dynamic(rb_iseq_t *iseq, const pm_node_t *node, const pm_node_list_t *parts, const pm_line_column_t *node_location, LINK_ANCHOR *const ret, bool popped, pm_scope_node_t *scope_node) { - rb_encoding *regexp_encoding = parse_regexp_encoding(scope_node, node); - int length = pm_interpolated_node_compile(iseq, parts, node_location, ret, popped, scope_node, regexp_encoding); + rb_encoding *explicit_regexp_encoding = parse_regexp_encoding(scope_node, node); + rb_encoding *implicit_regexp_encoding = explicit_regexp_encoding != NULL ? explicit_regexp_encoding : scope_node->encoding; + int length = pm_interpolated_node_compile(iseq, parts, node_location, ret, popped, scope_node, implicit_regexp_encoding, explicit_regexp_encoding); PUSH_INSN2(ret, *node_location, toregexp, INT2FIX(parse_regexp_flags(node) & 0xFF), INT2FIX(length)); } @@ -738,13 +776,13 @@ pm_static_literal_value(rb_iseq_t *iseq, const pm_node_t *node, const pm_scope_n return parse_regexp_concat(iseq, scope_node, (const pm_node_t *) cast, &cast->parts); } case PM_INTERPOLATED_STRING_NODE: { - VALUE string = pm_static_literal_concat(iseq, &((const pm_interpolated_string_node_t *) node)->parts, scope_node, NULL, false); + VALUE string = pm_static_literal_concat(iseq, &((const pm_interpolated_string_node_t *) node)->parts, scope_node, NULL, NULL, false); int line_number = pm_node_line_number(scope_node->parser, node); return pm_static_literal_string(iseq, string, line_number); } case PM_INTERPOLATED_SYMBOL_NODE: { const pm_interpolated_symbol_node_t *cast = (const pm_interpolated_symbol_node_t *) node; - VALUE string = pm_static_literal_concat(iseq, &cast->parts, scope_node, NULL, true); + VALUE string = pm_static_literal_concat(iseq, &cast->parts, scope_node, NULL, NULL, true); return ID2SYM(rb_intern_str(string)); } @@ -6524,7 +6562,7 @@ pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret, } else { const pm_interpolated_string_node_t *cast = (const pm_interpolated_string_node_t *) node; - int length = pm_interpolated_node_compile(iseq, &cast->parts, &location, ret, popped, scope_node, NULL); + int length = pm_interpolated_node_compile(iseq, &cast->parts, &location, ret, popped, scope_node, NULL, NULL); if (length > 1) PUSH_INSN1(ret, location, concatstrings, INT2FIX(length)); if (popped) PUSH_INSN(ret, location, pop); } @@ -6543,7 +6581,7 @@ pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret, } } else { - int length = pm_interpolated_node_compile(iseq, &cast->parts, &location, ret, popped, scope_node, NULL); + int length = pm_interpolated_node_compile(iseq, &cast->parts, &location, ret, popped, scope_node, NULL, NULL); if (length > 1) { PUSH_INSN1(ret, location, concatstrings, INT2FIX(length)); } @@ -6565,7 +6603,7 @@ pm_compile_node(rb_iseq_t *iseq, const pm_node_t *node, LINK_ANCHOR *const ret, PUSH_INSN(ret, location, putself); - int length = pm_interpolated_node_compile(iseq, &cast->parts, &location, ret, false, scope_node, NULL); + int length = pm_interpolated_node_compile(iseq, &cast->parts, &location, ret, false, scope_node, NULL, NULL); if (length > 1) PUSH_INSN1(ret, location, concatstrings, INT2FIX(length)); PUSH_SEND_WITH_FLAG(ret, location, idBackquote, INT2NUM(1), INT2FIX(VM_CALL_FCALL | VM_CALL_ARGS_SIMPLE)); diff --git a/test/.excludes-prism/TestM17N.rb b/test/.excludes-prism/TestM17N.rb index 25c4394634..e3c2d43b74 100644 --- a/test/.excludes-prism/TestM17N.rb +++ b/test/.excludes-prism/TestM17N.rb @@ -1,3 +1,2 @@ -exclude(:test_regexp_ascii, "https://github.com/ruby/prism/issues/2664") exclude(:test_regexp_usascii, "unknown") exclude(:test_string_mixed_unicode, "unknown")