From 27e13fbc58b02bc027de9c51c8963f57276ac7c1 Mon Sep 17 00:00:00 2001 From: Kevin Menard Date: Mon, 10 Jun 2024 17:36:52 -0400 Subject: [PATCH] Add a fast path implementation for appending single byte values to binary strings. Co-authored-by: Aaron Patterson --- string.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/string.c b/string.c index 59ddb6e741..49357987d0 100644 --- a/string.c +++ b/string.c @@ -3346,6 +3346,58 @@ rb_str_cat_cstr(VALUE str, const char *ptr) return rb_str_buf_cat(str, ptr, strlen(ptr)); } +static void +rb_str_buf_cat_byte(VALUE str, unsigned char byte) +{ + RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT); + + // We can't write directly to shared strings without impacting others, so we must make the string independent. + if (UNLIKELY(!str_independent(str))) { + str_make_independent(str); + } + + long string_length = -1; + const int null_terminator_length = 1; + char *sptr; + RSTRING_GETMEM(str, sptr, string_length); + + // Ensure the resulting string wouldn't be too long. + if (UNLIKELY(string_length > LONG_MAX - 1)) { + rb_raise(rb_eArgError, "string sizes too big"); + } + + long string_capacity = str_capacity(str, null_terminator_length); + + // Get the code range before any modifications since those might clear the code range. + int cr = ENC_CODERANGE(str); + + // Check if the string has spare string_capacity to write the new byte. + if (LIKELY(string_capacity >= string_length + 1)) { + // In fast path we can write the new byte and note the string's new length. + sptr[string_length] = byte; + STR_SET_LEN(str, string_length + 1); + TERM_FILL(sptr + string_length + 1, null_terminator_length); + } + else { + // If there's not enough string_capacity, make a call into the general string concatenation function. + char buf[1] = {byte}; + str_buf_cat(str, buf, 1); + } + + // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we + // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range + // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so + // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option. + if (cr == ENC_CODERANGE_7BIT || string_length == 0) { + if (ISASCII(byte)) { + ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT); + } + else { + ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID); + } + } +} + RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len)) RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr)) RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr)) @@ -3634,7 +3686,11 @@ rb_str_concat(VALUE str1, VALUE str2) } encidx = rb_ascii8bit_appendable_encoding_index(enc, code); - if (encidx >= 0) { + + if (encidx == ENCINDEX_ASCII_8BIT) { + rb_str_buf_cat_byte(str1, (unsigned char)code); + } + else if (encidx >= 0) { char buf[1]; buf[0] = (char)code; rb_str_cat(str1, buf, 1);