From 27e13fbc58b02bc027de9c51c8963f57276ac7c1 Mon Sep 17 00:00:00 2001
From: Kevin Menard <kevin@nirvdrum.com>
Date: Mon, 10 Jun 2024 17:36:52 -0400
Subject: [PATCH] Add a fast path implementation for appending single byte
 values to binary strings.

Co-authored-by: Aaron Patterson <tenderlove@ruby-lang.org>
---
 string.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/string.c b/string.c
index 59ddb6e741..49357987d0 100644
--- a/string.c
+++ b/string.c
@@ -3346,6 +3346,58 @@ rb_str_cat_cstr(VALUE str, const char *ptr)
     return rb_str_buf_cat(str, ptr, strlen(ptr));
 }
 
+static void
+rb_str_buf_cat_byte(VALUE str, unsigned char byte)
+{
+    RUBY_ASSERT(RB_ENCODING_GET_INLINED(str) == ENCINDEX_ASCII_8BIT);
+
+    // We can't write directly to shared strings without impacting others, so we must make the string independent.
+    if (UNLIKELY(!str_independent(str))) {
+        str_make_independent(str);
+    }
+
+    long string_length = -1;
+    const int null_terminator_length = 1;
+    char *sptr;
+    RSTRING_GETMEM(str, sptr, string_length);
+
+    // Ensure the resulting string wouldn't be too long.
+    if (UNLIKELY(string_length > LONG_MAX - 1)) {
+        rb_raise(rb_eArgError, "string sizes too big");
+    }
+
+    long string_capacity = str_capacity(str, null_terminator_length);
+
+    // Get the code range before any modifications since those might clear the code range.
+    int cr = ENC_CODERANGE(str);
+
+    // Check if the string has spare string_capacity to write the new byte.
+    if (LIKELY(string_capacity >= string_length + 1)) {
+        // In fast path we can write the new byte and note the string's new length.
+        sptr[string_length] = byte;
+        STR_SET_LEN(str, string_length + 1);
+        TERM_FILL(sptr + string_length + 1, null_terminator_length);
+    }
+    else {
+        // If there's not enough string_capacity, make a call into the general string concatenation function.
+        char buf[1] = {byte};
+        str_buf_cat(str, buf, 1);
+    }
+
+    // If the code range is already known, we can derive the resulting code range cheaply by looking at the byte we
+    // just appended. If the code range is unknown, but the string was empty, then we can also derive the code range
+    // by looking at the byte we just appended. Otherwise, we'd have to scan the bytes to determine the code range so
+    // we leave it as unknown. It cannot be broken for binary strings so we don't need to handle that option.
+    if (cr == ENC_CODERANGE_7BIT || string_length == 0) {
+        if (ISASCII(byte)) {
+            ENC_CODERANGE_SET(str, ENC_CODERANGE_7BIT);
+        }
+        else {
+            ENC_CODERANGE_SET(str, ENC_CODERANGE_VALID);
+        }
+    }
+}
+
 RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
 RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
 RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
@@ -3634,7 +3686,11 @@ rb_str_concat(VALUE str1, VALUE str2)
     }
 
     encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
-    if (encidx >= 0) {
+
+    if (encidx == ENCINDEX_ASCII_8BIT) {
+        rb_str_buf_cat_byte(str1, (unsigned char)code);
+    }
+    else if (encidx >= 0) {
         char buf[1];
         buf[0] = (char)code;
         rb_str_cat(str1, buf, 1);