[ruby/json] Refactor convert_UTF8_to_JSON to split searching and escaping code
The goal is to be able to dispatch to more optimized search implementations without having to duplicate the escaping code. Somehow, this is a few % faster already: ``` == Encoding activitypub.json (52595 bytes) ruby 3.4.1 (2024-12-25 revision https://github.com/ruby/json/commit/48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 2.257k i/100ms Calculating ------------------------------------- after 22.930k (± 1.3%) i/s (43.61 μs/i) - 115.107k in 5.020814s Comparison: before: 21604.0 i/s after: 22930.1 i/s - 1.06x faster == Encoding citm_catalog.json (500298 bytes) ruby 3.4.1 (2024-12-25 revision https://github.com/ruby/json/commit/48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 137.000 i/100ms Calculating ------------------------------------- after 1.397k (± 1.1%) i/s (715.57 μs/i) - 6.987k in 5.000408s Comparison: before: 1344.4 i/s after: 1397.5 i/s - 1.04x faster == Encoding twitter.json (466906 bytes) ruby 3.4.1 (2024-12-25 revision https://github.com/ruby/json/commit/48d4efcb85) +YJIT +PRISM [arm64-darwin23] Warming up -------------------------------------- after 249.000 i/100ms Calculating ------------------------------------- after 2.464k (± 1.8%) i/s (405.81 μs/i) - 12.450k in 5.054131s Comparison: before: 2326.5 i/s after: 2464.2 i/s - 1.06x faster ``` https://github.com/ruby/json/commit/8fb5ae807f
This commit is contained in:
parent
581d85058c
commit
98e1c2845a
@ -101,6 +101,7 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
|
|||||||
// 0 - single byte char that don't need to be escaped.
|
// 0 - single byte char that don't need to be escaped.
|
||||||
// (x | 8) - char that needs to be escaped.
|
// (x | 8) - char that needs to be escaped.
|
||||||
static const unsigned char CHAR_LENGTH_MASK = 7;
|
static const unsigned char CHAR_LENGTH_MASK = 7;
|
||||||
|
static const unsigned char ESCAPE_MASK = 8;
|
||||||
|
|
||||||
static const unsigned char escape_table[256] = {
|
static const unsigned char escape_table[256] = {
|
||||||
// ASCII Control Characters
|
// ASCII Control Characters
|
||||||
@ -165,6 +166,84 @@ static const unsigned char script_safe_escape_table[256] = {
|
|||||||
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
|
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
typedef struct _search_state {
|
||||||
|
const char *ptr;
|
||||||
|
const char *end;
|
||||||
|
const char *cursor;
|
||||||
|
FBuffer *buffer;
|
||||||
|
} search_state;
|
||||||
|
|
||||||
|
static inline void search_flush(search_state *search)
|
||||||
|
{
|
||||||
|
fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor);
|
||||||
|
search->cursor = search->ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline unsigned char search_escape(search_state *search, const unsigned char escape_table[256])
|
||||||
|
{
|
||||||
|
while (search->ptr < search->end) {
|
||||||
|
unsigned char ch = (unsigned char)*search->ptr;
|
||||||
|
unsigned char ch_len = escape_table[ch];
|
||||||
|
|
||||||
|
if (RB_UNLIKELY(ch_len)) {
|
||||||
|
if (ch_len & ESCAPE_MASK) {
|
||||||
|
if (RB_UNLIKELY(ch_len == 11)) {
|
||||||
|
const unsigned char *uptr = (const unsigned char *)search->ptr;
|
||||||
|
if (!(uptr[1] == 0x80 && (uptr[2] >> 1) == 0x54)) {
|
||||||
|
search->ptr += 3;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
search_flush(search);
|
||||||
|
return ch_len & CHAR_LENGTH_MASK;
|
||||||
|
} else {
|
||||||
|
search->ptr += ch_len;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
search->ptr++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
search_flush(search);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline void fast_escape_UTF8_char(search_state *search, unsigned char ch_len) {
|
||||||
|
const unsigned char ch = (unsigned char)*search->ptr;
|
||||||
|
switch (ch_len) {
|
||||||
|
case 1: {
|
||||||
|
switch (ch) {
|
||||||
|
case '"': fbuffer_append(search->buffer, "\\\"", 2); break;
|
||||||
|
case '\\': fbuffer_append(search->buffer, "\\\\", 2); break;
|
||||||
|
case '/': fbuffer_append(search->buffer, "\\/", 2); break;
|
||||||
|
case '\b': fbuffer_append(search->buffer, "\\b", 2); break;
|
||||||
|
case '\f': fbuffer_append(search->buffer, "\\f", 2); break;
|
||||||
|
case '\n': fbuffer_append(search->buffer, "\\n", 2); break;
|
||||||
|
case '\r': fbuffer_append(search->buffer, "\\r", 2); break;
|
||||||
|
case '\t': fbuffer_append(search->buffer, "\\t", 2); break;
|
||||||
|
default: {
|
||||||
|
const char *hexdig = "0123456789abcdef";
|
||||||
|
char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
|
||||||
|
scratch[4] = hexdig[(ch >> 4) & 0xf];
|
||||||
|
scratch[5] = hexdig[ch & 0xf];
|
||||||
|
fbuffer_append(search->buffer, scratch, 6);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
case 3: {
|
||||||
|
if (search->ptr[2] & 1) {
|
||||||
|
fbuffer_append(search->buffer, "\\u2029", 6);
|
||||||
|
} else {
|
||||||
|
fbuffer_append(search->buffer, "\\u2028", 6);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
search->cursor = (search->ptr += ch_len);
|
||||||
|
}
|
||||||
|
|
||||||
/* Converts in_string to a JSON string (without the wrapping '"'
|
/* Converts in_string to a JSON string (without the wrapping '"'
|
||||||
* characters) in FBuffer out_buffer.
|
* characters) in FBuffer out_buffer.
|
||||||
*
|
*
|
||||||
@ -181,182 +260,114 @@ static const unsigned char script_safe_escape_table[256] = {
|
|||||||
* Everything else (should be UTF-8) is just passed through and
|
* Everything else (should be UTF-8) is just passed through and
|
||||||
* appended to the result.
|
* appended to the result.
|
||||||
*/
|
*/
|
||||||
static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
|
static inline void convert_UTF8_to_JSON(search_state *search, const unsigned char escape_table[256])
|
||||||
{
|
{
|
||||||
const char *hexdig = "0123456789abcdef";
|
unsigned char ch_len;
|
||||||
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
|
while ((ch_len = search_escape(search, escape_table))) {
|
||||||
|
fast_escape_UTF8_char(search, ch_len);
|
||||||
const char *ptr = RSTRING_PTR(str);
|
|
||||||
unsigned long len = RSTRING_LEN(str);
|
|
||||||
|
|
||||||
unsigned long beg = 0, pos = 0;
|
|
||||||
|
|
||||||
#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
|
|
||||||
|
|
||||||
while (pos < len) {
|
|
||||||
unsigned char ch = ptr[pos];
|
|
||||||
unsigned char ch_len = escape_table[ch];
|
|
||||||
/* JSON encoding */
|
|
||||||
|
|
||||||
if (RB_UNLIKELY(ch_len)) {
|
|
||||||
switch (ch_len) {
|
|
||||||
case 9: {
|
|
||||||
FLUSH_POS(1);
|
|
||||||
switch (ch) {
|
|
||||||
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
|
|
||||||
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
|
|
||||||
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
|
|
||||||
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
|
|
||||||
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
|
|
||||||
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
|
|
||||||
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
|
|
||||||
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
|
|
||||||
default: {
|
|
||||||
scratch[2] = '0';
|
|
||||||
scratch[3] = '0';
|
|
||||||
scratch[4] = hexdig[(ch >> 4) & 0xf];
|
|
||||||
scratch[5] = hexdig[ch & 0xf];
|
|
||||||
fbuffer_append(out_buffer, scratch, 6);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
case 11: {
|
|
||||||
unsigned char b2 = ptr[pos + 1];
|
|
||||||
if (RB_UNLIKELY(b2 == 0x80)) {
|
|
||||||
unsigned char b3 = ptr[pos + 2];
|
|
||||||
if (b3 == 0xA8) {
|
|
||||||
FLUSH_POS(3);
|
|
||||||
fbuffer_append(out_buffer, "\\u2028", 6);
|
|
||||||
break;
|
|
||||||
} else if (b3 == 0xA9) {
|
|
||||||
FLUSH_POS(3);
|
|
||||||
fbuffer_append(out_buffer, "\\u2029", 6);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
ch_len = 3;
|
|
||||||
// fallthrough
|
|
||||||
}
|
|
||||||
default:
|
|
||||||
pos += ch_len;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
pos++;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
#undef FLUSH_POS
|
|
||||||
|
|
||||||
if (beg < len) {
|
|
||||||
fbuffer_append(out_buffer, &ptr[beg], len - beg);
|
|
||||||
}
|
|
||||||
|
|
||||||
RB_GC_GUARD(str);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
|
static inline unsigned char search_ascii_only_escape(search_state *search, const unsigned char escape_table[256])
|
||||||
{
|
{
|
||||||
const char *hexdig = "0123456789abcdef";
|
while (search->ptr < search->end) {
|
||||||
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
|
unsigned char ch = (unsigned char)*search->ptr;
|
||||||
|
|
||||||
const char *ptr = RSTRING_PTR(str);
|
|
||||||
unsigned long len = RSTRING_LEN(str);
|
|
||||||
|
|
||||||
unsigned long beg = 0, pos = 0;
|
|
||||||
|
|
||||||
#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
|
|
||||||
|
|
||||||
while (pos < len) {
|
|
||||||
unsigned char ch = ptr[pos];
|
|
||||||
unsigned char ch_len = escape_table[ch];
|
unsigned char ch_len = escape_table[ch];
|
||||||
|
|
||||||
if (RB_UNLIKELY(ch_len)) {
|
if (RB_UNLIKELY(ch_len)) {
|
||||||
switch (ch_len) {
|
search_flush(search);
|
||||||
case 9: {
|
return ch_len & CHAR_LENGTH_MASK;
|
||||||
FLUSH_POS(1);
|
} else {
|
||||||
switch (ch) {
|
search->ptr++;
|
||||||
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
|
}
|
||||||
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
|
}
|
||||||
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
|
search_flush(search);
|
||||||
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
|
return 0;
|
||||||
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
|
}
|
||||||
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
|
|
||||||
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
|
static inline void full_escape_UTF8_char(search_state *search, unsigned char ch_len) {
|
||||||
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
|
const unsigned char ch = (unsigned char)*search->ptr;
|
||||||
default: {
|
switch (ch_len) {
|
||||||
scratch[2] = '0';
|
case 1: {
|
||||||
scratch[3] = '0';
|
switch (ch) {
|
||||||
scratch[4] = hexdig[(ch >> 4) & 0xf];
|
case '"': fbuffer_append(search->buffer, "\\\"", 2); break;
|
||||||
scratch[5] = hexdig[ch & 0xf];
|
case '\\': fbuffer_append(search->buffer, "\\\\", 2); break;
|
||||||
fbuffer_append(out_buffer, scratch, 6);
|
case '/': fbuffer_append(search->buffer, "\\/", 2); break;
|
||||||
break;
|
case '\b': fbuffer_append(search->buffer, "\\b", 2); break;
|
||||||
}
|
case '\f': fbuffer_append(search->buffer, "\\f", 2); break;
|
||||||
}
|
case '\n': fbuffer_append(search->buffer, "\\n", 2); break;
|
||||||
break;
|
case '\r': fbuffer_append(search->buffer, "\\r", 2); break;
|
||||||
}
|
case '\t': fbuffer_append(search->buffer, "\\t", 2); break;
|
||||||
default: {
|
default: {
|
||||||
uint32_t wchar = 0;
|
const char *hexdig = "0123456789abcdef";
|
||||||
ch_len = ch_len & CHAR_LENGTH_MASK;
|
char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
|
||||||
|
scratch[4] = hexdig[(ch >> 4) & 0xf];
|
||||||
switch(ch_len) {
|
scratch[5] = hexdig[ch & 0xf];
|
||||||
case 2:
|
fbuffer_append(search->buffer, scratch, 6);
|
||||||
wchar = ptr[pos] & 0x1F;
|
|
||||||
break;
|
|
||||||
case 3:
|
|
||||||
wchar = ptr[pos] & 0x0F;
|
|
||||||
break;
|
|
||||||
case 4:
|
|
||||||
wchar = ptr[pos] & 0x07;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (short i = 1; i < ch_len; i++) {
|
|
||||||
wchar = (wchar << 6) | (ptr[pos+i] & 0x3F);
|
|
||||||
}
|
|
||||||
|
|
||||||
FLUSH_POS(ch_len);
|
|
||||||
|
|
||||||
if (wchar <= 0xFFFF) {
|
|
||||||
scratch[2] = hexdig[wchar >> 12];
|
|
||||||
scratch[3] = hexdig[(wchar >> 8) & 0xf];
|
|
||||||
scratch[4] = hexdig[(wchar >> 4) & 0xf];
|
|
||||||
scratch[5] = hexdig[wchar & 0xf];
|
|
||||||
fbuffer_append(out_buffer, scratch, 6);
|
|
||||||
} else {
|
|
||||||
uint16_t hi, lo;
|
|
||||||
wchar -= 0x10000;
|
|
||||||
hi = 0xD800 + (uint16_t)(wchar >> 10);
|
|
||||||
lo = 0xDC00 + (uint16_t)(wchar & 0x3FF);
|
|
||||||
|
|
||||||
scratch[2] = hexdig[hi >> 12];
|
|
||||||
scratch[3] = hexdig[(hi >> 8) & 0xf];
|
|
||||||
scratch[4] = hexdig[(hi >> 4) & 0xf];
|
|
||||||
scratch[5] = hexdig[hi & 0xf];
|
|
||||||
|
|
||||||
scratch[8] = hexdig[lo >> 12];
|
|
||||||
scratch[9] = hexdig[(lo >> 8) & 0xf];
|
|
||||||
scratch[10] = hexdig[(lo >> 4) & 0xf];
|
|
||||||
scratch[11] = hexdig[lo & 0xf];
|
|
||||||
|
|
||||||
fbuffer_append(out_buffer, scratch, 12);
|
|
||||||
}
|
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
break;
|
||||||
pos++;
|
}
|
||||||
|
default: {
|
||||||
|
const char *hexdig = "0123456789abcdef";
|
||||||
|
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
|
||||||
|
|
||||||
|
uint32_t wchar = 0;
|
||||||
|
|
||||||
|
switch(ch_len) {
|
||||||
|
case 2:
|
||||||
|
wchar = ch & 0x1F;
|
||||||
|
break;
|
||||||
|
case 3:
|
||||||
|
wchar = ch & 0x0F;
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
wchar = ch & 0x07;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (short i = 1; i < ch_len; i++) {
|
||||||
|
wchar = (wchar << 6) | (search->ptr[i] & 0x3F);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (wchar <= 0xFFFF) {
|
||||||
|
scratch[2] = hexdig[wchar >> 12];
|
||||||
|
scratch[3] = hexdig[(wchar >> 8) & 0xf];
|
||||||
|
scratch[4] = hexdig[(wchar >> 4) & 0xf];
|
||||||
|
scratch[5] = hexdig[wchar & 0xf];
|
||||||
|
fbuffer_append(search->buffer, scratch, 6);
|
||||||
|
} else {
|
||||||
|
uint16_t hi, lo;
|
||||||
|
wchar -= 0x10000;
|
||||||
|
hi = 0xD800 + (uint16_t)(wchar >> 10);
|
||||||
|
lo = 0xDC00 + (uint16_t)(wchar & 0x3FF);
|
||||||
|
|
||||||
|
scratch[2] = hexdig[hi >> 12];
|
||||||
|
scratch[3] = hexdig[(hi >> 8) & 0xf];
|
||||||
|
scratch[4] = hexdig[(hi >> 4) & 0xf];
|
||||||
|
scratch[5] = hexdig[hi & 0xf];
|
||||||
|
|
||||||
|
scratch[8] = hexdig[lo >> 12];
|
||||||
|
scratch[9] = hexdig[(lo >> 8) & 0xf];
|
||||||
|
scratch[10] = hexdig[(lo >> 4) & 0xf];
|
||||||
|
scratch[11] = hexdig[lo & 0xf];
|
||||||
|
|
||||||
|
fbuffer_append(search->buffer, scratch, 12);
|
||||||
|
}
|
||||||
|
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#undef FLUSH_POS
|
search->cursor = (search->ptr += ch_len);
|
||||||
|
}
|
||||||
|
|
||||||
if (beg < len) {
|
static void convert_UTF8_to_ASCII_only_JSON(search_state *search, const unsigned char escape_table[256])
|
||||||
fbuffer_append(out_buffer, &ptr[beg], len - beg);
|
{
|
||||||
|
unsigned char ch_len;
|
||||||
|
while ((ch_len = search_ascii_only_escape(search, escape_table))) {
|
||||||
|
full_escape_UTF8_char(search, ch_len);
|
||||||
}
|
}
|
||||||
|
|
||||||
RB_GC_GUARD(str);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
@ -911,13 +922,20 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
|
|||||||
|
|
||||||
fbuffer_append_char(buffer, '"');
|
fbuffer_append_char(buffer, '"');
|
||||||
|
|
||||||
|
long len;
|
||||||
|
search_state search;
|
||||||
|
search.buffer = buffer;
|
||||||
|
RSTRING_GETMEM(obj, search.ptr, len);
|
||||||
|
search.cursor = search.ptr;
|
||||||
|
search.end = search.ptr + len;
|
||||||
|
|
||||||
switch(rb_enc_str_coderange(obj)) {
|
switch(rb_enc_str_coderange(obj)) {
|
||||||
case ENC_CODERANGE_7BIT:
|
case ENC_CODERANGE_7BIT:
|
||||||
case ENC_CODERANGE_VALID:
|
case ENC_CODERANGE_VALID:
|
||||||
if (RB_UNLIKELY(state->ascii_only)) {
|
if (RB_UNLIKELY(state->ascii_only)) {
|
||||||
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
|
convert_UTF8_to_ASCII_only_JSON(&search, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
|
||||||
} else {
|
} else {
|
||||||
convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
|
convert_UTF8_to_JSON(&search, state->script_safe ? script_safe_escape_table : escape_table);
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
|
@ -665,6 +665,12 @@ class JSONGeneratorTest < Test::Unit::TestCase
|
|||||||
assert_equal("\"5\u{b0}\"", "5\xb0".dup.force_encoding(Encoding::ISO_8859_1).to_json)
|
assert_equal("\"5\u{b0}\"", "5\xb0".dup.force_encoding(Encoding::ISO_8859_1).to_json)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
def test_utf8_multibyte
|
||||||
|
assert_equal('["foßbar"]', JSON.generate(["foßbar"]))
|
||||||
|
assert_equal('"n€ßt€ð2"', JSON.generate("n€ßt€ð2"))
|
||||||
|
assert_equal('"\"\u0000\u001f"', JSON.generate("\"\u0000\u001f"))
|
||||||
|
end
|
||||||
|
|
||||||
def test_fragment
|
def test_fragment
|
||||||
fragment = JSON::Fragment.new(" 42")
|
fragment = JSON::Fragment.new(" 42")
|
||||||
assert_equal '{"number": 42}', JSON.generate({ number: fragment })
|
assert_equal '{"number": 42}', JSON.generate({ number: fragment })
|
||||||
|
Loading…
x
Reference in New Issue
Block a user