[ruby/json] json_string_unescape: assume the string doesn't need escaping

If that assumption holds true, then we don't need to copy the
string into a buffer to unescape it. For small string is just saves
copying, but for large ones it also saves a malloc/free combo.

Before:

```
== Parsing twitter.json (567916 bytes)
ruby 3.3.4 (2024-07-09 revision https://github.com/ruby/json/commit/be1089c8ec) +YJIT [arm64-darwin23]
Warming up --------------------------------------
                json    52.000 i/100ms
                  oj    61.000 i/100ms
           oj strict    70.000 i/100ms
          Oj::Parser    71.000 i/100ms
           rapidjson    55.000 i/100ms
Calculating -------------------------------------
                json    510.111 (± 2.9%) i/s    (1.96 ms/i) -      2.548k in   5.000029s
                  oj    610.232 (± 3.1%) i/s    (1.64 ms/i) -      3.050k in   5.003725s
           oj strict    713.231 (± 3.2%) i/s    (1.40 ms/i) -      3.570k in   5.010902s
          Oj::Parser    762.598 (± 3.0%) i/s    (1.31 ms/i) -      3.834k in   5.033130s
           rapidjson    553.029 (± 7.4%) i/s    (1.81 ms/i) -      2.750k in   5.022630s

Comparison:
                json:      510.1 i/s
          Oj::Parser:      762.6 i/s - 1.49x  faster
           oj strict:      713.2 i/s - 1.40x  faster
                  oj:      610.2 i/s - 1.20x  faster
           rapidjson:      553.0 i/s - same-ish: difference falls within error

== Parsing citm_catalog.json (1727030 bytes)
ruby 3.3.4 (2024-07-09 revision https://github.com/ruby/json/commit/be1089c8ec) +YJIT [arm64-darwin23]
Warming up --------------------------------------
                json    28.000 i/100ms
                  oj    33.000 i/100ms
           oj strict    37.000 i/100ms
          Oj::Parser    43.000 i/100ms
           rapidjson    38.000 i/100ms
Calculating -------------------------------------
                json    303.853 (± 3.6%) i/s    (3.29 ms/i) -      1.540k in   5.076079s
                  oj    348.009 (± 2.0%) i/s    (2.87 ms/i) -      1.749k in   5.027738s
           oj strict    396.679 (± 3.3%) i/s    (2.52 ms/i) -      1.998k in   5.042271s
          Oj::Parser    406.699 (± 2.2%) i/s    (2.46 ms/i) -      2.064k in   5.077587s
           rapidjson    393.463 (± 3.3%) i/s    (2.54 ms/i) -      1.976k in   5.028501s

Comparison:
                json:      303.9 i/s
          Oj::Parser:      406.7 i/s - 1.34x  faster
           oj strict:      396.7 i/s - 1.31x  faster
           rapidjson:      393.5 i/s - 1.29x  faster
                  oj:      348.0 i/s - 1.15x  faster
```

After:

```
== Parsing twitter.json (567916 bytes)
ruby 3.3.4 (2024-07-09 revision https://github.com/ruby/json/commit/be1089c8ec) +YJIT [arm64-darwin23]
Warming up --------------------------------------
                json    56.000 i/100ms
                  oj    62.000 i/100ms
           oj strict    72.000 i/100ms
          Oj::Parser    77.000 i/100ms
           rapidjson    55.000 i/100ms
Calculating -------------------------------------
                json    568.025 (± 2.1%) i/s    (1.76 ms/i) -      2.856k in   5.030272s
                  oj    630.936 (± 1.4%) i/s    (1.58 ms/i) -      3.162k in   5.012630s
           oj strict    705.784 (±11.2%) i/s    (1.42 ms/i) -      3.456k in   5.006706s
          Oj::Parser    783.989 (± 1.7%) i/s    (1.28 ms/i) -      3.927k in   5.010343s
           rapidjson    557.630 (± 2.0%) i/s    (1.79 ms/i) -      2.805k in   5.032388s

Comparison:
                json:      568.0 i/s
          Oj::Parser:      784.0 i/s - 1.38x  faster
           oj strict:      705.8 i/s - 1.24x  faster
                  oj:      630.9 i/s - 1.11x  faster
           rapidjson:      557.6 i/s - same-ish: difference falls within error

== Parsing citm_catalog.json (1727030 bytes)
ruby 3.3.4 (2024-07-09 revision https://github.com/ruby/json/commit/be1089c8ec) +YJIT [arm64-darwin23]
Warming up --------------------------------------
                json    29.000 i/100ms
                  oj    33.000 i/100ms
           oj strict    38.000 i/100ms
          Oj::Parser    43.000 i/100ms
           rapidjson    37.000 i/100ms
Calculating -------------------------------------
                json    319.271 (± 3.1%) i/s    (3.13 ms/i) -      1.595k in   5.001128s
                  oj    347.946 (± 1.7%) i/s    (2.87 ms/i) -      1.749k in   5.028395s
           oj strict    396.914 (± 3.0%) i/s    (2.52 ms/i) -      2.014k in   5.079645s
          Oj::Parser    409.311 (± 2.7%) i/s    (2.44 ms/i) -      2.064k in   5.046626s
           rapidjson    394.752 (± 1.5%) i/s    (2.53 ms/i) -      1.998k in   5.062776s

Comparison:
                json:      319.3 i/s
          Oj::Parser:      409.3 i/s - 1.28x  faster
           oj strict:      396.9 i/s - 1.24x  faster
           rapidjson:      394.8 i/s - 1.24x  faster
                  oj:      347.9 i/s - 1.09x  faster
```

https://github.com/ruby/json/commit/7e0f66546a
This commit is contained in:
Jean Boussier 2024-10-31 14:47:17 +01:00 committed by Hiroshi SHIBATA
parent 081689b9e2
commit 165cc6cf40
3 changed files with 57 additions and 30 deletions

View File

@ -1450,7 +1450,7 @@ case 16:
}
}
static inline VALUE build_string(const char *buffer, const char *bufferStart, bool intern, bool symbolize)
static inline VALUE build_string(const char *start, const char *end, bool intern, bool symbolize)
{
if (symbolize) {
intern = true;
@ -1458,12 +1458,12 @@ static inline VALUE build_string(const char *buffer, const char *bufferStart, bo
VALUE result;
# ifdef HAVE_RB_ENC_INTERNED_STR
if (intern) {
result = rb_enc_interned_str(bufferStart, (long)(buffer - bufferStart), rb_utf8_encoding());
result = rb_enc_interned_str(start, (long)(end - start), rb_utf8_encoding());
} else {
result = rb_utf8_str_new(bufferStart, (long)(buffer - bufferStart));
result = rb_utf8_str_new(start, (long)(end - start));
}
# else
result = rb_utf8_str_new(bufferStart, (long)(buffer - bufferStart));
result = rb_utf8_str_new(start, (long)(end - start));
if (intern) {
# if STR_UMINUS_DEDUPE_FROZEN
// Starting from MRI 3.0 it is preferable to freeze the string
@ -1488,7 +1488,7 @@ static inline VALUE build_string(const char *buffer, const char *bufferStart, bo
}
static const size_t MAX_STACK_BUFFER_SIZE = 128;
static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int symbolize)
static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bool symbolize)
{
VALUE result = Qnil;
size_t bufferSize = stringEnd - string;
@ -1496,6 +1496,11 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
int unescape_len;
char buf[4];
pe = memchr(p, '\\', bufferSize);
if (RB_LIKELY(pe == NULL)) {
return build_string(string, stringEnd, intern, symbolize);
}
if (bufferSize > MAX_STACK_BUFFER_SIZE) {
# ifdef HAVE_RB_ENC_INTERNED_STR
bufferStart = buffer = ALLOC_N(char, bufferSize ? bufferSize : 1);
@ -1598,7 +1603,7 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
buffer += pe - p;
}
result = build_string(buffer, bufferStart, intern, symbolize);
result = build_string(bufferStart, buffer, intern, symbolize);
if (bufferSize > MAX_STACK_BUFFER_SIZE) {
ruby_xfree(bufferStart);
@ -1608,7 +1613,7 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
}
#line 1612 "parser.c"
#line 1617 "parser.c"
enum {JSON_string_start = 1};
enum {JSON_string_first_final = 8};
enum {JSON_string_error = 0};
@ -1616,7 +1621,7 @@ enum {JSON_string_error = 0};
enum {JSON_string_en_main = 1};
#line 640 "parser.rl"
#line 645 "parser.rl"
static int
@ -1637,15 +1642,15 @@ static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *resu
VALUE match_string;
#line 1641 "parser.c"
#line 1646 "parser.c"
{
cs = JSON_string_start;
}
#line 660 "parser.rl"
#line 665 "parser.rl"
json->memo = p;
#line 1649 "parser.c"
#line 1654 "parser.c"
{
if ( p == pe )
goto _test_eof;
@ -1670,7 +1675,7 @@ case 2:
goto st0;
goto st2;
tr2:
#line 627 "parser.rl"
#line 632 "parser.rl"
{
*result = json_string_unescape(json->memo + 1, p, json->parsing_name || json-> freeze, json->parsing_name && json->symbolize_names);
if (NIL_P(*result)) {
@ -1680,14 +1685,14 @@ tr2:
{p = (( p + 1))-1;}
}
}
#line 637 "parser.rl"
#line 642 "parser.rl"
{ p--; {p++; cs = 8; goto _out;} }
goto st8;
st8:
if ( ++p == pe )
goto _test_eof8;
case 8:
#line 1691 "parser.c"
#line 1696 "parser.c"
goto st0;
st3:
if ( ++p == pe )
@ -1763,7 +1768,7 @@ case 7:
_out: {}
}
#line 662 "parser.rl"
#line 667 "parser.rl"
if (json->create_additions && RTEST(match_string = json->match_string)) {
VALUE klass;
@ -1960,7 +1965,7 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
}
#line 1964 "parser.c"
#line 1969 "parser.c"
enum {JSON_start = 1};
enum {JSON_first_final = 10};
enum {JSON_error = 0};
@ -1968,7 +1973,7 @@ enum {JSON_error = 0};
enum {JSON_en_main = 1};
#line 872 "parser.rl"
#line 877 "parser.rl"
/*
@ -1986,16 +1991,16 @@ static VALUE cParser_parse(VALUE self)
GET_PARSER;
#line 1990 "parser.c"
#line 1995 "parser.c"
{
cs = JSON_start;
}
#line 889 "parser.rl"
#line 894 "parser.rl"
p = json->source;
pe = p + json->len;
#line 1999 "parser.c"
#line 2004 "parser.c"
{
if ( p == pe )
goto _test_eof;
@ -2029,7 +2034,7 @@ st0:
cs = 0;
goto _out;
tr2:
#line 864 "parser.rl"
#line 869 "parser.rl"
{
char *np = JSON_parse_value(json, p, pe, &result, 0);
if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;}
@ -2039,7 +2044,7 @@ st10:
if ( ++p == pe )
goto _test_eof10;
case 10:
#line 2043 "parser.c"
#line 2048 "parser.c"
switch( (*p) ) {
case 13: goto st10;
case 32: goto st10;
@ -2128,7 +2133,7 @@ case 9:
_out: {}
}
#line 892 "parser.rl"
#line 897 "parser.rl"
if (cs >= JSON_first_final && p == pe) {
return result;

View File

@ -3,6 +3,23 @@
#include "ruby.h"
/* This is the fallback definition from Ruby 3.4 */
#ifndef RBIMPL_STDBOOL_H
#if defined(__cplusplus)
# if defined(HAVE_STDBOOL_H) && (__cplusplus >= 201103L)
# include <cstdbool>
# endif
#elif defined(HAVE_STDBOOL_H)
# include <stdbool.h>
#elif !defined(HAVE__BOOL)
typedef unsigned char _Bool;
# define bool _Bool
# define true ((_Bool)+1)
# define false ((_Bool)+0)
# define __bool_true_false_are_defined
#endif
#endif
#ifndef MAYBE_UNUSED
# define MAYBE_UNUSED(x) x
#endif
@ -46,7 +63,7 @@ static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *resul
static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *result);
static char *JSON_parse_float(JSON_Parser *json, char *p, char *pe, VALUE *result);
static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting);
static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int symbolize);
static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bool symbolize);
static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *result);
static VALUE convert_encoding(VALUE source);
static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self);

View File

@ -461,7 +461,7 @@ static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *resul
}
}
static inline VALUE build_string(const char *buffer, const char *bufferStart, bool intern, bool symbolize)
static inline VALUE build_string(const char *start, const char *end, bool intern, bool symbolize)
{
if (symbolize) {
intern = true;
@ -469,12 +469,12 @@ static inline VALUE build_string(const char *buffer, const char *bufferStart, bo
VALUE result;
# ifdef HAVE_RB_ENC_INTERNED_STR
if (intern) {
result = rb_enc_interned_str(bufferStart, (long)(buffer - bufferStart), rb_utf8_encoding());
result = rb_enc_interned_str(start, (long)(end - start), rb_utf8_encoding());
} else {
result = rb_utf8_str_new(bufferStart, (long)(buffer - bufferStart));
result = rb_utf8_str_new(start, (long)(end - start));
}
# else
result = rb_utf8_str_new(bufferStart, (long)(buffer - bufferStart));
result = rb_utf8_str_new(start, (long)(end - start));
if (intern) {
# if STR_UMINUS_DEDUPE_FROZEN
// Starting from MRI 3.0 it is preferable to freeze the string
@ -499,7 +499,7 @@ static inline VALUE build_string(const char *buffer, const char *bufferStart, bo
}
static const size_t MAX_STACK_BUFFER_SIZE = 128;
static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int symbolize)
static VALUE json_string_unescape(char *string, char *stringEnd, bool intern, bool symbolize)
{
VALUE result = Qnil;
size_t bufferSize = stringEnd - string;
@ -507,6 +507,11 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
int unescape_len;
char buf[4];
pe = memchr(p, '\\', bufferSize);
if (RB_LIKELY(pe == NULL)) {
return build_string(string, stringEnd, intern, symbolize);
}
if (bufferSize > MAX_STACK_BUFFER_SIZE) {
# ifdef HAVE_RB_ENC_INTERNED_STR
bufferStart = buffer = ALLOC_N(char, bufferSize ? bufferSize : 1);
@ -609,7 +614,7 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
buffer += pe - p;
}
result = build_string(buffer, bufferStart, intern, symbolize);
result = build_string(bufferStart, buffer, intern, symbolize);
if (bufferSize > MAX_STACK_BUFFER_SIZE) {
ruby_xfree(bufferStart);