[ruby/json] Adjust to the CVTUTF code being gone

I, Luke T. Shumaker, am the sole author of the added code.

I did not reference CVTUTF when writing it.  I did reference the
Unicode standard (15.0.0), the Wikipedia article on UTF-8, and the
Wikipedia article on UTF-16.  When I saw some tests fail, I did
reference the old deleted code (but a JSON-specific part, inherently
not as based on CVTUTF) to determine that script_safe should also
escape U+2028 and U+2029.

I targeted simplicity and clarity when writing the code--it can likely
be optimized.  In my mind, the obvious next optimization is to have it
combine contiguous non-escaped characters into just one call to
fbuffer_append(), instead of calling fbuffer_append() for each
character.

Regarding the use of the "modern" types `uint32_t`, `uint16_t`, and
`bool`:
 - ruby.h is guaranteed to give us uint32_t and uint16_t.
 - Since Ruby 3.0.0, ruby.h is guaranteed to give us bool... but we
   support down to Ruby 2.3.  But, ruby.h is guaranteed to give us
   HAVE_STDBOOL_H for the C99 stdbool.h; so use that to include
   stdbool.h if we can, and if not then fall back to a copy of the
   same bool definition that Ruby 3.0.5 uses with C89.

https://github.com/ruby/json/commit/c96351f874
This commit is contained in:
Luke T. Shumaker 2024-02-22 20:51:28 -07:00 committed by Hiroshi SHIBATA
parent 6e47968929
commit 74d459fd52
5 changed files with 239 additions and 144 deletions

View File

@ -18,50 +18,119 @@ static ID i_to_s, i_to_json, i_new, i_indent, i_space, i_space_before,
i_aref, i_send, i_respond_to_p, i_match, i_keys, i_depth, i_aref, i_send, i_respond_to_p, i_match, i_keys, i_depth,
i_buffer_initial_length, i_dup, i_script_safe, i_escape_slash, i_strict; i_buffer_initial_length, i_dup, i_script_safe, i_escape_slash, i_strict;
/* Escapes the UTF16 character and stores the result in the buffer buf. */ /* Converts in_string to a JSON string (without the wrapping '"'
static void unicode_escape(char *buf, UTF16 character) * characters) in FBuffer out_buffer.
*
* Character are JSON-escaped according to:
*
* - Always: ASCII control characters (0x00-0x1F), dquote, and
* backslash.
*
* - If out_ascii_only: non-ASCII characters (>0x7F)
*
* - If out_script_safe: forwardslash, line separator (U+2028), and
* paragraph separator (U+2029)
*
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe)
{ {
const char *digits = "0123456789abcdef"; const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
buf[2] = digits[character >> 12]; const char *in_utf8_str = RSTRING_PTR(in_string);
buf[3] = digits[(character >> 8) & 0xf]; unsigned long in_utf8_len = RSTRING_LEN(in_string);
buf[4] = digits[(character >> 4) & 0xf]; bool in_is_ascii_only = rb_enc_str_asciionly_p(in_string);
buf[5] = digits[character & 0xf];
}
/* Escapes the UTF16 character and stores the result in the buffer buf, then unsigned long pos;
* the buffer buf is appended to the FBuffer buffer. */
static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16
character)
{
unicode_escape(buf, character);
fbuffer_append(buffer, buf, 6);
}
/* Converts string to a JSON string in FBuffer buffer, where all but the ASCII for (pos = 0; pos < in_utf8_len;) {
* and control characters are JSON escaped. */ uint32_t ch;
static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe) unsigned long ch_len;
{ bool should_escape;
const UTF8 *source = (UTF8 *) RSTRING_PTR(string);
const UTF8 *sourceEnd = source + RSTRING_LEN(string);
char buf[6] = { '\\', 'u' };
RB_GC_GUARD(string); /* UTF-8 decoding */
} if (in_is_ascii_only) {
ch = in_utf8_str[pos];
ch_len = 1;
} else {
short i;
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
else
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
if ((pos+ch_len) > in_utf8_len)
rb_raise(rb_path2class("JSON::GeneratorError"),
"partial character in source, but hit end");
for (i = 1; i < ch_len; i++) {
if ((in_utf8_str[pos+i] & 0xC0) != 0x80) /* leading 2 bits should be 0b10 */
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
}
if (ch > 0x10FFFF)
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
}
/* Converts string to a JSON string in FBuffer buffer, where only the /* JSON policy */
* characters required by the JSON standard are JSON escaped. The remaining should_escape =
* characters (should be UTF8) are just passed through and appended to the (ch < 0x20) ||
* result. */ (ch == '"') ||
static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe) (ch == '\\') ||
{ (out_ascii_only && (ch > 0x7F)) ||
const char *ptr = RSTRING_PTR(string), *p; (out_script_safe && (ch == '/')) ||
unsigned long len = RSTRING_LEN(string), start = 0, end = 0; (out_script_safe && (ch == 0x2028)) ||
const char *escape = NULL; (out_script_safe && (ch == 0x2029));
int escape_len;
unsigned char c; /* JSON encoding */
char buf[6] = { '\\', 'u' }; if (should_escape) {
int ascii_only = rb_enc_str_asciionly_p(string); switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
default:
if (ch <= 0xFFFF) {
scratch[2] = hexdig[ch >> 12];
scratch[3] = hexdig[(ch >> 8) & 0xf];
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
} else {
uint16_t hi, lo;
ch -= 0x10000;
hi = 0xD800 + (uint16_t)(ch >> 10);
lo = 0xDC00 + (uint16_t)(ch & 0x3FF);
scratch[2] = hexdig[hi >> 12];
scratch[3] = hexdig[(hi >> 8) & 0xf];
scratch[4] = hexdig[(hi >> 4) & 0xf];
scratch[5] = hexdig[hi & 0xf];
scratch[8] = hexdig[lo >> 12];
scratch[9] = hexdig[(lo >> 8) & 0xf];
scratch[10] = hexdig[(lo >> 4) & 0xf];
scratch[11] = hexdig[lo & 0xf];
fbuffer_append(out_buffer, scratch, 12);
}
}
} else {
fbuffer_append(out_buffer, &in_utf8_str[pos], ch_len);
}
pos += ch_len;
}
RB_GC_GUARD(in_string);
} }
static char *fstrndup(const char *ptr, unsigned long len) { static char *fstrndup(const char *ptr, unsigned long len) {
@ -698,12 +767,7 @@ static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_S
if (!enc_utf8_compatible_p(rb_enc_get(obj))) { if (!enc_utf8_compatible_p(rb_enc_get(obj))) {
obj = rb_str_export_to_enc(obj, rb_utf8_encoding()); obj = rb_str_export_to_enc(obj, rb_utf8_encoding());
} }
convert_UTF8_to_JSON(buffer, obj, state->ascii_only, state->script_safe);
if (state->ascii_only) {
convert_UTF8_to_JSON_ASCII(buffer, obj, state->script_safe);
} else {
convert_UTF8_to_JSON(buffer, obj, state->script_safe);
}
fbuffer_append_char(buffer, '"'); fbuffer_append_char(buffer, '"');
} }

View File

@ -6,6 +6,14 @@
#include "ruby.h" #include "ruby.h"
#ifdef HAVE_STDBOOL_H
#include <stdbool.h>
#else
/* This is the fallback definition from Ruby 3.0.5. */
typedef unsigned char _Bool
#define bool _Bool
#endif
#ifdef HAVE_RUBY_RE_H #ifdef HAVE_RUBY_RE_H
#include "ruby/re.h" #include "ruby/re.h"
#else #else
@ -22,10 +30,7 @@
#define option_given_p(opts, key) RTEST(rb_funcall(opts, i_key_p, 1, key)) #define option_given_p(opts, key) RTEST(rb_funcall(opts, i_key_p, 1, key))
static void unicode_escape(char *buf, UTF16 character); static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe);
static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16 character);
static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe);
static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe);
static char *fstrndup(const char *ptr, unsigned long len); static char *fstrndup(const char *ptr, unsigned long len);
/* ruby api and some helpers */ /* ruby api and some helpers */

View File

@ -22,26 +22,28 @@ static const signed char digit_values[256] = {
-1, -1, -1, -1, -1, -1, -1 -1, -1, -1, -1, -1, -1, -1
}; };
static UTF32 unescape_unicode(const unsigned char *p) static uint32_t unescape_unicode(const unsigned char *p)
{ {
const uint32_t replacement_char = 0xFFFD;
signed char b; signed char b;
UTF32 result = 0; uint32_t result = 0;
b = digit_values[p[0]]; b = digit_values[p[0]];
if (b < 0) return UNI_REPLACEMENT_CHAR; if (b < 0) return replacement_char;
result = (result << 4) | (unsigned char)b; result = (result << 4) | (unsigned char)b;
b = digit_values[p[1]]; b = digit_values[p[1]];
if (b < 0) return UNI_REPLACEMENT_CHAR; if (b < 0) return replacement_char;
result = (result << 4) | (unsigned char)b; result = (result << 4) | (unsigned char)b;
b = digit_values[p[2]]; b = digit_values[p[2]];
if (b < 0) return UNI_REPLACEMENT_CHAR; if (b < 0) return replacement_char;
result = (result << 4) | (unsigned char)b; result = (result << 4) | (unsigned char)b;
b = digit_values[p[3]]; b = digit_values[p[3]];
if (b < 0) return UNI_REPLACEMENT_CHAR; if (b < 0) return replacement_char;
result = (result << 4) | (unsigned char)b; result = (result << 4) | (unsigned char)b;
return result; return result;
} }
static int convert_UTF32_to_UTF8(char *buf, UTF32 ch) static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
{ {
int len = 1; int len = 1;
if (ch <= 0x7F) { if (ch <= 0x7F) {
@ -77,11 +79,11 @@ static ID i_json_creatable_p, i_json_create, i_create_id, i_create_additions,
i_leftshift, i_new, i_try_convert, i_freeze, i_uminus; i_leftshift, i_new, i_try_convert, i_freeze, i_uminus;
#line 125 "parser.rl" #line 105 "parser.rl"
#line 107 "parser.c" #line 87 "parser.c"
enum {JSON_object_start = 1}; enum {JSON_object_start = 1};
enum {JSON_object_first_final = 27}; enum {JSON_object_first_final = 27};
enum {JSON_object_error = 0}; enum {JSON_object_error = 0};
@ -89,7 +91,7 @@ enum {JSON_object_error = 0};
enum {JSON_object_en_main = 1}; enum {JSON_object_en_main = 1};
#line 167 "parser.rl" #line 147 "parser.rl"
static char *JSON_parse_object(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting) static char *JSON_parse_object(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting)
@ -105,14 +107,14 @@ static char *JSON_parse_object(JSON_Parser *json, char *p, char *pe, VALUE *resu
*result = NIL_P(object_class) ? rb_hash_new() : rb_class_new_instance(0, 0, object_class); *result = NIL_P(object_class) ? rb_hash_new() : rb_class_new_instance(0, 0, object_class);
#line 131 "parser.c" #line 111 "parser.c"
{ {
cs = JSON_object_start; cs = JSON_object_start;
} }
#line 182 "parser.rl" #line 162 "parser.rl"
#line 138 "parser.c" #line 118 "parser.c"
{ {
if ( p == pe ) if ( p == pe )
goto _test_eof; goto _test_eof;
@ -140,7 +142,7 @@ case 2:
goto st2; goto st2;
goto st0; goto st0;
tr2: tr2:
#line 149 "parser.rl" #line 129 "parser.rl"
{ {
char *np; char *np;
json->parsing_name = 1; json->parsing_name = 1;
@ -153,7 +155,7 @@ st3:
if ( ++p == pe ) if ( ++p == pe )
goto _test_eof3; goto _test_eof3;
case 3: case 3:
#line 179 "parser.c" #line 159 "parser.c"
switch( (*p) ) { switch( (*p) ) {
case 13: goto st3; case 13: goto st3;
case 32: goto st3; case 32: goto st3;
@ -220,7 +222,7 @@ case 8:
goto st8; goto st8;
goto st0; goto st0;
tr11: tr11:
#line 133 "parser.rl" #line 113 "parser.rl"
{ {
VALUE v = Qnil; VALUE v = Qnil;
char *np = JSON_parse_value(json, p, pe, &v, current_nesting); char *np = JSON_parse_value(json, p, pe, &v, current_nesting);
@ -241,7 +243,7 @@ st9:
if ( ++p == pe ) if ( ++p == pe )
goto _test_eof9; goto _test_eof9;
case 9: case 9:
#line 267 "parser.c" #line 247 "parser.c"
switch( (*p) ) { switch( (*p) ) {
case 13: goto st9; case 13: goto st9;
case 32: goto st9; case 32: goto st9;
@ -330,14 +332,14 @@ case 18:
goto st9; goto st9;
goto st18; goto st18;
tr4: tr4:
#line 157 "parser.rl" #line 137 "parser.rl"
{ p--; {p++; cs = 27; goto _out;} } { p--; {p++; cs = 27; goto _out;} }
goto st27; goto st27;
st27: st27:
if ( ++p == pe ) if ( ++p == pe )
goto _test_eof27; goto _test_eof27;
case 27: case 27:
#line 363 "parser.c" #line 343 "parser.c"
goto st0; goto st0;
st19: st19:
if ( ++p == pe ) if ( ++p == pe )
@ -435,7 +437,7 @@ case 26:
_out: {} _out: {}
} }
#line 183 "parser.rl" #line 163 "parser.rl"
if (cs >= JSON_object_first_final) { if (cs >= JSON_object_first_final) {
if (json->create_additions) { if (json->create_additions) {
@ -460,7 +462,7 @@ case 26:
#line 486 "parser.c" #line 466 "parser.c"
enum {JSON_value_start = 1}; enum {JSON_value_start = 1};
enum {JSON_value_first_final = 29}; enum {JSON_value_first_final = 29};
enum {JSON_value_error = 0}; enum {JSON_value_error = 0};
@ -468,7 +470,7 @@ enum {JSON_value_error = 0};
enum {JSON_value_en_main = 1}; enum {JSON_value_en_main = 1};
#line 283 "parser.rl" #line 263 "parser.rl"
static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting) static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting)
@ -476,14 +478,14 @@ static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *resul
int cs = EVIL; int cs = EVIL;
#line 502 "parser.c" #line 482 "parser.c"
{ {
cs = JSON_value_start; cs = JSON_value_start;
} }
#line 290 "parser.rl" #line 270 "parser.rl"
#line 509 "parser.c" #line 489 "parser.c"
{ {
if ( p == pe ) if ( p == pe )
goto _test_eof; goto _test_eof;
@ -517,14 +519,14 @@ st0:
cs = 0; cs = 0;
goto _out; goto _out;
tr2: tr2:
#line 235 "parser.rl" #line 215 "parser.rl"
{ {
char *np = JSON_parse_string(json, p, pe, result); char *np = JSON_parse_string(json, p, pe, result);
if (np == NULL) { p--; {p++; cs = 29; goto _out;} } else {p = (( np))-1;} if (np == NULL) { p--; {p++; cs = 29; goto _out;} } else {p = (( np))-1;}
} }
goto st29; goto st29;
tr3: tr3:
#line 240 "parser.rl" #line 220 "parser.rl"
{ {
char *np; char *np;
if(pe > p + 8 && !strncmp(MinusInfinity, p, 9)) { if(pe > p + 8 && !strncmp(MinusInfinity, p, 9)) {
@ -544,7 +546,7 @@ tr3:
} }
goto st29; goto st29;
tr7: tr7:
#line 258 "parser.rl" #line 238 "parser.rl"
{ {
char *np; char *np;
np = JSON_parse_array(json, p, pe, result, current_nesting + 1); np = JSON_parse_array(json, p, pe, result, current_nesting + 1);
@ -552,7 +554,7 @@ tr7:
} }
goto st29; goto st29;
tr11: tr11:
#line 264 "parser.rl" #line 244 "parser.rl"
{ {
char *np; char *np;
np = JSON_parse_object(json, p, pe, result, current_nesting + 1); np = JSON_parse_object(json, p, pe, result, current_nesting + 1);
@ -560,7 +562,7 @@ tr11:
} }
goto st29; goto st29;
tr25: tr25:
#line 228 "parser.rl" #line 208 "parser.rl"
{ {
if (json->allow_nan) { if (json->allow_nan) {
*result = CInfinity; *result = CInfinity;
@ -570,7 +572,7 @@ tr25:
} }
goto st29; goto st29;
tr27: tr27:
#line 221 "parser.rl" #line 201 "parser.rl"
{ {
if (json->allow_nan) { if (json->allow_nan) {
*result = CNaN; *result = CNaN;
@ -580,19 +582,19 @@ tr27:
} }
goto st29; goto st29;
tr31: tr31:
#line 215 "parser.rl" #line 195 "parser.rl"
{ {
*result = Qfalse; *result = Qfalse;
} }
goto st29; goto st29;
tr34: tr34:
#line 212 "parser.rl" #line 192 "parser.rl"
{ {
*result = Qnil; *result = Qnil;
} }
goto st29; goto st29;
tr37: tr37:
#line 218 "parser.rl" #line 198 "parser.rl"
{ {
*result = Qtrue; *result = Qtrue;
} }
@ -601,9 +603,9 @@ st29:
if ( ++p == pe ) if ( ++p == pe )
goto _test_eof29; goto _test_eof29;
case 29: case 29:
#line 270 "parser.rl" #line 250 "parser.rl"
{ p--; {p++; cs = 29; goto _out;} } { p--; {p++; cs = 29; goto _out;} }
#line 629 "parser.c" #line 609 "parser.c"
switch( (*p) ) { switch( (*p) ) {
case 13: goto st29; case 13: goto st29;
case 32: goto st29; case 32: goto st29;
@ -844,7 +846,7 @@ case 28:
_out: {} _out: {}
} }
#line 291 "parser.rl" #line 271 "parser.rl"
if (json->freeze) { if (json->freeze) {
OBJ_FREEZE(*result); OBJ_FREEZE(*result);
@ -858,7 +860,7 @@ case 28:
} }
#line 884 "parser.c" #line 864 "parser.c"
enum {JSON_integer_start = 1}; enum {JSON_integer_start = 1};
enum {JSON_integer_first_final = 3}; enum {JSON_integer_first_final = 3};
enum {JSON_integer_error = 0}; enum {JSON_integer_error = 0};
@ -866,7 +868,7 @@ enum {JSON_integer_error = 0};
enum {JSON_integer_en_main = 1}; enum {JSON_integer_en_main = 1};
#line 311 "parser.rl" #line 291 "parser.rl"
static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *result) static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *result)
@ -874,15 +876,15 @@ static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *res
int cs = EVIL; int cs = EVIL;
#line 900 "parser.c" #line 880 "parser.c"
{ {
cs = JSON_integer_start; cs = JSON_integer_start;
} }
#line 318 "parser.rl" #line 298 "parser.rl"
json->memo = p; json->memo = p;
#line 908 "parser.c" #line 888 "parser.c"
{ {
if ( p == pe ) if ( p == pe )
goto _test_eof; goto _test_eof;
@ -916,14 +918,14 @@ case 3:
goto st0; goto st0;
goto tr4; goto tr4;
tr4: tr4:
#line 308 "parser.rl" #line 288 "parser.rl"
{ p--; {p++; cs = 4; goto _out;} } { p--; {p++; cs = 4; goto _out;} }
goto st4; goto st4;
st4: st4:
if ( ++p == pe ) if ( ++p == pe )
goto _test_eof4; goto _test_eof4;
case 4: case 4:
#line 949 "parser.c" #line 929 "parser.c"
goto st0; goto st0;
st5: st5:
if ( ++p == pe ) if ( ++p == pe )
@ -942,7 +944,7 @@ case 5:
_out: {} _out: {}
} }
#line 320 "parser.rl" #line 300 "parser.rl"
if (cs >= JSON_integer_first_final) { if (cs >= JSON_integer_first_final) {
long len = p - json->memo; long len = p - json->memo;
@ -957,7 +959,7 @@ case 5:
} }
#line 983 "parser.c" #line 963 "parser.c"
enum {JSON_float_start = 1}; enum {JSON_float_start = 1};
enum {JSON_float_first_final = 8}; enum {JSON_float_first_final = 8};
enum {JSON_float_error = 0}; enum {JSON_float_error = 0};
@ -965,7 +967,7 @@ enum {JSON_float_error = 0};
enum {JSON_float_en_main = 1}; enum {JSON_float_en_main = 1};
#line 345 "parser.rl" #line 325 "parser.rl"
static char *JSON_parse_float(JSON_Parser *json, char *p, char *pe, VALUE *result) static char *JSON_parse_float(JSON_Parser *json, char *p, char *pe, VALUE *result)
@ -973,15 +975,15 @@ static char *JSON_parse_float(JSON_Parser *json, char *p, char *pe, VALUE *resul
int cs = EVIL; int cs = EVIL;
#line 999 "parser.c" #line 979 "parser.c"
{ {
cs = JSON_float_start; cs = JSON_float_start;
} }
#line 352 "parser.rl" #line 332 "parser.rl"
json->memo = p; json->memo = p;
#line 1007 "parser.c" #line 987 "parser.c"
{ {
if ( p == pe ) if ( p == pe )
goto _test_eof; goto _test_eof;
@ -1039,14 +1041,14 @@ case 8:
goto st0; goto st0;
goto tr9; goto tr9;
tr9: tr9:
#line 339 "parser.rl" #line 319 "parser.rl"
{ p--; {p++; cs = 9; goto _out;} } { p--; {p++; cs = 9; goto _out;} }
goto st9; goto st9;
st9: st9:
if ( ++p == pe ) if ( ++p == pe )
goto _test_eof9; goto _test_eof9;
case 9: case 9:
#line 1072 "parser.c" #line 1052 "parser.c"
goto st0; goto st0;
st5: st5:
if ( ++p == pe ) if ( ++p == pe )
@ -1107,7 +1109,7 @@ case 7:
_out: {} _out: {}
} }
#line 354 "parser.rl" #line 334 "parser.rl"
if (cs >= JSON_float_first_final) { if (cs >= JSON_float_first_final) {
VALUE mod = Qnil; VALUE mod = Qnil;
@ -1158,7 +1160,7 @@ case 7:
#line 1184 "parser.c" #line 1164 "parser.c"
enum {JSON_array_start = 1}; enum {JSON_array_start = 1};
enum {JSON_array_first_final = 17}; enum {JSON_array_first_final = 17};
enum {JSON_array_error = 0}; enum {JSON_array_error = 0};
@ -1166,7 +1168,7 @@ enum {JSON_array_error = 0};
enum {JSON_array_en_main = 1}; enum {JSON_array_en_main = 1};
#line 432 "parser.rl" #line 412 "parser.rl"
static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting) static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting)
@ -1180,14 +1182,14 @@ static char *JSON_parse_array(JSON_Parser *json, char *p, char *pe, VALUE *resul
*result = NIL_P(array_class) ? rb_ary_new() : rb_class_new_instance(0, 0, array_class); *result = NIL_P(array_class) ? rb_ary_new() : rb_class_new_instance(0, 0, array_class);
#line 1206 "parser.c" #line 1186 "parser.c"
{ {
cs = JSON_array_start; cs = JSON_array_start;
} }
#line 445 "parser.rl" #line 425 "parser.rl"
#line 1213 "parser.c" #line 1193 "parser.c"
{ {
if ( p == pe ) if ( p == pe )
goto _test_eof; goto _test_eof;
@ -1226,7 +1228,7 @@ case 2:
goto st2; goto st2;
goto st0; goto st0;
tr2: tr2:
#line 409 "parser.rl" #line 389 "parser.rl"
{ {
VALUE v = Qnil; VALUE v = Qnil;
char *np = JSON_parse_value(json, p, pe, &v, current_nesting); char *np = JSON_parse_value(json, p, pe, &v, current_nesting);
@ -1246,7 +1248,7 @@ st3:
if ( ++p == pe ) if ( ++p == pe )
goto _test_eof3; goto _test_eof3;
case 3: case 3:
#line 1272 "parser.c" #line 1252 "parser.c"
switch( (*p) ) { switch( (*p) ) {
case 13: goto st3; case 13: goto st3;
case 32: goto st3; case 32: goto st3;
@ -1346,14 +1348,14 @@ case 12:
goto st3; goto st3;
goto st12; goto st12;
tr4: tr4:
#line 424 "parser.rl" #line 404 "parser.rl"
{ p--; {p++; cs = 17; goto _out;} } { p--; {p++; cs = 17; goto _out;} }
goto st17; goto st17;
st17: st17:
if ( ++p == pe ) if ( ++p == pe )
goto _test_eof17; goto _test_eof17;
case 17: case 17:
#line 1379 "parser.c" #line 1359 "parser.c"
goto st0; goto st0;
st13: st13:
if ( ++p == pe ) if ( ++p == pe )
@ -1409,7 +1411,7 @@ case 16:
_out: {} _out: {}
} }
#line 446 "parser.rl" #line 426 "parser.rl"
if(cs >= JSON_array_first_final) { if(cs >= JSON_array_first_final) {
return p + 1; return p + 1;
@ -1482,9 +1484,19 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
"incomplete unicode character escape sequence at '%s'", p "incomplete unicode character escape sequence at '%s'", p
); );
} else { } else {
UTF32 ch = unescape_unicode((unsigned char *) ++pe); uint32_t ch = unescape_unicode((unsigned char *) ++pe);
pe += 3; pe += 3;
if (UNI_SUR_HIGH_START == (ch & 0xFC00)) { /* To handle values above U+FFFF, we take a sequence of
* \uXXXX escapes in the U+D800..U+DBFF then
* U+DC00..U+DFFF ranges, take the low 10 bits from each
* to make a 20-bit number, then add 0x10000 to get the
* final codepoint.
*
* See Unicode 15: §3.8 "Surrogates", §5.3 "Handling
* Surrogate Pairs in UTF-16", and §23.6 "Surrogates
* Area".
*/
if ((ch & 0xFC00) == 0xD800) {
pe++; pe++;
if (pe > stringEnd - 6) { if (pe > stringEnd - 6) {
if (bufferSize > MAX_STACK_BUFFER_SIZE) { if (bufferSize > MAX_STACK_BUFFER_SIZE) {
@ -1496,7 +1508,7 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
); );
} }
if (pe[0] == '\\' && pe[1] == 'u') { if (pe[0] == '\\' && pe[1] == 'u') {
UTF32 sur = unescape_unicode((unsigned char *) pe + 2); uint32_t sur = unescape_unicode((unsigned char *) pe + 2);
ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
| (sur & 0x3FF)); | (sur & 0x3FF));
pe += 5; pe += 5;
@ -1566,7 +1578,7 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
} }
#line 1592 "parser.c" #line 1582 "parser.c"
enum {JSON_string_start = 1}; enum {JSON_string_start = 1};
enum {JSON_string_first_final = 8}; enum {JSON_string_first_final = 8};
enum {JSON_string_error = 0}; enum {JSON_string_error = 0};
@ -1574,7 +1586,7 @@ enum {JSON_string_error = 0};
enum {JSON_string_en_main = 1}; enum {JSON_string_en_main = 1};
#line 620 "parser.rl" #line 610 "parser.rl"
static int static int
@ -1595,15 +1607,15 @@ static char *JSON_parse_string(JSON_Parser *json, char *p, char *pe, VALUE *resu
VALUE match_string; VALUE match_string;
#line 1621 "parser.c" #line 1611 "parser.c"
{ {
cs = JSON_string_start; cs = JSON_string_start;
} }
#line 640 "parser.rl" #line 630 "parser.rl"
json->memo = p; json->memo = p;
#line 1629 "parser.c" #line 1619 "parser.c"
{ {
if ( p == pe ) if ( p == pe )
goto _test_eof; goto _test_eof;
@ -1628,7 +1640,7 @@ case 2:
goto st0; goto st0;
goto st2; goto st2;
tr2: tr2:
#line 607 "parser.rl" #line 597 "parser.rl"
{ {
*result = json_string_unescape(json->memo + 1, p, json->parsing_name || json-> freeze, json->parsing_name && json->symbolize_names); *result = json_string_unescape(json->memo + 1, p, json->parsing_name || json-> freeze, json->parsing_name && json->symbolize_names);
if (NIL_P(*result)) { if (NIL_P(*result)) {
@ -1638,14 +1650,14 @@ tr2:
{p = (( p + 1))-1;} {p = (( p + 1))-1;}
} }
} }
#line 617 "parser.rl" #line 607 "parser.rl"
{ p--; {p++; cs = 8; goto _out;} } { p--; {p++; cs = 8; goto _out;} }
goto st8; goto st8;
st8: st8:
if ( ++p == pe ) if ( ++p == pe )
goto _test_eof8; goto _test_eof8;
case 8: case 8:
#line 1671 "parser.c" #line 1661 "parser.c"
goto st0; goto st0;
st3: st3:
if ( ++p == pe ) if ( ++p == pe )
@ -1721,7 +1733,7 @@ case 7:
_out: {} _out: {}
} }
#line 642 "parser.rl" #line 632 "parser.rl"
if (json->create_additions && RTEST(match_string = json->match_string)) { if (json->create_additions && RTEST(match_string = json->match_string)) {
VALUE klass; VALUE klass;
@ -1755,6 +1767,7 @@ case 7:
static VALUE convert_encoding(VALUE source) static VALUE convert_encoding(VALUE source)
{ {
#ifdef HAVE_RUBY_ENCODING_H
rb_encoding *enc = rb_enc_get(source); rb_encoding *enc = rb_enc_get(source);
if (enc == rb_ascii8bit_encoding()) { if (enc == rb_ascii8bit_encoding()) {
if (OBJ_FROZEN(source)) { if (OBJ_FROZEN(source)) {
@ -1764,7 +1777,8 @@ static VALUE convert_encoding(VALUE source)
} else { } else {
source = rb_str_conv_enc(source, rb_enc_get(source), rb_utf8_encoding()); source = rb_str_conv_enc(source, rb_enc_get(source), rb_utf8_encoding());
} }
return source; #endif
return source;
} }
/* /*
@ -1892,7 +1906,7 @@ static VALUE cParser_initialize(int argc, VALUE *argv, VALUE self)
} }
#line 1920 "parser.c" #line 1910 "parser.c"
enum {JSON_start = 1}; enum {JSON_start = 1};
enum {JSON_first_final = 10}; enum {JSON_first_final = 10};
enum {JSON_error = 0}; enum {JSON_error = 0};
@ -1900,7 +1914,7 @@ enum {JSON_error = 0};
enum {JSON_en_main = 1}; enum {JSON_en_main = 1};
#line 828 "parser.rl" #line 818 "parser.rl"
/* /*
@ -1918,16 +1932,16 @@ static VALUE cParser_parse(VALUE self)
GET_PARSER; GET_PARSER;
#line 1946 "parser.c" #line 1936 "parser.c"
{ {
cs = JSON_start; cs = JSON_start;
} }
#line 845 "parser.rl" #line 835 "parser.rl"
p = json->source; p = json->source;
pe = p + json->len; pe = p + json->len;
#line 1955 "parser.c" #line 1945 "parser.c"
{ {
if ( p == pe ) if ( p == pe )
goto _test_eof; goto _test_eof;
@ -1961,7 +1975,7 @@ st0:
cs = 0; cs = 0;
goto _out; goto _out;
tr2: tr2:
#line 820 "parser.rl" #line 810 "parser.rl"
{ {
char *np = JSON_parse_value(json, p, pe, &result, 0); char *np = JSON_parse_value(json, p, pe, &result, 0);
if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;} if (np == NULL) { p--; {p++; cs = 10; goto _out;} } else {p = (( np))-1;}
@ -1971,7 +1985,7 @@ st10:
if ( ++p == pe ) if ( ++p == pe )
goto _test_eof10; goto _test_eof10;
case 10: case 10:
#line 1999 "parser.c" #line 1989 "parser.c"
switch( (*p) ) { switch( (*p) ) {
case 13: goto st10; case 13: goto st10;
case 32: goto st10; case 32: goto st10;
@ -2060,7 +2074,7 @@ case 9:
_out: {} _out: {}
} }
#line 848 "parser.rl" #line 838 "parser.rl"
if (cs >= JSON_first_final && p == pe) { if (cs >= JSON_first_final && p == pe) {
return result; return result;

View File

@ -48,8 +48,8 @@ typedef struct JSON_ParserStruct {
#define MinusInfinity "-Infinity" #define MinusInfinity "-Infinity"
#define EVIL 0x666 #define EVIL 0x666
static UTF32 unescape_unicode(const unsigned char *p); static uint32_t unescape_unicode(const unsigned char *p);
static int convert_UTF32_to_UTF8(char *buf, UTF32 ch); static int convert_UTF32_to_UTF8(char *buf, uint32_t ch);
static char *JSON_parse_object(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting); static char *JSON_parse_object(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting);
static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting); static char *JSON_parse_value(JSON_Parser *json, char *p, char *pe, VALUE *result, int current_nesting);
static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *result); static char *JSON_parse_integer(JSON_Parser *json, char *p, char *pe, VALUE *result);

View File

@ -20,26 +20,28 @@ static const signed char digit_values[256] = {
-1, -1, -1, -1, -1, -1, -1 -1, -1, -1, -1, -1, -1, -1
}; };
static UTF32 unescape_unicode(const unsigned char *p) static uint32_t unescape_unicode(const unsigned char *p)
{ {
const uint32_t replacement_char = 0xFFFD;
signed char b; signed char b;
UTF32 result = 0; uint32_t result = 0;
b = digit_values[p[0]]; b = digit_values[p[0]];
if (b < 0) return UNI_REPLACEMENT_CHAR; if (b < 0) return replacement_char;
result = (result << 4) | (unsigned char)b; result = (result << 4) | (unsigned char)b;
b = digit_values[p[1]]; b = digit_values[p[1]];
if (b < 0) return UNI_REPLACEMENT_CHAR; if (b < 0) return replacement_char;
result = (result << 4) | (unsigned char)b; result = (result << 4) | (unsigned char)b;
b = digit_values[p[2]]; b = digit_values[p[2]];
if (b < 0) return UNI_REPLACEMENT_CHAR; if (b < 0) return replacement_char;
result = (result << 4) | (unsigned char)b; result = (result << 4) | (unsigned char)b;
b = digit_values[p[3]]; b = digit_values[p[3]];
if (b < 0) return UNI_REPLACEMENT_CHAR; if (b < 0) return replacement_char;
result = (result << 4) | (unsigned char)b; result = (result << 4) | (unsigned char)b;
return result; return result;
} }
static int convert_UTF32_to_UTF8(char *buf, UTF32 ch) static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
{ {
int len = 1; int len = 1;
if (ch <= 0x7F) { if (ch <= 0x7F) {
@ -493,9 +495,19 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
"incomplete unicode character escape sequence at '%s'", p "incomplete unicode character escape sequence at '%s'", p
); );
} else { } else {
UTF32 ch = unescape_unicode((unsigned char *) ++pe); uint32_t ch = unescape_unicode((unsigned char *) ++pe);
pe += 3; pe += 3;
if (UNI_SUR_HIGH_START == (ch & 0xFC00)) { /* To handle values above U+FFFF, we take a sequence of
* \uXXXX escapes in the U+D800..U+DBFF then
* U+DC00..U+DFFF ranges, take the low 10 bits from each
* to make a 20-bit number, then add 0x10000 to get the
* final codepoint.
*
* See Unicode 15: §3.8 "Surrogates", §5.3 "Handling
* Surrogate Pairs in UTF-16", and §23.6 "Surrogates
* Area".
*/
if ((ch & 0xFC00) == 0xD800) {
pe++; pe++;
if (pe > stringEnd - 6) { if (pe > stringEnd - 6) {
if (bufferSize > MAX_STACK_BUFFER_SIZE) { if (bufferSize > MAX_STACK_BUFFER_SIZE) {
@ -507,7 +519,7 @@ static VALUE json_string_unescape(char *string, char *stringEnd, int intern, int
); );
} }
if (pe[0] == '\\' && pe[1] == 'u') { if (pe[0] == '\\' && pe[1] == 'u') {
UTF32 sur = unescape_unicode((unsigned char *) pe + 2); uint32_t sur = unescape_unicode((unsigned char *) pe + 2);
ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16) ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
| (sur & 0x3FF)); | (sur & 0x3FF));
pe += 5; pe += 5;