[ruby/json] Refactor further to expose the simpler escape search possible

https://github.com/ruby/json/commit/e03515ac8b
This commit is contained in:
Jean Boussier 2025-01-31 20:09:12 +01:00 committed by Hiroshi SHIBATA
parent 98e1c2845a
commit 98c56de823

View File

@ -103,70 +103,6 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
static const unsigned char CHAR_LENGTH_MASK = 7;
static const unsigned char ESCAPE_MASK = 8;
static const unsigned char escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
// ASCII Characters
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static const unsigned char ascii_only_escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
// ASCII Characters
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Continuation byte
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// First byte of a 2-byte code point
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
// First byte of a 3-byte code point
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
//First byte of a 4+ byte code point
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};
static const unsigned char script_safe_escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
// ASCII Characters
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, // '"' and '/'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Continuation byte
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// First byte of a 2-byte code point
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
// First byte of a 3-byte code point
3, 3,11, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE2 is the start of \u2028 and \u2029
//First byte of a 4+ byte code point
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};
typedef struct _search_state {
const char *ptr;
const char *end;
@ -180,26 +116,25 @@ static inline void search_flush(search_state *search)
search->cursor = search->ptr;
}
static inline unsigned char search_escape(search_state *search, const unsigned char escape_table[256])
static const unsigned char escape_table_basic[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
// ASCII Characters
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
};
static inline unsigned char search_escape_basic(search_state *search)
{
while (search->ptr < search->end) {
unsigned char ch = (unsigned char)*search->ptr;
unsigned char ch_len = escape_table[ch];
if (RB_UNLIKELY(ch_len)) {
if (ch_len & ESCAPE_MASK) {
if (RB_UNLIKELY(ch_len == 11)) {
const unsigned char *uptr = (const unsigned char *)search->ptr;
if (!(uptr[1] == 0x80 && (uptr[2] >> 1) == 0x54)) {
search->ptr += 3;
continue;
}
}
if (RB_UNLIKELY(escape_table_basic[(const unsigned char)*search->ptr])) {
search_flush(search);
return ch_len & CHAR_LENGTH_MASK;
} else {
search->ptr += ch_len;
}
return 1;
} else {
search->ptr++;
}
@ -208,7 +143,54 @@ static inline unsigned char search_escape(search_state *search, const unsigned c
return 0;
}
static inline void fast_escape_UTF8_char(search_state *search, unsigned char ch_len) {
static inline void escape_UTF8_char_basic(search_state *search) {
const unsigned char ch = (unsigned char)*search->ptr;
switch (ch) {
case '"': fbuffer_append(search->buffer, "\\\"", 2); break;
case '\\': fbuffer_append(search->buffer, "\\\\", 2); break;
case '/': fbuffer_append(search->buffer, "\\/", 2); break;
case '\b': fbuffer_append(search->buffer, "\\b", 2); break;
case '\f': fbuffer_append(search->buffer, "\\f", 2); break;
case '\n': fbuffer_append(search->buffer, "\\n", 2); break;
case '\r': fbuffer_append(search->buffer, "\\r", 2); break;
case '\t': fbuffer_append(search->buffer, "\\t", 2); break;
default: {
const char *hexdig = "0123456789abcdef";
char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(search->buffer, scratch, 6);
break;
}
}
search->ptr++;
search->cursor = search->ptr;
}
/* Converts in_string to a JSON string (without the wrapping '"'
* characters) in FBuffer out_buffer.
*
* Character are JSON-escaped according to:
*
* - Always: ASCII control characters (0x00-0x1F), dquote, and
* backslash.
*
* - If out_ascii_only: non-ASCII characters (>0x7F)
*
* - If script_safe: forwardslash (/), line separator (U+2028), and
* paragraph separator (U+2029)
*
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
static inline void convert_UTF8_to_JSON(search_state *search)
{
while (search_escape_basic(search)) {
escape_UTF8_char_basic(search);
}
}
static inline void escape_UTF8_char(search_state *search, unsigned char ch_len) {
const unsigned char ch = (unsigned char)*search->ptr;
switch (ch_len) {
case 1: {
@ -244,30 +226,92 @@ static inline void fast_escape_UTF8_char(search_state *search, unsigned char ch_
search->cursor = (search->ptr += ch_len);
}
/* Converts in_string to a JSON string (without the wrapping '"'
* characters) in FBuffer out_buffer.
*
* Character are JSON-escaped according to:
*
* - Always: ASCII control characters (0x00-0x1F), dquote, and
* backslash.
*
* - If out_ascii_only: non-ASCII characters (>0x7F)
*
* - If script_safe: forwardslash (/), line separator (U+2028), and
* paragraph separator (U+2029)
*
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
static inline void convert_UTF8_to_JSON(search_state *search, const unsigned char escape_table[256])
static const unsigned char script_safe_escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
// ASCII Characters
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, // '"' and '/'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Continuation byte
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// First byte of a 2-byte code point
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
// First byte of a 3-byte code point
3, 3,11, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // 0xE2 is the start of \u2028 and \u2029
//First byte of a 4+ byte code point
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};
static inline unsigned char search_script_safe_escape(search_state *search)
{
while (search->ptr < search->end) {
unsigned char ch = (unsigned char)*search->ptr;
unsigned char ch_len = script_safe_escape_table[ch];
if (RB_UNLIKELY(ch_len)) {
if (ch_len & ESCAPE_MASK) {
if (RB_UNLIKELY(ch_len == 11)) {
const unsigned char *uptr = (const unsigned char *)search->ptr;
if (!(uptr[1] == 0x80 && (uptr[2] >> 1) == 0x54)) {
search->ptr += 3;
continue;
}
}
search_flush(search);
return ch_len & CHAR_LENGTH_MASK;
} else {
search->ptr += ch_len;
}
} else {
search->ptr++;
}
}
search_flush(search);
return 0;
}
static void convert_UTF8_to_script_safe_JSON(search_state *search)
{
unsigned char ch_len;
while ((ch_len = search_escape(search, escape_table))) {
fast_escape_UTF8_char(search, ch_len);
while ((ch_len = search_script_safe_escape(search))) {
escape_UTF8_char(search, ch_len);
}
}
static const unsigned char ascii_only_escape_table[256] = {
// ASCII Control Characters
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
// ASCII Characters
0, 0, 9, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // '"'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 9, 0, 0, 0, // '\\'
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
// Continuation byte
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
// First byte of a 2-byte code point
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
// First byte of a 3-byte code point
3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
//First byte of a 4+ byte code point
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
};
static inline unsigned char search_ascii_only_escape(search_state *search, const unsigned char escape_table[256])
{
while (search->ptr < search->end) {
@ -934,8 +978,10 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
case ENC_CODERANGE_VALID:
if (RB_UNLIKELY(state->ascii_only)) {
convert_UTF8_to_ASCII_only_JSON(&search, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
} else if (RB_UNLIKELY(state->script_safe)) {
convert_UTF8_to_script_safe_JSON(&search);
} else {
convert_UTF8_to_JSON(&search, state->script_safe ? script_safe_escape_table : escape_table);
convert_UTF8_to_JSON(&search);
}
break;
default: