[ruby/json] parser.c: refactor raise_parse_error to have document start

https://github.com/ruby/json/commit/832b5b1a4c
This commit is contained in:
Jean Boussier 2025-05-12 13:06:05 +02:00 committed by Hiroshi SHIBATA
parent a93d9fdcd2
commit 8f008598c3
Notes: git 2025-05-13 05:12:37 +00:00

View File

@ -337,73 +337,6 @@ static size_t strnlen(const char *s, size_t maxlen)
}
#endif
#define PARSE_ERROR_FRAGMENT_LEN 32
#ifdef RBIMPL_ATTR_NORETURN
RBIMPL_ATTR_NORETURN()
#endif
static void raise_parse_error(const char *format, const char *start)
{
unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 1];
size_t len = start ? strnlen(start, PARSE_ERROR_FRAGMENT_LEN) : 0;
const char *ptr = start;
if (len == PARSE_ERROR_FRAGMENT_LEN) {
MEMCPY(buffer, start, char, PARSE_ERROR_FRAGMENT_LEN);
while (buffer[len - 1] >= 0x80 && buffer[len - 1] < 0xC0) { // Is continuation byte
len--;
}
if (buffer[len - 1] >= 0xC0) { // multibyte character start
len--;
}
buffer[len] = '\0';
ptr = (const char *)buffer;
}
rb_enc_raise(enc_utf8, rb_path2class("JSON::ParserError"), format, ptr);
}
/* unicode */
static const signed char digit_values[256] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1,
-1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1
};
static uint32_t unescape_unicode(const unsigned char *p)
{
signed char b;
uint32_t result = 0;
b = digit_values[p[0]];
if (b < 0) raise_parse_error("incomplete unicode character escape sequence at '%s'", (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[1]];
if (b < 0) raise_parse_error("incomplete unicode character escape sequence at '%s'", (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[2]];
if (b < 0) raise_parse_error("incomplete unicode character escape sequence at '%s'", (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[3]];
if (b < 0) raise_parse_error("incomplete unicode character escape sequence at '%s'", (char *)p - 2);
result = (result << 4) | (unsigned char)b;
return result;
}
static int convert_UTF32_to_UTF8(char *buf, uint32_t ch)
{
int len = 1;
@ -444,6 +377,7 @@ typedef struct JSON_ParserStruct {
typedef struct JSON_ParserStateStruct {
VALUE stack_handle;
const char *start;
const char *cursor;
const char *end;
rvalue_stack *stack;
@ -452,6 +386,83 @@ typedef struct JSON_ParserStateStruct {
int current_nesting;
} JSON_ParserState;
#define PARSE_ERROR_FRAGMENT_LEN 32
#ifdef RBIMPL_ATTR_NORETURN
RBIMPL_ATTR_NORETURN()
#endif
static void raise_parse_error(const char *format, JSON_ParserState *state)
{
unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 1];
const char *ptr = state->cursor;
size_t len = ptr ? strnlen(ptr, PARSE_ERROR_FRAGMENT_LEN) : 0;
if (len == PARSE_ERROR_FRAGMENT_LEN) {
MEMCPY(buffer, ptr, char, PARSE_ERROR_FRAGMENT_LEN);
while (buffer[len - 1] >= 0x80 && buffer[len - 1] < 0xC0) { // Is continuation byte
len--;
}
if (buffer[len - 1] >= 0xC0) { // multibyte character start
len--;
}
buffer[len] = '\0';
ptr = (const char *)buffer;
}
rb_enc_raise(enc_utf8, rb_path2class("JSON::ParserError"), format, ptr);
}
#ifdef RBIMPL_ATTR_NORETURN
RBIMPL_ATTR_NORETURN()
#endif
static void raise_parse_error_at(const char *format, JSON_ParserState *state, const char *at)
{
state->cursor = at;
raise_parse_error(format, state);
}
/* unicode */
static const signed char digit_values[256] = {
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1,
-1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
-1, -1, -1, -1, -1, -1, -1
};
static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p)
{
signed char b;
uint32_t result = 0;
b = digit_values[p[0]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[1]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[2]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
b = digit_values[p[3]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b;
return result;
}
#define GET_PARSER_CONFIG \
JSON_ParserConfig *config; \
TypedData_Get_Struct(self, JSON_ParserConfig, &JSON_ParserConfig_type, config)
@ -485,8 +496,7 @@ json_eat_comments(JSON_ParserState *state)
while (true) {
state->cursor = memchr(state->cursor, '*', state->end - state->cursor);
if (!state->cursor) {
state->cursor = state->end;
raise_parse_error("unexpected end of input, expected closing '*/'", state->cursor);
raise_parse_error_at("unexpected end of input, expected closing '*/'", state, state->end);
} else {
state->cursor++;
if (state->cursor < state->end && *state->cursor == '/') {
@ -498,11 +508,11 @@ json_eat_comments(JSON_ParserState *state)
break;
}
default:
raise_parse_error("unexpected token at '%s'", state->cursor);
raise_parse_error("unexpected token at '%s'", state);
break;
}
} else {
raise_parse_error("unexpected token at '%s'", state->cursor);
raise_parse_error("unexpected token at '%s'", state);
}
}
@ -621,9 +631,9 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
break;
case 'u':
if (pe > stringEnd - 5) {
raise_parse_error("incomplete unicode character escape sequence at '%s'", p);
raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, p);
} else {
uint32_t ch = unescape_unicode((unsigned char *) ++pe);
uint32_t ch = unescape_unicode(state, (unsigned char *) ++pe);
pe += 3;
/* To handle values above U+FFFF, we take a sequence of
* \uXXXX escapes in the U+D800..U+DBFF then
@ -638,10 +648,10 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
if ((ch & 0xFC00) == 0xD800) {
pe++;
if (pe > stringEnd - 6) {
raise_parse_error("incomplete surrogate pair at '%s'", p);
raise_parse_error_at("incomplete surrogate pair at '%s'", state, p);
}
if (pe[0] == '\\' && pe[1] == 'u') {
uint32_t sur = unescape_unicode((unsigned char *) pe + 2);
uint32_t sur = unescape_unicode(state, (unsigned char *) pe + 2);
ch = (((ch & 0x3F) << 10) | ((((ch >> 6) & 0xF) + 1) << 16)
| (sur & 0x3FF));
pe += 5;
@ -829,12 +839,12 @@ static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig
state->cursor++;
escaped = true;
if ((unsigned char)*state->cursor < 0x20) {
raise_parse_error("invalid ASCII control character in string: %s", state->cursor);
raise_parse_error("invalid ASCII control character in string: %s", state);
}
break;
}
default:
raise_parse_error("invalid ASCII control character in string: %s", state->cursor);
raise_parse_error("invalid ASCII control character in string: %s", state);
break;
}
}
@ -842,7 +852,7 @@ static inline VALUE json_parse_string(JSON_ParserState *state, JSON_ParserConfig
state->cursor++;
}
raise_parse_error("unexpected end of input, expected closing \"", state->cursor);
raise_parse_error("unexpected end of input, expected closing \"", state);
return Qfalse;
}
@ -850,7 +860,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
{
json_eat_whitespace(state);
if (state->cursor >= state->end) {
raise_parse_error("unexpected end of input", state->cursor);
raise_parse_error("unexpected end of input", state);
}
switch (*state->cursor) {
@ -860,7 +870,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
return json_push_value(state, config, Qnil);
}
raise_parse_error("unexpected token at '%s'", state->cursor);
raise_parse_error("unexpected token at '%s'", state);
break;
case 't':
if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "true", 4) == 0)) {
@ -868,7 +878,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
return json_push_value(state, config, Qtrue);
}
raise_parse_error("unexpected token at '%s'", state->cursor);
raise_parse_error("unexpected token at '%s'", state);
break;
case 'f':
// Note: memcmp with a small power of two compile to an integer comparison
@ -877,7 +887,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
return json_push_value(state, config, Qfalse);
}
raise_parse_error("unexpected token at '%s'", state->cursor);
raise_parse_error("unexpected token at '%s'", state);
break;
case 'N':
// Note: memcmp with a small power of two compile to an integer comparison
@ -886,7 +896,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
return json_push_value(state, config, CNaN);
}
raise_parse_error("unexpected token at '%s'", state->cursor);
raise_parse_error("unexpected token at '%s'", state);
break;
case 'I':
if (config->allow_nan && (state->end - state->cursor >= 8) && (memcmp(state->cursor, "Infinity", 8) == 0)) {
@ -894,7 +904,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
return json_push_value(state, config, CInfinity);
}
raise_parse_error("unexpected token at '%s'", state->cursor);
raise_parse_error("unexpected token at '%s'", state);
break;
case '-':
// Note: memcmp with a small power of two compile to an integer comparison
@ -903,7 +913,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
state->cursor += 9;
return json_push_value(state, config, CMinusInfinity);
} else {
raise_parse_error("unexpected token at '%s'", state->cursor);
raise_parse_error("unexpected token at '%s'", state);
}
}
// Fallthrough
@ -921,11 +931,11 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
long integer_length = state->cursor - start;
if (RB_UNLIKELY(start[0] == '0' && integer_length > 1)) {
raise_parse_error("invalid number: %s", start);
raise_parse_error_at("invalid number: %s", state, start);
} else if (RB_UNLIKELY(integer_length > 2 && start[0] == '-' && start[1] == '0')) {
raise_parse_error("invalid number: %s", start);
raise_parse_error_at("invalid number: %s", state, start);
} else if (RB_UNLIKELY(integer_length == 1 && start[0] == '-')) {
raise_parse_error("invalid number: %s", start);
raise_parse_error_at("invalid number: %s", state, start);
}
if ((state->cursor < state->end) && (*state->cursor == '.')) {
@ -933,7 +943,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
state->cursor++;
if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
raise_parse_error("invalid number: %s", state->cursor);
raise_parse_error("invalid number: %s", state);
}
while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
@ -949,7 +959,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
}
if (state->cursor == state->end || *state->cursor < '0' || *state->cursor > '9') {
raise_parse_error("invalid number: %s", state->cursor);
raise_parse_error("invalid number: %s", state);
}
while ((state->cursor < state->end) && (*state->cursor >= '0') && (*state->cursor <= '9')) {
@ -1009,7 +1019,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
}
}
raise_parse_error("expected ',' or ']' after array value", state->cursor);
raise_parse_error("expected ',' or ']' after array value", state);
}
break;
}
@ -1028,13 +1038,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
}
if (*state->cursor != '"') {
raise_parse_error("expected object key, got '%s", state->cursor);
raise_parse_error("expected object key, got '%s", state);
}
json_parse_string(state, config, true);
json_eat_whitespace(state);
if ((state->cursor >= state->end) || (*state->cursor != ':')) {
raise_parse_error("expected ':' after object key", state->cursor);
raise_parse_error("expected ':' after object key", state);
}
state->cursor++;
@ -1063,13 +1073,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
}
if (*state->cursor != '"') {
raise_parse_error("expected object key, got: '%s'", state->cursor);
raise_parse_error("expected object key, got: '%s'", state);
}
json_parse_string(state, config, true);
json_eat_whitespace(state);
if ((state->cursor >= state->end) || (*state->cursor != ':')) {
raise_parse_error("expected ':' after object key, got: '%s", state->cursor);
raise_parse_error("expected ':' after object key, got: '%s", state);
}
state->cursor++;
@ -1079,24 +1089,24 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
}
}
raise_parse_error("expected ',' or '}' after object value, got: '%s'", state->cursor);
raise_parse_error("expected ',' or '}' after object value, got: '%s'", state);
}
break;
}
default:
raise_parse_error("unexpected character: '%s'", state->cursor);
raise_parse_error("unexpected character: '%s'", state);
break;
}
raise_parse_error("unreacheable: '%s'", state->cursor);
raise_parse_error("unreacheable: '%s'", state);
}
static void json_ensure_eof(JSON_ParserState *state)
{
json_eat_whitespace(state);
if (state->cursor != state->end) {
raise_parse_error("unexpected token at end of stream '%s'", state->cursor);
raise_parse_error("unexpected token at end of stream '%s'", state);
}
}
@ -1232,9 +1242,14 @@ static VALUE cParser_parse(JSON_ParserConfig *config, VALUE Vsource)
.capa = RVALUE_STACK_INITIAL_CAPA,
};
long len;
const char *start;
RSTRING_GETMEM(Vsource, start, len);
JSON_ParserState _state = {
.cursor = RSTRING_PTR(Vsource),
.end = RSTRING_END(Vsource),
.start = start,
.cursor = start,
.end = start + len,
.stack = &stack,
};
JSON_ParserState *state = &_state;