[ruby/json] Further improve parsing errors

Report EOF when applicable instead of an empty fragment.

Also stop fragment extraction on first whitespace.

https://github.com/ruby/json/commit/cc1daba860
This commit is contained in:
Jean Boussier 2025-05-12 15:41:12 +02:00 committed by Hiroshi SHIBATA
parent 8cc1aa82f1
commit cd7495a1d0
Notes: git 2025-05-13 05:12:36 +00:00
3 changed files with 66 additions and 39 deletions

View File

@ -393,7 +393,7 @@ RBIMPL_ATTR_NORETURN()
#endif #endif
static void raise_parse_error(const char *format, JSON_ParserState *state) static void raise_parse_error(const char *format, JSON_ParserState *state)
{ {
unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 1]; unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 3];
const char *cursor = state->cursor; const char *cursor = state->cursor;
long column = 0; long column = 0;
@ -412,23 +412,35 @@ static void raise_parse_error(const char *format, JSON_ParserState *state)
} }
} }
const char *ptr = state->cursor; const char *ptr = "EOF";
size_t len = ptr ? strnlen(ptr, PARSE_ERROR_FRAGMENT_LEN) : 0; if (state->cursor && state->cursor < state->end) {
ptr = state->cursor;
size_t len = 0;
while (len < PARSE_ERROR_FRAGMENT_LEN) {
char ch = ptr[len];
if (!ch || ch == '\n' || ch == ' ' || ch == '\t' || ch == '\r') {
break;
}
len++;
}
if (len == PARSE_ERROR_FRAGMENT_LEN) { if (len) {
MEMCPY(buffer, ptr, char, PARSE_ERROR_FRAGMENT_LEN); buffer[0] = '\'';
MEMCPY(buffer + 1, ptr, char, len);
while (buffer[len - 1] >= 0x80 && buffer[len - 1] < 0xC0) { // Is continuation byte while (buffer[len] >= 0x80 && buffer[len] < 0xC0) { // Is continuation byte
len--; len--;
} }
if (buffer[len - 1] >= 0xC0) { // multibyte character start if (buffer[len] >= 0xC0) { // multibyte character start
len--; len--;
} }
buffer[len] = '\0'; buffer[len + 1] = '\'';
buffer[len + 2] = '\0';
ptr = (const char *)buffer; ptr = (const char *)buffer;
} }
}
VALUE msg = rb_sprintf(format, ptr); VALUE msg = rb_sprintf(format, ptr);
VALUE message = rb_enc_sprintf(enc_utf8, "%s at line %ld column %ld", RSTRING_PTR(msg), line, column); VALUE message = rb_enc_sprintf(enc_utf8, "%s at line %ld column %ld", RSTRING_PTR(msg), line, column);
@ -473,16 +485,16 @@ static uint32_t unescape_unicode(JSON_ParserState *state, const unsigned char *p
signed char b; signed char b;
uint32_t result = 0; uint32_t result = 0;
b = digit_values[p[0]]; b = digit_values[p[0]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, (char *)p - 2); if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b; result = (result << 4) | (unsigned char)b;
b = digit_values[p[1]]; b = digit_values[p[1]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, (char *)p - 2); if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b; result = (result << 4) | (unsigned char)b;
b = digit_values[p[2]]; b = digit_values[p[2]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, (char *)p - 2); if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b; result = (result << 4) | (unsigned char)b;
b = digit_values[p[3]]; b = digit_values[p[3]];
if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, (char *)p - 2); if (b < 0) raise_parse_error_at("incomplete unicode character escape sequence at %s", state, (char *)p - 2);
result = (result << 4) | (unsigned char)b; result = (result << 4) | (unsigned char)b;
return result; return result;
} }
@ -532,11 +544,11 @@ json_eat_comments(JSON_ParserState *state)
break; break;
} }
default: default:
raise_parse_error("unexpected token '%s'", state); raise_parse_error("unexpected token %s", state);
break; break;
} }
} else { } else {
raise_parse_error("unexpected token '%s'", state); raise_parse_error("unexpected token %s", state);
} }
} }
@ -655,7 +667,7 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
break; break;
case 'u': case 'u':
if (pe > stringEnd - 5) { if (pe > stringEnd - 5) {
raise_parse_error_at("incomplete unicode character escape sequence at '%s'", state, p); raise_parse_error_at("incomplete unicode character escape sequence at %s", state, p);
} else { } else {
uint32_t ch = unescape_unicode(state, (unsigned char *) ++pe); uint32_t ch = unescape_unicode(state, (unsigned char *) ++pe);
pe += 3; pe += 3;
@ -672,7 +684,7 @@ static VALUE json_string_unescape(JSON_ParserState *state, const char *string, c
if ((ch & 0xFC00) == 0xD800) { if ((ch & 0xFC00) == 0xD800) {
pe++; pe++;
if (pe > stringEnd - 6) { if (pe > stringEnd - 6) {
raise_parse_error_at("incomplete surrogate pair at '%s'", state, p); raise_parse_error_at("incomplete surrogate pair at %s", state, p);
} }
if (pe[0] == '\\' && pe[1] == 'u') { if (pe[0] == '\\' && pe[1] == 'u') {
uint32_t sur = unescape_unicode(state, (unsigned char *) pe + 2); uint32_t sur = unescape_unicode(state, (unsigned char *) pe + 2);
@ -894,7 +906,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
return json_push_value(state, config, Qnil); return json_push_value(state, config, Qnil);
} }
raise_parse_error("unexpected token '%s'", state); raise_parse_error("unexpected token %s", state);
break; break;
case 't': case 't':
if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "true", 4) == 0)) { if ((state->end - state->cursor >= 4) && (memcmp(state->cursor, "true", 4) == 0)) {
@ -902,7 +914,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
return json_push_value(state, config, Qtrue); return json_push_value(state, config, Qtrue);
} }
raise_parse_error("unexpected token '%s'", state); raise_parse_error("unexpected token %s", state);
break; break;
case 'f': case 'f':
// Note: memcmp with a small power of two compile to an integer comparison // Note: memcmp with a small power of two compile to an integer comparison
@ -911,7 +923,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
return json_push_value(state, config, Qfalse); return json_push_value(state, config, Qfalse);
} }
raise_parse_error("unexpected token '%s'", state); raise_parse_error("unexpected token %s", state);
break; break;
case 'N': case 'N':
// Note: memcmp with a small power of two compile to an integer comparison // Note: memcmp with a small power of two compile to an integer comparison
@ -920,7 +932,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
return json_push_value(state, config, CNaN); return json_push_value(state, config, CNaN);
} }
raise_parse_error("unexpected token '%s'", state); raise_parse_error("unexpected token %s", state);
break; break;
case 'I': case 'I':
if (config->allow_nan && (state->end - state->cursor >= 8) && (memcmp(state->cursor, "Infinity", 8) == 0)) { if (config->allow_nan && (state->end - state->cursor >= 8) && (memcmp(state->cursor, "Infinity", 8) == 0)) {
@ -928,7 +940,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
return json_push_value(state, config, CInfinity); return json_push_value(state, config, CInfinity);
} }
raise_parse_error("unexpected token '%s'", state); raise_parse_error("unexpected token %s", state);
break; break;
case '-': case '-':
// Note: memcmp with a small power of two compile to an integer comparison // Note: memcmp with a small power of two compile to an integer comparison
@ -937,7 +949,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
state->cursor += 9; state->cursor += 9;
return json_push_value(state, config, CMinusInfinity); return json_push_value(state, config, CMinusInfinity);
} else { } else {
raise_parse_error("unexpected token '%s'", state); raise_parse_error("unexpected token %s", state);
} }
} }
// Fallthrough // Fallthrough
@ -1062,7 +1074,7 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
} }
if (*state->cursor != '"') { if (*state->cursor != '"') {
raise_parse_error("expected object key, got '%s'", state); raise_parse_error("expected object key, got %s", state);
} }
json_parse_string(state, config, true); json_parse_string(state, config, true);
@ -1097,13 +1109,13 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
} }
if (*state->cursor != '"') { if (*state->cursor != '"') {
raise_parse_error("expected object key, got: '%s'", state); raise_parse_error("expected object key, got: %s", state);
} }
json_parse_string(state, config, true); json_parse_string(state, config, true);
json_eat_whitespace(state); json_eat_whitespace(state);
if ((state->cursor >= state->end) || (*state->cursor != ':')) { if ((state->cursor >= state->end) || (*state->cursor != ':')) {
raise_parse_error("expected ':' after object key, got: '%s'", state); raise_parse_error("expected ':' after object key, got: %s", state);
} }
state->cursor++; state->cursor++;
@ -1113,24 +1125,24 @@ static VALUE json_parse_any(JSON_ParserState *state, JSON_ParserConfig *config)
} }
} }
raise_parse_error("expected ',' or '}' after object value, got: '%s'", state); raise_parse_error("expected ',' or '}' after object value, got: %s", state);
} }
break; break;
} }
default: default:
raise_parse_error("unexpected character: '%s'", state); raise_parse_error("unexpected character: %s", state);
break; break;
} }
raise_parse_error("unreacheable: '%s'", state); raise_parse_error("unreacheable: %s", state);
} }
static void json_ensure_eof(JSON_ParserState *state) static void json_ensure_eof(JSON_ParserState *state)
{ {
json_eat_whitespace(state); json_eat_whitespace(state);
if (state->cursor != state->end) { if (state->cursor != state->end) {
raise_parse_error("unexpected token at end of stream '%s'", state); raise_parse_error("unexpected token at end of stream %s", state);
} }
} }

View File

@ -14,20 +14,35 @@ class JSONExtParserTest < Test::Unit::TestCase
end end
def test_error_messages def test_error_messages
ex = assert_raise(ParserError) { parse('Infinity') } ex = assert_raise(ParserError) { parse('Infinity something') }
unless RUBY_PLATFORM =~ /java/ unless RUBY_PLATFORM =~ /java/
assert_equal "unexpected token 'Infinity' at line 1 column 1", ex.message assert_equal "unexpected token 'Infinity' at line 1 column 1", ex.message
end end
ex = assert_raise(ParserError) { parse('-Infinity') } ex = assert_raise(ParserError) { parse('foo bar') }
unless RUBY_PLATFORM =~ /java/
assert_equal "unexpected token 'foo' at line 1 column 1", ex.message
end
ex = assert_raise(ParserError) { parse('-Infinity something') }
unless RUBY_PLATFORM =~ /java/ unless RUBY_PLATFORM =~ /java/
assert_equal "unexpected token '-Infinity' at line 1 column 1", ex.message assert_equal "unexpected token '-Infinity' at line 1 column 1", ex.message
end end
ex = assert_raise(ParserError) { parse('NaN') } ex = assert_raise(ParserError) { parse('NaN something') }
unless RUBY_PLATFORM =~ /java/ unless RUBY_PLATFORM =~ /java/
assert_equal "unexpected token 'NaN' at line 1 column 1", ex.message assert_equal "unexpected token 'NaN' at line 1 column 1", ex.message
end end
ex = assert_raise(ParserError) { parse(' ') }
unless RUBY_PLATFORM =~ /java/
assert_equal "unexpected end of input at line 1 column 4", ex.message
end
ex = assert_raise(ParserError) { parse('{ ') }
unless RUBY_PLATFORM =~ /java/
assert_equal "expected object key, got EOF at line 1 column 5", ex.message
end
end end
if GC.respond_to?(:stress=) if GC.respond_to?(:stress=)

View File

@ -646,7 +646,7 @@ class JSONParserTest < Test::Unit::TestCase
JSON.parse('{"input":{"firstName":"Bob","lastName":"Mob","email":"bob@example.com"}') JSON.parse('{"input":{"firstName":"Bob","lastName":"Mob","email":"bob@example.com"}')
end end
if RUBY_ENGINE == "ruby" if RUBY_ENGINE == "ruby"
assert_equal %(expected ',' or '}' after object value, got: '' at line 1 column 72), error.message assert_equal %(expected ',' or '}' after object value, got: EOF at line 1 column 72), error.message
end end
end end