From 0d62037fc0626855c36359e4a8a02936b592f9d9 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Tue, 25 Feb 2025 17:12:26 +0100 Subject: [PATCH] [ruby/json] Ensure parser error snippets are valid UTF-8 Fix: https://github.com/ruby/json/issues/755 Error messages now include a snippet of the document that doesn't parse to help locate the issue, however the way it was done wasn't UTF-8 aware, and it could result in exception messages with truncated characters. It would be nice to go a bit farther and actually support codepoints, but it's a lot of complexity to do it in C, perhaps if we move that logic to Ruby given it's not a performance sensitive codepath. https://github.com/ruby/json/commit/e144793b72 --- ext/json/parser/parser.c | 15 ++++++++++++--- test/json/json_parser_test.rb | 16 ++++++++++++++++ 2 files changed, 28 insertions(+), 3 deletions(-) diff --git a/ext/json/parser/parser.c b/ext/json/parser/parser.c index c21a5fda5f..776eb916f0 100644 --- a/ext/json/parser/parser.c +++ b/ext/json/parser/parser.c @@ -454,15 +454,24 @@ RBIMPL_ATTR_NORETURN() #endif static void raise_parse_error(const char *format, const char *start) { - char buffer[PARSE_ERROR_FRAGMENT_LEN + 1]; + unsigned char buffer[PARSE_ERROR_FRAGMENT_LEN + 1]; size_t len = start ? strnlen(start, PARSE_ERROR_FRAGMENT_LEN) : 0; const char *ptr = start; if (len == PARSE_ERROR_FRAGMENT_LEN) { MEMCPY(buffer, start, char, PARSE_ERROR_FRAGMENT_LEN); - buffer[PARSE_ERROR_FRAGMENT_LEN] = '\0'; - ptr = buffer; + + while (buffer[len - 1] >= 0x80 && buffer[len - 1] < 0xC0) { // Is continuation byte + len--; + } + + if (buffer[len - 1] >= 0xC0) { // multibyte character start + len--; + } + + buffer[len] = '\0'; + ptr = (const char *)buffer; } rb_enc_raise(enc_utf8, rb_path2class("JSON::ParserError"), format, ptr); diff --git a/test/json/json_parser_test.rb b/test/json/json_parser_test.rb index d1f084bb63..ae0f285d32 100644 --- a/test/json/json_parser_test.rb +++ b/test/json/json_parser_test.rb @@ -645,6 +645,22 @@ class JSONParserTest < Test::Unit::TestCase end end + def test_parse_error_snippet + omit "C ext only test" unless RUBY_ENGINE == "ruby" + + error = assert_raise(JSON::ParserError) { JSON.parse("あああああああああああああああああああああああ") } + assert_equal "unexpected character: 'ああああああああああ'", error.message + + error = assert_raise(JSON::ParserError) { JSON.parse("aあああああああああああああああああああああああ") } + assert_equal "unexpected character: 'aああああああああああ'", error.message + + error = assert_raise(JSON::ParserError) { JSON.parse("abあああああああああああああああああああああああ") } + assert_equal "unexpected character: 'abあああああああああ'", error.message + + error = assert_raise(JSON::ParserError) { JSON.parse("abcあああああああああああああああああああああああ") } + assert_equal "unexpected character: 'abcあああああああああ'", error.message + end + def test_parse_leading_slash # ref: https://github.com/ruby/ruby/pull/12598 assert_raise(JSON::ParserError) do