string_decoder: fix number of replacement chars

Fixes: https://github.com/nodejs/node/issues/22626 PR-URL: https://github.com/nodejs/node/pull/22709 Reviewed-By: Ruben Bridgewater <ruben@bridgewater.de> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Rich Trott <rtrott@gmail.com> Reviewed-By: Matteo Collina <matteo.collina@gmail.com> Reviewed-By: Colin Ihrig <cjihrig@gmail.com>
2018-09-05 14:24:45 +02:00 · 2018-09-05 14:24:45 +02:00 · 06f6ac179c
commit 06f6ac179c
parent ab6ddc0634
2 changed files with 18 additions and 6 deletions
--- a/src/string_decoder.cc
+++ b/src/string_decoder.cc
@ -71,16 +71,17 @@ MaybeLocal<String> StringDecoder::DecodeData(Isolate* isolate,
               kIncompleteCharactersEnd);
      if (Encoding() == UTF8) {
        // For UTF-8, we need special treatment to align with the V8 decoder:
-        // If an incomplete character is found at a chunk boundary, we turn
-        // that character into a single invalid one.
+        // If an incomplete character is found at a chunk boundary, we use
+        // its remainder and pass it to V8 as-is.
        for (size_t i = 0; i < nread && i < MissingBytes(); ++i) {
          if ((data[i] & 0xC0) != 0x80) {
            // This byte is not a continuation byte even though it should have
-            // been one.
-            // Act as if there was a 1-byte incomplete character, which does
-            // not make sense but works here because we know it's invalid.
+            // been one. We stop decoding of the incomplete character at this
+            // point (but still use the rest of the incomplete bytes from this
+            // chunk) and assume that the new, unexpected byte starts a new one.
            state_[kMissingBytes] = 0;
-            state_[kBufferedBytes] = 1;
+            memcpy(IncompleteCharacterBuffer() + BufferedBytes(), data, i);
+            state_[kBufferedBytes] += i;
            data += i;
            nread -= i;
            break;
--- a/test/parallel/test-string-decoder.js
+++ b/test/parallel/test-string-decoder.js
@ -162,6 +162,17 @@ assert.strictEqual(decoder.write(Buffer.alloc(20)), '\0'.repeat(10));
 assert.strictEqual(decoder.write(Buffer.alloc(48)), '\0'.repeat(24));
 assert.strictEqual(decoder.end(), '');

+// Regression tests for https://github.com/nodejs/node/issues/22626
+// (not enough replacement chars when having seen more than one byte of an
+// incomplete multibyte characters).
+decoder = new StringDecoder('utf8');
+assert.strictEqual(decoder.write(Buffer.from('f69b', 'hex')), '');
+assert.strictEqual(decoder.write(Buffer.from('d1', 'hex')), '\ufffd\ufffd');
+assert.strictEqual(decoder.end(), '\ufffd');
+assert.strictEqual(decoder.write(Buffer.from('f4', 'hex')), '');
+assert.strictEqual(decoder.write(Buffer.from('bde5', 'hex')), '\ufffd\ufffd');
+assert.strictEqual(decoder.end(), '\ufffd');
+
 common.expectsError(
  () => new StringDecoder(1),
  {