diff --git a/lib/string_decoder.js b/lib/string_decoder.js index cb9fbe409fa..672ba185cc2 100644 --- a/lib/string_decoder.js +++ b/lib/string_decoder.js @@ -81,7 +81,7 @@ StringDecoder.prototype.fillLast = function(buf) { }; // Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a -// continuation byte. +// continuation byte. If an invalid byte is detected, -2 is returned. function utf8CheckByte(byte) { if (byte <= 0x7F) return 0; @@ -91,7 +91,7 @@ function utf8CheckByte(byte) { return 3; else if (byte >> 3 === 0x1E) return 4; - return -1; + return (byte >> 6 === 0x02 ? -1 : -2); } // Checks at most 3 bytes at the end of a Buffer in order to detect an @@ -107,7 +107,7 @@ function utf8CheckIncomplete(self, buf, i) { self.lastNeed = nb - 1; return nb; } - if (--j < i) + if (--j < i || nb === -2) return 0; nb = utf8CheckByte(buf[j]); if (nb >= 0) { @@ -115,7 +115,7 @@ function utf8CheckIncomplete(self, buf, i) { self.lastNeed = nb - 2; return nb; } - if (--j < i) + if (--j < i || nb === -2) return 0; nb = utf8CheckByte(buf[j]); if (nb >= 0) { @@ -133,7 +133,7 @@ function utf8CheckIncomplete(self, buf, i) { // Validates as many continuation bytes for a multi-byte UTF-8 character as // needed or are available. If we see a non-continuation byte where we expect // one, we "replace" the validated continuation bytes we've seen so far with -// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding +// a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding // behavior. The continuation byte check is included three times in the case // where all of the continuation bytes for a character exist in the same buffer. // It is also done this way as a slight performance increase instead of using a @@ -141,17 +141,17 @@ function utf8CheckIncomplete(self, buf, i) { function utf8CheckExtraBytes(self, buf, p) { if ((buf[0] & 0xC0) !== 0x80) { self.lastNeed = 0; - return '\ufffd'.repeat(p); + return '\ufffd'; } if (self.lastNeed > 1 && buf.length > 1) { if ((buf[1] & 0xC0) !== 0x80) { self.lastNeed = 1; - return '\ufffd'.repeat(p + 1); + return '\ufffd'; } if (self.lastNeed > 2 && buf.length > 2) { if ((buf[2] & 0xC0) !== 0x80) { self.lastNeed = 2; - return '\ufffd'.repeat(p + 2); + return '\ufffd'; } } } @@ -184,12 +184,12 @@ function utf8Text(buf, i) { return buf.toString('utf8', i, end); } -// For UTF-8, a replacement character for each buffered byte of a (partial) -// character needs to be added to the output. +// For UTF-8, a replacement character is added when ending on a partial +// character. function utf8End(buf) { const r = (buf && buf.length ? this.write(buf) : ''); if (this.lastNeed) - return r + '\ufffd'.repeat(this.lastTotal - this.lastNeed); + return r + '\ufffd'; return r; } diff --git a/test/parallel/test-string-decoder-end.js b/test/parallel/test-string-decoder-end.js index 6e9eea3193e..921a7a138ad 100644 --- a/test/parallel/test-string-decoder-end.js +++ b/test/parallel/test-string-decoder-end.js @@ -18,8 +18,6 @@ for (let i = 1; i <= 16; i++) { encodings.forEach(testEncoding); -console.log('ok'); - function testEncoding(encoding) { bufs.forEach((buf) => { testBuf(encoding, buf); @@ -27,8 +25,6 @@ function testEncoding(encoding) { } function testBuf(encoding, buf) { - console.error('# %s', encoding, buf); - // write one byte at a time. let s = new SD(encoding); let res1 = ''; @@ -46,9 +42,6 @@ function testBuf(encoding, buf) { // .toString() on the buffer const res3 = buf.toString(encoding); - console.log('expect=%j', res3); - console.log('res1=%j', res1); - console.log('res2=%j', res2); assert.strictEqual(res1, res3, 'one byte at a time should match toString'); assert.strictEqual(res2, res3, 'all bytes at once should match toString'); } diff --git a/test/parallel/test-string-decoder.js b/test/parallel/test-string-decoder.js index a8fdb999a09..4d22d670f63 100644 --- a/test/parallel/test-string-decoder.js +++ b/test/parallel/test-string-decoder.js @@ -8,8 +8,6 @@ const StringDecoder = require('string_decoder').StringDecoder; let decoder = new StringDecoder(); assert.strictEqual(decoder.encoding, 'utf8'); -process.stdout.write('scanning '); - // UTF-8 test('utf-8', Buffer.from('$', 'utf-8'), '$'); test('utf-8', Buffer.from('¢', 'utf-8'), '¢'); @@ -42,15 +40,15 @@ test('utf-8', Buffer.from('C9B5A941', 'hex'), '\u0275\ufffdA'); test('utf-8', Buffer.from('E2', 'hex'), '\ufffd'); test('utf-8', Buffer.from('E241', 'hex'), '\ufffdA'); test('utf-8', Buffer.from('CCCCB8', 'hex'), '\ufffd\u0338'); -test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffd\ufffdA'); +test('utf-8', Buffer.from('F0B841', 'hex'), '\ufffdA'); test('utf-8', Buffer.from('F1CCB8', 'hex'), '\ufffd\u0338'); test('utf-8', Buffer.from('F0FB00', 'hex'), '\ufffd\ufffd\0'); test('utf-8', Buffer.from('CCE2B8B8', 'hex'), '\ufffd\u2e38'); -test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\ufffd\u0338'); +test('utf-8', Buffer.from('E2B8CCB8', 'hex'), '\ufffd\u0338'); test('utf-8', Buffer.from('E2FBCC01', 'hex'), '\ufffd\ufffd\ufffd\u0001'); -test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), // CESU-8 of U+1D40D - '\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd'); test('utf-8', Buffer.from('CCB8CDB9', 'hex'), '\u0338\u0379'); +// CESU-8 of U+1D40D +test('utf-8', Buffer.from('EDA0B5EDB08D', 'hex'), '\ufffd\ufffd'); // UCS-2 test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc'); @@ -58,8 +56,6 @@ test('ucs2', Buffer.from('ababc', 'ucs2'), 'ababc'); // UTF-16LE test('utf16le', Buffer.from('3DD84DDC', 'hex'), '\ud83d\udc4d'); // thumbs up -console.log(' crayon!'); - // Additional UTF-8 tests decoder = new StringDecoder('utf8'); assert.strictEqual(decoder.write(Buffer.from('E1', 'hex')), ''); @@ -67,7 +63,7 @@ assert.strictEqual(decoder.end(), '\ufffd'); decoder = new StringDecoder('utf8'); assert.strictEqual(decoder.write(Buffer.from('E18B', 'hex')), ''); -assert.strictEqual(decoder.end(), '\ufffd\ufffd'); +assert.strictEqual(decoder.end(), '\ufffd'); decoder = new StringDecoder('utf8'); assert.strictEqual(decoder.write(Buffer.from('\ufffd')), '\ufffd'); @@ -131,7 +127,6 @@ function test(encoding, input, expected, singleSequence) { output += decoder.write(input.slice(write[0], write[1])); }); output += decoder.end(); - process.stdout.write('.'); if (output !== expected) { const message = 'Expected "' + unicodeEscape(expected) + '", ' +