string_decoder: fix bad utf8 character handling
This commit fixes an issue when extra utf8 continuation bytes appear at the end of a chunk of data, causing miscalculations to be made when checking how many bytes are needed to decode a complete character. Fixes: https://github.com/nodejs/node/issues/7308 PR-URL: https://github.com/nodejs/node/pull/7310 Reviewed-By: Colin Ihrig <cjihrig@gmail.com> Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Fedor Indutny <fedor.indutny@gmail.com>
This commit is contained in:
parent
5267f29b34
commit
79ef3b6b5d
@ -25,8 +25,10 @@ function StringDecoder(encoding) {
|
|||||||
case 'utf16le':
|
case 'utf16le':
|
||||||
this.text = utf16Text;
|
this.text = utf16Text;
|
||||||
this.end = utf16End;
|
this.end = utf16End;
|
||||||
// fall through
|
nb = 4;
|
||||||
|
break;
|
||||||
case 'utf8':
|
case 'utf8':
|
||||||
|
this.fillLast = utf8FillLast;
|
||||||
nb = 4;
|
nb = 4;
|
||||||
break;
|
break;
|
||||||
case 'base64':
|
case 'base64':
|
||||||
@ -68,7 +70,7 @@ StringDecoder.prototype.end = utf8End;
|
|||||||
// Returns only complete characters in a Buffer
|
// Returns only complete characters in a Buffer
|
||||||
StringDecoder.prototype.text = utf8Text;
|
StringDecoder.prototype.text = utf8Text;
|
||||||
|
|
||||||
// Attempts to complete a partial character using bytes from a Buffer
|
// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
|
||||||
StringDecoder.prototype.fillLast = function(buf) {
|
StringDecoder.prototype.fillLast = function(buf) {
|
||||||
if (this.lastNeed <= buf.length) {
|
if (this.lastNeed <= buf.length) {
|
||||||
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
|
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
|
||||||
@ -92,38 +94,83 @@ function utf8CheckByte(byte) {
|
|||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Checks at most the last 3 bytes of a Buffer for an incomplete UTF-8
|
// Checks at most 3 bytes at the end of a Buffer in order to detect an
|
||||||
// character, returning the total number of bytes needed to complete the partial
|
// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4)
|
||||||
// character (if applicable).
|
// needed to complete the UTF-8 character (if applicable) are returned.
|
||||||
function utf8CheckIncomplete(self, buf, i) {
|
function utf8CheckIncomplete(self, buf, i) {
|
||||||
var j = buf.length - 1;
|
var j = buf.length - 1;
|
||||||
if (j < i)
|
if (j < i)
|
||||||
return 0;
|
return 0;
|
||||||
var nb = utf8CheckByte(buf[j--]);
|
var nb = utf8CheckByte(buf[j]);
|
||||||
if (nb >= 0) {
|
if (nb >= 0) {
|
||||||
if (nb > 0)
|
if (nb > 0)
|
||||||
self.lastNeed = nb + 1 - (buf.length - j);
|
self.lastNeed = nb - 1;
|
||||||
return nb;
|
return nb;
|
||||||
}
|
}
|
||||||
if (j < i)
|
if (--j < i)
|
||||||
return 0;
|
return 0;
|
||||||
nb = utf8CheckByte(buf[j--]);
|
nb = utf8CheckByte(buf[j]);
|
||||||
if (nb >= 0) {
|
if (nb >= 0) {
|
||||||
if (nb > 0)
|
if (nb > 0)
|
||||||
self.lastNeed = nb + 1 - (buf.length - j);
|
self.lastNeed = nb - 2;
|
||||||
return nb;
|
return nb;
|
||||||
}
|
}
|
||||||
if (j < i)
|
if (--j < i)
|
||||||
return 0;
|
return 0;
|
||||||
nb = utf8CheckByte(buf[j--]);
|
nb = utf8CheckByte(buf[j]);
|
||||||
if (nb >= 0) {
|
if (nb >= 0) {
|
||||||
if (nb > 0)
|
if (nb > 0) {
|
||||||
self.lastNeed = nb + 1 - (buf.length - j);
|
if (nb === 2)
|
||||||
|
nb = 0;
|
||||||
|
else
|
||||||
|
self.lastNeed = nb - 3;
|
||||||
|
}
|
||||||
return nb;
|
return nb;
|
||||||
}
|
}
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Validates as many continuation bytes for a multi-byte UTF-8 character as
|
||||||
|
// needed or are available. If we see a non-continuation byte where we expect
|
||||||
|
// one, we "replace" the validated continuation bytes we've seen so far with
|
||||||
|
// UTF-8 replacement characters ('\ufffd'), to match v8's UTF-8 decoding
|
||||||
|
// behavior. The continuation byte check is included three times in the case
|
||||||
|
// where all of the continuation bytes for a character exist in the same buffer.
|
||||||
|
// It is also done this way as a slight performance increase instead of using a
|
||||||
|
// loop.
|
||||||
|
function utf8CheckExtraBytes(self, buf, p) {
|
||||||
|
if ((buf[0] & 0xC0) !== 0x80) {
|
||||||
|
self.lastNeed = 0;
|
||||||
|
return '\ufffd'.repeat(p);
|
||||||
|
}
|
||||||
|
if (self.lastNeed > 1 && buf.length > 1) {
|
||||||
|
if ((buf[1] & 0xC0) !== 0x80) {
|
||||||
|
self.lastNeed = 1;
|
||||||
|
return '\ufffd'.repeat(p + 1);
|
||||||
|
}
|
||||||
|
if (self.lastNeed > 2 && buf.length > 2) {
|
||||||
|
if ((buf[2] & 0xC0) !== 0x80) {
|
||||||
|
self.lastNeed = 2;
|
||||||
|
return '\ufffd'.repeat(p + 2);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer.
|
||||||
|
function utf8FillLast(buf) {
|
||||||
|
const p = this.lastTotal - this.lastNeed;
|
||||||
|
var r = utf8CheckExtraBytes(this, buf, p);
|
||||||
|
if (r !== undefined)
|
||||||
|
return r;
|
||||||
|
if (this.lastNeed <= buf.length) {
|
||||||
|
buf.copy(this.lastChar, p, 0, this.lastNeed);
|
||||||
|
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
|
||||||
|
}
|
||||||
|
buf.copy(this.lastChar, p, 0, buf.length);
|
||||||
|
this.lastNeed -= buf.length;
|
||||||
|
}
|
||||||
|
|
||||||
// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
|
// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
|
||||||
// partial character, the character's bytes are buffered until the required
|
// partial character, the character's bytes are buffered until the required
|
||||||
// number of bytes are available.
|
// number of bytes are available.
|
||||||
|
@ -55,7 +55,7 @@ assert.strictEqual(decoder.write(Buffer.from('\ufffd\ufffd\ufffd')),
|
|||||||
assert.strictEqual(decoder.end(), '');
|
assert.strictEqual(decoder.end(), '');
|
||||||
|
|
||||||
decoder = new StringDecoder('utf8');
|
decoder = new StringDecoder('utf8');
|
||||||
assert.strictEqual(decoder.write(Buffer.from('efbfbde2', 'hex')), '\ufffd');
|
assert.strictEqual(decoder.write(Buffer.from('EFBFBDE2', 'hex')), '\ufffd');
|
||||||
assert.strictEqual(decoder.end(), '\ufffd');
|
assert.strictEqual(decoder.end(), '\ufffd');
|
||||||
|
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user