string_decoder: optimize write()
By limiting property getting/setting to only where they are absolutely necessary, we can achieve greater performance especially with small utf8 inputs and any size base64 inputs. PR-URL: https://github.com/iojs/io.js/pull/1209 Reviewed-By: Rod Vagg <rod@vagg.org> Reviewed-By: Nicu Micleușanu <micnic90@gmail.com> Reviewed-By: Chris Dickinson <christopher.s.dickinson@gmail.com>
This commit is contained in:
parent
3d46fefe0c
commit
8a945814dd
60
benchmark/misc/string-decoder.js
Normal file
60
benchmark/misc/string-decoder.js
Normal file
@ -0,0 +1,60 @@
|
||||
var common = require('../common.js');
|
||||
var StringDecoder = require('string_decoder').StringDecoder;
|
||||
|
||||
var bench = common.createBenchmark(main, {
|
||||
encoding: ['ascii', 'utf8', 'base64-utf8', 'base64-ascii'],
|
||||
inlen: [32, 128, 1024],
|
||||
chunk: [16, 64, 256, 1024],
|
||||
n: [25e4]
|
||||
});
|
||||
|
||||
var UTF_ALPHA = 'Blåbærsyltetøy';
|
||||
var ASC_ALPHA = 'Blueberry jam';
|
||||
|
||||
function main(conf) {
|
||||
var encoding = conf.encoding;
|
||||
var inLen = conf.inlen | 0;
|
||||
var chunkLen = conf.chunk | 0;
|
||||
var n = conf.n | 0;
|
||||
|
||||
var alpha;
|
||||
var chunks = [];
|
||||
var str = '';
|
||||
var isBase64 = (encoding === 'base64-ascii' || encoding === 'base64-utf8');
|
||||
|
||||
if (encoding === 'ascii' || encoding === 'base64-ascii')
|
||||
alpha = ASC_ALPHA;
|
||||
else if (encoding === 'utf8' || encoding === 'base64-utf8')
|
||||
alpha = UTF_ALPHA;
|
||||
else
|
||||
throw new Error('Bad encoding');
|
||||
|
||||
var sd = new StringDecoder(isBase64 ? 'base64' : encoding);
|
||||
|
||||
for (var i = 0; i < inLen; ++i) {
|
||||
if (i > 0 && (i % chunkLen) === 0 && !isBase64) {
|
||||
chunks.push(new Buffer(str, encoding));
|
||||
str = '';
|
||||
}
|
||||
str += alpha[i % alpha.length];
|
||||
}
|
||||
if (str.length > 0 && !isBase64)
|
||||
chunks.push(new Buffer(str, encoding));
|
||||
if (isBase64) {
|
||||
str = new Buffer(str, 'utf8').toString('base64');
|
||||
while (str.length > 0) {
|
||||
var len = Math.min(chunkLen, str.length);
|
||||
chunks.push(new Buffer(str.substring(0, len), 'utf8'));
|
||||
str = str.substring(len);
|
||||
}
|
||||
}
|
||||
|
||||
var nChunks = chunks.length;
|
||||
|
||||
bench.start();
|
||||
for (var i = 0; i < n; ++i) {
|
||||
for (var j = 0; j < nChunks; ++j)
|
||||
sd.write(chunks[j]);
|
||||
}
|
||||
bench.end(n);
|
||||
}
|
@ -1,7 +1,9 @@
|
||||
'use strict';
|
||||
|
||||
const isEncoding = Buffer.isEncoding;
|
||||
|
||||
function assertEncoding(encoding) {
|
||||
if (encoding && !Buffer.isEncoding(encoding)) {
|
||||
if (encoding && !isEncoding(encoding)) {
|
||||
throw new Error('Unknown encoding: ' + encoding);
|
||||
}
|
||||
}
|
||||
@ -59,65 +61,83 @@ const StringDecoder = exports.StringDecoder = function(encoding) {
|
||||
// replacement character. See https://codereview.chromium.org/121173009/ .
|
||||
StringDecoder.prototype.write = function(buffer) {
|
||||
var charStr = '';
|
||||
var buflen = buffer.length;
|
||||
var charBuffer = this.charBuffer;
|
||||
var charLength = this.charLength;
|
||||
var charReceived = this.charReceived;
|
||||
var surrogateSize = this.surrogateSize;
|
||||
var encoding = this.encoding;
|
||||
// if our last write ended with an incomplete multibyte character
|
||||
while (this.charLength) {
|
||||
while (charLength) {
|
||||
// determine how many remaining bytes this buffer has to offer for this char
|
||||
var available = (buffer.length >= this.charLength - this.charReceived) ?
|
||||
this.charLength - this.charReceived :
|
||||
buffer.length;
|
||||
var diff = charLength - charReceived;
|
||||
var available = (buflen >= diff) ? diff : buflen;
|
||||
|
||||
// add the new bytes to the char buffer
|
||||
buffer.copy(this.charBuffer, this.charReceived, 0, available);
|
||||
this.charReceived += available;
|
||||
buffer.copy(charBuffer, charReceived, 0, available);
|
||||
charReceived += available;
|
||||
|
||||
if (this.charReceived < this.charLength) {
|
||||
if (charReceived < charLength) {
|
||||
// still not enough chars in this buffer? wait for more ...
|
||||
|
||||
this.charLength = charLength;
|
||||
this.charReceived = charReceived;
|
||||
|
||||
return '';
|
||||
}
|
||||
|
||||
// remove bytes belonging to the current character from the buffer
|
||||
buffer = buffer.slice(available, buffer.length);
|
||||
buffer = buffer.slice(available, buflen);
|
||||
buflen = buffer.length;
|
||||
|
||||
// get the character that was split
|
||||
charStr = this.charBuffer.slice(0, this.charLength).toString(this.encoding);
|
||||
charStr = charBuffer.toString(encoding, 0, charLength);
|
||||
|
||||
// CESU-8: lead surrogate (D800-DBFF) is also the incomplete character
|
||||
var charCode = charStr.charCodeAt(charStr.length - 1);
|
||||
if (charCode >= 0xD800 && charCode <= 0xDBFF) {
|
||||
this.charLength += this.surrogateSize;
|
||||
charLength += surrogateSize;
|
||||
charStr = '';
|
||||
continue;
|
||||
}
|
||||
this.charReceived = this.charLength = 0;
|
||||
charReceived = charLength = 0;
|
||||
|
||||
// if there are no more bytes in this buffer, just emit our char
|
||||
if (buffer.length === 0) {
|
||||
if (buflen === 0) {
|
||||
this.charLength = charLength;
|
||||
this.charReceived = charReceived;
|
||||
|
||||
return charStr;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
// determine and set charLength / charReceived
|
||||
this.detectIncompleteChar(buffer);
|
||||
if (this.detectIncompleteChar(buffer))
|
||||
charLength = this.charLength;
|
||||
charReceived = this.charReceived;
|
||||
|
||||
var end = buffer.length;
|
||||
if (this.charLength) {
|
||||
var end = buflen;
|
||||
if (charLength) {
|
||||
// buffer the incomplete character bytes we got
|
||||
buffer.copy(this.charBuffer, 0, buffer.length - this.charReceived, end);
|
||||
end -= this.charReceived;
|
||||
buffer.copy(charBuffer, 0, buflen - charReceived, end);
|
||||
end -= charReceived;
|
||||
}
|
||||
|
||||
charStr += buffer.toString(this.encoding, 0, end);
|
||||
this.charLength = charLength;
|
||||
charStr += buffer.toString(encoding, 0, end);
|
||||
|
||||
var end = charStr.length - 1;
|
||||
var charCode = charStr.charCodeAt(end);
|
||||
// CESU-8: lead surrogate (D800-DBFF) is also the incomplete character
|
||||
if (charCode >= 0xD800 && charCode <= 0xDBFF) {
|
||||
var size = this.surrogateSize;
|
||||
this.charLength += size;
|
||||
this.charReceived += size;
|
||||
this.charBuffer.copy(this.charBuffer, size, 0, size);
|
||||
buffer.copy(this.charBuffer, 0, 0, size);
|
||||
charLength += surrogateSize;
|
||||
charReceived += surrogateSize;
|
||||
charBuffer.copy(charBuffer, surrogateSize, 0, surrogateSize);
|
||||
buffer.copy(charBuffer, 0, 0, surrogateSize);
|
||||
|
||||
this.charLength = charLength;
|
||||
this.charReceived = charReceived;
|
||||
|
||||
return charStr.substring(0, end);
|
||||
}
|
||||
|
||||
@ -130,35 +150,43 @@ StringDecoder.prototype.write = function(buffer) {
|
||||
// length that character, and sets this.charReceived to the number of bytes
|
||||
// that are available for this character.
|
||||
StringDecoder.prototype.detectIncompleteChar = function(buffer) {
|
||||
var buflen = buffer.length;
|
||||
// determine how many bytes we have to check at the end of this buffer
|
||||
var i = (buffer.length >= 3) ? 3 : buffer.length;
|
||||
var i = (buflen >= 3) ? 3 : buflen;
|
||||
var newlen = false;
|
||||
|
||||
// Figure out if one of the last i bytes of our buffer announces an
|
||||
// incomplete char.
|
||||
for (; i > 0; i--) {
|
||||
var c = buffer[buffer.length - i];
|
||||
var c = buffer[buflen - i];
|
||||
|
||||
// See http://en.wikipedia.org/wiki/UTF-8#Description
|
||||
|
||||
// 110XXXXX
|
||||
if (i == 1 && c >> 5 == 0x06) {
|
||||
if (i === 1 && c >> 5 === 0x06) {
|
||||
this.charLength = 2;
|
||||
newlen = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// 1110XXXX
|
||||
if (i <= 2 && c >> 4 == 0x0E) {
|
||||
if (i <= 2 && c >> 4 === 0x0E) {
|
||||
this.charLength = 3;
|
||||
newlen = true;
|
||||
break;
|
||||
}
|
||||
|
||||
// 11110XXX
|
||||
if (i <= 3 && c >> 3 == 0x1E) {
|
||||
if (i <= 3 && c >> 3 === 0x1E) {
|
||||
this.charLength = 4;
|
||||
newlen = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
this.charReceived = i;
|
||||
|
||||
return newlen;
|
||||
};
|
||||
|
||||
StringDecoder.prototype.end = function(buffer) {
|
||||
@ -166,11 +194,12 @@ StringDecoder.prototype.end = function(buffer) {
|
||||
if (buffer && buffer.length)
|
||||
res = this.write(buffer);
|
||||
|
||||
if (this.charReceived) {
|
||||
var cr = this.charReceived;
|
||||
var charReceived = this.charReceived;
|
||||
if (charReceived) {
|
||||
var cr = charReceived;
|
||||
var buf = this.charBuffer;
|
||||
var enc = this.encoding;
|
||||
res += buf.slice(0, cr).toString(enc);
|
||||
res += buf.toString(enc, 0, cr);
|
||||
}
|
||||
|
||||
return res;
|
||||
@ -181,11 +210,13 @@ function passThroughWrite(buffer) {
|
||||
}
|
||||
|
||||
function utf16DetectIncompleteChar(buffer) {
|
||||
this.charReceived = buffer.length % 2;
|
||||
this.charLength = this.charReceived ? 2 : 0;
|
||||
var charReceived = this.charReceived = buffer.length % 2;
|
||||
this.charLength = charReceived ? 2 : 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
function base64DetectIncompleteChar(buffer) {
|
||||
this.charReceived = buffer.length % 3;
|
||||
this.charLength = this.charReceived ? 3 : 0;
|
||||
var charReceived = this.charReceived = buffer.length % 3;
|
||||
this.charLength = charReceived ? 3 : 0;
|
||||
return true;
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user