string_decoder: reimplement in C++
Implement string decoder in C++. The perks are a decent speed boost (for decoding, whereas creation show some performance degradation), that this can now be used more easily to add native decoding support to C++ streams and (arguably) more readable variable names. PR-URL: https://github.com/nodejs/node/pull/18537 Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
This commit is contained in:
parent
de848ac1e0
commit
180af17b52
@ -22,10 +22,23 @@
|
|||||||
'use strict';
|
'use strict';
|
||||||
|
|
||||||
const { Buffer } = require('buffer');
|
const { Buffer } = require('buffer');
|
||||||
|
const {
|
||||||
|
kIncompleteCharactersStart,
|
||||||
|
kIncompleteCharactersEnd,
|
||||||
|
kMissingBytes,
|
||||||
|
kBufferedBytes,
|
||||||
|
kEncodingField,
|
||||||
|
kSize,
|
||||||
|
decode,
|
||||||
|
flush,
|
||||||
|
encodings
|
||||||
|
} = internalBinding('string_decoder');
|
||||||
const internalUtil = require('internal/util');
|
const internalUtil = require('internal/util');
|
||||||
const errors = require('internal/errors');
|
const errors = require('internal/errors');
|
||||||
const isEncoding = Buffer[internalUtil.kIsEncodingSymbol];
|
const isEncoding = Buffer[internalUtil.kIsEncodingSymbol];
|
||||||
|
|
||||||
|
const kNativeDecoder = Symbol('kNativeDecoder');
|
||||||
|
|
||||||
// Do not cache `Buffer.isEncoding` when checking encoding names as some
|
// Do not cache `Buffer.isEncoding` when checking encoding names as some
|
||||||
// modules monkey-patch it to support additional encodings
|
// modules monkey-patch it to support additional encodings
|
||||||
function normalizeEncoding(enc) {
|
function normalizeEncoding(enc) {
|
||||||
@ -36,258 +49,54 @@ function normalizeEncoding(enc) {
|
|||||||
return nenc || enc;
|
return nenc || enc;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const encodingsMap = {};
|
||||||
|
for (var i = 0; i < encodings.length; ++i)
|
||||||
|
encodingsMap[encodings[i]] = i;
|
||||||
|
|
||||||
// StringDecoder provides an interface for efficiently splitting a series of
|
// StringDecoder provides an interface for efficiently splitting a series of
|
||||||
// buffers into a series of JS strings without breaking apart multi-byte
|
// buffers into a series of JS strings without breaking apart multi-byte
|
||||||
// characters.
|
// characters.
|
||||||
exports.StringDecoder = StringDecoder;
|
class StringDecoder {
|
||||||
function StringDecoder(encoding) {
|
constructor(encoding) {
|
||||||
this.encoding = normalizeEncoding(encoding);
|
this.encoding = normalizeEncoding(encoding);
|
||||||
var nb;
|
this[kNativeDecoder] = Buffer.alloc(kSize);
|
||||||
switch (this.encoding) {
|
this[kNativeDecoder][kEncodingField] = encodingsMap[this.encoding];
|
||||||
case 'utf16le':
|
|
||||||
this.text = utf16Text;
|
|
||||||
this.end = utf16End;
|
|
||||||
nb = 4;
|
|
||||||
break;
|
|
||||||
case 'utf8':
|
|
||||||
this.fillLast = utf8FillLast;
|
|
||||||
nb = 4;
|
|
||||||
break;
|
|
||||||
case 'base64':
|
|
||||||
this.text = base64Text;
|
|
||||||
this.end = base64End;
|
|
||||||
nb = 3;
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
this.write = simpleWrite;
|
|
||||||
this.end = simpleEnd;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
this.lastNeed = 0;
|
|
||||||
this.lastTotal = 0;
|
|
||||||
this.lastChar = Buffer.allocUnsafe(nb);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
StringDecoder.prototype.write = function(buf) {
|
write(buf) {
|
||||||
if (buf.length === 0)
|
if (typeof buf === 'string')
|
||||||
return '';
|
return buf;
|
||||||
var r;
|
if (!ArrayBuffer.isView(buf))
|
||||||
var i;
|
throw new errors.TypeError('ERR_INVALID_ARG_TYPE', 'buf',
|
||||||
if (this.lastNeed) {
|
['Buffer', 'Uint8Array', 'ArrayBufferView']);
|
||||||
r = this.fillLast(buf);
|
return decode(this[kNativeDecoder], buf);
|
||||||
if (r === undefined)
|
|
||||||
return '';
|
|
||||||
i = this.lastNeed;
|
|
||||||
this.lastNeed = 0;
|
|
||||||
} else {
|
|
||||||
i = 0;
|
|
||||||
}
|
|
||||||
if (i < buf.length)
|
|
||||||
return (r ? r + this.text(buf, i) : this.text(buf, i));
|
|
||||||
return r || '';
|
|
||||||
};
|
|
||||||
|
|
||||||
StringDecoder.prototype.end = utf8End;
|
|
||||||
|
|
||||||
// Returns only complete characters in a Buffer
|
|
||||||
StringDecoder.prototype.text = utf8Text;
|
|
||||||
|
|
||||||
// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
|
|
||||||
StringDecoder.prototype.fillLast = function(buf) {
|
|
||||||
if (this.lastNeed <= buf.length) {
|
|
||||||
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
|
|
||||||
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
|
|
||||||
}
|
|
||||||
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length);
|
|
||||||
this.lastNeed -= buf.length;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
|
|
||||||
// continuation byte. If an invalid byte is detected, -2 is returned.
|
|
||||||
function utf8CheckByte(byte) {
|
|
||||||
if (byte <= 0x7F)
|
|
||||||
return 0;
|
|
||||||
else if (byte >> 5 === 0x06)
|
|
||||||
return 2;
|
|
||||||
else if (byte >> 4 === 0x0E)
|
|
||||||
return 3;
|
|
||||||
else if (byte >> 3 === 0x1E)
|
|
||||||
return 4;
|
|
||||||
return (byte >> 6 === 0x02 ? -1 : -2);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Checks at most 3 bytes at the end of a Buffer in order to detect an
|
end(buf) {
|
||||||
// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4)
|
let ret = '';
|
||||||
// needed to complete the UTF-8 character (if applicable) are returned.
|
if (buf !== undefined)
|
||||||
function utf8CheckIncomplete(self, buf, i) {
|
ret = this.write(buf);
|
||||||
var j = buf.length - 1;
|
if (this[kNativeDecoder][kBufferedBytes] > 0)
|
||||||
if (j < i)
|
ret += flush(this[kNativeDecoder]);
|
||||||
return 0;
|
return ret;
|
||||||
var nb = utf8CheckByte(buf[j]);
|
|
||||||
if (nb >= 0) {
|
|
||||||
if (nb > 0)
|
|
||||||
self.lastNeed = nb - 1;
|
|
||||||
return nb;
|
|
||||||
}
|
|
||||||
if (--j < i || nb === -2)
|
|
||||||
return 0;
|
|
||||||
nb = utf8CheckByte(buf[j]);
|
|
||||||
if (nb >= 0) {
|
|
||||||
if (nb > 0)
|
|
||||||
self.lastNeed = nb - 2;
|
|
||||||
return nb;
|
|
||||||
}
|
|
||||||
if (--j < i || nb === -2)
|
|
||||||
return 0;
|
|
||||||
nb = utf8CheckByte(buf[j]);
|
|
||||||
if (nb >= 0) {
|
|
||||||
if (nb > 0) {
|
|
||||||
if (nb === 2)
|
|
||||||
nb = 0;
|
|
||||||
else
|
|
||||||
self.lastNeed = nb - 3;
|
|
||||||
}
|
|
||||||
return nb;
|
|
||||||
}
|
|
||||||
return 0;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Validates as many continuation bytes for a multi-byte UTF-8 character as
|
/* Everything below this line is undocumented legacy stuff. */
|
||||||
// needed or are available. If we see a non-continuation byte where we expect
|
|
||||||
// one, we "replace" the validated continuation bytes we've seen so far with
|
text(buf, offset) {
|
||||||
// a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding
|
this[kNativeDecoder][kMissingBytes] = 0;
|
||||||
// behavior. The continuation byte check is included three times in the case
|
this[kNativeDecoder][kBufferedBytes] = 0;
|
||||||
// where all of the continuation bytes for a character exist in the same buffer.
|
return this.write(buf.slice(offset));
|
||||||
// It is also done this way as a slight performance increase instead of using a
|
|
||||||
// loop.
|
|
||||||
function utf8CheckExtraBytes(self, buf, p) {
|
|
||||||
if ((buf[0] & 0xC0) !== 0x80) {
|
|
||||||
self.lastNeed = 0;
|
|
||||||
return '\ufffd';
|
|
||||||
}
|
|
||||||
if (self.lastNeed > 1 && buf.length > 1) {
|
|
||||||
if ((buf[1] & 0xC0) !== 0x80) {
|
|
||||||
self.lastNeed = 1;
|
|
||||||
return '\ufffd';
|
|
||||||
}
|
|
||||||
if (self.lastNeed > 2 && buf.length > 2) {
|
|
||||||
if ((buf[2] & 0xC0) !== 0x80) {
|
|
||||||
self.lastNeed = 2;
|
|
||||||
return '\ufffd';
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
get lastTotal() {
|
||||||
|
return this[kNativeDecoder][kBufferedBytes] + this.lastNeed;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
get lastChar() {
|
||||||
|
return this[kNativeDecoder].subarray(kIncompleteCharactersStart,
|
||||||
|
kIncompleteCharactersEnd);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer.
|
exports.StringDecoder = StringDecoder;
|
||||||
function utf8FillLast(buf) {
|
|
||||||
const p = this.lastTotal - this.lastNeed;
|
|
||||||
var r = utf8CheckExtraBytes(this, buf, p);
|
|
||||||
if (r !== undefined)
|
|
||||||
return r;
|
|
||||||
if (this.lastNeed <= buf.length) {
|
|
||||||
buf.copy(this.lastChar, p, 0, this.lastNeed);
|
|
||||||
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
|
|
||||||
}
|
|
||||||
buf.copy(this.lastChar, p, 0, buf.length);
|
|
||||||
this.lastNeed -= buf.length;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
|
|
||||||
// partial character, the character's bytes are buffered until the required
|
|
||||||
// number of bytes are available.
|
|
||||||
function utf8Text(buf, i) {
|
|
||||||
const total = utf8CheckIncomplete(this, buf, i);
|
|
||||||
if (!this.lastNeed)
|
|
||||||
return buf.toString('utf8', i);
|
|
||||||
this.lastTotal = total;
|
|
||||||
const end = buf.length - (total - this.lastNeed);
|
|
||||||
buf.copy(this.lastChar, 0, end);
|
|
||||||
return buf.toString('utf8', i, end);
|
|
||||||
}
|
|
||||||
|
|
||||||
// For UTF-8, a replacement character is added when ending on a partial
|
|
||||||
// character.
|
|
||||||
function utf8End(buf) {
|
|
||||||
const r = (buf && buf.length ? this.write(buf) : '');
|
|
||||||
if (this.lastNeed) {
|
|
||||||
this.lastNeed = 0;
|
|
||||||
this.lastTotal = 0;
|
|
||||||
return r + '\ufffd';
|
|
||||||
}
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
// UTF-16LE typically needs two bytes per character, but even if we have an even
|
|
||||||
// number of bytes available, we need to check if we end on a leading/high
|
|
||||||
// surrogate. In that case, we need to wait for the next two bytes in order to
|
|
||||||
// decode the last character properly.
|
|
||||||
function utf16Text(buf, i) {
|
|
||||||
if ((buf.length - i) % 2 === 0) {
|
|
||||||
const r = buf.toString('utf16le', i);
|
|
||||||
if (r) {
|
|
||||||
const c = r.charCodeAt(r.length - 1);
|
|
||||||
if (c >= 0xD800 && c <= 0xDBFF) {
|
|
||||||
this.lastNeed = 2;
|
|
||||||
this.lastTotal = 4;
|
|
||||||
this.lastChar[0] = buf[buf.length - 2];
|
|
||||||
this.lastChar[1] = buf[buf.length - 1];
|
|
||||||
return r.slice(0, -1);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
this.lastNeed = 1;
|
|
||||||
this.lastTotal = 2;
|
|
||||||
this.lastChar[0] = buf[buf.length - 1];
|
|
||||||
return buf.toString('utf16le', i, buf.length - 1);
|
|
||||||
}
|
|
||||||
|
|
||||||
// For UTF-16LE we do not explicitly append special replacement characters if we
|
|
||||||
// end on a partial character, we simply let v8 handle that.
|
|
||||||
function utf16End(buf) {
|
|
||||||
const r = (buf && buf.length ? this.write(buf) : '');
|
|
||||||
if (this.lastNeed) {
|
|
||||||
const end = this.lastTotal - this.lastNeed;
|
|
||||||
this.lastNeed = 0;
|
|
||||||
this.lastTotal = 0;
|
|
||||||
return r + this.lastChar.toString('utf16le', 0, end);
|
|
||||||
}
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
function base64Text(buf, i) {
|
|
||||||
const n = (buf.length - i) % 3;
|
|
||||||
if (n === 0)
|
|
||||||
return buf.toString('base64', i);
|
|
||||||
this.lastNeed = 3 - n;
|
|
||||||
this.lastTotal = 3;
|
|
||||||
if (n === 1) {
|
|
||||||
this.lastChar[0] = buf[buf.length - 1];
|
|
||||||
} else {
|
|
||||||
this.lastChar[0] = buf[buf.length - 2];
|
|
||||||
this.lastChar[1] = buf[buf.length - 1];
|
|
||||||
}
|
|
||||||
return buf.toString('base64', i, buf.length - n);
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
function base64End(buf) {
|
|
||||||
const r = (buf && buf.length ? this.write(buf) : '');
|
|
||||||
if (this.lastNeed) {
|
|
||||||
const end = 3 - this.lastNeed;
|
|
||||||
this.lastNeed = 0;
|
|
||||||
this.lastTotal = 0;
|
|
||||||
return r + this.lastChar.toString('base64', 0, end);
|
|
||||||
}
|
|
||||||
return r;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex)
|
|
||||||
function simpleWrite(buf) {
|
|
||||||
return buf.toString(this.encoding);
|
|
||||||
}
|
|
||||||
|
|
||||||
function simpleEnd(buf) {
|
|
||||||
return (buf && buf.length ? this.write(buf) : '');
|
|
||||||
}
|
|
||||||
|
4
node.gyp
4
node.gyp
@ -326,6 +326,7 @@
|
|||||||
'src/signal_wrap.cc',
|
'src/signal_wrap.cc',
|
||||||
'src/spawn_sync.cc',
|
'src/spawn_sync.cc',
|
||||||
'src/string_bytes.cc',
|
'src/string_bytes.cc',
|
||||||
|
'src/string_decoder.cc',
|
||||||
'src/string_search.cc',
|
'src/string_search.cc',
|
||||||
'src/stream_base.cc',
|
'src/stream_base.cc',
|
||||||
'src/stream_wrap.cc',
|
'src/stream_wrap.cc',
|
||||||
@ -379,6 +380,8 @@
|
|||||||
'src/req_wrap.h',
|
'src/req_wrap.h',
|
||||||
'src/req_wrap-inl.h',
|
'src/req_wrap-inl.h',
|
||||||
'src/string_bytes.h',
|
'src/string_bytes.h',
|
||||||
|
'src/string_decoder.h',
|
||||||
|
'src/string_decoder-inl.h',
|
||||||
'src/stream_base.h',
|
'src/stream_base.h',
|
||||||
'src/stream_base-inl.h',
|
'src/stream_base-inl.h',
|
||||||
'src/stream_wrap.h',
|
'src/stream_wrap.h',
|
||||||
@ -989,6 +992,7 @@
|
|||||||
'<(obj_path)<(obj_separator)node_url.<(obj_suffix)',
|
'<(obj_path)<(obj_separator)node_url.<(obj_suffix)',
|
||||||
'<(obj_path)<(obj_separator)util.<(obj_suffix)',
|
'<(obj_path)<(obj_separator)util.<(obj_suffix)',
|
||||||
'<(obj_path)<(obj_separator)string_bytes.<(obj_suffix)',
|
'<(obj_path)<(obj_separator)string_bytes.<(obj_suffix)',
|
||||||
|
'<(obj_path)<(obj_separator)string_decoder.<(obj_suffix)',
|
||||||
'<(obj_path)<(obj_separator)string_search.<(obj_suffix)',
|
'<(obj_path)<(obj_separator)string_search.<(obj_suffix)',
|
||||||
'<(obj_path)<(obj_separator)stream_base.<(obj_suffix)',
|
'<(obj_path)<(obj_separator)stream_base.<(obj_suffix)',
|
||||||
'<(obj_path)<(obj_separator)node_constants.<(obj_suffix)',
|
'<(obj_path)<(obj_separator)node_constants.<(obj_suffix)',
|
||||||
|
@ -120,6 +120,7 @@ struct sockaddr;
|
|||||||
V(signal_wrap) \
|
V(signal_wrap) \
|
||||||
V(spawn_sync) \
|
V(spawn_sync) \
|
||||||
V(stream_wrap) \
|
V(stream_wrap) \
|
||||||
|
V(string_decoder) \
|
||||||
V(tcp_wrap) \
|
V(tcp_wrap) \
|
||||||
V(timer_wrap) \
|
V(timer_wrap) \
|
||||||
V(trace_events) \
|
V(trace_events) \
|
||||||
|
38
src/string_decoder-inl.h
Normal file
38
src/string_decoder-inl.h
Normal file
@ -0,0 +1,38 @@
|
|||||||
|
#ifndef SRC_STRING_DECODER_INL_H_
|
||||||
|
#define SRC_STRING_DECODER_INL_H_
|
||||||
|
|
||||||
|
#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
|
||||||
|
|
||||||
|
#include "string_decoder.h"
|
||||||
|
#include "util.h"
|
||||||
|
|
||||||
|
namespace node {
|
||||||
|
|
||||||
|
void StringDecoder::SetEncoding(enum encoding encoding) {
|
||||||
|
state_[kBufferedBytes] = 0;
|
||||||
|
state_[kMissingBytes] = 0;
|
||||||
|
state_[kEncodingField] = encoding;
|
||||||
|
}
|
||||||
|
|
||||||
|
enum encoding StringDecoder::Encoding() const {
|
||||||
|
return static_cast<enum encoding>(state_[kEncodingField]);
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned StringDecoder::BufferedBytes() const {
|
||||||
|
return state_[kBufferedBytes];
|
||||||
|
}
|
||||||
|
|
||||||
|
unsigned StringDecoder::MissingBytes() const {
|
||||||
|
return state_[kMissingBytes];
|
||||||
|
}
|
||||||
|
|
||||||
|
char* StringDecoder::IncompleteCharacterBuffer() {
|
||||||
|
return reinterpret_cast<char*>(state_ + kIncompleteCharactersStart);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
} // namespace node
|
||||||
|
|
||||||
|
#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
|
||||||
|
|
||||||
|
#endif // SRC_STRING_DECODER_INL_H_
|
334
src/string_decoder.cc
Normal file
334
src/string_decoder.cc
Normal file
@ -0,0 +1,334 @@
|
|||||||
|
#include "string_decoder-inl.h"
|
||||||
|
#include "string_bytes.h"
|
||||||
|
#include "node_internals.h"
|
||||||
|
#include "node_buffer.h"
|
||||||
|
|
||||||
|
using v8::Array;
|
||||||
|
using v8::Context;
|
||||||
|
using v8::FunctionCallbackInfo;
|
||||||
|
using v8::Integer;
|
||||||
|
using v8::Isolate;
|
||||||
|
using v8::Local;
|
||||||
|
using v8::MaybeLocal;
|
||||||
|
using v8::Object;
|
||||||
|
using v8::String;
|
||||||
|
using v8::Value;
|
||||||
|
|
||||||
|
namespace node {
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
MaybeLocal<String> MakeString(Isolate* isolate,
|
||||||
|
const char* data,
|
||||||
|
size_t length,
|
||||||
|
enum encoding encoding) {
|
||||||
|
Local<Value> error;
|
||||||
|
MaybeLocal<Value> ret;
|
||||||
|
if (encoding == UTF8) {
|
||||||
|
return String::NewFromUtf8(
|
||||||
|
isolate,
|
||||||
|
data,
|
||||||
|
v8::NewStringType::kNormal,
|
||||||
|
length);
|
||||||
|
} else if (encoding == UCS2) {
|
||||||
|
#ifdef DEBUG
|
||||||
|
CHECK_EQ(reinterpret_cast<uintptr_t>(data) % 2, 0);
|
||||||
|
CHECK_EQ(length % 2, 0);
|
||||||
|
#endif
|
||||||
|
ret = StringBytes::Encode(
|
||||||
|
isolate,
|
||||||
|
reinterpret_cast<const uint16_t*>(data),
|
||||||
|
length / 2,
|
||||||
|
&error);
|
||||||
|
} else {
|
||||||
|
ret = StringBytes::Encode(
|
||||||
|
isolate,
|
||||||
|
data,
|
||||||
|
length,
|
||||||
|
encoding,
|
||||||
|
&error);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ret.IsEmpty()) {
|
||||||
|
CHECK(!error.IsEmpty());
|
||||||
|
isolate->ThrowException(error);
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
CHECK(ret.IsEmpty() || ret.ToLocalChecked()->IsString());
|
||||||
|
#endif
|
||||||
|
return ret.FromMaybe(Local<Value>()).As<String>();
|
||||||
|
}
|
||||||
|
|
||||||
|
} // anonymous namespace
|
||||||
|
|
||||||
|
|
||||||
|
MaybeLocal<String> StringDecoder::DecodeData(Isolate* isolate,
|
||||||
|
const char* data,
|
||||||
|
size_t* nread_ptr) {
|
||||||
|
Local<String> prepend, body;
|
||||||
|
|
||||||
|
size_t nread = *nread_ptr;
|
||||||
|
|
||||||
|
if (Encoding() == UTF8 || Encoding() == UCS2 || Encoding() == BASE64) {
|
||||||
|
// See if we want bytes to finish a character from the previous
|
||||||
|
// chunk; if so, copy the new bytes to the missing bytes buffer
|
||||||
|
// and create a small string from it that is to be prepended to the
|
||||||
|
// main body.
|
||||||
|
if (MissingBytes() > 0) {
|
||||||
|
// There are never more bytes missing than the pre-calculated maximum.
|
||||||
|
CHECK_LE(MissingBytes() + BufferedBytes(),
|
||||||
|
kIncompleteCharactersEnd);
|
||||||
|
if (Encoding() == UTF8) {
|
||||||
|
// For UTF-8, we need special treatment to align with the V8 decoder:
|
||||||
|
// If an incomplete character is found at a chunk boundary, we turn
|
||||||
|
// that character into a single invalid one.
|
||||||
|
for (size_t i = 0; i < nread && i < MissingBytes(); ++i) {
|
||||||
|
if ((data[i] & 0xC0) != 0x80) {
|
||||||
|
// This byte is not a continuation byte even though it should have
|
||||||
|
// been one.
|
||||||
|
// Act as if there was a 1-byte incomplete character, which does
|
||||||
|
// not make sense but works here because we know it's invalid.
|
||||||
|
state_[kMissingBytes] = 0;
|
||||||
|
state_[kBufferedBytes] = 1;
|
||||||
|
data += i;
|
||||||
|
nread -= i;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t found_bytes =
|
||||||
|
std::min(nread, static_cast<size_t>(MissingBytes()));
|
||||||
|
memcpy(IncompleteCharacterBuffer() + BufferedBytes(),
|
||||||
|
data,
|
||||||
|
found_bytes);
|
||||||
|
// Adjust the two buffers.
|
||||||
|
data += found_bytes;
|
||||||
|
nread -= found_bytes;
|
||||||
|
|
||||||
|
state_[kMissingBytes] -= found_bytes;
|
||||||
|
state_[kBufferedBytes] += found_bytes;
|
||||||
|
|
||||||
|
if (LIKELY(MissingBytes() == 0)) {
|
||||||
|
// If no more bytes are missing, create a small string that we
|
||||||
|
// will later prepend.
|
||||||
|
if (!MakeString(isolate,
|
||||||
|
IncompleteCharacterBuffer(),
|
||||||
|
BufferedBytes(),
|
||||||
|
Encoding()).ToLocal(&prepend)) {
|
||||||
|
return MaybeLocal<String>();
|
||||||
|
}
|
||||||
|
|
||||||
|
*nread_ptr += BufferedBytes();
|
||||||
|
// No more buffered bytes.
|
||||||
|
state_[kBufferedBytes] = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// It could be that trying to finish the previous chunk already
|
||||||
|
// consumed all data that we received in this chunk.
|
||||||
|
if (UNLIKELY(nread == 0)) {
|
||||||
|
body = !prepend.IsEmpty() ? prepend : String::Empty(isolate);
|
||||||
|
prepend = Local<String>();
|
||||||
|
} else {
|
||||||
|
#ifdef DEBUG
|
||||||
|
// If not, that means is no character left to finish at this point.
|
||||||
|
CHECK_EQ(MissingBytes(), 0);
|
||||||
|
CHECK_EQ(BufferedBytes(), 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// See whether there is a character that we may have to cut off and
|
||||||
|
// finish when receiving the next chunk.
|
||||||
|
if (Encoding() == UTF8 && data[nread - 1] & 0x80) {
|
||||||
|
// This is UTF-8 encoded data and we ended on a non-ASCII UTF-8 byte.
|
||||||
|
// This means we'll need to figure out where the character to which
|
||||||
|
// the byte belongs begins.
|
||||||
|
for (size_t i = nread - 1; ; --i) {
|
||||||
|
#ifdef DEBUG
|
||||||
|
CHECK_LT(i, nread);
|
||||||
|
#endif
|
||||||
|
state_[kBufferedBytes]++;
|
||||||
|
if ((data[i] & 0xC0) == 0x80) {
|
||||||
|
// This byte does not start a character (a "trailing" byte).
|
||||||
|
if (state_[kBufferedBytes] >= 4 || i == 0) {
|
||||||
|
// We either have more then 4 trailing bytes (which means
|
||||||
|
// the current character would not be inside the range for
|
||||||
|
// valid Unicode, and in particular cannot be represented
|
||||||
|
// through JavaScript's UTF-16-based approach to strings), or the
|
||||||
|
// current buffer does not contain the start of an UTF-8 character
|
||||||
|
// at all. Either way, this is invalid UTF8 and we can just
|
||||||
|
// let the engine's decoder handle it.
|
||||||
|
state_[kBufferedBytes] = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Found the first byte of a UTF-8 character. By looking at the
|
||||||
|
// upper bits we can tell how long the character *should* be.
|
||||||
|
if ((data[i] & 0xE0) == 0xC0) {
|
||||||
|
state_[kMissingBytes] = 2;
|
||||||
|
} else if ((data[i] & 0xF0) == 0xE0) {
|
||||||
|
state_[kMissingBytes] = 3;
|
||||||
|
} else if ((data[i] & 0xF8) == 0xF0) {
|
||||||
|
state_[kMissingBytes] = 4;
|
||||||
|
} else {
|
||||||
|
// This lead byte would indicate a character outside of the
|
||||||
|
// representable range.
|
||||||
|
state_[kBufferedBytes] = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (BufferedBytes() >= MissingBytes()) {
|
||||||
|
// Received more or exactly as many trailing bytes than the lead
|
||||||
|
// character would indicate. In the "==" case, we have valid
|
||||||
|
// data and don't need to slice anything off;
|
||||||
|
// in the ">" case, this is invalid UTF-8 anyway.
|
||||||
|
state_[kMissingBytes] = 0;
|
||||||
|
state_[kBufferedBytes] = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
state_[kMissingBytes] -= state_[kBufferedBytes];
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else if (Encoding() == UCS2) {
|
||||||
|
if ((nread % 2) == 1) {
|
||||||
|
// We got half a codepoint, and need the second byte of it.
|
||||||
|
state_[kBufferedBytes] = 1;
|
||||||
|
state_[kMissingBytes] = 1;
|
||||||
|
} else if ((data[nread - 1] & 0xFC) == 0xD8) {
|
||||||
|
// Half a split UTF-16 character.
|
||||||
|
state_[kBufferedBytes] = 2;
|
||||||
|
state_[kMissingBytes] = 2;
|
||||||
|
}
|
||||||
|
} else if (Encoding() == BASE64) {
|
||||||
|
state_[kBufferedBytes] = nread % 3;
|
||||||
|
if (state_[kBufferedBytes] > 0)
|
||||||
|
state_[kMissingBytes] = 3 - BufferedBytes();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (BufferedBytes() > 0) {
|
||||||
|
// Copy the requested number of buffered bytes from the end of the
|
||||||
|
// input into the incomplete character buffer.
|
||||||
|
nread -= BufferedBytes();
|
||||||
|
*nread_ptr -= BufferedBytes();
|
||||||
|
memcpy(IncompleteCharacterBuffer(), data + nread, BufferedBytes());
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nread > 0) {
|
||||||
|
if (!MakeString(isolate, data, nread, Encoding()).ToLocal(&body))
|
||||||
|
return MaybeLocal<String>();
|
||||||
|
} else {
|
||||||
|
body = String::Empty(isolate);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (prepend.IsEmpty()) {
|
||||||
|
return body;
|
||||||
|
} else {
|
||||||
|
return String::Concat(prepend, body);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1);
|
||||||
|
return MakeString(isolate, data, nread, Encoding());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MaybeLocal<String> StringDecoder::FlushData(Isolate* isolate) {
|
||||||
|
if (Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1) {
|
||||||
|
CHECK_EQ(MissingBytes(), 0);
|
||||||
|
CHECK_EQ(BufferedBytes(), 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Encoding() == UCS2 && BufferedBytes() % 2 == 1) {
|
||||||
|
// Ignore a single trailing byte, like the JS decoder does.
|
||||||
|
state_[kMissingBytes]--;
|
||||||
|
state_[kBufferedBytes]--;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (BufferedBytes() == 0)
|
||||||
|
return String::Empty(isolate);
|
||||||
|
|
||||||
|
MaybeLocal<String> ret =
|
||||||
|
MakeString(isolate,
|
||||||
|
IncompleteCharacterBuffer(),
|
||||||
|
BufferedBytes(),
|
||||||
|
Encoding());
|
||||||
|
|
||||||
|
state_[kMissingBytes] = 0;
|
||||||
|
state_[kBufferedBytes] = 0;
|
||||||
|
|
||||||
|
return ret;
|
||||||
|
}
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
void DecodeData(const FunctionCallbackInfo<Value>& args) {
|
||||||
|
StringDecoder* decoder =
|
||||||
|
reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
|
||||||
|
CHECK_NE(decoder, nullptr);
|
||||||
|
size_t nread = Buffer::Length(args[1]);
|
||||||
|
MaybeLocal<String> ret =
|
||||||
|
decoder->DecodeData(args.GetIsolate(), Buffer::Data(args[1]), &nread);
|
||||||
|
if (!ret.IsEmpty())
|
||||||
|
args.GetReturnValue().Set(ret.ToLocalChecked());
|
||||||
|
}
|
||||||
|
|
||||||
|
void FlushData(const FunctionCallbackInfo<Value>& args) {
|
||||||
|
StringDecoder* decoder =
|
||||||
|
reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
|
||||||
|
CHECK_NE(decoder, nullptr);
|
||||||
|
MaybeLocal<String> ret = decoder->FlushData(args.GetIsolate());
|
||||||
|
if (!ret.IsEmpty())
|
||||||
|
args.GetReturnValue().Set(ret.ToLocalChecked());
|
||||||
|
}
|
||||||
|
|
||||||
|
void InitializeStringDecoder(Local<Object> target,
|
||||||
|
Local<Value> unused,
|
||||||
|
Local<Context> context) {
|
||||||
|
Environment* env = Environment::GetCurrent(context);
|
||||||
|
Isolate* isolate = env->isolate();
|
||||||
|
|
||||||
|
#define SET_DECODER_CONSTANT(name) \
|
||||||
|
target->Set(context, \
|
||||||
|
FIXED_ONE_BYTE_STRING(isolate, #name), \
|
||||||
|
Integer::New(isolate, StringDecoder::name)).FromJust()
|
||||||
|
|
||||||
|
SET_DECODER_CONSTANT(kIncompleteCharactersStart);
|
||||||
|
SET_DECODER_CONSTANT(kIncompleteCharactersEnd);
|
||||||
|
SET_DECODER_CONSTANT(kMissingBytes);
|
||||||
|
SET_DECODER_CONSTANT(kBufferedBytes);
|
||||||
|
SET_DECODER_CONSTANT(kEncodingField);
|
||||||
|
SET_DECODER_CONSTANT(kNumFields);
|
||||||
|
|
||||||
|
Local<Array> encodings = Array::New(isolate);
|
||||||
|
#define ADD_TO_ENCODINGS_ARRAY(cname, jsname) \
|
||||||
|
encodings->Set(context, \
|
||||||
|
static_cast<int32_t>(cname), \
|
||||||
|
FIXED_ONE_BYTE_STRING(isolate, jsname)).FromJust()
|
||||||
|
ADD_TO_ENCODINGS_ARRAY(ASCII, "ascii");
|
||||||
|
ADD_TO_ENCODINGS_ARRAY(UTF8, "utf8");
|
||||||
|
ADD_TO_ENCODINGS_ARRAY(BASE64, "base64");
|
||||||
|
ADD_TO_ENCODINGS_ARRAY(UCS2, "utf16le");
|
||||||
|
ADD_TO_ENCODINGS_ARRAY(HEX, "hex");
|
||||||
|
ADD_TO_ENCODINGS_ARRAY(BUFFER, "buffer");
|
||||||
|
ADD_TO_ENCODINGS_ARRAY(LATIN1, "latin1");
|
||||||
|
|
||||||
|
target->Set(context,
|
||||||
|
FIXED_ONE_BYTE_STRING(isolate, "encodings"),
|
||||||
|
encodings).FromJust();
|
||||||
|
|
||||||
|
target->Set(context,
|
||||||
|
FIXED_ONE_BYTE_STRING(isolate, "kSize"),
|
||||||
|
Integer::New(isolate, sizeof(StringDecoder))).FromJust();
|
||||||
|
|
||||||
|
env->SetMethod(target, "decode", DecodeData);
|
||||||
|
env->SetMethod(target, "flush", FlushData);
|
||||||
|
}
|
||||||
|
|
||||||
|
} // anonymous namespace
|
||||||
|
|
||||||
|
} // namespace node
|
||||||
|
|
||||||
|
NODE_MODULE_CONTEXT_AWARE_INTERNAL(string_decoder,
|
||||||
|
node::InitializeStringDecoder)
|
50
src/string_decoder.h
Normal file
50
src/string_decoder.h
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
#ifndef SRC_STRING_DECODER_H_
|
||||||
|
#define SRC_STRING_DECODER_H_
|
||||||
|
|
||||||
|
#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
|
||||||
|
|
||||||
|
#include "node.h"
|
||||||
|
|
||||||
|
namespace node {
|
||||||
|
|
||||||
|
class StringDecoder {
|
||||||
|
public:
|
||||||
|
StringDecoder() { state_[kEncodingField] = BUFFER; }
|
||||||
|
inline void SetEncoding(enum encoding encoding);
|
||||||
|
inline enum encoding Encoding() const;
|
||||||
|
|
||||||
|
inline char* IncompleteCharacterBuffer();
|
||||||
|
inline unsigned MissingBytes() const;
|
||||||
|
inline unsigned BufferedBytes() const;
|
||||||
|
|
||||||
|
// Decode a string from the specified encoding.
|
||||||
|
// The value pointed to by `nread` will be modified to reflect that
|
||||||
|
// less data may have been read because it ended on an incomplete character
|
||||||
|
// and more data may have been read because a previously incomplete character
|
||||||
|
// was finished.
|
||||||
|
v8::MaybeLocal<v8::String> DecodeData(v8::Isolate* isolate,
|
||||||
|
const char* data,
|
||||||
|
size_t* nread);
|
||||||
|
// Flush an incomplete character. For character encodings like UTF8 this
|
||||||
|
// means printing replacement characters, buf for e.g. Base64 the returned
|
||||||
|
// string contains more data.
|
||||||
|
v8::MaybeLocal<v8::String> FlushData(v8::Isolate* isolate);
|
||||||
|
|
||||||
|
enum Fields {
|
||||||
|
kIncompleteCharactersStart = 0,
|
||||||
|
kIncompleteCharactersEnd = 4,
|
||||||
|
kMissingBytes = 4,
|
||||||
|
kBufferedBytes = 5,
|
||||||
|
kEncodingField = 6,
|
||||||
|
kNumFields = 7
|
||||||
|
};
|
||||||
|
|
||||||
|
private:
|
||||||
|
uint8_t state_[kNumFields] = {};
|
||||||
|
};
|
||||||
|
|
||||||
|
} // namespace node
|
||||||
|
|
||||||
|
#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
|
||||||
|
|
||||||
|
#endif // SRC_STRING_DECODER_H_
|
@ -128,6 +128,10 @@ assert.strictEqual(decoder.write(Buffer.from('3DD8', 'hex')), '');
|
|||||||
assert.strictEqual(decoder.write(Buffer.from('4D', 'hex')), '');
|
assert.strictEqual(decoder.write(Buffer.from('4D', 'hex')), '');
|
||||||
assert.strictEqual(decoder.end(), '\ud83d');
|
assert.strictEqual(decoder.end(), '\ud83d');
|
||||||
|
|
||||||
|
decoder = new StringDecoder('utf16le');
|
||||||
|
assert.strictEqual(decoder.write(Buffer.from('3DD84D', 'hex')), '\ud83d');
|
||||||
|
assert.strictEqual(decoder.end(), '');
|
||||||
|
|
||||||
common.expectsError(
|
common.expectsError(
|
||||||
() => new StringDecoder(1),
|
() => new StringDecoder(1),
|
||||||
{
|
{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user