string_decoder: reimplement in C++
Implement string decoder in C++. The perks are a decent speed boost (for decoding, whereas creation show some performance degradation), that this can now be used more easily to add native decoding support to C++ streams and (arguably) more readable variable names. PR-URL: https://github.com/nodejs/node/pull/18537 Reviewed-By: James M Snell <jasnell@gmail.com> Reviewed-By: Ben Noordhuis <info@bnoordhuis.nl>
This commit is contained in:
parent
de848ac1e0
commit
180af17b52
@ -22,10 +22,23 @@
|
||||
'use strict';
|
||||
|
||||
const { Buffer } = require('buffer');
|
||||
const {
|
||||
kIncompleteCharactersStart,
|
||||
kIncompleteCharactersEnd,
|
||||
kMissingBytes,
|
||||
kBufferedBytes,
|
||||
kEncodingField,
|
||||
kSize,
|
||||
decode,
|
||||
flush,
|
||||
encodings
|
||||
} = internalBinding('string_decoder');
|
||||
const internalUtil = require('internal/util');
|
||||
const errors = require('internal/errors');
|
||||
const isEncoding = Buffer[internalUtil.kIsEncodingSymbol];
|
||||
|
||||
const kNativeDecoder = Symbol('kNativeDecoder');
|
||||
|
||||
// Do not cache `Buffer.isEncoding` when checking encoding names as some
|
||||
// modules monkey-patch it to support additional encodings
|
||||
function normalizeEncoding(enc) {
|
||||
@ -36,258 +49,54 @@ function normalizeEncoding(enc) {
|
||||
return nenc || enc;
|
||||
}
|
||||
|
||||
const encodingsMap = {};
|
||||
for (var i = 0; i < encodings.length; ++i)
|
||||
encodingsMap[encodings[i]] = i;
|
||||
|
||||
// StringDecoder provides an interface for efficiently splitting a series of
|
||||
// buffers into a series of JS strings without breaking apart multi-byte
|
||||
// characters.
|
||||
class StringDecoder {
|
||||
constructor(encoding) {
|
||||
this.encoding = normalizeEncoding(encoding);
|
||||
this[kNativeDecoder] = Buffer.alloc(kSize);
|
||||
this[kNativeDecoder][kEncodingField] = encodingsMap[this.encoding];
|
||||
}
|
||||
|
||||
write(buf) {
|
||||
if (typeof buf === 'string')
|
||||
return buf;
|
||||
if (!ArrayBuffer.isView(buf))
|
||||
throw new errors.TypeError('ERR_INVALID_ARG_TYPE', 'buf',
|
||||
['Buffer', 'Uint8Array', 'ArrayBufferView']);
|
||||
return decode(this[kNativeDecoder], buf);
|
||||
}
|
||||
|
||||
end(buf) {
|
||||
let ret = '';
|
||||
if (buf !== undefined)
|
||||
ret = this.write(buf);
|
||||
if (this[kNativeDecoder][kBufferedBytes] > 0)
|
||||
ret += flush(this[kNativeDecoder]);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/* Everything below this line is undocumented legacy stuff. */
|
||||
|
||||
text(buf, offset) {
|
||||
this[kNativeDecoder][kMissingBytes] = 0;
|
||||
this[kNativeDecoder][kBufferedBytes] = 0;
|
||||
return this.write(buf.slice(offset));
|
||||
}
|
||||
|
||||
get lastTotal() {
|
||||
return this[kNativeDecoder][kBufferedBytes] + this.lastNeed;
|
||||
}
|
||||
|
||||
get lastChar() {
|
||||
return this[kNativeDecoder].subarray(kIncompleteCharactersStart,
|
||||
kIncompleteCharactersEnd);
|
||||
}
|
||||
}
|
||||
|
||||
exports.StringDecoder = StringDecoder;
|
||||
function StringDecoder(encoding) {
|
||||
this.encoding = normalizeEncoding(encoding);
|
||||
var nb;
|
||||
switch (this.encoding) {
|
||||
case 'utf16le':
|
||||
this.text = utf16Text;
|
||||
this.end = utf16End;
|
||||
nb = 4;
|
||||
break;
|
||||
case 'utf8':
|
||||
this.fillLast = utf8FillLast;
|
||||
nb = 4;
|
||||
break;
|
||||
case 'base64':
|
||||
this.text = base64Text;
|
||||
this.end = base64End;
|
||||
nb = 3;
|
||||
break;
|
||||
default:
|
||||
this.write = simpleWrite;
|
||||
this.end = simpleEnd;
|
||||
return;
|
||||
}
|
||||
this.lastNeed = 0;
|
||||
this.lastTotal = 0;
|
||||
this.lastChar = Buffer.allocUnsafe(nb);
|
||||
}
|
||||
|
||||
StringDecoder.prototype.write = function(buf) {
|
||||
if (buf.length === 0)
|
||||
return '';
|
||||
var r;
|
||||
var i;
|
||||
if (this.lastNeed) {
|
||||
r = this.fillLast(buf);
|
||||
if (r === undefined)
|
||||
return '';
|
||||
i = this.lastNeed;
|
||||
this.lastNeed = 0;
|
||||
} else {
|
||||
i = 0;
|
||||
}
|
||||
if (i < buf.length)
|
||||
return (r ? r + this.text(buf, i) : this.text(buf, i));
|
||||
return r || '';
|
||||
};
|
||||
|
||||
StringDecoder.prototype.end = utf8End;
|
||||
|
||||
// Returns only complete characters in a Buffer
|
||||
StringDecoder.prototype.text = utf8Text;
|
||||
|
||||
// Attempts to complete a partial non-UTF-8 character using bytes from a Buffer
|
||||
StringDecoder.prototype.fillLast = function(buf) {
|
||||
if (this.lastNeed <= buf.length) {
|
||||
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, this.lastNeed);
|
||||
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
|
||||
}
|
||||
buf.copy(this.lastChar, this.lastTotal - this.lastNeed, 0, buf.length);
|
||||
this.lastNeed -= buf.length;
|
||||
};
|
||||
|
||||
// Checks the type of a UTF-8 byte, whether it's ASCII, a leading byte, or a
|
||||
// continuation byte. If an invalid byte is detected, -2 is returned.
|
||||
function utf8CheckByte(byte) {
|
||||
if (byte <= 0x7F)
|
||||
return 0;
|
||||
else if (byte >> 5 === 0x06)
|
||||
return 2;
|
||||
else if (byte >> 4 === 0x0E)
|
||||
return 3;
|
||||
else if (byte >> 3 === 0x1E)
|
||||
return 4;
|
||||
return (byte >> 6 === 0x02 ? -1 : -2);
|
||||
}
|
||||
|
||||
// Checks at most 3 bytes at the end of a Buffer in order to detect an
|
||||
// incomplete multi-byte UTF-8 character. The total number of bytes (2, 3, or 4)
|
||||
// needed to complete the UTF-8 character (if applicable) are returned.
|
||||
function utf8CheckIncomplete(self, buf, i) {
|
||||
var j = buf.length - 1;
|
||||
if (j < i)
|
||||
return 0;
|
||||
var nb = utf8CheckByte(buf[j]);
|
||||
if (nb >= 0) {
|
||||
if (nb > 0)
|
||||
self.lastNeed = nb - 1;
|
||||
return nb;
|
||||
}
|
||||
if (--j < i || nb === -2)
|
||||
return 0;
|
||||
nb = utf8CheckByte(buf[j]);
|
||||
if (nb >= 0) {
|
||||
if (nb > 0)
|
||||
self.lastNeed = nb - 2;
|
||||
return nb;
|
||||
}
|
||||
if (--j < i || nb === -2)
|
||||
return 0;
|
||||
nb = utf8CheckByte(buf[j]);
|
||||
if (nb >= 0) {
|
||||
if (nb > 0) {
|
||||
if (nb === 2)
|
||||
nb = 0;
|
||||
else
|
||||
self.lastNeed = nb - 3;
|
||||
}
|
||||
return nb;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
// Validates as many continuation bytes for a multi-byte UTF-8 character as
|
||||
// needed or are available. If we see a non-continuation byte where we expect
|
||||
// one, we "replace" the validated continuation bytes we've seen so far with
|
||||
// a single UTF-8 replacement character ('\ufffd'), to match v8's UTF-8 decoding
|
||||
// behavior. The continuation byte check is included three times in the case
|
||||
// where all of the continuation bytes for a character exist in the same buffer.
|
||||
// It is also done this way as a slight performance increase instead of using a
|
||||
// loop.
|
||||
function utf8CheckExtraBytes(self, buf, p) {
|
||||
if ((buf[0] & 0xC0) !== 0x80) {
|
||||
self.lastNeed = 0;
|
||||
return '\ufffd';
|
||||
}
|
||||
if (self.lastNeed > 1 && buf.length > 1) {
|
||||
if ((buf[1] & 0xC0) !== 0x80) {
|
||||
self.lastNeed = 1;
|
||||
return '\ufffd';
|
||||
}
|
||||
if (self.lastNeed > 2 && buf.length > 2) {
|
||||
if ((buf[2] & 0xC0) !== 0x80) {
|
||||
self.lastNeed = 2;
|
||||
return '\ufffd';
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Attempts to complete a multi-byte UTF-8 character using bytes from a Buffer.
|
||||
function utf8FillLast(buf) {
|
||||
const p = this.lastTotal - this.lastNeed;
|
||||
var r = utf8CheckExtraBytes(this, buf, p);
|
||||
if (r !== undefined)
|
||||
return r;
|
||||
if (this.lastNeed <= buf.length) {
|
||||
buf.copy(this.lastChar, p, 0, this.lastNeed);
|
||||
return this.lastChar.toString(this.encoding, 0, this.lastTotal);
|
||||
}
|
||||
buf.copy(this.lastChar, p, 0, buf.length);
|
||||
this.lastNeed -= buf.length;
|
||||
}
|
||||
|
||||
// Returns all complete UTF-8 characters in a Buffer. If the Buffer ended on a
|
||||
// partial character, the character's bytes are buffered until the required
|
||||
// number of bytes are available.
|
||||
function utf8Text(buf, i) {
|
||||
const total = utf8CheckIncomplete(this, buf, i);
|
||||
if (!this.lastNeed)
|
||||
return buf.toString('utf8', i);
|
||||
this.lastTotal = total;
|
||||
const end = buf.length - (total - this.lastNeed);
|
||||
buf.copy(this.lastChar, 0, end);
|
||||
return buf.toString('utf8', i, end);
|
||||
}
|
||||
|
||||
// For UTF-8, a replacement character is added when ending on a partial
|
||||
// character.
|
||||
function utf8End(buf) {
|
||||
const r = (buf && buf.length ? this.write(buf) : '');
|
||||
if (this.lastNeed) {
|
||||
this.lastNeed = 0;
|
||||
this.lastTotal = 0;
|
||||
return r + '\ufffd';
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
// UTF-16LE typically needs two bytes per character, but even if we have an even
|
||||
// number of bytes available, we need to check if we end on a leading/high
|
||||
// surrogate. In that case, we need to wait for the next two bytes in order to
|
||||
// decode the last character properly.
|
||||
function utf16Text(buf, i) {
|
||||
if ((buf.length - i) % 2 === 0) {
|
||||
const r = buf.toString('utf16le', i);
|
||||
if (r) {
|
||||
const c = r.charCodeAt(r.length - 1);
|
||||
if (c >= 0xD800 && c <= 0xDBFF) {
|
||||
this.lastNeed = 2;
|
||||
this.lastTotal = 4;
|
||||
this.lastChar[0] = buf[buf.length - 2];
|
||||
this.lastChar[1] = buf[buf.length - 1];
|
||||
return r.slice(0, -1);
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
this.lastNeed = 1;
|
||||
this.lastTotal = 2;
|
||||
this.lastChar[0] = buf[buf.length - 1];
|
||||
return buf.toString('utf16le', i, buf.length - 1);
|
||||
}
|
||||
|
||||
// For UTF-16LE we do not explicitly append special replacement characters if we
|
||||
// end on a partial character, we simply let v8 handle that.
|
||||
function utf16End(buf) {
|
||||
const r = (buf && buf.length ? this.write(buf) : '');
|
||||
if (this.lastNeed) {
|
||||
const end = this.lastTotal - this.lastNeed;
|
||||
this.lastNeed = 0;
|
||||
this.lastTotal = 0;
|
||||
return r + this.lastChar.toString('utf16le', 0, end);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
function base64Text(buf, i) {
|
||||
const n = (buf.length - i) % 3;
|
||||
if (n === 0)
|
||||
return buf.toString('base64', i);
|
||||
this.lastNeed = 3 - n;
|
||||
this.lastTotal = 3;
|
||||
if (n === 1) {
|
||||
this.lastChar[0] = buf[buf.length - 1];
|
||||
} else {
|
||||
this.lastChar[0] = buf[buf.length - 2];
|
||||
this.lastChar[1] = buf[buf.length - 1];
|
||||
}
|
||||
return buf.toString('base64', i, buf.length - n);
|
||||
}
|
||||
|
||||
|
||||
function base64End(buf) {
|
||||
const r = (buf && buf.length ? this.write(buf) : '');
|
||||
if (this.lastNeed) {
|
||||
const end = 3 - this.lastNeed;
|
||||
this.lastNeed = 0;
|
||||
this.lastTotal = 0;
|
||||
return r + this.lastChar.toString('base64', 0, end);
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
// Pass bytes on through for single-byte encodings (e.g. ascii, latin1, hex)
|
||||
function simpleWrite(buf) {
|
||||
return buf.toString(this.encoding);
|
||||
}
|
||||
|
||||
function simpleEnd(buf) {
|
||||
return (buf && buf.length ? this.write(buf) : '');
|
||||
}
|
||||
|
4
node.gyp
4
node.gyp
@ -326,6 +326,7 @@
|
||||
'src/signal_wrap.cc',
|
||||
'src/spawn_sync.cc',
|
||||
'src/string_bytes.cc',
|
||||
'src/string_decoder.cc',
|
||||
'src/string_search.cc',
|
||||
'src/stream_base.cc',
|
||||
'src/stream_wrap.cc',
|
||||
@ -379,6 +380,8 @@
|
||||
'src/req_wrap.h',
|
||||
'src/req_wrap-inl.h',
|
||||
'src/string_bytes.h',
|
||||
'src/string_decoder.h',
|
||||
'src/string_decoder-inl.h',
|
||||
'src/stream_base.h',
|
||||
'src/stream_base-inl.h',
|
||||
'src/stream_wrap.h',
|
||||
@ -989,6 +992,7 @@
|
||||
'<(obj_path)<(obj_separator)node_url.<(obj_suffix)',
|
||||
'<(obj_path)<(obj_separator)util.<(obj_suffix)',
|
||||
'<(obj_path)<(obj_separator)string_bytes.<(obj_suffix)',
|
||||
'<(obj_path)<(obj_separator)string_decoder.<(obj_suffix)',
|
||||
'<(obj_path)<(obj_separator)string_search.<(obj_suffix)',
|
||||
'<(obj_path)<(obj_separator)stream_base.<(obj_suffix)',
|
||||
'<(obj_path)<(obj_separator)node_constants.<(obj_suffix)',
|
||||
|
@ -120,6 +120,7 @@ struct sockaddr;
|
||||
V(signal_wrap) \
|
||||
V(spawn_sync) \
|
||||
V(stream_wrap) \
|
||||
V(string_decoder) \
|
||||
V(tcp_wrap) \
|
||||
V(timer_wrap) \
|
||||
V(trace_events) \
|
||||
|
38
src/string_decoder-inl.h
Normal file
38
src/string_decoder-inl.h
Normal file
@ -0,0 +1,38 @@
|
||||
#ifndef SRC_STRING_DECODER_INL_H_
|
||||
#define SRC_STRING_DECODER_INL_H_
|
||||
|
||||
#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
|
||||
|
||||
#include "string_decoder.h"
|
||||
#include "util.h"
|
||||
|
||||
namespace node {
|
||||
|
||||
void StringDecoder::SetEncoding(enum encoding encoding) {
|
||||
state_[kBufferedBytes] = 0;
|
||||
state_[kMissingBytes] = 0;
|
||||
state_[kEncodingField] = encoding;
|
||||
}
|
||||
|
||||
enum encoding StringDecoder::Encoding() const {
|
||||
return static_cast<enum encoding>(state_[kEncodingField]);
|
||||
}
|
||||
|
||||
unsigned StringDecoder::BufferedBytes() const {
|
||||
return state_[kBufferedBytes];
|
||||
}
|
||||
|
||||
unsigned StringDecoder::MissingBytes() const {
|
||||
return state_[kMissingBytes];
|
||||
}
|
||||
|
||||
char* StringDecoder::IncompleteCharacterBuffer() {
|
||||
return reinterpret_cast<char*>(state_ + kIncompleteCharactersStart);
|
||||
}
|
||||
|
||||
|
||||
} // namespace node
|
||||
|
||||
#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
|
||||
|
||||
#endif // SRC_STRING_DECODER_INL_H_
|
334
src/string_decoder.cc
Normal file
334
src/string_decoder.cc
Normal file
@ -0,0 +1,334 @@
|
||||
#include "string_decoder-inl.h"
|
||||
#include "string_bytes.h"
|
||||
#include "node_internals.h"
|
||||
#include "node_buffer.h"
|
||||
|
||||
using v8::Array;
|
||||
using v8::Context;
|
||||
using v8::FunctionCallbackInfo;
|
||||
using v8::Integer;
|
||||
using v8::Isolate;
|
||||
using v8::Local;
|
||||
using v8::MaybeLocal;
|
||||
using v8::Object;
|
||||
using v8::String;
|
||||
using v8::Value;
|
||||
|
||||
namespace node {
|
||||
|
||||
namespace {
|
||||
|
||||
MaybeLocal<String> MakeString(Isolate* isolate,
|
||||
const char* data,
|
||||
size_t length,
|
||||
enum encoding encoding) {
|
||||
Local<Value> error;
|
||||
MaybeLocal<Value> ret;
|
||||
if (encoding == UTF8) {
|
||||
return String::NewFromUtf8(
|
||||
isolate,
|
||||
data,
|
||||
v8::NewStringType::kNormal,
|
||||
length);
|
||||
} else if (encoding == UCS2) {
|
||||
#ifdef DEBUG
|
||||
CHECK_EQ(reinterpret_cast<uintptr_t>(data) % 2, 0);
|
||||
CHECK_EQ(length % 2, 0);
|
||||
#endif
|
||||
ret = StringBytes::Encode(
|
||||
isolate,
|
||||
reinterpret_cast<const uint16_t*>(data),
|
||||
length / 2,
|
||||
&error);
|
||||
} else {
|
||||
ret = StringBytes::Encode(
|
||||
isolate,
|
||||
data,
|
||||
length,
|
||||
encoding,
|
||||
&error);
|
||||
}
|
||||
|
||||
if (ret.IsEmpty()) {
|
||||
CHECK(!error.IsEmpty());
|
||||
isolate->ThrowException(error);
|
||||
}
|
||||
|
||||
#ifdef DEBUG
|
||||
CHECK(ret.IsEmpty() || ret.ToLocalChecked()->IsString());
|
||||
#endif
|
||||
return ret.FromMaybe(Local<Value>()).As<String>();
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
|
||||
MaybeLocal<String> StringDecoder::DecodeData(Isolate* isolate,
|
||||
const char* data,
|
||||
size_t* nread_ptr) {
|
||||
Local<String> prepend, body;
|
||||
|
||||
size_t nread = *nread_ptr;
|
||||
|
||||
if (Encoding() == UTF8 || Encoding() == UCS2 || Encoding() == BASE64) {
|
||||
// See if we want bytes to finish a character from the previous
|
||||
// chunk; if so, copy the new bytes to the missing bytes buffer
|
||||
// and create a small string from it that is to be prepended to the
|
||||
// main body.
|
||||
if (MissingBytes() > 0) {
|
||||
// There are never more bytes missing than the pre-calculated maximum.
|
||||
CHECK_LE(MissingBytes() + BufferedBytes(),
|
||||
kIncompleteCharactersEnd);
|
||||
if (Encoding() == UTF8) {
|
||||
// For UTF-8, we need special treatment to align with the V8 decoder:
|
||||
// If an incomplete character is found at a chunk boundary, we turn
|
||||
// that character into a single invalid one.
|
||||
for (size_t i = 0; i < nread && i < MissingBytes(); ++i) {
|
||||
if ((data[i] & 0xC0) != 0x80) {
|
||||
// This byte is not a continuation byte even though it should have
|
||||
// been one.
|
||||
// Act as if there was a 1-byte incomplete character, which does
|
||||
// not make sense but works here because we know it's invalid.
|
||||
state_[kMissingBytes] = 0;
|
||||
state_[kBufferedBytes] = 1;
|
||||
data += i;
|
||||
nread -= i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
size_t found_bytes =
|
||||
std::min(nread, static_cast<size_t>(MissingBytes()));
|
||||
memcpy(IncompleteCharacterBuffer() + BufferedBytes(),
|
||||
data,
|
||||
found_bytes);
|
||||
// Adjust the two buffers.
|
||||
data += found_bytes;
|
||||
nread -= found_bytes;
|
||||
|
||||
state_[kMissingBytes] -= found_bytes;
|
||||
state_[kBufferedBytes] += found_bytes;
|
||||
|
||||
if (LIKELY(MissingBytes() == 0)) {
|
||||
// If no more bytes are missing, create a small string that we
|
||||
// will later prepend.
|
||||
if (!MakeString(isolate,
|
||||
IncompleteCharacterBuffer(),
|
||||
BufferedBytes(),
|
||||
Encoding()).ToLocal(&prepend)) {
|
||||
return MaybeLocal<String>();
|
||||
}
|
||||
|
||||
*nread_ptr += BufferedBytes();
|
||||
// No more buffered bytes.
|
||||
state_[kBufferedBytes] = 0;
|
||||
}
|
||||
}
|
||||
|
||||
// It could be that trying to finish the previous chunk already
|
||||
// consumed all data that we received in this chunk.
|
||||
if (UNLIKELY(nread == 0)) {
|
||||
body = !prepend.IsEmpty() ? prepend : String::Empty(isolate);
|
||||
prepend = Local<String>();
|
||||
} else {
|
||||
#ifdef DEBUG
|
||||
// If not, that means is no character left to finish at this point.
|
||||
CHECK_EQ(MissingBytes(), 0);
|
||||
CHECK_EQ(BufferedBytes(), 0);
|
||||
#endif
|
||||
|
||||
// See whether there is a character that we may have to cut off and
|
||||
// finish when receiving the next chunk.
|
||||
if (Encoding() == UTF8 && data[nread - 1] & 0x80) {
|
||||
// This is UTF-8 encoded data and we ended on a non-ASCII UTF-8 byte.
|
||||
// This means we'll need to figure out where the character to which
|
||||
// the byte belongs begins.
|
||||
for (size_t i = nread - 1; ; --i) {
|
||||
#ifdef DEBUG
|
||||
CHECK_LT(i, nread);
|
||||
#endif
|
||||
state_[kBufferedBytes]++;
|
||||
if ((data[i] & 0xC0) == 0x80) {
|
||||
// This byte does not start a character (a "trailing" byte).
|
||||
if (state_[kBufferedBytes] >= 4 || i == 0) {
|
||||
// We either have more then 4 trailing bytes (which means
|
||||
// the current character would not be inside the range for
|
||||
// valid Unicode, and in particular cannot be represented
|
||||
// through JavaScript's UTF-16-based approach to strings), or the
|
||||
// current buffer does not contain the start of an UTF-8 character
|
||||
// at all. Either way, this is invalid UTF8 and we can just
|
||||
// let the engine's decoder handle it.
|
||||
state_[kBufferedBytes] = 0;
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// Found the first byte of a UTF-8 character. By looking at the
|
||||
// upper bits we can tell how long the character *should* be.
|
||||
if ((data[i] & 0xE0) == 0xC0) {
|
||||
state_[kMissingBytes] = 2;
|
||||
} else if ((data[i] & 0xF0) == 0xE0) {
|
||||
state_[kMissingBytes] = 3;
|
||||
} else if ((data[i] & 0xF8) == 0xF0) {
|
||||
state_[kMissingBytes] = 4;
|
||||
} else {
|
||||
// This lead byte would indicate a character outside of the
|
||||
// representable range.
|
||||
state_[kBufferedBytes] = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
if (BufferedBytes() >= MissingBytes()) {
|
||||
// Received more or exactly as many trailing bytes than the lead
|
||||
// character would indicate. In the "==" case, we have valid
|
||||
// data and don't need to slice anything off;
|
||||
// in the ">" case, this is invalid UTF-8 anyway.
|
||||
state_[kMissingBytes] = 0;
|
||||
state_[kBufferedBytes] = 0;
|
||||
}
|
||||
|
||||
state_[kMissingBytes] -= state_[kBufferedBytes];
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (Encoding() == UCS2) {
|
||||
if ((nread % 2) == 1) {
|
||||
// We got half a codepoint, and need the second byte of it.
|
||||
state_[kBufferedBytes] = 1;
|
||||
state_[kMissingBytes] = 1;
|
||||
} else if ((data[nread - 1] & 0xFC) == 0xD8) {
|
||||
// Half a split UTF-16 character.
|
||||
state_[kBufferedBytes] = 2;
|
||||
state_[kMissingBytes] = 2;
|
||||
}
|
||||
} else if (Encoding() == BASE64) {
|
||||
state_[kBufferedBytes] = nread % 3;
|
||||
if (state_[kBufferedBytes] > 0)
|
||||
state_[kMissingBytes] = 3 - BufferedBytes();
|
||||
}
|
||||
|
||||
if (BufferedBytes() > 0) {
|
||||
// Copy the requested number of buffered bytes from the end of the
|
||||
// input into the incomplete character buffer.
|
||||
nread -= BufferedBytes();
|
||||
*nread_ptr -= BufferedBytes();
|
||||
memcpy(IncompleteCharacterBuffer(), data + nread, BufferedBytes());
|
||||
}
|
||||
|
||||
if (nread > 0) {
|
||||
if (!MakeString(isolate, data, nread, Encoding()).ToLocal(&body))
|
||||
return MaybeLocal<String>();
|
||||
} else {
|
||||
body = String::Empty(isolate);
|
||||
}
|
||||
}
|
||||
|
||||
if (prepend.IsEmpty()) {
|
||||
return body;
|
||||
} else {
|
||||
return String::Concat(prepend, body);
|
||||
}
|
||||
} else {
|
||||
CHECK(Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1);
|
||||
return MakeString(isolate, data, nread, Encoding());
|
||||
}
|
||||
}
|
||||
|
||||
MaybeLocal<String> StringDecoder::FlushData(Isolate* isolate) {
|
||||
if (Encoding() == ASCII || Encoding() == HEX || Encoding() == LATIN1) {
|
||||
CHECK_EQ(MissingBytes(), 0);
|
||||
CHECK_EQ(BufferedBytes(), 0);
|
||||
}
|
||||
|
||||
if (Encoding() == UCS2 && BufferedBytes() % 2 == 1) {
|
||||
// Ignore a single trailing byte, like the JS decoder does.
|
||||
state_[kMissingBytes]--;
|
||||
state_[kBufferedBytes]--;
|
||||
}
|
||||
|
||||
if (BufferedBytes() == 0)
|
||||
return String::Empty(isolate);
|
||||
|
||||
MaybeLocal<String> ret =
|
||||
MakeString(isolate,
|
||||
IncompleteCharacterBuffer(),
|
||||
BufferedBytes(),
|
||||
Encoding());
|
||||
|
||||
state_[kMissingBytes] = 0;
|
||||
state_[kBufferedBytes] = 0;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
namespace {
|
||||
|
||||
void DecodeData(const FunctionCallbackInfo<Value>& args) {
|
||||
StringDecoder* decoder =
|
||||
reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
|
||||
CHECK_NE(decoder, nullptr);
|
||||
size_t nread = Buffer::Length(args[1]);
|
||||
MaybeLocal<String> ret =
|
||||
decoder->DecodeData(args.GetIsolate(), Buffer::Data(args[1]), &nread);
|
||||
if (!ret.IsEmpty())
|
||||
args.GetReturnValue().Set(ret.ToLocalChecked());
|
||||
}
|
||||
|
||||
void FlushData(const FunctionCallbackInfo<Value>& args) {
|
||||
StringDecoder* decoder =
|
||||
reinterpret_cast<StringDecoder*>(Buffer::Data(args[0]));
|
||||
CHECK_NE(decoder, nullptr);
|
||||
MaybeLocal<String> ret = decoder->FlushData(args.GetIsolate());
|
||||
if (!ret.IsEmpty())
|
||||
args.GetReturnValue().Set(ret.ToLocalChecked());
|
||||
}
|
||||
|
||||
void InitializeStringDecoder(Local<Object> target,
|
||||
Local<Value> unused,
|
||||
Local<Context> context) {
|
||||
Environment* env = Environment::GetCurrent(context);
|
||||
Isolate* isolate = env->isolate();
|
||||
|
||||
#define SET_DECODER_CONSTANT(name) \
|
||||
target->Set(context, \
|
||||
FIXED_ONE_BYTE_STRING(isolate, #name), \
|
||||
Integer::New(isolate, StringDecoder::name)).FromJust()
|
||||
|
||||
SET_DECODER_CONSTANT(kIncompleteCharactersStart);
|
||||
SET_DECODER_CONSTANT(kIncompleteCharactersEnd);
|
||||
SET_DECODER_CONSTANT(kMissingBytes);
|
||||
SET_DECODER_CONSTANT(kBufferedBytes);
|
||||
SET_DECODER_CONSTANT(kEncodingField);
|
||||
SET_DECODER_CONSTANT(kNumFields);
|
||||
|
||||
Local<Array> encodings = Array::New(isolate);
|
||||
#define ADD_TO_ENCODINGS_ARRAY(cname, jsname) \
|
||||
encodings->Set(context, \
|
||||
static_cast<int32_t>(cname), \
|
||||
FIXED_ONE_BYTE_STRING(isolate, jsname)).FromJust()
|
||||
ADD_TO_ENCODINGS_ARRAY(ASCII, "ascii");
|
||||
ADD_TO_ENCODINGS_ARRAY(UTF8, "utf8");
|
||||
ADD_TO_ENCODINGS_ARRAY(BASE64, "base64");
|
||||
ADD_TO_ENCODINGS_ARRAY(UCS2, "utf16le");
|
||||
ADD_TO_ENCODINGS_ARRAY(HEX, "hex");
|
||||
ADD_TO_ENCODINGS_ARRAY(BUFFER, "buffer");
|
||||
ADD_TO_ENCODINGS_ARRAY(LATIN1, "latin1");
|
||||
|
||||
target->Set(context,
|
||||
FIXED_ONE_BYTE_STRING(isolate, "encodings"),
|
||||
encodings).FromJust();
|
||||
|
||||
target->Set(context,
|
||||
FIXED_ONE_BYTE_STRING(isolate, "kSize"),
|
||||
Integer::New(isolate, sizeof(StringDecoder))).FromJust();
|
||||
|
||||
env->SetMethod(target, "decode", DecodeData);
|
||||
env->SetMethod(target, "flush", FlushData);
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
} // namespace node
|
||||
|
||||
NODE_MODULE_CONTEXT_AWARE_INTERNAL(string_decoder,
|
||||
node::InitializeStringDecoder)
|
50
src/string_decoder.h
Normal file
50
src/string_decoder.h
Normal file
@ -0,0 +1,50 @@
|
||||
#ifndef SRC_STRING_DECODER_H_
|
||||
#define SRC_STRING_DECODER_H_
|
||||
|
||||
#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
|
||||
|
||||
#include "node.h"
|
||||
|
||||
namespace node {
|
||||
|
||||
class StringDecoder {
|
||||
public:
|
||||
StringDecoder() { state_[kEncodingField] = BUFFER; }
|
||||
inline void SetEncoding(enum encoding encoding);
|
||||
inline enum encoding Encoding() const;
|
||||
|
||||
inline char* IncompleteCharacterBuffer();
|
||||
inline unsigned MissingBytes() const;
|
||||
inline unsigned BufferedBytes() const;
|
||||
|
||||
// Decode a string from the specified encoding.
|
||||
// The value pointed to by `nread` will be modified to reflect that
|
||||
// less data may have been read because it ended on an incomplete character
|
||||
// and more data may have been read because a previously incomplete character
|
||||
// was finished.
|
||||
v8::MaybeLocal<v8::String> DecodeData(v8::Isolate* isolate,
|
||||
const char* data,
|
||||
size_t* nread);
|
||||
// Flush an incomplete character. For character encodings like UTF8 this
|
||||
// means printing replacement characters, buf for e.g. Base64 the returned
|
||||
// string contains more data.
|
||||
v8::MaybeLocal<v8::String> FlushData(v8::Isolate* isolate);
|
||||
|
||||
enum Fields {
|
||||
kIncompleteCharactersStart = 0,
|
||||
kIncompleteCharactersEnd = 4,
|
||||
kMissingBytes = 4,
|
||||
kBufferedBytes = 5,
|
||||
kEncodingField = 6,
|
||||
kNumFields = 7
|
||||
};
|
||||
|
||||
private:
|
||||
uint8_t state_[kNumFields] = {};
|
||||
};
|
||||
|
||||
} // namespace node
|
||||
|
||||
#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS
|
||||
|
||||
#endif // SRC_STRING_DECODER_H_
|
@ -128,6 +128,10 @@ assert.strictEqual(decoder.write(Buffer.from('3DD8', 'hex')), '');
|
||||
assert.strictEqual(decoder.write(Buffer.from('4D', 'hex')), '');
|
||||
assert.strictEqual(decoder.end(), '\ud83d');
|
||||
|
||||
decoder = new StringDecoder('utf16le');
|
||||
assert.strictEqual(decoder.write(Buffer.from('3DD84D', 'hex')), '\ud83d');
|
||||
assert.strictEqual(decoder.end(), '');
|
||||
|
||||
common.expectsError(
|
||||
() => new StringDecoder(1),
|
||||
{
|
||||
|
Loading…
x
Reference in New Issue
Block a user