QCborStreamReader: move the UTF-8 decoding into readStringChunk

This allows us to decode long UTF-8 strings in chunks, instead of
allocating a big block of the size of the UTF-8 source and then another
for the full UTF-16 content.

Task-number: QTBUG-88253
Change-Id: I7b9b97ae9b32412abdc6fffd16452a47b1036ef3
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
(cherry picked from commit bab2cd1125a21885bea97079219031ab45861826)
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Thiago Macieira 2020-11-06 23:48:49 -08:00
parent 8675f81cd8
commit d7a0f8356a

View File

@ -48,6 +48,7 @@
#include <qiodevice.h>
#include <qdebug.h>
#include <qstack.h>
#include <qvarlengtharray.h>
QT_BEGIN_NAMESPACE
@ -671,18 +672,23 @@ public:
union {
char *ptr;
QByteArray *array;
QString *string;
};
enum { ByteArray = -1 };
enum { ByteArray = -1, String = -3 };
qsizetype maxlen_or_type;
ReadStringChunk(char *ptr, qsizetype maxlen) : ptr(ptr), maxlen_or_type(maxlen) {}
ReadStringChunk(QByteArray *array) : array(array), maxlen_or_type(ByteArray) {}
ReadStringChunk(QString *str) : string(str), maxlen_or_type(String) {}
bool isString() const { return maxlen_or_type == String; }
bool isByteArray() const { return maxlen_or_type == ByteArray; }
bool isPlainPointer() const { return maxlen_or_type >= 0; }
};
static QCborStreamReader::StringResultCode appendStringChunk(QCborStreamReader &reader, QByteArray *data);
QCborStreamReader::StringResult<qsizetype> readStringChunk(ReadStringChunk params);
qsizetype readStringChunk_byte(ReadStringChunk params, qsizetype len);
qsizetype readStringChunk_unicode(ReadStringChunk params, qsizetype utf8len);
bool ensureStringIteration();
};
@ -1354,29 +1360,17 @@ bool QCborStreamReader::leaveContainer()
*/
QCborStreamReader::StringResult<QString> QCborStreamReader::_readString_helper()
{
auto r = _readByteArray_helper();
QCborStreamReader::StringResult<QString> result;
auto r = d->readStringChunk(&result.data);
result.status = r.status;
if (r.status == Ok) {
// See QUtf8::convertToUnicode() a detailed explanation of why this
// conversion uses the same number of words or less.
CborError err = CborNoError;
if (r.data.size() > MaxStringSize) {
err = CborErrorDataTooLarge;
} else {
QStringConverter::State cs(QStringConverter::Flag::Stateless);
result.data = QUtf8::convertToUnicode(r.data, &cs);
if (cs.invalidChars != 0 || cs.remainingChars != 0)
err = CborErrorInvalidUtf8TextString;
}
if (err) {
d->handleError(err);
result.data.clear();
result.status = Error;
}
if (r.status == Error) {
result.data.clear();
} else {
Q_ASSERT(r.data == result.data.length());
if (r.status == EndOfString && lastError() == QCborError::NoError)
preparse();
}
return result;
}
@ -1547,12 +1541,41 @@ QCborStreamReaderPrivate::readStringChunk(ReadStringChunk params)
return result;
}
// Read the chunk into the user's buffer.
qint64 actuallyRead;
qptrdiff offset = qptrdiff(content);
bufferStart += offset;
if (device) {
// This first skip can't fail because we've already read this many bytes.
device->skip(bufferStart);
}
if (params.isString()) {
// readString()
result.data = readStringChunk_unicode(params, qsizetype(len));
} else {
// readByteArray() or readStringChunk()
result.data = readStringChunk_byte(params, qsizetype(len));
}
if (result.data < 0)
return result; // error
if (device)
updateBufferAfterString(0, len);
else
bufferStart += len;
preread();
result.status = QCborStreamReader::Ok;
return result;
}
inline qsizetype
QCborStreamReaderPrivate::readStringChunk_byte(ReadStringChunk params, qsizetype len)
{
qint64 actuallyRead;
qsizetype toRead = qsizetype(len);
qsizetype left = 0; // bytes from the chunk not copied to the user buffer, to discard
char *ptr;
char *ptr = nullptr;
if (params.isPlainPointer()) {
left = toRead - params.maxlen_or_type;
@ -1567,7 +1590,7 @@ QCborStreamReaderPrivate::readStringChunk(ReadStringChunk params)
auto newSize = oldSize;
if (add_overflow<decltype(newSize)>(oldSize, toRead, &newSize)) {
handleError(CborErrorDataTooLarge);
return result;
return -1;
}
try {
params.array->resize(newSize);
@ -1576,15 +1599,13 @@ QCborStreamReaderPrivate::readStringChunk(ReadStringChunk params)
// compatibility with Qt 5; in Qt 6, we could consider everything
// to be OOM.
handleError(newSize > MaxByteArraySize ? CborErrorDataTooLarge: CborErrorOutOfMemory);
return result;
return -1;
}
ptr = const_cast<char *>(params.array->constData()) + oldSize;
}
if (device) {
// This first skip can't fail because we've already read this many bytes.
device->skip(bufferStart + qptrdiff(content));
actuallyRead = device->read(ptr, toRead);
if (actuallyRead != toRead) {
@ -1597,20 +1618,71 @@ QCborStreamReaderPrivate::readStringChunk(ReadStringChunk params)
if (actuallyRead < 0) {
handleError(CborErrorIO);
return result;
return -1;
}
updateBufferAfterString(offset, len);
} else {
actuallyRead = toRead;
memcpy(ptr, buffer.constData() + bufferStart + offset, toRead);
bufferStart += QByteArray::size_type(offset + len);
memcpy(ptr, buffer.constData() + bufferStart, toRead);
}
preread();
result.data = actuallyRead;
result.status = QCborStreamReader::Ok;
return result;
return actuallyRead;
}
inline qsizetype
QCborStreamReaderPrivate::readStringChunk_unicode(ReadStringChunk params, qsizetype utf8len)
{
// See QUtf8::convertToUnicode() a detailed explanation of why this
// conversion uses the same number of words or less.
QChar *begin = nullptr;
if (params.isString()) {
try {
params.string->resize(utf8len);
} catch (const std::bad_alloc &) {
if (utf8len > MaxStringSize)
handleError(CborErrorDataTooLarge);
else
handleError(CborErrorOutOfMemory);
return -1;
}
begin = const_cast<QChar *>(params.string->constData());
}
QChar *ptr = begin;
QStringConverter::State cs(QStringConverter::Flag::Stateless);
if (device == nullptr) {
// Easy case: we can decode straight from the buffer we already have
ptr = QUtf8::convertToUnicode(ptr, { buffer.constData() + bufferStart, utf8len }, &cs);
} else {
// read in chunks, to avoid creating large, intermediate buffers
constexpr qsizetype StringChunkSize = 16384;
qsizetype chunkSize = qMin(StringChunkSize, utf8len);
QVarLengthArray<char> chunk(chunkSize);
cs = { QStringConverter::Flag::ConvertInitialBom };
while (utf8len > 0 && cs.invalidChars == 0) {
qsizetype toRead = qMin(chunkSize, utf8len);
qint64 actuallyRead = device->read(chunk.data(), toRead);
if (actuallyRead == toRead)
ptr = QUtf8::convertToUnicode(ptr, { chunk.data(), toRead }, &cs);
if (actuallyRead != toRead) {
handleError(CborErrorIO);
return -1;
}
utf8len -= toRead;
}
}
if (cs.invalidChars != 0 || cs.remainingChars != 0) {
handleError(CborErrorInvalidUtf8TextString);
return -1;
}
qsizetype size = ptr - begin;
if (params.isString())
params.string->truncate(size);
return size;
}
QT_END_NAMESPACE