QCborStreamReader: move the UTF-8 decoding into readStringChunk

This allows us to decode long UTF-8 strings in chunks, instead of allocating a big block of the size of the UTF-8 source and then another for the full UTF-16 content. Task-number: QTBUG-88253 Change-Id: I7b9b97ae9b32412abdc6fffd16452a47b1036ef3 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io> (cherry picked from commit bab2cd1125a21885bea97079219031ab45861826) Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
2020-11-06 23:48:49 -08:00 · 2020-11-06 23:48:49 -08:00 · d7a0f8356a
commit d7a0f8356a
parent 8675f81cd8
1 changed files with 109 additions and 37 deletions
--- a/src/corelib/serialization/qcborstreamreader.cpp
+++ b/src/corelib/serialization/qcborstreamreader.cpp
@ -48,6 +48,7 @@
 #include <qiodevice.h>
 #include <qdebug.h>
 #include <qstack.h>
+#include <qvarlengtharray.h>

 QT_BEGIN_NAMESPACE

@ -671,18 +672,23 @@ public:
        union {
            char *ptr;
            QByteArray *array;
+            QString *string;
        };
-        enum { ByteArray = -1 };
+        enum { ByteArray = -1, String = -3 };
        qsizetype maxlen_or_type;

        ReadStringChunk(char *ptr, qsizetype maxlen) : ptr(ptr), maxlen_or_type(maxlen) {}
        ReadStringChunk(QByteArray *array) : array(array), maxlen_or_type(ByteArray) {}
+        ReadStringChunk(QString *str) : string(str), maxlen_or_type(String) {}
+        bool isString() const { return maxlen_or_type == String; }
        bool isByteArray() const { return maxlen_or_type == ByteArray; }
        bool isPlainPointer() const { return maxlen_or_type >= 0; }
    };

    static QCborStreamReader::StringResultCode appendStringChunk(QCborStreamReader &reader, QByteArray *data);
    QCborStreamReader::StringResult<qsizetype> readStringChunk(ReadStringChunk params);
+    qsizetype readStringChunk_byte(ReadStringChunk params, qsizetype len);
+    qsizetype readStringChunk_unicode(ReadStringChunk params, qsizetype utf8len);
    bool ensureStringIteration();
 };

@ -1354,29 +1360,17 @@ bool QCborStreamReader::leaveContainer()
 */
 QCborStreamReader::StringResult<QString> QCborStreamReader::_readString_helper()
 {
-    auto r = _readByteArray_helper();
    QCborStreamReader::StringResult<QString> result;
+    auto r = d->readStringChunk(&result.data);
    result.status = r.status;
-
-    if (r.status == Ok) {
-        // See QUtf8::convertToUnicode() a detailed explanation of why this
-        // conversion uses the same number of words or less.
-        CborError err = CborNoError;
-        if (r.data.size() > MaxStringSize) {
-            err = CborErrorDataTooLarge;
-        } else {
-            QStringConverter::State cs(QStringConverter::Flag::Stateless);
-            result.data = QUtf8::convertToUnicode(r.data, &cs);
-            if (cs.invalidChars != 0 || cs.remainingChars != 0)
-                err = CborErrorInvalidUtf8TextString;
-        }
-
-        if (err) {
-            d->handleError(err);
-            result.data.clear();
-            result.status = Error;
-        }
+    if (r.status == Error) {
+        result.data.clear();
+    } else {
+        Q_ASSERT(r.data == result.data.length());
+        if (r.status == EndOfString && lastError() == QCborError::NoError)
+            preparse();
    }
+
    return result;
 }

@ -1547,12 +1541,41 @@ QCborStreamReaderPrivate::readStringChunk(ReadStringChunk params)
        return result;
    }

-    // Read the chunk into the user's buffer.
-    qint64 actuallyRead;
    qptrdiff offset = qptrdiff(content);
+    bufferStart += offset;
+    if (device) {
+        // This first skip can't fail because we've already read this many bytes.
+        device->skip(bufferStart);
+    }
+
+    if (params.isString()) {
+        // readString()
+        result.data = readStringChunk_unicode(params, qsizetype(len));
+    } else {
+        // readByteArray() or readStringChunk()
+        result.data = readStringChunk_byte(params, qsizetype(len));
+    }
+
+    if (result.data < 0)
+        return result;      // error
+
+    if (device)
+        updateBufferAfterString(0, len);
+    else
+        bufferStart += len;
+
+    preread();
+    result.status = QCborStreamReader::Ok;
+    return result;
+}
+
+inline qsizetype
+QCborStreamReaderPrivate::readStringChunk_byte(ReadStringChunk params, qsizetype len)
+{
+    qint64 actuallyRead;
    qsizetype toRead = qsizetype(len);
    qsizetype left = 0;     // bytes from the chunk not copied to the user buffer, to discard
-    char *ptr;
+    char *ptr = nullptr;

    if (params.isPlainPointer()) {
        left = toRead - params.maxlen_or_type;
@ -1567,7 +1590,7 @@ QCborStreamReaderPrivate::readStringChunk(ReadStringChunk params)
        auto newSize = oldSize;
        if (add_overflow<decltype(newSize)>(oldSize, toRead, &newSize)) {
            handleError(CborErrorDataTooLarge);
-            return result;
+            return -1;
        }
        try {
            params.array->resize(newSize);
@ -1576,15 +1599,13 @@ QCborStreamReaderPrivate::readStringChunk(ReadStringChunk params)
            // compatibility with Qt 5; in Qt 6, we could consider everything
            // to be OOM.
            handleError(newSize > MaxByteArraySize ? CborErrorDataTooLarge: CborErrorOutOfMemory);
-            return result;
+            return -1;
        }

        ptr = const_cast<char *>(params.array->constData()) + oldSize;
    }

    if (device) {
-        // This first skip can't fail because we've already read this many bytes.
-        device->skip(bufferStart + qptrdiff(content));
        actuallyRead = device->read(ptr, toRead);

        if (actuallyRead != toRead)  {
@ -1597,20 +1618,71 @@ QCborStreamReaderPrivate::readStringChunk(ReadStringChunk params)

        if (actuallyRead < 0) {
            handleError(CborErrorIO);
-            return result;
+            return -1;
        }
-
-        updateBufferAfterString(offset, len);
    } else {
        actuallyRead = toRead;
-        memcpy(ptr, buffer.constData() + bufferStart + offset, toRead);
-        bufferStart += QByteArray::size_type(offset + len);
+        memcpy(ptr, buffer.constData() + bufferStart, toRead);
    }

-    preread();
-    result.data = actuallyRead;
-    result.status = QCborStreamReader::Ok;
-    return result;
+    return actuallyRead;
+}
+
+inline qsizetype
+QCborStreamReaderPrivate::readStringChunk_unicode(ReadStringChunk params, qsizetype utf8len)
+{
+    // See QUtf8::convertToUnicode() a detailed explanation of why this
+    // conversion uses the same number of words or less.
+    QChar *begin = nullptr;
+    if (params.isString()) {
+        try {
+            params.string->resize(utf8len);
+        } catch (const std::bad_alloc &) {
+            if (utf8len > MaxStringSize)
+                handleError(CborErrorDataTooLarge);
+            else
+                handleError(CborErrorOutOfMemory);
+            return -1;
+        }
+
+        begin = const_cast<QChar *>(params.string->constData());
+    }
+
+    QChar *ptr = begin;
+    QStringConverter::State cs(QStringConverter::Flag::Stateless);
+    if (device == nullptr) {
+        // Easy case: we can decode straight from the buffer we already have
+        ptr = QUtf8::convertToUnicode(ptr, { buffer.constData() + bufferStart, utf8len }, &cs);
+    } else {
+        // read in chunks, to avoid creating large, intermediate buffers
+        constexpr qsizetype StringChunkSize = 16384;
+        qsizetype chunkSize = qMin(StringChunkSize, utf8len);
+        QVarLengthArray<char> chunk(chunkSize);
+
+        cs = { QStringConverter::Flag::ConvertInitialBom };
+        while (utf8len > 0 && cs.invalidChars == 0) {
+            qsizetype toRead = qMin(chunkSize, utf8len);
+            qint64 actuallyRead = device->read(chunk.data(), toRead);
+            if (actuallyRead == toRead)
+                ptr = QUtf8::convertToUnicode(ptr, { chunk.data(), toRead }, &cs);
+
+            if (actuallyRead != toRead) {
+                handleError(CborErrorIO);
+                return -1;
+            }
+            utf8len -= toRead;
+        }
+    }
+
+    if (cs.invalidChars != 0 || cs.remainingChars != 0) {
+        handleError(CborErrorInvalidUtf8TextString);
+        return -1;
+    }
+
+    qsizetype size = ptr - begin;
+    if (params.isString())
+        params.string->truncate(size);
+    return size;
 }

 QT_END_NAMESPACE