QCborStreamReader: add UTF-8 reading API

Change-Id: I8bd6bb457b9c42218247fffd1797605d1687b0dc
Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
This commit is contained in:
Thiago Macieira 2023-11-13 19:42:17 -08:00
parent 8af346c1f6
commit 1d9137e13f
3 changed files with 157 additions and 5 deletions

View File

@ -618,13 +618,14 @@ public:
QByteArray *array;
QString *string;
};
enum { ByteArray = -1, String = -3 };
enum Type { ByteArray = -1, String = -3, Utf8String = -5 };
qsizetype maxlen_or_type;
ReadStringChunk(char *ptr, qsizetype maxlen) : ptr(ptr), maxlen_or_type(maxlen) {}
ReadStringChunk(QByteArray *array) : array(array), maxlen_or_type(ByteArray) {}
ReadStringChunk(QByteArray *array, Type type = ByteArray) : array(array), maxlen_or_type(type) {}
ReadStringChunk(QString *str) : string(str), maxlen_or_type(String) {}
bool isString() const { return maxlen_or_type == String; }
bool isUtf8String() const { return maxlen_or_type == Utf8String; }
bool isByteArray() const { return maxlen_or_type == ByteArray; }
bool isPlainPointer() const { return maxlen_or_type >= 0; }
};
@ -634,6 +635,7 @@ public:
QCborStreamReader::StringResult<qsizetype> readStringChunk(ReadStringChunk params);
qsizetype readStringChunk_byte(ReadStringChunk params, qsizetype len);
qsizetype readStringChunk_unicode(ReadStringChunk params, qsizetype utf8len);
qsizetype readStringChunk_utf8(ReadStringChunk params, qsizetype utf8len);
bool ensureStringIteration();
};
@ -1291,7 +1293,7 @@ bool QCborStreamReader::leaveContainer()
Decodes one string chunk from the CBOR string and returns it. This function
is used for both regular and chunked string contents, so the caller must
always loop around calling this function, even if isLengthKnown() has
always loop around calling this function, even if isLengthKnown()
is true. The typical use of this function is as follows:
\snippet code/src_corelib_serialization_qcborstream.cpp 27
@ -1322,12 +1324,46 @@ QCborStreamReader::StringResult<QString> QCborStreamReader::_readString_helper()
return result;
}
/*!
\fn QCborStreamReader::StringResult<QByteArray> QCborStreamReader::readUtf8String()
\since 6.7
Decodes one string chunk from the CBOR string and returns it. This function
is used for both regular and chunked string contents, so the caller must
always loop around calling this function, even if isLengthKnown() is true.
The typical use of this function is as for readString() in the following:
\snippet code/src_corelib_serialization_qcborstream.cpp 27
The toUtf8String() function implements the above loop and some extra checks.
\include qcborstreamreader.cpp string-no-type-conversions
\sa toString(), readByteArray(), isString(), readStringChunk()
*/
QCborStreamReader::StringResult<QByteArray> QCborStreamReader::_readUtf8String_helper()
{
using P = QCborStreamReaderPrivate::ReadStringChunk;
QCborStreamReader::StringResult<QByteArray> result;
auto r = d->readStringChunk(P{ &result.data, P::Utf8String });
result.status = r.status;
if (r.status == Error) {
result.data.clear();
} else {
Q_ASSERT(r.data == result.data.size());
if (r.status == EndOfString && lastError() == QCborError::NoError)
preparse();
}
return result;
}
/*!
\fn QCborStreamReader::StringResult<QByteArray> QCborStreamReader::readByteArray()
Decodes one byte array chunk from the CBOR string and returns it. This
function is used for both regular and chunked contents, so the caller must
always loop around calling this function, even if isLengthKnown() has
always loop around calling this function, even if isLengthKnown()
is true. The typical use of this function is as follows:
\snippet code/src_corelib_serialization_qcborstream.cpp 28
@ -1449,6 +1485,48 @@ bool QCborStreamReader::_toString_helper(QString &dst)
return ok;
}
/*!
\fn QCborStreamReader::toUtf8String()
\since 6.7
Decodes the current text string and returns it. If the string is chunked,
this function will iterate over all chunks and concatenate them. If an
error happens, this function returns a default-constructed QString(), but
that may not be distinguishable from certain empty text strings. Instead,
check lastError() to determine if an error has happened.
\include qcborstreamreader.cpp string-no-type-conversions
\include qcborstreamreader.cpp note-not-restartable
\sa readString(), readStringChunk(), isString(), toByteArray()
*/
/*!
\fn QCborStreamReader::toUtf8String(QByteArray &dst)
\overload
\since 6.7
Decodes the current text string and appends to \a dst. If the string is
chunked, this function will iterate over all chunks and concatenate them.
If an error happens during decoding, other chunks that could be decoded
successfully may have been written to \a dst nonetheless. Returns \c true
if the decoding happened without errors, \c false otherwise.
\include qcborstreamreader.cpp string-no-type-conversions
\include qcborstreamreader.cpp note-not-restartable
\sa readString(), readStringChunk(), isString(), toByteArray()
*/
bool QCborStreamReader::_toUtf8String_helper(QByteArray &dst)
{
using P = QCborStreamReaderPrivate::ReadStringChunk;
bool ok = d->readFullString({ &dst, P::Utf8String });
if (ok)
preparse();
return ok;
}
/*!
\fn QCborStreamReader::toByteArray()
\since 6.7
@ -1611,6 +1689,8 @@ QCborStreamReaderPrivate::readStringChunk(ReadStringChunk params)
if (params.isString()) {
// readString()
result.data = readStringChunk_unicode(params, qsizetype(len));
} else if (params.isUtf8String()) {
result.data = readStringChunk_utf8(params, qsizetype(len));
} else {
// readByteArray() or readStringChunk()
result.data = readStringChunk_byte(params, qsizetype(len));
@ -1657,7 +1737,7 @@ QCborStreamReaderPrivate::readStringChunk_byte(ReadStringChunk params, qsizetype
else
toRead = params.maxlen_or_type; // buffer smaller than string
ptr = params.ptr;
} else if (params.isByteArray()) {
} else if (!params.isString()) {
// See note above on having ensured there is enough incoming data.
auto oldSize = params.array->size();
auto newSize = oldSize;
@ -1758,6 +1838,23 @@ QCborStreamReaderPrivate::readStringChunk_unicode(ReadStringChunk params, qsizet
return size - currentSize; // how many bytes we added
}
inline qsizetype
QCborStreamReaderPrivate::readStringChunk_utf8(ReadStringChunk params, qsizetype utf8len)
{
qsizetype result = readStringChunk_byte(params, utf8len);
if (result < 0)
return result;
// validate the UTF-8 content we've just read
QByteArrayView chunk = *params.array;
chunk = chunk.last(result);
if (QtPrivate::isValidUtf8(chunk))
return result;
handleError(CborErrorInvalidUtf8TextString);
return -1;
}
QT_END_NAMESPACE
#include "moc_qcborstreamreader.cpp"

View File

@ -121,8 +121,10 @@ public:
bool leaveContainer();
bool toString(QString &dst) { Q_ASSERT(isString()); return _toString_helper(dst); }
bool toUtf8String(QByteArray &dst) { Q_ASSERT(isString()); return _toUtf8String_helper(dst); }
bool toByteArray(QByteArray &dst) { Q_ASSERT(isByteArray()); return _toByteArray_helper(dst); }
StringResult<QString> readString() { Q_ASSERT(isString()); return _readString_helper(); }
StringResult<QByteArray> readUtf8String() { Q_ASSERT(isString()); return _readUtf8String_helper(); }
StringResult<QByteArray> readByteArray(){ Q_ASSERT(isByteArray()); return _readByteArray_helper(); }
qsizetype currentStringChunkSize() const{ Q_ASSERT(isString() || isByteArray()); return _currentStringChunkSize(); }
StringResult<qsizetype> readStringChunk(char *ptr, qsizetype maxlen);
@ -151,6 +153,13 @@ public:
dst.clear();
return dst;
}
QByteArray toUtf8String()
{
QByteArray dst;
if (!toUtf8String(dst))
dst.clear();
return dst;
}
QByteArray toByteArray()
{
QByteArray dst;
@ -163,9 +172,11 @@ private:
void preparse();
bool _enterContainer_helper();
StringResult<QString> _readString_helper();
StringResult<QByteArray> _readUtf8String_helper();
StringResult<QByteArray> _readByteArray_helper();
qsizetype _currentStringChunkSize() const;
bool _toString_helper(QString &);
bool _toUtf8String_helper(QByteArray &);
bool _toByteArray_helper(QByteArray &);
template <typename FP> FP _toFloatingPoint() const noexcept

View File

@ -707,6 +707,27 @@ void tst_QCborStreamReader::strings()
QVERIFY(reader.toString(str));
QCOMPARE(str, prefix + QString::fromUtf8(fullString));
}
// Re-do again using the UTF-8 interface.
reader.reset();
QVERIFY(reader.isString() || reader.isByteArray());
if (reader.isString()) {
QByteArray prefix("some prefix");
QByteArray utf8 = prefix;
QVERIFY(reader.toUtf8String(utf8));
QCOMPARE(utf8, prefix + fullString);
reader.reset();
fullString = prefix;
forever {
auto r = reader.readUtf8String();
QCOMPARE_NE(r.status, QCborStreamReader::Error);
fullString += r.data;
if (r.status == QCborStreamReader::EndOfString)
break;
}
QCOMPARE(fullString, utf8);
}
}
void tst_QCborStreamReader::tags_data()
@ -949,6 +970,29 @@ void tst_QCborStreamReader::validation()
else
QVERIFY(reader.toByteArray().isNull());
}
reader.reset();
// and the UTF-8 API
if (reader.isString()) {
QByteArray prefix = "some prefix";
QByteArray ba = prefix;
QVERIFY(!reader.toUtf8String(ba));
QVERIFY(ba.startsWith(prefix)); // but may have decoded some
QCOMPARE(reader.lastError(), error);
reader.reset();
QVERIFY(reader.toUtf8String().isNull());
reader.reset();
auto r = reader.readUtf8String();
for ( ; r.status == QCborStreamReader::Ok; r = reader.readUtf8String()) {
// while the data is valid...
QVERIFY(!r.data.isNull());
}
QCOMPARE_NE(r.status, QCborStreamReader::EndOfString);
QCOMPARE(reader.lastError(), error);
}
}
void tst_QCborStreamReader::hugeDeviceValidation_data()