QStringConverter: Introduce finalize()

When doing a streaming conversion, it is not enough to check whether
there are errors. Once all input has been consumed, one must also check
whether there has been any data that was consumed, but not converted
yet. Provide finalize() to do the check, set an error if there was
incomplete data, and to optionally write replacement characters for it
to an output buffer.

[ChangeLog][QtCore][QStringDecoder] Added finalize(), a function to
force the converter to consider the sequence of inputs as complete,
flushing potential partial character sequences.

[ChangeLog][QtCore][QStringEncoder] Added finalize(), a function to
force the converter to consider the sequence of inputs as complete,
flushing potential partial character sequences or restoring state for
stateful text encodings.

Change-Id: I5fe26ae8e5d1477a86b365cc49c430b057876893
Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
This commit is contained in:
Fabian Kosmale 2022-12-07 13:12:45 +01:00 committed by Mårten Nordheim
parent 479fb822bf
commit f1c0bd2e06
5 changed files with 377 additions and 4 deletions

View File

@ -19,20 +19,28 @@ QByteArray encodedString = fromUtf16(string);
auto toUtf16 = QStringDecoder(QStringDecoder::Utf8); auto toUtf16 = QStringDecoder(QStringDecoder::Utf8);
QString string; QString string;
while (new_data_available()) { while (new_data_available() && !toUtf16.hasError()) {
QByteArray chunk = get_new_data(); QByteArray chunk = get_new_data();
string += toUtf16(chunk); string += toUtf16(chunk);
} }
auto result = toUtf16.finalize();
if (result.error != QStringDecoder::FinalizeResult::NoError) {
// Handle error
}
//! [2] //! [2]
//! [3] //! [3]
auto fromUtf16 = QStringEncoder(QStringEncoder::Utf8); auto fromUtf16 = QStringEncoder(QStringEncoder::Utf8);
QByteArray encoded; QByteArray encoded;
while (new_data_available()) { while (new_data_available() && !fromUtf16.hasError()) {
QString chunk = get_new_data(); QString chunk = get_new_data();
encoded += fromUtf16(chunk); encoded += fromUtf16(chunk);
} }
auto result = fromUtf16.finalize();
if (result.error != QStringEncoder::FinalizeResult::NoError) {
// Handle error
}
//! [3] //! [3]
{ {

View File

@ -39,9 +39,8 @@
#include <QtCore/private/wcharhelpers_win_p.h> #include <QtCore/private/wcharhelpers_win_p.h>
#include <QtCore/q20iterator.h> #include <QtCore/q20iterator.h>
#include <QtCore/q26numeric.h>
#endif // !QT_BOOTSTRAPPED #endif // !QT_BOOTSTRAPPED
#endif #endif // Q_OS_WIN
#include <array> #include <array>
#if __has_include(<bit>) && __cplusplus > 201703L #if __has_include(<bit>) && __cplusplus > 201703L
@ -49,6 +48,9 @@
#endif #endif
#include <string> #include <string>
#include <QtCore/q20utility.h> #include <QtCore/q20utility.h>
#ifndef QT_BOOTSTRAPPED
#include <QtCore/q26numeric.h>
#endif // !QT_BOOTSTRAPPED
QT_BEGIN_NAMESPACE QT_BEGIN_NAMESPACE
@ -2517,6 +2519,27 @@ std::optional<QStringConverter::Encoding> QStringConverter::encodingForName(QAny
} }
#ifndef QT_BOOTSTRAPPED #ifndef QT_BOOTSTRAPPED
namespace QtPrivate {
// Note: Check isValid() on the QStringConverter before calling this with its
// state!
static int partiallyParsedDataCount(QStringConverter::State *state)
{
#if QT_CONFIG(icu)
if (state->flags & QStringConverter::Flag::UsesIcu) {
UConverter *converter = static_cast<UConverter *>(state->d[0]);
if (!converter)
return 0;
UErrorCode err = U_ZERO_ERROR;
auto leftOver = ucnv_fromUCountPending(converter, &err);
// If there is an error, leftOver is -1, so no need for an additional
// check.
return std::max(leftOver, 0);
}
#endif
return q26::saturate_cast<int>(state->remainingChars);
}
} // namespace QtPrivate
/*! /*!
Returns the encoding for the content of \a data if it can be determined. Returns the encoding for the content of \a data if it can be determined.
\a expectedFirstCharacter can be passed as an additional hint to help determine \a expectedFirstCharacter can be passed as an additional hint to help determine
@ -2684,6 +2707,205 @@ QStringList QStringConverter::availableCodecs()
return result; return result;
} }
/*!
\class QStringConverter::FinalizeResultBase
\internal
*/
/*!
\class QStringConverter::FinalizeResultChar
\inmodule QtCore
\since 6.11
\reentrant
\brief Holds the result of calling finalize() on QStringDecoder or
QStringEncoder.
This class is used to relay the result of the finalize() call or the reason
why the call did not succeed.
*/
/*!
\enum QStringConverter::FinalizeResultBase::Error
\value NoError No error.
\value InvalidCharacters The encoder successfully finalized, but encountered
invalid characters either during finalization or some time earlier.
\value NotEnoughSpace finalize() did \e{not} succeed, you must grow the
buffer and call finalize() again.
*/
/*!
\variable QStringConverter::FinalizeResultChar::error
Relays errors discovered during finalization.
*/
/*!
\variable QStringConverter::FinalizeResultChar::next
Points to the character position \e{following} the last-written character.
*/
/*!
\variable QStringConverter::FinalizeResultChar::invalidChars
The number of invalid characters that were previously counted in the state
as well as any that were encountered during the call to finalize().
*/
/*!
\typedef QStringDecoder::FinalizeResult
This is an alias for QStringConverter::FinalizeResultChar<char16_t>.
*/
/*!
\typedef QStringDecoder::FinalizeResultQChar
This is an alias for QStringConverter::FinalizeResultChar<QChar>.
*/
/*!
\fn QStringDecoder::FinalizeResultQChar QStringDecoder::finalize(QChar *out, qsizetype maxlen)
\fn QStringDecoder::FinalizeResult QStringDecoder::finalize(char16_t *out, qsizetype maxlen)
\fn QStringDecoder::FinalizeResult QStringDecoder::finalize()
Signals to the decoder that no further data will arrive.
May also provide data from residual content that was pending decoding.
When there is no residual data to account for, the return's \c error
field will be set to \l {QCharConverter::FinalizeResult::Error::}
{NoError}.
If \a out is supplied and non-null, it must have space in which up to
\a maxlen characters may be written. Up to this many characters of
residual output are written to this space, with the end indicated by
the return-value's \c next field. Typically this residual data shall
consist of one replacement character per remaining unconverted input
character.
If all residual content has been delivered via \a out, if \a out is
\nullptr, or if there is no residual data, the decoder is reset on
return from finalize(). Otherwise, the remaining data can be retrieved
or discarded by a further call to finalize().
\since 6.11
\sa hasError(), appendToBuffer()
*/
auto QStringDecoder::finalize(char16_t *out, qsizetype maxlen) -> FinalizeResult
{
int count = 0;
if (isValid())
count = QtPrivate::partiallyParsedDataCount(&state);
using Error = FinalizeResult::Error;
const qint16 invalidChars = q26::saturate_cast<qint16>(state.invalidChars + count);
if (count == 0 || !out) {
resetState();
return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
}
if (maxlen < count)
return { {}, out, invalidChars, Error::NotEnoughSpace };
const char16_t replacement = (state.flags & QStringConverter::Flag::ConvertInvalidToNull)
? QChar::Null
: QChar::ReplacementCharacter;
out = std::fill_n(out, count, replacement);
resetState();
return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
}
/*!
\typedef QStringEncoder::FinalizeResult
This is an alias for QStringConverter::FinalizeResultChar<char>.
*/
/*!
\fn QStringEncoder::FinalizeResult QStringEncoder::finalize(char *out, qsizetype maxlen)
\fn QStringEncoder::FinalizeResult QStringEncoder::finalize()
Signals to the decoder that no further data will arrive.
May also provide data from residual content that was pending decoding.
When there is no residual data to account for, the return's \c error
field will be set to \l {QCharConverter::FinalizeResult::Error::}
{NoError}.
If \a out is supplied and non-null, it must have space in which up to
\a maxlen characters may be written. Up to this many characters of
residual output are written to this space, with the end indicated by
the return-value's \c next field. Typically this residual data shall
consist of one replacement character per remaining unconverted input
character. When using a stateful encoding, such as ISO-2022-JP, this may
also write bytes to restore, or end, the current state in the character
stream.
If all residual content has been delivered via \a out, if \a out is
\nullptr, or if there is no residual data, the decoder is reset on
return from finalize(). Otherwise, the remaining data can be retrieved
or discarded by a further call to finalize().
\since 6.11
\sa hasError(), appendToBuffer()
*/
auto QStringEncoder::finalize(char *out, qsizetype maxlen) -> QStringEncoder::FinalizeResult
{
qsizetype count = 0;
if (isValid())
count = QtPrivate::partiallyParsedDataCount(&state);
// For ICU we may be using a stateful codec that need to restore or finalize
// some state, otherwise we have nothing to do with count == 0
using Error = FinalizeResult::Error;
const bool usesIcu = !!(state.flags & QStringConverter::Flag::UsesIcu) && !!state.d[0];
const qint16 invalidChars = q26::saturate_cast<qint16>(state.invalidChars + count);
if (!isValid() || (!count && !usesIcu) || !out) {
resetState();
return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
}
if ((false)) {
#if defined(QT_USE_ICU_CODECS)
} else if (usesIcu) {
Q_ASSERT(out);
auto *icu_conv = static_cast<UConverter *>(state.d[0]);
Q_ASSERT(icu_conv); // bool usesIcu checks that the pointer is non-null
UErrorCode err = U_ZERO_ERROR;
UBool flush = true;
// If the QStringConverter was moved, the state that we used as a context is stale now.
UConverterFromUCallback action;
const void *context;
ucnv_getFromUCallBack(icu_conv, &action, &context);
if (context != &state)
ucnv_setFromUCallBack(icu_conv, action, &state, nullptr, nullptr, &err);
const UChar *dummyInput = u"";
const char *outEnd = out + maxlen;
ucnv_fromUnicode(icu_conv, &out, outEnd, &dummyInput, dummyInput, nullptr, flush, &err);
if (err == U_BUFFER_OVERFLOW_ERROR)
return { {}, out, invalidChars, Error::NotEnoughSpace };
resetState();
#endif
} else if (!(state.flags & QStringConverter::Flag::ConvertInvalidToNull)) {
/*
We don't really know (in general) how the replacement character
looks like in the target encoding. So we just encode 0xfffd, which
is the Unicode replacement character.
Use 4 as a best-guess for the upper-bound of how many characters
would potentially be produced by the leftover UTF-16 characters in
the state
*/
constexpr QChar replacementCharacter = QChar::ReplacementCharacter;
constexpr char16_t repl = replacementCharacter.unicode();
constexpr std::array<char16_t, 4> replacement{ repl, repl, repl, repl };
const qsizetype charactersToEncode = std::min(count, qsizetype(replacement.size()));
if (maxlen < requiredSpace(charactersToEncode))
return { {}, out, invalidChars, Error::NotEnoughSpace };
// we don't want the incomplete data in the internal buffer; we're
// flushing the buffer after all
resetState();
out = appendToBuffer(out, QStringView(replacement.data(), charactersToEncode));
} else /* outputting Null characters for each remaining unconverted input character */ {
if (maxlen < count)
return { {}, out, invalidChars, Error::NotEnoughSpace };
out = std::fill_n(out, count, '\0');
resetState();
}
return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError };
}
/*! /*!
Tries to determine the encoding of the HTML in \a data by looking at leading byte Tries to determine the encoding of the HTML in \a data by looking at leading byte
order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder

View File

@ -63,6 +63,13 @@ public:
} }
return iface->fromUtf16(out, in, &state); return iface->fromUtf16(out, in, &state);
} }
using FinalizeResult = FinalizeResultChar<char>;
Q_REQUIRED_RESULT
Q_CORE_EXPORT FinalizeResult finalize(char *out, qsizetype maxlen);
Q_REQUIRED_RESULT
FinalizeResult finalize() { return finalize(nullptr, 0); }
private: private:
QByteArray encodeAsByteArray(QStringView in) QByteArray encodeAsByteArray(QStringView in)
{ {
@ -128,6 +135,22 @@ public:
char16_t *appendToBuffer(char16_t *out, QByteArrayView ba) char16_t *appendToBuffer(char16_t *out, QByteArrayView ba)
{ return reinterpret_cast<char16_t *>(appendToBuffer(reinterpret_cast<QChar *>(out), ba)); } { return reinterpret_cast<char16_t *>(appendToBuffer(reinterpret_cast<QChar *>(out), ba)); }
using FinalizeResult = FinalizeResultChar<char16_t>;
using FinalizeResultQChar = FinalizeResultChar<QChar>;
FinalizeResultQChar finalize(QChar *out, qsizetype maxlen)
{
auto r = finalize(reinterpret_cast<char16_t *>(out), maxlen);
return { {}, reinterpret_cast<QChar *>(r.next), r.invalidChars, r.error };
}
Q_REQUIRED_RESULT
Q_CORE_EXPORT FinalizeResult finalize(char16_t *out, qsizetype maxlen);
Q_REQUIRED_RESULT
FinalizeResult finalize()
{
return finalize(static_cast<char16_t *>(nullptr), 0);
}
Q_CORE_EXPORT static QStringDecoder decoderForHtml(QByteArrayView data); Q_CORE_EXPORT static QStringDecoder decoderForHtml(QByteArrayView data);
private: private:

View File

@ -169,6 +169,25 @@ public:
Q_CORE_EXPORT static QStringList availableCodecs(); Q_CORE_EXPORT static QStringList availableCodecs();
struct FinalizeResultBase
{
enum Error : quint8 {
NoError,
InvalidCharacters,
NotEnoughSpace,
};
};
template <typename Char>
struct FinalizeResultChar : FinalizeResultBase
{
using Error = FinalizeResultBase::Error;
Char *next;
qint16 invalidChars;
Error error;
};
protected: protected:
const Interface *iface; const Interface *iface;
State state; State state;

View File

@ -196,6 +196,9 @@ private slots:
void availableCodesAreAvailable(); void availableCodesAreAvailable();
void finalize();
void finalizeStateful();
#ifdef Q_OS_WIN #ifdef Q_OS_WIN
// On all other systems local 8-bit encoding is UTF-8 // On all other systems local 8-bit encoding is UTF-8
void fromLocal8Bit_data(); void fromLocal8Bit_data();
@ -2491,6 +2494,104 @@ void tst_QStringConverter::availableCodesAreAvailable()
QVERIFY(QStringEncoder(codecName.toLatin1()).isValid()); QVERIFY(QStringEncoder(codecName.toLatin1()).isValid());
} }
void tst_QStringConverter::finalize()
{
// encoder
{
auto fromUtf16 = QStringEncoder(QStringEncoder::Utf8);
QString incompleteInput(QChar(0xd800));
QByteArray buffer("cdcdcdcd");
fromUtf16.appendToBuffer(buffer.data(), incompleteInput);
QVERIFY(!fromUtf16.hasError());
QCOMPARE(buffer, "cdcdcdcd");
QStringEncoder::FinalizeResult r = fromUtf16.finalize(buffer.data(), buffer.size());
QCOMPARE_GT(r.next, buffer.constData());
QCOMPARE(r.error, QStringEncoder::FinalizeResult::Error::InvalidCharacters);
QCOMPARE_GT(r.invalidChars, 0);
QVERIFY(!fromUtf16.hasError());
QVERIFY(buffer.startsWith(QString(QChar(QChar::ReplacementCharacter)).toUtf8()));
// Try calling finalize again, no new bytes should be output
std::array<char, 3> extraBytes;
r = fromUtf16.finalize(extraBytes.data(), extraBytes.size());
// Ugly-cast to void to circumvent smart testlib
QCOMPARE((void *)r.next, (void *)extraBytes.data());
QCOMPARE(r.invalidChars, 0);
QCOMPARE(r.error, QStringEncoder::FinalizeResult::Error::NoError);
}
// decoder
{
auto toUtf16 = QStringDecoder(QStringConverter::Utf8);
QByteArray incompleteInput("\xf0", 1);
QString buffer = u"cdcdcdcd"_s;
toUtf16.appendToBuffer(buffer.data(), incompleteInput);
QVERIFY(!toUtf16.hasError());
QCOMPARE(buffer, u"cdcdcdcd"_s);
auto result = toUtf16.finalize(buffer.data(), buffer.size());
QCOMPARE_GT(result.next, buffer.constData());
QCOMPARE(result.error, QStringDecoder::FinalizeResult::Error::InvalidCharacters);
QVERIFY(buffer.startsWith(QChar(QChar::ReplacementCharacter)));
// Try calling finalize again, no new bytes should be output
std::array<QChar, 3> extraBytes;
result = toUtf16.finalize(extraBytes.data(), extraBytes.size());
// Ugly-cast to void to circumvent smart testlib
QCOMPARE((void *)result.next, (void *)extraBytes.data());
}
}
void tst_QStringConverter::finalizeStateful()
{
#if !QT_CONFIG(icu) && !QT_CONFIG(winsdkicu)
// Technically there is _access_ to stateful encoding on Windows, but only
// through the System encoder.
QSKIP("ICU is not enabled in this build => stateful encoding is not tested.");
#else
{
// Test that calling finalize() restores ASCII mode in this stateful encoding:
static const char expected[] = {
0x1b, 0x24, 0x42, 0x25, 0x26, 0x25, 0x23, 0x25, 0x2d, 0x25, 0x5a, 0x25,
0x47, 0x25, 0x23, 0x25, 0x22, 0x1b, 0x28, 0x42
};
QString input = u"ウィキペディア"_s; // "Wikipedia"
QByteArray buffer(20, '\0');
auto stateful = QStringEncoder("ISO-2022-JP");
if (!stateful.isValid())
QSKIP("ICU without support for ISO-2022-JP, cannot continue test.");
char *out = stateful.appendToBuffer(buffer.data(), input);
QCOMPARE(std::distance(buffer.data(), out), 17);
// First without enough space. We assume ICU may or may not output the
// start of the 1b 28 42 sequence, so we handle either.
char * const end = buffer.end();
QStringEncoder::FinalizeResult result = stateful.finalize(out, 1);
QCOMPARE(result.error, QStringEncoder::FinalizeResult::Error::NotEnoughSpace);
// Then with enough space
result = stateful.finalize(result.next, std::distance(result.next, end));
QCOMPARE((void *)result.next, (void *)buffer.constEnd());
QCOMPARE(buffer.toHex(' '), QByteArrayView(expected).toByteArray().toHex(' '));
QCOMPARE(result.invalidChars, 0);
// Try calling finalize again, no new bytes should be output
std::array<char, 3> extraBytes;
result = stateful.finalize(extraBytes.data(), extraBytes.size());
QCOMPARE((void *)result.next, (void *)extraBytes.data());
QCOMPARE(result.error, QStringEncoder::FinalizeResult::Error::NoError);
QCOMPARE(result.invalidChars, 0);
}
{
// Repeat, but calling finalize() without an output
QString input = u"ウィキペディア"_s; // "Wikipedia"
QByteArray buffer(20, '\0');
auto stateful = QStringEncoder("ISO-2022-JP");
QVERIFY(stateful.isValid());
char *out = stateful.appendToBuffer(buffer.data(), input);
QCOMPARE(std::distance(buffer.data(), out), 17);
// This passes some pointers to ICU, we just shouldn't crash
QStringEncoder::FinalizeResult r = stateful.finalize();
QCOMPARE(r.error, QStringEncoder::FinalizeResult::Error::NoError);
QCOMPARE(r.invalidChars, 0);
QCOMPARE(r.next, nullptr);
}
#endif
}
class LoadAndConvert: public QRunnable class LoadAndConvert: public QRunnable
{ {
public: public: