diff --git a/src/corelib/doc/snippets/code/src_corelib_text_qstringconverter.cpp b/src/corelib/doc/snippets/code/src_corelib_text_qstringconverter.cpp index b5bfc9cd55f..4e44c364f37 100644 --- a/src/corelib/doc/snippets/code/src_corelib_text_qstringconverter.cpp +++ b/src/corelib/doc/snippets/code/src_corelib_text_qstringconverter.cpp @@ -19,20 +19,28 @@ QByteArray encodedString = fromUtf16(string); auto toUtf16 = QStringDecoder(QStringDecoder::Utf8); QString string; -while (new_data_available()) { +while (new_data_available() && !toUtf16.hasError()) { QByteArray chunk = get_new_data(); string += toUtf16(chunk); } +auto result = toUtf16.finalize(); +if (result.error != QStringDecoder::FinalizeResult::NoError) { + // Handle error +} //! [2] //! [3] auto fromUtf16 = QStringEncoder(QStringEncoder::Utf8); QByteArray encoded; -while (new_data_available()) { +while (new_data_available() && !fromUtf16.hasError()) { QString chunk = get_new_data(); encoded += fromUtf16(chunk); } +auto result = fromUtf16.finalize(); +if (result.error != QStringEncoder::FinalizeResult::NoError) { + // Handle error +} //! [3] { diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index b17432c067a..7cedc753d81 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -39,9 +39,8 @@ #include #include -#include #endif // !QT_BOOTSTRAPPED -#endif +#endif // Q_OS_WIN #include #if __has_include() && __cplusplus > 201703L @@ -49,6 +48,9 @@ #endif #include #include +#ifndef QT_BOOTSTRAPPED +#include +#endif // !QT_BOOTSTRAPPED QT_BEGIN_NAMESPACE @@ -2517,6 +2519,27 @@ std::optional QStringConverter::encodingForName(QAny } #ifndef QT_BOOTSTRAPPED +namespace QtPrivate { +// Note: Check isValid() on the QStringConverter before calling this with its +// state! +static int partiallyParsedDataCount(QStringConverter::State *state) +{ +#if QT_CONFIG(icu) + if (state->flags & QStringConverter::Flag::UsesIcu) { + UConverter *converter = static_cast(state->d[0]); + if (!converter) + return 0; + UErrorCode err = U_ZERO_ERROR; + auto leftOver = ucnv_fromUCountPending(converter, &err); + // If there is an error, leftOver is -1, so no need for an additional + // check. + return std::max(leftOver, 0); + } +#endif + return q26::saturate_cast(state->remainingChars); +} +} // namespace QtPrivate + /*! Returns the encoding for the content of \a data if it can be determined. \a expectedFirstCharacter can be passed as an additional hint to help determine @@ -2684,6 +2707,205 @@ QStringList QStringConverter::availableCodecs() return result; } +/*! + \class QStringConverter::FinalizeResultBase + \internal +*/ +/*! + \class QStringConverter::FinalizeResultChar + \inmodule QtCore + \since 6.11 + \reentrant + \brief Holds the result of calling finalize() on QStringDecoder or + QStringEncoder. + + This class is used to relay the result of the finalize() call or the reason + why the call did not succeed. +*/ +/*! + \enum QStringConverter::FinalizeResultBase::Error + \value NoError No error. + \value InvalidCharacters The encoder successfully finalized, but encountered + invalid characters either during finalization or some time earlier. + \value NotEnoughSpace finalize() did \e{not} succeed, you must grow the + buffer and call finalize() again. +*/ + +/*! + \variable QStringConverter::FinalizeResultChar::error + Relays errors discovered during finalization. +*/ +/*! + \variable QStringConverter::FinalizeResultChar::next + Points to the character position \e{following} the last-written character. +*/ +/*! + \variable QStringConverter::FinalizeResultChar::invalidChars + The number of invalid characters that were previously counted in the state + as well as any that were encountered during the call to finalize(). +*/ + +/*! + \typedef QStringDecoder::FinalizeResult + + This is an alias for QStringConverter::FinalizeResultChar. +*/ + +/*! + \typedef QStringDecoder::FinalizeResultQChar + + This is an alias for QStringConverter::FinalizeResultChar. +*/ + +/*! + \fn QStringDecoder::FinalizeResultQChar QStringDecoder::finalize(QChar *out, qsizetype maxlen) + \fn QStringDecoder::FinalizeResult QStringDecoder::finalize(char16_t *out, qsizetype maxlen) + \fn QStringDecoder::FinalizeResult QStringDecoder::finalize() + + Signals to the decoder that no further data will arrive. + + May also provide data from residual content that was pending decoding. + When there is no residual data to account for, the return's \c error + field will be set to \l {QCharConverter::FinalizeResult::Error::} + {NoError}. + + If \a out is supplied and non-null, it must have space in which up to + \a maxlen characters may be written. Up to this many characters of + residual output are written to this space, with the end indicated by + the return-value's \c next field. Typically this residual data shall + consist of one replacement character per remaining unconverted input + character. + + If all residual content has been delivered via \a out, if \a out is + \nullptr, or if there is no residual data, the decoder is reset on + return from finalize(). Otherwise, the remaining data can be retrieved + or discarded by a further call to finalize(). + + \since 6.11 + \sa hasError(), appendToBuffer() + */ +auto QStringDecoder::finalize(char16_t *out, qsizetype maxlen) -> FinalizeResult +{ + int count = 0; + if (isValid()) + count = QtPrivate::partiallyParsedDataCount(&state); + using Error = FinalizeResult::Error; + const qint16 invalidChars = q26::saturate_cast(state.invalidChars + count); + if (count == 0 || !out) { + resetState(); + return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError }; + } + if (maxlen < count) + return { {}, out, invalidChars, Error::NotEnoughSpace }; + + const char16_t replacement = (state.flags & QStringConverter::Flag::ConvertInvalidToNull) + ? QChar::Null + : QChar::ReplacementCharacter; + out = std::fill_n(out, count, replacement); + resetState(); + return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError }; +} + +/*! + \typedef QStringEncoder::FinalizeResult + + This is an alias for QStringConverter::FinalizeResultChar. +*/ + +/*! + \fn QStringEncoder::FinalizeResult QStringEncoder::finalize(char *out, qsizetype maxlen) + \fn QStringEncoder::FinalizeResult QStringEncoder::finalize() + + Signals to the decoder that no further data will arrive. + + May also provide data from residual content that was pending decoding. + When there is no residual data to account for, the return's \c error + field will be set to \l {QCharConverter::FinalizeResult::Error::} + {NoError}. + + If \a out is supplied and non-null, it must have space in which up to + \a maxlen characters may be written. Up to this many characters of + residual output are written to this space, with the end indicated by + the return-value's \c next field. Typically this residual data shall + consist of one replacement character per remaining unconverted input + character. When using a stateful encoding, such as ISO-2022-JP, this may + also write bytes to restore, or end, the current state in the character + stream. + + If all residual content has been delivered via \a out, if \a out is + \nullptr, or if there is no residual data, the decoder is reset on + return from finalize(). Otherwise, the remaining data can be retrieved + or discarded by a further call to finalize(). + + \since 6.11 + \sa hasError(), appendToBuffer() + */ +auto QStringEncoder::finalize(char *out, qsizetype maxlen) -> QStringEncoder::FinalizeResult +{ + qsizetype count = 0; + if (isValid()) + count = QtPrivate::partiallyParsedDataCount(&state); + // For ICU we may be using a stateful codec that need to restore or finalize + // some state, otherwise we have nothing to do with count == 0 + using Error = FinalizeResult::Error; + const bool usesIcu = !!(state.flags & QStringConverter::Flag::UsesIcu) && !!state.d[0]; + const qint16 invalidChars = q26::saturate_cast(state.invalidChars + count); + if (!isValid() || (!count && !usesIcu) || !out) { + resetState(); + return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError }; + } + + if ((false)) { +#if defined(QT_USE_ICU_CODECS) + } else if (usesIcu) { + Q_ASSERT(out); + auto *icu_conv = static_cast(state.d[0]); + Q_ASSERT(icu_conv); // bool usesIcu checks that the pointer is non-null + UErrorCode err = U_ZERO_ERROR; + + UBool flush = true; + + // If the QStringConverter was moved, the state that we used as a context is stale now. + UConverterFromUCallback action; + const void *context; + ucnv_getFromUCallBack(icu_conv, &action, &context); + if (context != &state) + ucnv_setFromUCallBack(icu_conv, action, &state, nullptr, nullptr, &err); + const UChar *dummyInput = u""; + const char *outEnd = out + maxlen; + ucnv_fromUnicode(icu_conv, &out, outEnd, &dummyInput, dummyInput, nullptr, flush, &err); + if (err == U_BUFFER_OVERFLOW_ERROR) + return { {}, out, invalidChars, Error::NotEnoughSpace }; + resetState(); +#endif + } else if (!(state.flags & QStringConverter::Flag::ConvertInvalidToNull)) { + /* + We don't really know (in general) how the replacement character + looks like in the target encoding. So we just encode 0xfffd, which + is the Unicode replacement character. + Use 4 as a best-guess for the upper-bound of how many characters + would potentially be produced by the leftover UTF-16 characters in + the state + */ + constexpr QChar replacementCharacter = QChar::ReplacementCharacter; + constexpr char16_t repl = replacementCharacter.unicode(); + constexpr std::array replacement{ repl, repl, repl, repl }; + const qsizetype charactersToEncode = std::min(count, qsizetype(replacement.size())); + if (maxlen < requiredSpace(charactersToEncode)) + return { {}, out, invalidChars, Error::NotEnoughSpace }; + // we don't want the incomplete data in the internal buffer; we're + // flushing the buffer after all + resetState(); + out = appendToBuffer(out, QStringView(replacement.data(), charactersToEncode)); + } else /* outputting Null characters for each remaining unconverted input character */ { + if (maxlen < count) + return { {}, out, invalidChars, Error::NotEnoughSpace }; + out = std::fill_n(out, count, '\0'); + resetState(); + } + return { {}, out, invalidChars, invalidChars ? Error::InvalidCharacters : Error::NoError }; +} + /*! Tries to determine the encoding of the HTML in \a data by looking at leading byte order marks or a charset specifier in the HTML meta tag and returns a QStringDecoder diff --git a/src/corelib/text/qstringconverter.h b/src/corelib/text/qstringconverter.h index 64ccfb1baa7..2ae7805764d 100644 --- a/src/corelib/text/qstringconverter.h +++ b/src/corelib/text/qstringconverter.h @@ -63,6 +63,13 @@ public: } return iface->fromUtf16(out, in, &state); } + + using FinalizeResult = FinalizeResultChar; + Q_REQUIRED_RESULT + Q_CORE_EXPORT FinalizeResult finalize(char *out, qsizetype maxlen); + Q_REQUIRED_RESULT + FinalizeResult finalize() { return finalize(nullptr, 0); } + private: QByteArray encodeAsByteArray(QStringView in) { @@ -128,6 +135,22 @@ public: char16_t *appendToBuffer(char16_t *out, QByteArrayView ba) { return reinterpret_cast(appendToBuffer(reinterpret_cast(out), ba)); } + + using FinalizeResult = FinalizeResultChar; + using FinalizeResultQChar = FinalizeResultChar; + FinalizeResultQChar finalize(QChar *out, qsizetype maxlen) + { + auto r = finalize(reinterpret_cast(out), maxlen); + return { {}, reinterpret_cast(r.next), r.invalidChars, r.error }; + } + Q_REQUIRED_RESULT + Q_CORE_EXPORT FinalizeResult finalize(char16_t *out, qsizetype maxlen); + Q_REQUIRED_RESULT + FinalizeResult finalize() + { + return finalize(static_cast(nullptr), 0); + } + Q_CORE_EXPORT static QStringDecoder decoderForHtml(QByteArrayView data); private: diff --git a/src/corelib/text/qstringconverter_base.h b/src/corelib/text/qstringconverter_base.h index 26ea9dfe0d7..32a866d07eb 100644 --- a/src/corelib/text/qstringconverter_base.h +++ b/src/corelib/text/qstringconverter_base.h @@ -169,6 +169,25 @@ public: Q_CORE_EXPORT static QStringList availableCodecs(); + + struct FinalizeResultBase + { + enum Error : quint8 { + NoError, + InvalidCharacters, + NotEnoughSpace, + }; + }; + template + struct FinalizeResultChar : FinalizeResultBase + { + using Error = FinalizeResultBase::Error; + + Char *next; + qint16 invalidChars; + Error error; + }; + protected: const Interface *iface; State state; diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp index 7f646888434..f9cf48701e3 100644 --- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp +++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp @@ -196,6 +196,9 @@ private slots: void availableCodesAreAvailable(); + void finalize(); + void finalizeStateful(); + #ifdef Q_OS_WIN // On all other systems local 8-bit encoding is UTF-8 void fromLocal8Bit_data(); @@ -2491,6 +2494,104 @@ void tst_QStringConverter::availableCodesAreAvailable() QVERIFY(QStringEncoder(codecName.toLatin1()).isValid()); } +void tst_QStringConverter::finalize() +{ + // encoder + { + auto fromUtf16 = QStringEncoder(QStringEncoder::Utf8); + QString incompleteInput(QChar(0xd800)); + QByteArray buffer("cdcdcdcd"); + fromUtf16.appendToBuffer(buffer.data(), incompleteInput); + QVERIFY(!fromUtf16.hasError()); + QCOMPARE(buffer, "cdcdcdcd"); + QStringEncoder::FinalizeResult r = fromUtf16.finalize(buffer.data(), buffer.size()); + QCOMPARE_GT(r.next, buffer.constData()); + QCOMPARE(r.error, QStringEncoder::FinalizeResult::Error::InvalidCharacters); + QCOMPARE_GT(r.invalidChars, 0); + QVERIFY(!fromUtf16.hasError()); + QVERIFY(buffer.startsWith(QString(QChar(QChar::ReplacementCharacter)).toUtf8())); + // Try calling finalize again, no new bytes should be output + std::array extraBytes; + r = fromUtf16.finalize(extraBytes.data(), extraBytes.size()); + // Ugly-cast to void to circumvent smart testlib + QCOMPARE((void *)r.next, (void *)extraBytes.data()); + QCOMPARE(r.invalidChars, 0); + QCOMPARE(r.error, QStringEncoder::FinalizeResult::Error::NoError); + } + // decoder + { + auto toUtf16 = QStringDecoder(QStringConverter::Utf8); + QByteArray incompleteInput("\xf0", 1); + QString buffer = u"cdcdcdcd"_s; + toUtf16.appendToBuffer(buffer.data(), incompleteInput); + QVERIFY(!toUtf16.hasError()); + QCOMPARE(buffer, u"cdcdcdcd"_s); + auto result = toUtf16.finalize(buffer.data(), buffer.size()); + QCOMPARE_GT(result.next, buffer.constData()); + QCOMPARE(result.error, QStringDecoder::FinalizeResult::Error::InvalidCharacters); + QVERIFY(buffer.startsWith(QChar(QChar::ReplacementCharacter))); + // Try calling finalize again, no new bytes should be output + std::array extraBytes; + result = toUtf16.finalize(extraBytes.data(), extraBytes.size()); + // Ugly-cast to void to circumvent smart testlib + QCOMPARE((void *)result.next, (void *)extraBytes.data()); + } +} + +void tst_QStringConverter::finalizeStateful() +{ +#if !QT_CONFIG(icu) && !QT_CONFIG(winsdkicu) + // Technically there is _access_ to stateful encoding on Windows, but only + // through the System encoder. + QSKIP("ICU is not enabled in this build => stateful encoding is not tested."); +#else + { + // Test that calling finalize() restores ASCII mode in this stateful encoding: + static const char expected[] = { + 0x1b, 0x24, 0x42, 0x25, 0x26, 0x25, 0x23, 0x25, 0x2d, 0x25, 0x5a, 0x25, + 0x47, 0x25, 0x23, 0x25, 0x22, 0x1b, 0x28, 0x42 + }; + QString input = u"ウィキペディア"_s; // "Wikipedia" + QByteArray buffer(20, '\0'); + auto stateful = QStringEncoder("ISO-2022-JP"); + if (!stateful.isValid()) + QSKIP("ICU without support for ISO-2022-JP, cannot continue test."); + char *out = stateful.appendToBuffer(buffer.data(), input); + QCOMPARE(std::distance(buffer.data(), out), 17); + // First without enough space. We assume ICU may or may not output the + // start of the 1b 28 42 sequence, so we handle either. + char * const end = buffer.end(); + QStringEncoder::FinalizeResult result = stateful.finalize(out, 1); + QCOMPARE(result.error, QStringEncoder::FinalizeResult::Error::NotEnoughSpace); + // Then with enough space + result = stateful.finalize(result.next, std::distance(result.next, end)); + QCOMPARE((void *)result.next, (void *)buffer.constEnd()); + QCOMPARE(buffer.toHex(' '), QByteArrayView(expected).toByteArray().toHex(' ')); + QCOMPARE(result.invalidChars, 0); + // Try calling finalize again, no new bytes should be output + std::array extraBytes; + result = stateful.finalize(extraBytes.data(), extraBytes.size()); + QCOMPARE((void *)result.next, (void *)extraBytes.data()); + QCOMPARE(result.error, QStringEncoder::FinalizeResult::Error::NoError); + QCOMPARE(result.invalidChars, 0); + } + { + // Repeat, but calling finalize() without an output + QString input = u"ウィキペディア"_s; // "Wikipedia" + QByteArray buffer(20, '\0'); + auto stateful = QStringEncoder("ISO-2022-JP"); + QVERIFY(stateful.isValid()); + char *out = stateful.appendToBuffer(buffer.data(), input); + QCOMPARE(std::distance(buffer.data(), out), 17); + // This passes some pointers to ICU, we just shouldn't crash + QStringEncoder::FinalizeResult r = stateful.finalize(); + QCOMPARE(r.error, QStringEncoder::FinalizeResult::Error::NoError); + QCOMPARE(r.invalidChars, 0); + QCOMPARE(r.next, nullptr); + } +#endif +} + class LoadAndConvert: public QRunnable { public: