diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index 193eeb88822..7b9cd2662e4 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -25,6 +25,7 @@ #ifndef QT_BOOTSTRAPPED #include #include +#include #endif // !QT_BOOTSTRAPPED #endif @@ -1265,11 +1266,8 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage, QStringConverter::State *state) { - qsizetype length = in.size(); - - Q_ASSERT(length < INT_MAX); // ### FIXME const char *mb = in.data(); - int mblen = length; + qsizetype mblen = in.size(); if (state && state->flags & QStringConverter::Flag::Stateless) state = nullptr; @@ -1354,11 +1352,11 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage, return {it, size}; }; - constexpr int MaxStep = std::numeric_limits::max(); - const char *end = mb + mblen; - while (mb != end) { - const int nextIn = int(std::min(qsizetype(mblen), qsizetype(MaxStep))); - const int nextOut = int(std::min(outlen, qsizetype(MaxStep))); + // Need it in this scope, since we try to decrease our window size if we + // encounter an error + int nextIn = qt_saturate(mblen); + while (mblen > 0) { + const int nextOut = qt_saturate(outlen); std::tie(out, outlen) = growOut(1); // Need space for at least one character if (!out) return {}; @@ -1371,25 +1369,56 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage, } else { int r = GetLastError(); if (r == ERROR_INSUFFICIENT_BUFFER) { - Q_ASSERT(QtPrivate::q_points_into_range(out, buf.data(), buf.data() + buf.size())); const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0); std::tie(out, outlen) = growOut(wclen); if (!out) return {}; - } else if (r == ERROR_NO_UNICODE_TRANSLATION && state - && state->remainingChars < q20::ssize(state->state_data)) { - ++state->remainingChars; - --mblen; - for (qsizetype i = 0; i < state->remainingChars; ++i) - state->state_data[i] = mb[mblen + i]; - if (mblen == 0) + } else if (r == ERROR_NO_UNICODE_TRANSLATION) { + // Can't decode the current window, so either store the state, + // reduce window size or output a replacement character. + + // Check if we can store all remaining characters in the state + // to be used next time we're called: + if (state && mblen <= q20::ssize(state->state_data)) { + state->remainingChars = mblen; + std::copy_n(mb, mblen, state->state_data); + mb += mblen; + mblen = 0; break; + } + + // .. if not, try to find the last valid character in the window + // and try again with a shrunken window: + if (nextIn > 1) { + // There may be some incomplete data at the end of our current + // window, so decrease the window size and try again. + // In the worst case scenario there is gigs of undecodable + // garbage, but what are we supposed to do about that? + const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0); + if (it != mb) + nextIn = int(it - mb); + else + --nextIn; + continue; + } + + // Finally, we are forced to output a replacement character for + // the first byte in the window: + std::tie(out, outlen) = growOut(1); + if (!out) + return {}; + *out = QChar::ReplacementCharacter; + ++out; + --outlen; + ++mb; + --mblen; } else { // Fail. qWarning("MultiByteToWideChar: Cannot convert multibyte text"); break; } } + nextIn = qt_saturate(mblen); } if (sp.isEmpty()) { @@ -1404,8 +1433,11 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage, if (sp.size() && sp.back().isNull()) sp.chop(1); - if (!state && mblen > 0) // We have trailing characters that should be converted + if (!state && mblen > 0) { + // We have trailing character(s) that could not be converted, and + // nowhere to cache them sp.resize(sp.size() + mblen, QChar::ReplacementCharacter); + } return sp; } diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp index 816ea55a66e..9f48d59ce57 100644 --- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp +++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp @@ -201,6 +201,7 @@ private slots: void fromLocal8Bit_data(); void fromLocal8Bit(); void fromLocal8Bit_special_cases(); + void fromLocal8Bit_2GiB(); void toLocal8Bit_data(); void toLocal8Bit(); void toLocal8Bit_special_cases(); @@ -2637,6 +2638,41 @@ void tst_QStringConverter::fromLocal8Bit_special_cases() QCOMPARE(state.remainingChars, 0); } +void tst_QStringConverter::fromLocal8Bit_2GiB() +{ +#if QT_POINTER_SIZE == 4 + QSKIP("This test is only relevant for 64-bit builds"); +#else + qsizetype size = qsizetype(std::numeric_limits::max()) + 3; + QByteArray input; + QT_TRY { + input.reserve(size); + } QT_CATCH (const std::bad_alloc &) { + QSKIP("Out of memory"); + } + // fill with '、' - a single octet character in Shift-JIS + input.fill('\xa4', std::numeric_limits::max() - 1); + // then append 'こ' - a two octet character in Shift-JIS + // which is now straddling the 2 GiB boundary + input += "\x82\xb1"; + // then append another two '、', so that our output is also crossing the + // 2 GiB boundary + input += "\xa4\xa4"; + QCOMPARE(input.size(), input.capacity()); + constexpr uint SHIFT_JIS = 932u; + QStringConverter::State state; + QString result; + QT_TRY { + result = QLocal8Bit::convertToUnicode_sys(input, SHIFT_JIS, &state); + } QT_CATCH (const std::bad_alloc &) { + QSKIP("Out of memory"); + } + QCOMPARE(result.size(), size - 1); // The 2-octet character is only 1 code unit in UTF-16 + QCOMPARE(result.last(4), u"、こ、、"); // Check we correctly decoded it + QCOMPARE(state.remainingChars, 0); // and there is nothing left in the state +#endif +} + void tst_QStringConverter::toLocal8Bit_data() { fromLocal8Bit_data();