From 5e0c6b5fa078cbafa2f63c2057024bb2548e78fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5rten=20Nordheim?= Date: Fri, 20 Oct 2023 13:48:21 +0200 Subject: [PATCH] QLocal8Bit::convertToUnicode[win]: handle more than one octet state Both to store and to restore. Without this a 3 or more octet sequence would cause errors or wrong output. This can be seen with GB 18030. Pick-to: 6.5 Fixes: QTBUG-118318 Task-number: QTBUG-105105 Change-Id: Id1f7f5f2fba4633b9f888add2186f4d8d21b7293 Reviewed-by: Thiago Macieira (cherry picked from commit 94214fe100334fd2983e76617191e4153c383a7f) Reviewed-by: Qt Cherry-pick Bot --- src/corelib/text/qstringconverter.cpp | 130 ++++++++---------- .../qstringconverter/tst_qstringconverter.cpp | 15 +- 2 files changed, 62 insertions(+), 83 deletions(-) diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index 7072da02a4d..ea7c6c4d15e 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -23,6 +23,7 @@ #include #ifndef QT_BOOTSTRAPPED #include +#include #endif // !QT_BOOTSTRAPPED #endif @@ -1255,59 +1256,6 @@ int QLocal8Bit::checkUtf8() return GetACP() == CP_UTF8 ? 1 : -1; } -static QString convertToUnicodeCharByChar(QByteArrayView in, quint32 codePage, - QStringConverter::State *state) -{ - qsizetype length = in.size(); - const char *chars = in.data(); - - Q_ASSERT(state); - if (state->flags & QStringConverter::Flag::Stateless) // temporary - state = nullptr; - - if (!chars || !length) - return QString(); - - qsizetype copyLocation = 0; - qsizetype extra = 2; - if (state && state->remainingChars) { - copyLocation = state->remainingChars; - extra += copyLocation; - } - qsizetype newLength = length + extra; - char *mbcs = new char[newLength]; - //ensure that we have a NULL terminated string - mbcs[newLength-1] = 0; - mbcs[newLength-2] = 0; - memcpy(&(mbcs[copyLocation]), chars, length); - if (copyLocation) { - //copy the last character from the state - mbcs[0] = (char)state->state_data[0]; - state->remainingChars = 0; - } - const char *mb = mbcs; - const char *next = 0; - QString s; - while ((next = CharNextExA(codePage, mb, 0)) != mb) { - wchar_t wc[2] ={0}; - int charlength = int(next - mb); // always just a few bytes - int len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, charlength, wc, 2); - if (len>0) { - s.append(QChar(wc[0])); - } else { - int r = GetLastError(); - //check if the character being dropped is the last character - if (r == ERROR_NO_UNICODE_TRANSLATION && mb == (mbcs+newLength -3) && state) { - state->remainingChars = 1; - state->state_data[0] = (char)*mb; - } - } - mb = next; - } - delete [] mbcs; - return s; -} - QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state) { return convertToUnicode_sys(in, CP_ACP, state); @@ -1329,28 +1277,60 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage, wchar_t *out = buf.data(); qsizetype outlen = buf.size(); - int len; + int len = 0; QString sp; //convert the pending character (if available) if (state && state->remainingChars) { - char prev[3] = {0}; - prev[0] = state->state_data[0]; - prev[1] = mb[0]; - state->remainingChars = 0; - len = MultiByteToWideChar(codePage, 0, prev, 2, out, outlen); + // Use at most 6 characters as a guess for the longest encoded character + // in any multibyte encoding. + // Even with a total of 2 bytes of overhead that would leave around + // 2^(4 * 8) possible characters + std::array prev = {0}; + Q_ASSERT(state->remainingChars <= q20::ssize(state->state_data)); + int remainingChars = state->remainingChars; + for (int i = 0; i < remainingChars; ++i) + prev[i] = state->state_data[i]; + do { + prev[remainingChars] = *mb; + ++mb; + --mblen; + ++remainingChars; + len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, prev.data(), + remainingChars, out, int(outlen)); + } while (!len && mblen && remainingChars < int(prev.size())); if (len) { - if (mblen == 1) + state->remainingChars = 0; + if (mblen == 0) return QStringView(out, len).toString(); - mb++; - mblen--; - ++out; - --outlen; + out += len; + outlen -= len; + } else if (mblen == 0 && remainingChars <= q20::ssize(state->state_data)) { + // Update the state, maybe we're lucky next time + for (int i = state->remainingChars; i < remainingChars; ++i) + state->state_data[i] = prev[i]; + state->remainingChars = remainingChars; + return QString(); + } else { + // Reset the pointer and length, since we used none of it. + mb = in.data(); + mblen = in.length(); + + // We couldn't decode any of the characters in the saved state, + // so output replacement characters + for (int i = 0; i < state->remainingChars; ++i) + out[i] = QChar::ReplacementCharacter; + out += state->remainingChars; + outlen -= state->remainingChars; + state->remainingChars = 0; } } - while (!(len=MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, - mb, mblen, out, int(outlen)))) { + Q_ASSERT(mblen > 0); + Q_ASSERT(state->remainingChars == 0); + + while (!(len = MultiByteToWideChar(codePage, MB_ERR_INVALID_CHARS, mb, mblen, out, + int(outlen)))) { int r = GetLastError(); if (r == ERROR_INSUFFICIENT_BUFFER) { Q_ASSERT(QtPrivate::q_points_into_range(out, buf.data(), buf.data() + buf.size())); @@ -1361,16 +1341,14 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage, it = std::copy_n(buf.data(), offset, it); out = it; outlen = wclen; - } else if (r == ERROR_NO_UNICODE_TRANSLATION) { - //check whether, we hit an invalid character in the middle - if (state && ((mblen <= 1) || (state->remainingChars && state->state_data[0]))) - return convertToUnicodeCharByChar(in, codePage, state); - //Remove the last character and try again... - if (state) { - state->state_data[0] = mb[mblen - 1]; - state->remainingChars = 1; - } // else: We have discarded a character that we won't handle? @todo - mblen--; + } else if (r == ERROR_NO_UNICODE_TRANSLATION && state + && state->remainingChars < q20::ssize(state->state_data)) { + ++state->remainingChars; + --mblen; + for (qsizetype i = 0; i < state->remainingChars; ++i) + state->state_data[i] = mb[mblen + i]; + if (mblen == 0) + break; } else { // Fail. qWarning("MultiByteToWideChar: Cannot convert multibyte text"); diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp index 97d0e85a2d7..a95df840372 100644 --- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp +++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp @@ -2546,7 +2546,6 @@ void tst_QStringConverter::fromLocal8Bit_special_cases() QCOMPARE_GT(state.remainingChars, 0); // Then provide the remaining octet: result = QLocal8Bit::convertToUnicode_sys("\xa0", UTF8, &state); - QEXPECT_FAIL("", "We don't store enough state to handle this case", Abort); QCOMPARE(result, u"你"); QCOMPARE(state.remainingChars, 0); @@ -2555,20 +2554,22 @@ void tst_QStringConverter::fromLocal8Bit_special_cases() state.clear(); constexpr uint GB_18030 = 54936u; const char sequence[] = "\x95\x32\x90\x31"; - QByteArrayView octets = QByteArrayView(sequence); + // Repeat the sequence multiple times to test handling of exhaustion of + // internal buffer + QByteArray repeated = QByteArray(sequence).repeated(2049); + QByteArrayView octets = QByteArrayView(repeated); result = QLocal8Bit::convertToUnicode_sys(octets.first(2), GB_18030, &state); QCOMPARE(result, QString()); QVERIFY(result.isNull()); - QEXPECT_FAIL("", "We don't store enough state to handle this case.", Abort); QCOMPARE_GT(state.remainingChars, 0); // Then provide one more octet: result = QLocal8Bit::convertToUnicode_sys(octets.sliced(2, 1), GB_18030, &state); QCOMPARE(result, QString()); QVERIFY(result.isNull()); QCOMPARE_GT(state.remainingChars, 0); - // Then provide the last octet - result = QLocal8Bit::convertToUnicode_sys(octets.last(1), GB_18030, &state); - QCOMPARE(result, u"𠂇"); + // Then provide the last octet + the rest of the string + result = QLocal8Bit::convertToUnicode_sys(octets.sliced(3), GB_18030, &state); + QCOMPARE(result.first(2), u"𠂇"); QCOMPARE(state.remainingChars, 0); } @@ -2618,7 +2619,7 @@ void tst_QStringConverter::toLocal8Bit_special_cases() // Retain compat with the behavior for toLocal8Bit: QCOMPARE(firstHalf.toLocal8Bit(), "?"); } -#endif +#endif // Q_OS_WIN struct DontCrashAtExit { ~DontCrashAtExit() {