From cfd96390955afa93efec642f4047dd15b49b3411 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5rten=20Nordheim?= Date: Mon, 16 Oct 2023 16:37:34 +0200 Subject: [PATCH] QStringConverter[win]: expose+test control of code-page Then we can easily test how fromLocal8Bit() and toLocal8Bit() behave with different code-pages. Pick-to: 6.5 Task-number: QTBUG-118318 Task-number: QTBUG-118185 Task-number: QTBUG-105105 Change-Id: Ib1cd3bccd27d598f4c80915557e332befcd96354 Reviewed-by: Thiago Macieira (cherry picked from commit 13fbedd162d167bb3cdbf95181b0870f61cf2ce0) --- src/corelib/text/qstringconverter.cpp | 32 ++-- src/corelib/text/qstringconverter_p.h | 2 + .../qstringconverter/tst_qstringconverter.cpp | 148 ++++++++++++++++++ 3 files changed, 172 insertions(+), 10 deletions(-) diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index f9bcb366637..6485198e62d 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -1252,7 +1252,8 @@ int QLocal8Bit::checkUtf8() return GetACP() == CP_UTF8 ? 1 : -1; } -static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::State *state) +static QString convertToUnicodeCharByChar(QByteArrayView in, quint32 codePage, + QStringConverter::State *state) { qsizetype length = in.size(); const char *chars = in.data(); @@ -1284,10 +1285,10 @@ static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::S const char *mb = mbcs; const char *next = 0; QString s; - while ((next = CharNextExA(CP_ACP, mb, 0)) != mb) { + while ((next = CharNextExA(codePage, mb, 0)) != mb) { wchar_t wc[2] ={0}; int charlength = int(next - mb); // always just a few bytes - int len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2); + int len = MultiByteToWideChar(codePage, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, charlength, wc, 2); if (len>0) { s.append(QChar(wc[0])); } else { @@ -1304,8 +1305,13 @@ static QString convertToUnicodeCharByChar(QByteArrayView in, QStringConverter::S return s; } - QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::State *state) +{ + return convertToUnicode_sys(in, CP_ACP, state); +} + +QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage, + QStringConverter::State *state) { qsizetype length = in.size(); @@ -1335,7 +1341,7 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St prev[0] = state_data; prev[1] = mb[0]; remainingChars = 0; - len = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, + len = MultiByteToWideChar(codePage, MB_PRECOMPOSED, prev, 2, wc.data(), wc.length()); if (len) { sp.append(QChar(wc[0])); @@ -1350,11 +1356,11 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St } } - while (!(len=MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, + while (!(len=MultiByteToWideChar(codePage, MB_PRECOMPOSED|MB_ERR_INVALID_CHARS, mb, mblen, wc.data(), wc.length()))) { int r = GetLastError(); if (r == ERROR_INSUFFICIENT_BUFFER) { - const int wclen = MultiByteToWideChar(CP_ACP, MB_PRECOMPOSED, + const int wclen = MultiByteToWideChar(codePage, MB_PRECOMPOSED, mb, mblen, 0, 0); wc.resize(wclen); } else if (r == ERROR_NO_UNICODE_TRANSLATION) { @@ -1363,7 +1369,7 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St mblen--; //check whether, we hit an invalid character in the middle if ((mblen <= 1) || (remainingChars && state_data)) - return convertToUnicodeCharByChar(in, state); + return convertToUnicodeCharByChar(in, codePage, state); //Remove the last character and try again... state_data = mb[mblen-1]; remainingChars = 1; @@ -1394,6 +1400,12 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St } QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter::State *state) +{ + return convertFromUnicode_sys(in, CP_ACP, state); +} + +QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage, + QStringConverter::State *state) { const QChar *ch = in.data(); qsizetype uclen = in.size(); @@ -1411,12 +1423,12 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, QStringConverter:: BOOL used_def; QByteArray mb(4096, 0); int len; - while (!(len=WideCharToMultiByte(CP_ACP, 0, (const wchar_t*)ch, uclen, + while (!(len=WideCharToMultiByte(codePage, 0, (const wchar_t*)ch, uclen, mb.data(), mb.size()-1, 0, &used_def))) { int r = GetLastError(); if (r == ERROR_INSUFFICIENT_BUFFER) { - mb.resize(1+WideCharToMultiByte(CP_ACP, 0, + mb.resize(1+WideCharToMultiByte(codePage, 0, (const wchar_t*)ch, uclen, 0, 0, 0, &used_def)); // and try again... diff --git a/src/corelib/text/qstringconverter_p.h b/src/corelib/text/qstringconverter_p.h index edbe1b54843..924ef2c7696 100644 --- a/src/corelib/text/qstringconverter_p.h +++ b/src/corelib/text/qstringconverter_p.h @@ -362,6 +362,7 @@ struct Q_CORE_EXPORT QLocal8Bit } return r > 0; } + static QString convertToUnicode_sys(QByteArrayView, quint32, QStringConverter::State *); static QString convertToUnicode_sys(QByteArrayView, QStringConverter::State *); static QString convertToUnicode(QByteArrayView in, QStringConverter::State *state) { @@ -369,6 +370,7 @@ struct Q_CORE_EXPORT QLocal8Bit return QUtf8::convertToUnicode(in, state); return convertToUnicode_sys(in, state); } + static QByteArray convertFromUnicode_sys(QStringView, quint32, QStringConverter::State *); static QByteArray convertFromUnicode_sys(QStringView, QStringConverter::State *); static QByteArray convertFromUnicode(QStringView in, QStringConverter::State *state) { diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp index 253749ede99..67047e9f37f 100644 --- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp +++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp @@ -178,6 +178,16 @@ private slots: void encodingForHtml_data(); void encodingForHtml(); + +#ifdef Q_OS_WIN + // On all other systems local 8-bit encoding is UTF-8 + void fromLocal8Bit_data(); + void fromLocal8Bit(); + void fromLocal8Bit_special_cases(); + void toLocal8Bit_data(); + void toLocal8Bit(); + void toLocal8Bit_special_cases(); +#endif }; void tst_QStringConverter::constructByName() @@ -2475,6 +2485,144 @@ void tst_QStringConverter::threadSafety() QCOMPARE(b, QString::fromLatin1("abcdefghijklmonpqrstufvxyz")); } +#ifdef Q_OS_WIN +void tst_QStringConverter::fromLocal8Bit_data() +{ + QTest::addColumn("eightBit"); + QTest::addColumn("utf16"); + QTest::addColumn("codePage"); + + constexpr uint WINDOWS_1252 = 1252u; + QTest::newRow("windows-1252") << "Hello, world!"_ba << u"Hello, world!"_s << WINDOWS_1252; + constexpr uint SHIFT_JIS = 932u; + // Mostly two byte characters, but the comma is a single byte character (0xa4) + QTest::newRow("shiftJIS") + << "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd\xa4\x90\xa2\x8a\x45\x81\x49"_ba + << u"こんにちは、世界!"_s << SHIFT_JIS; +} + +void tst_QStringConverter::fromLocal8Bit() +{ + QFETCH(const QByteArray, eightBit); + QFETCH(const QString, utf16); + QFETCH(const quint32, codePage); + + QStringConverter::State state; + + QString result = QLocal8Bit::convertToUnicode_sys(eightBit, codePage, &state); + QCOMPARE(result, utf16); + QCOMPARE(state.remainingChars, 0); + + result.clear(); + state.clear(); + for (char c : eightBit) + result += QLocal8Bit::convertToUnicode_sys({&c, 1}, codePage, &state); + QCOMPARE(result, utf16); + QCOMPARE(state.remainingChars, 0); +} + +void tst_QStringConverter::fromLocal8Bit_special_cases() +{ + QStringConverter::State state; + constexpr uint SHIFT_JIS = 932u; + // Decode a 2-octet character, but only provide 1 octet at first: + QString result = QLocal8Bit::convertToUnicode_sys("\x82", SHIFT_JIS, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the second octet: + result = QLocal8Bit::convertToUnicode_sys("\xb1", SHIFT_JIS, &state); + QCOMPARE(result, u"こ"); + QCOMPARE(state.remainingChars, 0); + + // Now try a 3-octet UTF-8 sequence: + result.clear(); + state.clear(); + constexpr uint UTF8 = 65001u; + // First the first 2 octets: + result = QLocal8Bit::convertToUnicode_sys("\xe4\xbd", UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the remaining octet: + result = QLocal8Bit::convertToUnicode_sys("\xa0", UTF8, &state); + QEXPECT_FAIL("", "We don't store enough state to handle this case", Abort); + QCOMPARE(result, u"你"); + QCOMPARE(state.remainingChars, 0); + + // Now try a 4-octet GB 18030 sequence: + result.clear(); + state.clear(); + constexpr uint GB_18030 = 54936u; + const char sequence[] = "\x95\x32\x90\x31"; + QByteArrayView octets = QByteArrayView(sequence); + result = QLocal8Bit::convertToUnicode_sys(octets.first(2), GB_18030, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QEXPECT_FAIL("", + "We don't store enough state to handle this case. + GB 18030 does not work with " + "the MB_PRECOMPOSED flag.", + Abort); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide one more octet: + result = QLocal8Bit::convertToUnicode_sys(octets.sliced(2, 1), GB_18030, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the last octet + result = QLocal8Bit::convertToUnicode_sys(octets.last(1), GB_18030, &state); + QCOMPARE(result, u"𠂇"); + QCOMPARE(state.remainingChars, 0); +} + +void tst_QStringConverter::toLocal8Bit_data() +{ + fromLocal8Bit_data(); +} + +void tst_QStringConverter::toLocal8Bit() +{ + QFETCH(const QByteArray, eightBit); + QFETCH(const QString, utf16); + QFETCH(const quint32, codePage); + + QStringConverter::State state; + + QByteArray result = QLocal8Bit::convertFromUnicode_sys(utf16, codePage, &state); + QCOMPARE(result, eightBit); + QCOMPARE(state.remainingChars, 0); + + result.clear(); + state.clear(); + for (QChar c : utf16) + result += QLocal8Bit::convertFromUnicode_sys(QStringView(&c, 1), codePage, &state); + QCOMPARE(result, eightBit); +} + +void tst_QStringConverter::toLocal8Bit_special_cases() +{ + QStringConverter::State state; + // Normally utf8 goes through a different code path, but we can force it here + constexpr uint UTF8 = 65001u; + // Decode a 2-code unit character, but only provide 1 code unit at first: + const char16_t a[] = u"𬽦"; + QStringView firstHalf = QStringView(a, 1); + QByteArray result = QLocal8Bit::convertFromUnicode_sys(firstHalf, UTF8, &state); + QEXPECT_FAIL("", "We don't currently handle missing the low surrogate", Abort); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then provide the second code unit: + QStringView secondHalf = QStringView(a + 1, 1); + result = QLocal8Bit::convertFromUnicode_sys(secondHalf, UTF8, &state); + QCOMPARE(result, "\xf0\xac\xbd\xa6"_ba); + QCOMPARE(state.remainingChars, 0); + + // Retain compat with the behavior for toLocal8Bit: + QCOMPARE(firstHalf.toLocal8Bit(), "?"); +} +#endif + struct DontCrashAtExit { ~DontCrashAtExit() { QStringDecoder decoder(QStringDecoder::Utf8);