From f54213449843b86402cdb340e045eaf1f7da4533 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A5rten=20Nordheim?= Date: Wed, 18 Oct 2023 17:30:14 +0200 Subject: [PATCH] QLocal8Bit::convertFromUnicode[win]: handle trailing high surrogate The win32 API doesn't give us much choice. _Some_ code pages have support for returning some error if we pass a specific flag, but not all of them. Anyway, since the code pages might not support all that UTF-16 provides, we can't reasonably make it error out on characters that cannot be converted. So, the most reasonable thing we can handle is a unpaired high surrogate at the end of a string, assume that the rest of the string was fine, and that the low surrogate will be provided in the next call. Pick-to: 6.5 Fixes: QTBUG-118185 Task-number: QTBUG-105105 Change-Id: I1f193c9d8e04bec769d885d32440c759d9dff0c2 Reviewed-by: Thiago Macieira Reviewed-by: Edward Welbourne (cherry picked from commit d8d5922f16f1710b66caf718c302b633d2f78b0b) --- src/corelib/text/qstringconverter.cpp | 52 +++++++++++++++---- .../qstringconverter/tst_qstringconverter.cpp | 30 ++++++++--- 2 files changed, 66 insertions(+), 16 deletions(-) diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index 8000a699ce9..7d7dd35dce4 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -1387,7 +1387,6 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage, Q_ASSERT(uclen < INT_MAX); // ### FIXME Q_ASSERT(state); - Q_UNUSED(state); // ### Fixme if (state->flags & QStringConverter::Flag::Stateless) // temporary state = nullptr; @@ -1401,15 +1400,47 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage, qsizetype outlen = buf.size(); QByteArray mb; - int len; + if (state && state->remainingChars > 0) { + Q_ASSERT(state->remainingChars == 1); + // Let's try to decode the pending character + wchar_t wc[2] = { wchar_t(state->state_data[0]), ch[0] }; + int len = WideCharToMultiByte(codePage, 0, wc, int(std::size(wc)), out, outlen, nullptr, + nullptr); + if (!len) + return {}; // Cannot recover, and I refuse to believe it was a size limitation + out += len; + outlen -= len; + ++ch; + --uclen; + state->remainingChars = 0; + state->state_data[0] = 0; + if (uclen == 0) + return QByteArrayView(buf.data(), len).toByteArray(); + } + + if (state && QChar::isHighSurrogate(ch[uclen - 1])) { + // We can handle a missing low surrogate at the end of the string, + // so if there is one, exclude it now and store it in the state. + state->remainingChars = 1; + state->state_data[0] = ch[uclen - 1]; + --uclen; + if (uclen == 0) + return QByteArray(); + } + + Q_ASSERT(uclen > 0); + + int len = 0; while (!(len = WideCharToMultiByte(codePage, 0, ch, int(uclen), out, int(outlen), nullptr, nullptr))) { int r = GetLastError(); if (r == ERROR_INSUFFICIENT_BUFFER) { int neededLength = WideCharToMultiByte(codePage, 0, ch, int(uclen), nullptr, 0, nullptr, nullptr); - mb.resize(neededLength); - out = mb.data(); + const qsizetype currentLength = out - buf.data(); + mb.resize(currentLength + neededLength); + memcpy(mb.data(), out, currentLength * sizeof(*out)); + out = mb.data() + currentLength; outlen = neededLength; // and try again... } else { @@ -1423,12 +1454,13 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage, break; } } - if (!len) - return QByteArray(); - if (out == buf.data()) - mb = QByteArray(buf.data(), len); - else - mb.resize(len); + auto end = out + len; + if (QtPrivate::q_points_into_range(out, buf.data(), buf.data() + buf.size())) { + if (end != buf.data()) // else: we return null-array + mb = QByteArrayView(buf.data(), end).toByteArray(); + } else { + mb.truncate(end - mb.data()); + } return mb; } #endif diff --git a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp index a95df840372..9b33b464216 100644 --- a/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp +++ b/tests/auto/corelib/text/qstringconverter/tst_qstringconverter.cpp @@ -2499,6 +2499,10 @@ void tst_QStringConverter::fromLocal8Bit_data() QTest::newRow("shiftJIS") << "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd\xa4\x90\xa2\x8a\x45\x81\x49"_ba << u"こんにちは、世界!"_s << SHIFT_JIS; + + constexpr uint GB_18030 = 54936u; + QTest::newRow("GB-18030") << "\xc4\xe3\xba\xc3\xca\xc0\xbd\xe7\xa3\xa1"_ba << u"你好世界!"_s + << GB_18030; } void tst_QStringConverter::fromLocal8Bit() @@ -2595,6 +2599,7 @@ void tst_QStringConverter::toLocal8Bit() for (QChar c : utf16) result += QLocal8Bit::convertFromUnicode_sys(QStringView(&c, 1), codePage, &state); QCOMPARE(result, eightBit); + QCOMPARE(state.remainingChars, 0); } void tst_QStringConverter::toLocal8Bit_special_cases() @@ -2604,20 +2609,33 @@ void tst_QStringConverter::toLocal8Bit_special_cases() constexpr uint UTF8 = 65001u; // Decode a 2-code unit character, but only provide 1 code unit at first: const char16_t a[] = u"𬽦"; - QStringView firstHalf = QStringView(a, 1); - QByteArray result = QLocal8Bit::convertFromUnicode_sys(firstHalf, UTF8, &state); - QEXPECT_FAIL("", "We don't currently handle missing the low surrogate", Abort); + QStringView codeUnits = a; + QByteArray result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &state); QCOMPARE(result, QString()); QVERIFY(result.isNull()); QCOMPARE_GT(state.remainingChars, 0); // Then provide the second code unit: - QStringView secondHalf = QStringView(a + 1, 1); - result = QLocal8Bit::convertFromUnicode_sys(secondHalf, UTF8, &state); + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &state); QCOMPARE(result, "\xf0\xac\xbd\xa6"_ba); QCOMPARE(state.remainingChars, 0); // Retain compat with the behavior for toLocal8Bit: - QCOMPARE(firstHalf.toLocal8Bit(), "?"); + QCOMPARE(codeUnits.first(1).toLocal8Bit(), "?"); + + // Now do the same, but the second time we feed in a character, we also + // provide many more so the internal stack buffer is not large enough. + result.clear(); + state.clear(); + QString str = QStringView(a).toString().repeated(2048); + codeUnits = str; + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.first(1), UTF8, &state); + QCOMPARE(result, QString()); + QVERIFY(result.isNull()); + QCOMPARE_GT(state.remainingChars, 0); + // Then we provide the rest of the string: + result = QLocal8Bit::convertFromUnicode_sys(codeUnits.sliced(1), UTF8, &state); + QCOMPARE(result.first(4), "\xf0\xac\xbd\xa6"_ba); + QCOMPARE(state.remainingChars, 0); } #endif // Q_OS_WIN