QLocal8Bit::convertFromUnicode[win]: support more than 2Gi input

As we did for convertToUnicode. To support more than 2Gi input, we
need to handle the input in chunks because of the `int` parameter in the
Windows API. Testing also revealed some corner cases we also need to
handle, which is mostly happening when there is an incomplete surrogate
pair at the end of the current input window.

The test takes between 3 (plain MinGW) and 8 (MSVC with ASAN) seconds
to run on my machine.

Pick-to: 6.6 6.5
Fixes: QTBUG-105105
Change-Id: I4fb0420b88ca41dfa8b561a35c6d96659bd81468
Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
(cherry picked from commit 496340f33ad48738d1595c5c4048e4a05819786c)
Reviewed-by: Fabian Kosmale <fabian.kosmale@qt.io>
This commit is contained in:
Mårten Nordheim 2024-02-05 15:21:15 +01:00
parent 5e882b5de9
commit b04fe2de61
2 changed files with 81 additions and 8 deletions

View File

@ -1452,7 +1452,6 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
const wchar_t *ch = reinterpret_cast<const wchar_t *>(in.data()); const wchar_t *ch = reinterpret_cast<const wchar_t *>(in.data());
qsizetype uclen = in.size(); qsizetype uclen = in.size();
Q_ASSERT(uclen < INT_MAX); // ### FIXME
Q_ASSERT(state); Q_ASSERT(state);
if (state->flags & QStringConverter::Flag::Stateless) // temporary if (state->flags & QStringConverter::Flag::Stateless) // temporary
state = nullptr; state = nullptr;
@ -1503,9 +1502,42 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
Q_ASSERT(uclen > 0); Q_ASSERT(uclen > 0);
// Return a pointer to storage where we have enough space for `size`
const auto growOut = [&](qsizetype size) -> std::tuple<char *, qsizetype> {
if (outlen >= size)
return {out, outlen};
const bool wasStackBuffer = mb.isEmpty();
const auto begin = wasStackBuffer ? buf.data() : mb.data();
const qsizetype offset = qsizetype(std::distance(begin, out));
qsizetype newSize = 0;
if (Q_UNLIKELY(qAddOverflow(offset, size, &newSize))) {
Q_CHECK_PTR(false);
return {nullptr, 0};
}
mb.resize(newSize);
auto it = mb.data();
if (wasStackBuffer)
it = std::copy_n(buf.data(), offset, it);
else
it += offset;
return {it, size};
};
const auto getNextWindowSize = [&]() {
int nextIn = qt_saturate<int>(uclen);
// The Windows API has some issues if the current window ends in the
// middle of a surrogate pair, so we avoid that:
if (nextIn > 1 && QChar::isHighSurrogate(ch[nextIn - 1]))
--nextIn;
return nextIn;
};
int len = 0; int len = 0;
while (uclen > 0) { while (uclen > 0) {
const int nextIn = qt_saturate<int>(uclen); const int nextIn = getNextWindowSize();
std::tie(out, outlen) = growOut(1); // We need at least one byte
if (!out)
return {};
const int nextOut = qt_saturate<int>(outlen); const int nextOut = qt_saturate<int>(outlen);
len = WideCharToMultiByte(codePage, 0, ch, nextIn, out, nextOut, nullptr, nullptr); len = WideCharToMultiByte(codePage, 0, ch, nextIn, out, nextOut, nullptr, nullptr);
if (len > 0) { if (len > 0) {
@ -1516,14 +1548,21 @@ QByteArray QLocal8Bit::convertFromUnicode_sys(QStringView in, quint32 codePage,
} else { } else {
int r = GetLastError(); int r = GetLastError();
if (r == ERROR_INSUFFICIENT_BUFFER) { if (r == ERROR_INSUFFICIENT_BUFFER) {
Q_ASSERT(mb.isEmpty());
int neededLength = WideCharToMultiByte(codePage, 0, ch, nextIn, nullptr, 0, int neededLength = WideCharToMultiByte(codePage, 0, ch, nextIn, nullptr, 0,
nullptr, nullptr); nullptr, nullptr);
const qsizetype currentLength = out - buf.data(); if (neededLength <= 0) {
mb.resize(currentLength + neededLength); // Fail. Observed with UTF8 where the input window was max int and ended in an
memcpy(mb.data(), out, currentLength * sizeof(*out)); // incomplete sequence, probably a Windows bug. We try to avoid that from
out = mb.data() + currentLength; // happening by reducing the window size in that case. But let's keep this
outlen = neededLength; // branch just in case of other bugs.
r = GetLastError();
fprintf(stderr,
"WideCharToMultiByte: Cannot convert multibyte text (error %d)\n", r);
break;
}
std::tie(out, outlen) = growOut(neededLength);
if (!out)
return {};
// and try again... // and try again...
} else { } else {
// Fail. Probably can't happen in fact (dwFlags is 0). // Fail. Probably can't happen in fact (dwFlags is 0).

View File

@ -205,6 +205,7 @@ private slots:
void toLocal8Bit_data(); void toLocal8Bit_data();
void toLocal8Bit(); void toLocal8Bit();
void toLocal8Bit_special_cases(); void toLocal8Bit_special_cases();
void toLocal8Bit_2GiB();
#endif #endif
}; };
@ -2751,6 +2752,39 @@ void tst_QStringConverter::toLocal8Bit_special_cases()
QCOMPARE(result.first(4), "\xf0\xac\xbd\xa6"_ba); QCOMPARE(result.first(4), "\xf0\xac\xbd\xa6"_ba);
QCOMPARE(state.remainingChars, 0); QCOMPARE(state.remainingChars, 0);
} }
void tst_QStringConverter::toLocal8Bit_2GiB()
{
#if QT_POINTER_SIZE == 4
QSKIP("This test is only relevant for 64-bit builds");
#else
constexpr qsizetype TwoGiB = qsizetype(std::numeric_limits<int>::max());
QString input;
QT_TRY {
input.reserve(TwoGiB + 1);
} QT_CATCH (const std::bad_alloc &) {
QSKIP("Out of memory");
}
// Fill with a single code unit character
input.fill(u'.', TwoGiB - 1);
// Then append a 2 code unit character, so that the input straddles the 2 GiB
// boundary
input += u"🙂";
QCOMPARE(input.size(), input.capacity());
constexpr uint UTF8 = 65001u;
QStringConverter::State state;
QByteArray result;
QT_TRY {
result = QLocal8Bit::convertFromUnicode_sys(input, UTF8, &state);
} QT_CATCH (const std::bad_alloc &) {
QSKIP("Out of memory");
}
QUtf8StringView rView = result;
QCOMPARE(rView.size(), TwoGiB + 3); // The 2 code unit smiley is 4 code units in UTF-8
QCOMPARE(rView.last(7), u8"...🙂"); // Check we correctly decoded it
QCOMPARE(state.remainingChars, 0); // and there is nothing left in the state
#endif
}
#endif // Q_OS_WIN #endif // Q_OS_WIN
struct DontCrashAtExit { struct DontCrashAtExit {