QLocal8Bit::convertToUnicode[win]: support more than 2Gi input

To properly support more than 2Gi input we have to support being asked
to resize more than once. Previously we would only have to resize the
one time because we went from our 4K stack buffer to the final size
heap buffer. But now, since our input size can only be specified in
int, we have to deal with looping over the input and resizing the buffer
as needed.

We also have to deal with trailing data at the end of our sliding window
potentially causing issues for the encoding. So we try to shrink our
window when it causes issues, or store the trailing data for the next
call.

The >2Gi test takes about 6-8 seconds on my machine.

Pick-to: 6.6 6.5
Task-number: QTBUG-105105
Change-Id: I9a44b8f379bf2c2c58183f961544ed2f4c8c7215
Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
(cherry picked from commit a7171c62569ac2005560131a17515bb6841e9b98)
Reviewed-by: Qt Cherry-pick Bot <cherrypick_bot@qt-project.org>
This commit is contained in:
Mårten Nordheim 2023-10-23 16:04:16 +02:00
parent 0c944d361a
commit 22f2fcd354
2 changed files with 86 additions and 18 deletions

View File

@ -25,6 +25,7 @@
#ifndef QT_BOOTSTRAPPED
#include <QtCore/qvarlengtharray.h>
#include <QtCore/q20iterator.h>
#include <QtCore/private/qnumeric_p.h>
#endif // !QT_BOOTSTRAPPED
#endif
@ -1265,11 +1266,8 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St
QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
QStringConverter::State *state)
{
qsizetype length = in.size();
Q_ASSERT(length < INT_MAX); // ### FIXME
const char *mb = in.data();
int mblen = length;
qsizetype mblen = in.size();
if (state && state->flags & QStringConverter::Flag::Stateless)
state = nullptr;
@ -1354,11 +1352,11 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
return {it, size};
};
constexpr int MaxStep = std::numeric_limits<int>::max();
const char *end = mb + mblen;
while (mb != end) {
const int nextIn = int(std::min(qsizetype(mblen), qsizetype(MaxStep)));
const int nextOut = int(std::min(outlen, qsizetype(MaxStep)));
// Need it in this scope, since we try to decrease our window size if we
// encounter an error
int nextIn = qt_saturate<int>(mblen);
while (mblen > 0) {
const int nextOut = qt_saturate<int>(outlen);
std::tie(out, outlen) = growOut(1); // Need space for at least one character
if (!out)
return {};
@ -1371,25 +1369,56 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
} else {
int r = GetLastError();
if (r == ERROR_INSUFFICIENT_BUFFER) {
Q_ASSERT(QtPrivate::q_points_into_range(out, buf.data(), buf.data() + buf.size()));
const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0);
std::tie(out, outlen) = growOut(wclen);
if (!out)
return {};
} else if (r == ERROR_NO_UNICODE_TRANSLATION && state
&& state->remainingChars < q20::ssize(state->state_data)) {
++state->remainingChars;
--mblen;
for (qsizetype i = 0; i < state->remainingChars; ++i)
state->state_data[i] = mb[mblen + i];
if (mblen == 0)
} else if (r == ERROR_NO_UNICODE_TRANSLATION) {
// Can't decode the current window, so either store the state,
// reduce window size or output a replacement character.
// Check if we can store all remaining characters in the state
// to be used next time we're called:
if (state && mblen <= q20::ssize(state->state_data)) {
state->remainingChars = mblen;
std::copy_n(mb, mblen, state->state_data);
mb += mblen;
mblen = 0;
break;
}
// .. if not, try to find the last valid character in the window
// and try again with a shrunken window:
if (nextIn > 1) {
// There may be some incomplete data at the end of our current
// window, so decrease the window size and try again.
// In the worst case scenario there is gigs of undecodable
// garbage, but what are we supposed to do about that?
const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0);
if (it != mb)
nextIn = int(it - mb);
else
--nextIn;
continue;
}
// Finally, we are forced to output a replacement character for
// the first byte in the window:
std::tie(out, outlen) = growOut(1);
if (!out)
return {};
*out = QChar::ReplacementCharacter;
++out;
--outlen;
++mb;
--mblen;
} else {
// Fail.
qWarning("MultiByteToWideChar: Cannot convert multibyte text");
break;
}
}
nextIn = qt_saturate<int>(mblen);
}
if (sp.isEmpty()) {
@ -1404,8 +1433,11 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
if (sp.size() && sp.back().isNull())
sp.chop(1);
if (!state && mblen > 0) // We have trailing characters that should be converted
if (!state && mblen > 0) {
// We have trailing character(s) that could not be converted, and
// nowhere to cache them
sp.resize(sp.size() + mblen, QChar::ReplacementCharacter);
}
return sp;
}

View File

@ -201,6 +201,7 @@ private slots:
void fromLocal8Bit_data();
void fromLocal8Bit();
void fromLocal8Bit_special_cases();
void fromLocal8Bit_2GiB();
void toLocal8Bit_data();
void toLocal8Bit();
void toLocal8Bit_special_cases();
@ -2637,6 +2638,41 @@ void tst_QStringConverter::fromLocal8Bit_special_cases()
QCOMPARE(state.remainingChars, 0);
}
void tst_QStringConverter::fromLocal8Bit_2GiB()
{
#if QT_POINTER_SIZE == 4
QSKIP("This test is only relevant for 64-bit builds");
#else
qsizetype size = qsizetype(std::numeric_limits<int>::max()) + 3;
QByteArray input;
QT_TRY {
input.reserve(size);
} QT_CATCH (const std::bad_alloc &) {
QSKIP("Out of memory");
}
// fill with '、' - a single octet character in Shift-JIS
input.fill('\xa4', std::numeric_limits<int>::max() - 1);
// then append 'こ' - a two octet character in Shift-JIS
// which is now straddling the 2 GiB boundary
input += "\x82\xb1";
// then append another two '、', so that our output is also crossing the
// 2 GiB boundary
input += "\xa4\xa4";
QCOMPARE(input.size(), input.capacity());
constexpr uint SHIFT_JIS = 932u;
QStringConverter::State state;
QString result;
QT_TRY {
result = QLocal8Bit::convertToUnicode_sys(input, SHIFT_JIS, &state);
} QT_CATCH (const std::bad_alloc &) {
QSKIP("Out of memory");
}
QCOMPARE(result.size(), size - 1); // The 2-octet character is only 1 code unit in UTF-16
QCOMPARE(result.last(4), u"、こ、、"); // Check we correctly decoded it
QCOMPARE(state.remainingChars, 0); // and there is nothing left in the state
#endif
}
void tst_QStringConverter::toLocal8Bit_data()
{
fromLocal8Bit_data();