QLocal8Bit::convertToUnicode[win]: support more than 2Gi input
To properly support more than 2Gi input we have to support being asked to resize more than once. Previously we would only have to resize the one time because we went from our 4K stack buffer to the final size heap buffer. But now, since our input size can only be specified in int, we have to deal with looping over the input and resizing the buffer as needed. We also have to deal with trailing data at the end of our sliding window potentially causing issues for the encoding. So we try to shrink our window when it causes issues, or store the trailing data for the next call. The >2Gi test takes about 6-8 seconds on my machine. Pick-to: 6.6 6.5 Task-number: QTBUG-105105 Change-Id: I9a44b8f379bf2c2c58183f961544ed2f4c8c7215 Reviewed-by: Edward Welbourne <edward.welbourne@qt.io> (cherry picked from commit a7171c62569ac2005560131a17515bb6841e9b98) Reviewed-by: Qt Cherry-pick Bot <cherrypick_bot@qt-project.org>
This commit is contained in:
parent
0c944d361a
commit
22f2fcd354
@ -25,6 +25,7 @@
|
||||
#ifndef QT_BOOTSTRAPPED
|
||||
#include <QtCore/qvarlengtharray.h>
|
||||
#include <QtCore/q20iterator.h>
|
||||
#include <QtCore/private/qnumeric_p.h>
|
||||
#endif // !QT_BOOTSTRAPPED
|
||||
#endif
|
||||
|
||||
@ -1265,11 +1266,8 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, QStringConverter::St
|
||||
QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
|
||||
QStringConverter::State *state)
|
||||
{
|
||||
qsizetype length = in.size();
|
||||
|
||||
Q_ASSERT(length < INT_MAX); // ### FIXME
|
||||
const char *mb = in.data();
|
||||
int mblen = length;
|
||||
qsizetype mblen = in.size();
|
||||
|
||||
if (state && state->flags & QStringConverter::Flag::Stateless)
|
||||
state = nullptr;
|
||||
@ -1354,11 +1352,11 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
|
||||
return {it, size};
|
||||
};
|
||||
|
||||
constexpr int MaxStep = std::numeric_limits<int>::max();
|
||||
const char *end = mb + mblen;
|
||||
while (mb != end) {
|
||||
const int nextIn = int(std::min(qsizetype(mblen), qsizetype(MaxStep)));
|
||||
const int nextOut = int(std::min(outlen, qsizetype(MaxStep)));
|
||||
// Need it in this scope, since we try to decrease our window size if we
|
||||
// encounter an error
|
||||
int nextIn = qt_saturate<int>(mblen);
|
||||
while (mblen > 0) {
|
||||
const int nextOut = qt_saturate<int>(outlen);
|
||||
std::tie(out, outlen) = growOut(1); // Need space for at least one character
|
||||
if (!out)
|
||||
return {};
|
||||
@ -1371,25 +1369,56 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
|
||||
} else {
|
||||
int r = GetLastError();
|
||||
if (r == ERROR_INSUFFICIENT_BUFFER) {
|
||||
Q_ASSERT(QtPrivate::q_points_into_range(out, buf.data(), buf.data() + buf.size()));
|
||||
const int wclen = MultiByteToWideChar(codePage, 0, mb, nextIn, 0, 0);
|
||||
std::tie(out, outlen) = growOut(wclen);
|
||||
if (!out)
|
||||
return {};
|
||||
} else if (r == ERROR_NO_UNICODE_TRANSLATION && state
|
||||
&& state->remainingChars < q20::ssize(state->state_data)) {
|
||||
++state->remainingChars;
|
||||
--mblen;
|
||||
for (qsizetype i = 0; i < state->remainingChars; ++i)
|
||||
state->state_data[i] = mb[mblen + i];
|
||||
if (mblen == 0)
|
||||
} else if (r == ERROR_NO_UNICODE_TRANSLATION) {
|
||||
// Can't decode the current window, so either store the state,
|
||||
// reduce window size or output a replacement character.
|
||||
|
||||
// Check if we can store all remaining characters in the state
|
||||
// to be used next time we're called:
|
||||
if (state && mblen <= q20::ssize(state->state_data)) {
|
||||
state->remainingChars = mblen;
|
||||
std::copy_n(mb, mblen, state->state_data);
|
||||
mb += mblen;
|
||||
mblen = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
// .. if not, try to find the last valid character in the window
|
||||
// and try again with a shrunken window:
|
||||
if (nextIn > 1) {
|
||||
// There may be some incomplete data at the end of our current
|
||||
// window, so decrease the window size and try again.
|
||||
// In the worst case scenario there is gigs of undecodable
|
||||
// garbage, but what are we supposed to do about that?
|
||||
const auto it = CharPrevExA(codePage, mb, mb + nextIn, 0);
|
||||
if (it != mb)
|
||||
nextIn = int(it - mb);
|
||||
else
|
||||
--nextIn;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Finally, we are forced to output a replacement character for
|
||||
// the first byte in the window:
|
||||
std::tie(out, outlen) = growOut(1);
|
||||
if (!out)
|
||||
return {};
|
||||
*out = QChar::ReplacementCharacter;
|
||||
++out;
|
||||
--outlen;
|
||||
++mb;
|
||||
--mblen;
|
||||
} else {
|
||||
// Fail.
|
||||
qWarning("MultiByteToWideChar: Cannot convert multibyte text");
|
||||
break;
|
||||
}
|
||||
}
|
||||
nextIn = qt_saturate<int>(mblen);
|
||||
}
|
||||
|
||||
if (sp.isEmpty()) {
|
||||
@ -1404,8 +1433,11 @@ QString QLocal8Bit::convertToUnicode_sys(QByteArrayView in, quint32 codePage,
|
||||
if (sp.size() && sp.back().isNull())
|
||||
sp.chop(1);
|
||||
|
||||
if (!state && mblen > 0) // We have trailing characters that should be converted
|
||||
if (!state && mblen > 0) {
|
||||
// We have trailing character(s) that could not be converted, and
|
||||
// nowhere to cache them
|
||||
sp.resize(sp.size() + mblen, QChar::ReplacementCharacter);
|
||||
}
|
||||
return sp;
|
||||
}
|
||||
|
||||
|
@ -201,6 +201,7 @@ private slots:
|
||||
void fromLocal8Bit_data();
|
||||
void fromLocal8Bit();
|
||||
void fromLocal8Bit_special_cases();
|
||||
void fromLocal8Bit_2GiB();
|
||||
void toLocal8Bit_data();
|
||||
void toLocal8Bit();
|
||||
void toLocal8Bit_special_cases();
|
||||
@ -2637,6 +2638,41 @@ void tst_QStringConverter::fromLocal8Bit_special_cases()
|
||||
QCOMPARE(state.remainingChars, 0);
|
||||
}
|
||||
|
||||
void tst_QStringConverter::fromLocal8Bit_2GiB()
|
||||
{
|
||||
#if QT_POINTER_SIZE == 4
|
||||
QSKIP("This test is only relevant for 64-bit builds");
|
||||
#else
|
||||
qsizetype size = qsizetype(std::numeric_limits<int>::max()) + 3;
|
||||
QByteArray input;
|
||||
QT_TRY {
|
||||
input.reserve(size);
|
||||
} QT_CATCH (const std::bad_alloc &) {
|
||||
QSKIP("Out of memory");
|
||||
}
|
||||
// fill with '、' - a single octet character in Shift-JIS
|
||||
input.fill('\xa4', std::numeric_limits<int>::max() - 1);
|
||||
// then append 'こ' - a two octet character in Shift-JIS
|
||||
// which is now straddling the 2 GiB boundary
|
||||
input += "\x82\xb1";
|
||||
// then append another two '、', so that our output is also crossing the
|
||||
// 2 GiB boundary
|
||||
input += "\xa4\xa4";
|
||||
QCOMPARE(input.size(), input.capacity());
|
||||
constexpr uint SHIFT_JIS = 932u;
|
||||
QStringConverter::State state;
|
||||
QString result;
|
||||
QT_TRY {
|
||||
result = QLocal8Bit::convertToUnicode_sys(input, SHIFT_JIS, &state);
|
||||
} QT_CATCH (const std::bad_alloc &) {
|
||||
QSKIP("Out of memory");
|
||||
}
|
||||
QCOMPARE(result.size(), size - 1); // The 2-octet character is only 1 code unit in UTF-16
|
||||
QCOMPARE(result.last(4), u"、こ、、"); // Check we correctly decoded it
|
||||
QCOMPARE(state.remainingChars, 0); // and there is nothing left in the state
|
||||
#endif
|
||||
}
|
||||
|
||||
void tst_QStringConverter::toLocal8Bit_data()
|
||||
{
|
||||
fromLocal8Bit_data();
|
||||
|
Loading…
x
Reference in New Issue
Block a user