QUtf8: merge the convert{To,From}Unicode main bodies

Instead of duplicating the looping, centralize in a template function
that calls out to the lambda to handle the error condition and thus
store in the state.

Change-Id: If2730356303f721bc3bbfffd2640666da8d65f1d
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
This commit is contained in:
Thiago Macieira 2024-10-03 20:16:34 -07:00
parent 2553d9709f
commit 2fcb905fef
2 changed files with 73 additions and 66 deletions

View File

@ -472,13 +472,12 @@ static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t
enum { HeaderDone = 1 }; enum { HeaderDone = 1 };
QByteArray QUtf8::convertFromUnicode(QStringView in) template <typename OnErrorLambda> Q_ALWAYS_INLINE
char *QUtf8::convertFromUnicode(char *out, QStringView in, OnErrorLambda &&onError) noexcept
{ {
qsizetype len = in.size(); qsizetype len = in.size();
// create a QByteArray with the worst case scenario size uchar *dst = reinterpret_cast<uchar *>(out);
QByteArray result(len * 3, Qt::Uninitialized);
uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
const char16_t *src = reinterpret_cast<const char16_t *>(in.data()); const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
const char16_t *const end = src + len; const char16_t *const end = src + len;
@ -490,14 +489,27 @@ QByteArray QUtf8::convertFromUnicode(QStringView in)
do { do {
char16_t u = *src++; char16_t u = *src++;
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end); int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
if (res < 0) { if (Q_UNLIKELY(res < 0))
// encoding error - append '?' onError(dst, u, res);
*dst++ = '?';
}
} while (src < nextAscii); } while (src < nextAscii);
} }
result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData()))); return reinterpret_cast<char *>(dst);
}
QByteArray QUtf8::convertFromUnicode(QStringView in)
{
qsizetype len = in.size();
// create a QByteArray with the worst case scenario size
QByteArray result(len * 3, Qt::Uninitialized);
char *dst = const_cast<char *>(result.constData());
dst = convertFromUnicode(dst, in, [](auto *dst, ...) {
// encoding error - append '?'
*dst++ = '?';
});
result.truncate(dst - result.constData());
return result; return result;
} }
@ -548,35 +560,22 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta
} }
} }
while (src != end) { out = reinterpret_cast<char *>(cursor);
const char16_t *nextAscii = end; return convertFromUnicode(out, { src, end }, [&](uchar *&cursor, char16_t uc, int res) {
if (simdEncodeAscii(cursor, nextAscii, src, end)) if (res == QUtf8BaseTraits::Error) {
break; // encoding error
++state->invalidChars;
do { cursor = appendReplacementChar(cursor);
char16_t uc = *src++; } else if (res == QUtf8BaseTraits::EndOfString) {
int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end); if (state->flags & QStringConverter::Flag::Stateless) {
if (Q_LIKELY(res >= 0))
continue;
if (res == QUtf8BaseTraits::Error) {
// encoding error
++state->invalidChars; ++state->invalidChars;
cursor = appendReplacementChar(cursor); cursor = appendReplacementChar(cursor);
} else if (res == QUtf8BaseTraits::EndOfString) { } else {
if (state->flags & QStringConverter::Flag::Stateless) { state->remainingChars = 1;
++state->invalidChars; state->state_data[0] = uc;
cursor = appendReplacementChar(cursor);
} else {
state->remainingChars = 1;
state->state_data[0] = uc;
}
return reinterpret_cast<char *>(cursor);
} }
} while (src < nextAscii); }
} });
return reinterpret_cast<char *>(cursor);
} }
char *QUtf8::convertFromLatin1(char *out, QLatin1StringView in) char *QUtf8::convertFromLatin1(char *out, QLatin1StringView in)
@ -635,6 +634,21 @@ QString QUtf8::convertToUnicode(QByteArrayView in)
QUtf8::convertToUnicode(QChar *, QByteArrayView) directly. QUtf8::convertToUnicode(QChar *, QByteArrayView) directly.
*/ */
char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
{
// check if have to skip a BOM
auto bom = QByteArrayView::fromArray(utf8bom);
if (in.size() >= bom.size() && in.first(bom.size()) == bom)
in.slice(sizeof(utf8bom));
return convertToUnicode(dst, in, [](char16_t *&dst, ...) {
// decoding error
*dst++ = QChar::ReplacementCharacter;
return true; // continue decoding
});
}
template <typename OnErrorLambda> Q_ALWAYS_INLINE char16_t *
QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, OnErrorLambda &&onError) noexcept
{ {
const uchar *const start = reinterpret_cast<const uchar *>(in.data()); const uchar *const start = reinterpret_cast<const uchar *>(in.data());
const uchar *src = start; const uchar *src = start;
@ -642,29 +656,20 @@ char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
// attempt to do a full decoding in SIMD // attempt to do a full decoding in SIMD
const uchar *nextAscii = end; const uchar *nextAscii = end;
if (!simdDecodeAscii(dst, nextAscii, src, end)) { while (src < end) {
// at least one non-ASCII entry nextAscii = end;
// check if we failed to decode the UTF-8 BOM; if so, skip it if (simdDecodeAscii(dst, nextAscii, src, end))
if (Q_UNLIKELY(src == start) break;
&& end - src >= 3
&& Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
src += 3;
}
while (src < end) { do {
nextAscii = end; uchar b = *src++;
if (simdDecodeAscii(dst, nextAscii, src, end)) const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
break; if (Q_LIKELY(res >= 0))
continue;
do { // decoding error
uchar b = *src++; if (!onError(dst, src, res))
const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end); return dst;
if (res < 0) { } while (src < nextAscii);
// decoding error
*dst++ = QChar::ReplacementCharacter;
}
} while (src < nextAscii);
}
} }
return dst; return dst;
@ -702,7 +707,6 @@ char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConve
replacement = QChar::Null; replacement = QChar::Null;
qsizetype res; qsizetype res;
uchar ch = 0;
const uchar *src = reinterpret_cast<const uchar *>(in.data()); const uchar *src = reinterpret_cast<const uchar *>(in.data());
const uchar *end = src + len; const uchar *end = src + len;
@ -754,19 +758,16 @@ char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConve
// main body, stateless decoding // main body, stateless decoding
res = 0; res = 0;
const uchar *nextAscii = src; dst = convertToUnicode(dst, { src, end }, [&](char16_t *&dst, const uchar *src_, int res_) {
while (res >= 0 && src < end) { res = res_;
if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end)) src = src_;
break;
ch = *src++;
res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
if (res == QUtf8BaseTraits::Error) { if (res == QUtf8BaseTraits::Error) {
res = 0; res = 0;
++state->invalidChars; ++state->invalidChars;
*dst++ = replacement; *dst++ = replacement;
} }
} return res == 0; // continue if plain decoding error
});
if (res == QUtf8BaseTraits::EndOfString) { if (res == QUtf8BaseTraits::EndOfString) {
// unterminated UTF sequence // unterminated UTF sequence

View File

@ -334,6 +334,12 @@ struct QUtf8
Qt::CaseSensitivity cs = Qt::CaseSensitive); Qt::CaseSensitivity cs = Qt::CaseSensitive);
static int compareUtf8(QByteArrayView lhs, QByteArrayView rhs, static int compareUtf8(QByteArrayView lhs, QByteArrayView rhs,
Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept; Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept;
private:
template <typename OnErrorLambda> static char *
convertFromUnicode(char *out, QStringView in, OnErrorLambda &&onError) noexcept;
template <typename OnErrorLambda> static char16_t *
convertToUnicode(char16_t *dst, QByteArrayView in, OnErrorLambda &&onError) noexcept;
}; };
struct QUtf16 struct QUtf16