QUtf8: merge the convert{To,From}Unicode main bodies

Instead of duplicating the looping, centralize in a template function that calls out to the lambda to handle the error condition and thus store in the state. Change-Id: If2730356303f721bc3bbfffd2640666da8d65f1d Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
2024-10-03 20:16:34 -07:00 · 2024-10-03 20:16:34 -07:00 · 2fcb905fef
commit 2fcb905fef
parent 2553d9709f
2 changed files with 73 additions and 66 deletions
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@ -472,13 +472,12 @@ static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t
 enum { HeaderDone = 1 };
-QByteArray QUtf8::convertFromUnicode(QStringView in)
+template <typename OnErrorLambda> Q_ALWAYS_INLINE
 char *QUtf8::convertFromUnicode(char *out, QStringView in, OnErrorLambda &&onError) noexcept
 {
    qsizetype len = in.size();
-    // create a QByteArray with the worst case scenario size
+    uchar *dst = reinterpret_cast<uchar *>(out);
    QByteArray result(len * 3, Qt::Uninitialized);
    uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
    const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
    const char16_t *const end = src + len;
@ -490,14 +489,27 @@ QByteArray QUtf8::convertFromUnicode(QStringView in)
        do {
            char16_t u = *src++;
            int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
-            if (res < 0) {
+            if (Q_UNLIKELY(res < 0))
-                // encoding error - append '?'
+                onError(dst, u, res);
                *dst++ = '?';
            }
        } while (src < nextAscii);
    }
-    result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
+    return reinterpret_cast<char *>(dst);
 }
 QByteArray QUtf8::convertFromUnicode(QStringView in)
 {
    qsizetype len = in.size();
    // create a QByteArray with the worst case scenario size
    QByteArray result(len * 3, Qt::Uninitialized);
    char *dst = const_cast<char *>(result.constData());
    dst = convertFromUnicode(dst, in, [](auto *dst, ...) {
        // encoding error - append '?'
        *dst++ = '?';
    });
    result.truncate(dst - result.constData());
    return result;
 }
@ -548,35 +560,22 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta
        }
    }
-    while (src != end) {
+    out = reinterpret_cast<char *>(cursor);
-        const char16_t *nextAscii = end;
+    return convertFromUnicode(out, { src, end }, [&](uchar *&cursor, char16_t uc, int res) {
-        if (simdEncodeAscii(cursor, nextAscii, src, end))
+        if (res == QUtf8BaseTraits::Error) {
-            break;
+            // encoding error
-
+            ++state->invalidChars;
-        do {
+            cursor = appendReplacementChar(cursor);
-            char16_t uc = *src++;
+        } else if (res == QUtf8BaseTraits::EndOfString) {
-            int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+            if (state->flags & QStringConverter::Flag::Stateless) {
            if (Q_LIKELY(res >= 0))
                continue;
            if (res == QUtf8BaseTraits::Error) {
                // encoding error
                ++state->invalidChars;
                cursor = appendReplacementChar(cursor);
-            } else if (res == QUtf8BaseTraits::EndOfString) {
+            } else {
-                if (state->flags & QStringConverter::Flag::Stateless) {
+                state->remainingChars = 1;
-                    ++state->invalidChars;
+                state->state_data[0] = uc;
                    cursor = appendReplacementChar(cursor);
                } else {
                    state->remainingChars = 1;
                    state->state_data[0] = uc;
                }
                return reinterpret_cast<char *>(cursor);
            }
-        } while (src < nextAscii);
+        }
-    }
+    });
    return reinterpret_cast<char *>(cursor);
 }
 char *QUtf8::convertFromLatin1(char *out, QLatin1StringView in)
@ -635,6 +634,21 @@ QString QUtf8::convertToUnicode(QByteArrayView in)
    QUtf8::convertToUnicode(QChar *, QByteArrayView) directly.
 */
 char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
 {
    // check if have to skip a BOM
    auto bom = QByteArrayView::fromArray(utf8bom);
    if (in.size() >= bom.size() && in.first(bom.size()) == bom)
        in.slice(sizeof(utf8bom));
    return convertToUnicode(dst, in, [](char16_t *&dst, ...) {
        // decoding error
        *dst++ = QChar::ReplacementCharacter;
        return true;        // continue decoding
    });
 }
 template <typename OnErrorLambda> Q_ALWAYS_INLINE char16_t *
 QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, OnErrorLambda &&onError) noexcept
 {
    const uchar *const start = reinterpret_cast<const uchar *>(in.data());
    const uchar *src = start;
@ -642,29 +656,20 @@ char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
    // attempt to do a full decoding in SIMD
    const uchar *nextAscii = end;
-    if (!simdDecodeAscii(dst, nextAscii, src, end)) {
+    while (src < end) {
-        // at least one non-ASCII entry
+        nextAscii = end;
-        // check if we failed to decode the UTF-8 BOM; if so, skip it
+        if (simdDecodeAscii(dst, nextAscii, src, end))
-        if (Q_UNLIKELY(src == start)
+            break;
                && end - src >= 3
                && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
            src += 3;
        }
-        while (src < end) {
+        do {
-            nextAscii = end;
+            uchar b = *src++;
-            if (simdDecodeAscii(dst, nextAscii, src, end))
+            const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
-                break;
+            if (Q_LIKELY(res >= 0))
-
+                continue;
-            do {
+            // decoding error
-                uchar b = *src++;
+            if (!onError(dst, src, res))
-                const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
+                return dst;
-                if (res < 0) {
+        } while (src < nextAscii);
                    // decoding error
                    *dst++ = QChar::ReplacementCharacter;
                }
            } while (src < nextAscii);
        }
    }
    return dst;
@ -702,7 +707,6 @@ char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConve
        replacement = QChar::Null;
    qsizetype res;
    uchar ch = 0;
    const uchar *src = reinterpret_cast<const uchar *>(in.data());
    const uchar *end = src + len;
@ -754,19 +758,16 @@ char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConve
    // main body, stateless decoding
    res = 0;
-    const uchar *nextAscii = src;
+    dst = convertToUnicode(dst, { src, end }, [&](char16_t *&dst, const uchar *src_, int res_) {
-    while (res >= 0 && src < end) {
+        res = res_;
-        if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
+        src = src_;
            break;
        ch = *src++;
        res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
        if (res == QUtf8BaseTraits::Error) {
            res = 0;
            ++state->invalidChars;
            *dst++ = replacement;
        }
-    }
+        return res == 0;    // continue if plain decoding error
    });
    if (res == QUtf8BaseTraits::EndOfString) {
        // unterminated UTF sequence
--- a/src/corelib/text/qstringconverter_p.h
+++ b/src/corelib/text/qstringconverter_p.h
@ -334,6 +334,12 @@ struct QUtf8
                           Qt::CaseSensitivity cs = Qt::CaseSensitive);
    static int compareUtf8(QByteArrayView lhs, QByteArrayView rhs,
                           Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept;
 private:
    template <typename OnErrorLambda> static char *
    convertFromUnicode(char *out, QStringView in, OnErrorLambda &&onError) noexcept;
    template <typename OnErrorLambda> static char16_t *
    convertToUnicode(char16_t *dst, QByteArrayView in, OnErrorLambda &&onError) noexcept;
 };
 struct QUtf16