QUtf8: merge the convert{To,From}Unicode main bodies

Instead of duplicating the looping, centralize in a template function that calls out to the lambda to handle the error condition and thus store in the state. Change-Id: If2730356303f721bc3bbfffd2640666da8d65f1d Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
2024-10-03 20:16:34 -07:00 · 2024-10-03 20:16:34 -07:00 · 2fcb905fef
commit 2fcb905fef
parent 2553d9709f
2 changed files with 73 additions and 66 deletions
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@ -472,13 +472,12 @@ static void simdCompareAscii(const qchar8_t *&, const qchar8_t *, const char16_t

 enum { HeaderDone = 1 };

-QByteArray QUtf8::convertFromUnicode(QStringView in)
+template <typename OnErrorLambda> Q_ALWAYS_INLINE
+char *QUtf8::convertFromUnicode(char *out, QStringView in, OnErrorLambda &&onError) noexcept
 {
    qsizetype len = in.size();

-    // create a QByteArray with the worst case scenario size
-    QByteArray result(len * 3, Qt::Uninitialized);
-    uchar *dst = reinterpret_cast<uchar *>(const_cast<char *>(result.constData()));
+    uchar *dst = reinterpret_cast<uchar *>(out);
    const char16_t *src = reinterpret_cast<const char16_t *>(in.data());
    const char16_t *const end = src + len;

@ -490,14 +489,27 @@ QByteArray QUtf8::convertFromUnicode(QStringView in)
        do {
            char16_t u = *src++;
            int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(u, dst, src, end);
-            if (res < 0) {
-                // encoding error - append '?'
-                *dst++ = '?';
-            }
+            if (Q_UNLIKELY(res < 0))
+                onError(dst, u, res);
        } while (src < nextAscii);
    }

-    result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
+    return reinterpret_cast<char *>(dst);
+}
+
+QByteArray QUtf8::convertFromUnicode(QStringView in)
+{
+    qsizetype len = in.size();
+
+    // create a QByteArray with the worst case scenario size
+    QByteArray result(len * 3, Qt::Uninitialized);
+    char *dst = const_cast<char *>(result.constData());
+    dst = convertFromUnicode(dst, in, [](auto *dst, ...) {
+        // encoding error - append '?'
+        *dst++ = '?';
+    });
+
+    result.truncate(dst - result.constData());
    return result;
 }

@ -548,35 +560,22 @@ char *QUtf8::convertFromUnicode(char *out, QStringView in, QStringConverter::Sta
        }
    }

-    while (src != end) {
-        const char16_t *nextAscii = end;
-        if (simdEncodeAscii(cursor, nextAscii, src, end))
-            break;
-
-        do {
-            char16_t uc = *src++;
-            int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
-            if (Q_LIKELY(res >= 0))
-                continue;
-
-            if (res == QUtf8BaseTraits::Error) {
-                // encoding error
+    out = reinterpret_cast<char *>(cursor);
+    return convertFromUnicode(out, { src, end }, [&](uchar *&cursor, char16_t uc, int res) {
+        if (res == QUtf8BaseTraits::Error) {
+            // encoding error
+            ++state->invalidChars;
+            cursor = appendReplacementChar(cursor);
+        } else if (res == QUtf8BaseTraits::EndOfString) {
+            if (state->flags & QStringConverter::Flag::Stateless) {
                ++state->invalidChars;
                cursor = appendReplacementChar(cursor);
-            } else if (res == QUtf8BaseTraits::EndOfString) {
-                if (state->flags & QStringConverter::Flag::Stateless) {
-                    ++state->invalidChars;
-                    cursor = appendReplacementChar(cursor);
-                } else {
-                    state->remainingChars = 1;
-                    state->state_data[0] = uc;
-                }
-                return reinterpret_cast<char *>(cursor);
+            } else {
+                state->remainingChars = 1;
+                state->state_data[0] = uc;
            }
-        } while (src < nextAscii);
-    }
-
-    return reinterpret_cast<char *>(cursor);
+        }
+    });
 }

 char *QUtf8::convertFromLatin1(char *out, QLatin1StringView in)
@ -635,6 +634,21 @@ QString QUtf8::convertToUnicode(QByteArrayView in)
    QUtf8::convertToUnicode(QChar *, QByteArrayView) directly.
 */
 char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept
+{
+    // check if have to skip a BOM
+    auto bom = QByteArrayView::fromArray(utf8bom);
+    if (in.size() >= bom.size() && in.first(bom.size()) == bom)
+        in.slice(sizeof(utf8bom));
+
+    return convertToUnicode(dst, in, [](char16_t *&dst, ...) {
+        // decoding error
+        *dst++ = QChar::ReplacementCharacter;
+        return true;        // continue decoding
+    });
+}
+
+template <typename OnErrorLambda> Q_ALWAYS_INLINE char16_t *
+QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, OnErrorLambda &&onError) noexcept
 {
    const uchar *const start = reinterpret_cast<const uchar *>(in.data());
    const uchar *src = start;
@ -642,29 +656,20 @@ char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in) noexcept

    // attempt to do a full decoding in SIMD
    const uchar *nextAscii = end;
-    if (!simdDecodeAscii(dst, nextAscii, src, end)) {
-        // at least one non-ASCII entry
-        // check if we failed to decode the UTF-8 BOM; if so, skip it
-        if (Q_UNLIKELY(src == start)
-                && end - src >= 3
-                && Q_UNLIKELY(src[0] == utf8bom[0] && src[1] == utf8bom[1] && src[2] == utf8bom[2])) {
-            src += 3;
-        }
+    while (src < end) {
+        nextAscii = end;
+        if (simdDecodeAscii(dst, nextAscii, src, end))
+            break;

-        while (src < end) {
-            nextAscii = end;
-            if (simdDecodeAscii(dst, nextAscii, src, end))
-                break;
-
-            do {
-                uchar b = *src++;
-                const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
-                if (res < 0) {
-                    // decoding error
-                    *dst++ = QChar::ReplacementCharacter;
-                }
-            } while (src < nextAscii);
-        }
+        do {
+            uchar b = *src++;
+            const qsizetype res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
+            if (Q_LIKELY(res >= 0))
+                continue;
+            // decoding error
+            if (!onError(dst, src, res))
+                return dst;
+        } while (src < nextAscii);
    }

    return dst;
@ -702,7 +707,6 @@ char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConve
        replacement = QChar::Null;

    qsizetype res;
-    uchar ch = 0;

    const uchar *src = reinterpret_cast<const uchar *>(in.data());
    const uchar *end = src + len;
@ -754,19 +758,16 @@ char16_t *QUtf8::convertToUnicode(char16_t *dst, QByteArrayView in, QStringConve

    // main body, stateless decoding
    res = 0;
-    const uchar *nextAscii = src;
-    while (res >= 0 && src < end) {
-        if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
-            break;
-
-        ch = *src++;
-        res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
+    dst = convertToUnicode(dst, { src, end }, [&](char16_t *&dst, const uchar *src_, int res_) {
+        res = res_;
+        src = src_;
        if (res == QUtf8BaseTraits::Error) {
            res = 0;
            ++state->invalidChars;
            *dst++ = replacement;
        }
-    }
+        return res == 0;    // continue if plain decoding error
+    });

    if (res == QUtf8BaseTraits::EndOfString) {
        // unterminated UTF sequence
--- a/src/corelib/text/qstringconverter_p.h
+++ b/src/corelib/text/qstringconverter_p.h
@ -334,6 +334,12 @@ struct QUtf8
                           Qt::CaseSensitivity cs = Qt::CaseSensitive);
    static int compareUtf8(QByteArrayView lhs, QByteArrayView rhs,
                           Qt::CaseSensitivity cs = Qt::CaseSensitive) noexcept;
+
+private:
+    template <typename OnErrorLambda> static char *
+    convertFromUnicode(char *out, QStringView in, OnErrorLambda &&onError) noexcept;
+    template <typename OnErrorLambda> static char16_t *
+    convertToUnicode(char16_t *dst, QByteArrayView in, OnErrorLambda &&onError) noexcept;
 };

 struct QUtf16