Improve the code generation for the Latin1 codec

This change does not modify the actual algorithm implemented. It only updates the source code so that the code generation is more optimal: - change only one variable per loop (the "offset" variable) - unroll the tail expansion of the last 15 characters The Neon code for the toLatin1 codec most likely benefits from the unrolling of the tail too, but I can't verify that I haven't broken anything. Change-Id: I8a92fd3c1aa700e6f8b0c8ebdb1978ade394757f Reviewed-by: Olivier Goffart <ogoffart@woboq.com>
2014-01-16 15:25:50 -08:00 · 2014-01-16 15:25:50 -08:00 · f7308e007e
commit f7308e007e
parent ab3637dd67
1 changed files with 73 additions and 58 deletions
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@ -190,6 +190,16 @@ template <uint MaxCount> struct UnrollTailLoop
        return UnrollTailLoop<MaxCount - 1>::exec(count - 1, returnIfExited, loopCheck, returnIfFailed, i + 1);
    }
    template <typename Functor>
    static inline void exec(int count, Functor code)
    {
        /* equivalent to:
         *   for (int i = 0; i < count; ++i)
         *       code(i);
         */
        exec(count, 0, [=](int i) -> bool { code(i); return false; }, [](int) { return 0; });
    }
 };
 template <> template <typename RetType, typename Functor1, typename Functor2>
 inline RetType UnrollTailLoop<0>::exec(int, RetType returnIfExited, Functor1, Functor2, int)
@ -207,25 +217,29 @@ static void qt_from_latin1(ushort *dst, const char *str, size_t size)
     * The same method gives no improvement with NEON.
     */
 #if defined(__SSE2__)
-    if (size >= 16) {
+    const char *e = str + size;
-        int chunkCount = size >> 4; // divided by 16
+    qptrdiff offset = 0;
    // we're going to read str[offset..offset+15] (16 bytes)
    for ( ; str + offset + 15 < e; offset += 16) {
        const __m128i nullMask = _mm_set1_epi32(0);
-        for (int i = 0; i < chunkCount; ++i) {
+        const __m128i chunk = _mm_loadu_si128((__m128i*)(str + offset)); // load
            const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load
            str += 16;
        // unpack the first 8 bytes, padding with zeros
        const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
-            _mm_storeu_si128((__m128i*)dst, firstHalf); // store
+        _mm_storeu_si128((__m128i*)(dst + offset), firstHalf); // store
            dst += 8;
        // unpack the last 8 bytes, padding with zeros
        const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
-            _mm_storeu_si128((__m128i*)dst, secondHalf); // store
+        _mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store
            dst += 8;
    }
    size = size % 16;
-    }
+    dst += offset;
    str += offset;
 #  ifdef Q_COMPILER_LAMBDA
    return UnrollTailLoop<15>::exec(size, [=](int i) { dst[i] = (uchar)str[i]; });
 #  endif
 #endif
 #if defined(__mips_dsp)
    if (size > 20)
@ -295,28 +309,30 @@ static inline __m128i mergeQuestionMarks(__m128i chunk)
 static void qt_to_latin1(uchar *dst, const ushort *src, int length)
 {
    if (length) {
 #if defined(__SSE2__)
-        if (length >= 16) {
+    uchar *e = dst + length;
-            const int chunkCount = length >> 4; // divided by 16
+    qptrdiff offset = 0;
-            for (int i = 0; i < chunkCount; ++i) {
+    // we're going to write to dst[offset..offset+15] (16 bytes)
-                __m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load
+    for ( ; dst + offset + 15 < e; offset += 16) {
        __m128i chunk1 = _mm_loadu_si128((__m128i*)(src + offset)); // load
        chunk1 = mergeQuestionMarks(chunk1);
                src += 8;
-                __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load
+        __m128i chunk2 = _mm_loadu_si128((__m128i*)(src + offset + 8)); // load
        chunk2 = mergeQuestionMarks(chunk2);
                src += 8;
        // pack the two vector to 16 x 8bits elements
        const __m128i result = _mm_packus_epi16(chunk1, chunk2);
        _mm_storeu_si128((__m128i*)(dst + offset), result); // store
    }
                _mm_storeu_si128((__m128i*)dst, result); // store
                dst += 16;
            }
    length = length % 16;
-        }
+    dst += offset;
    src += offset;
 #  ifdef Q_COMPILER_LAMBDA
    return UnrollTailLoop<15>::exec(length, [=](int i) { dst[i] = (src[i]>0xff) ? '?' : (uchar) src[i]; });
 #  endif
 #elif defined(__ARM_NEON__)
    // Refer to the documentation of the SSE2 implementation
    // this use eactly the same method as for SSE except:
@ -349,7 +365,6 @@ static void qt_to_latin1(uchar *dst, const ushort *src, int length)
        ++src;
    }
 #endif
    }
 }
 // Unicode case-insensitive comparison