Cleanup convert_ARGB_to_ARGB_PM_inplace_sse2

Changes it to follow standard SIMD patterns so it can use ALIGNMENT_PROLOGUE_16BYTES and SIMD_EPILOGUE helpers. Should also improve performance by using aligned memory access. Change-Id: I14a48b82e3f3de83bd7572aa82bed07f28ad944c Reviewed-by: Erik Verbruggen <erik.verbruggen@qt.io>
2016-11-16 16:25:11 +01:00 · 2016-11-16 16:25:11 +01:00 · 506aa694a9
commit 506aa694a9
parent 0382bb2ab8
1 changed files with 37 additions and 22 deletions
--- a/src/gui/image/qimage_sse2.cpp
+++ b/src/gui/image/qimage_sse2.cpp
@ -51,51 +51,66 @@ bool convert_ARGB_to_ARGB_PM_inplace_sse2(QImageData *data, Qt::ImageConversionF
 {
    Q_ASSERT(data->format == QImage::Format_ARGB32 || data->format == QImage::Format_RGBA8888);

-    // extra pixels on each line
-    const int spare = data->width & 3;
-    // width in pixels of the pad at the end of each line
-    const int pad = (data->bytes_per_line >> 2) - data->width;
-    const int iter = data->width >> 2;
-    int height = data->height;
+    const int width = data->width;
+    const int height = data->height;
+    const int bpl = data->bytes_per_line;

    const __m128i alphaMask = _mm_set1_epi32(0xff000000);
    const __m128i nullVector = _mm_setzero_si128();
    const __m128i half = _mm_set1_epi16(0x80);
    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);

-    __m128i *d = reinterpret_cast<__m128i*>(data->data);
-    while (height--) {
-        const __m128i *end = d + iter;
-
-        for (; d != end; ++d) {
-            const __m128i srcVector = _mm_loadu_si128(d);
+    uchar *d = data->data;
+    for (int y = 0; y < height; ++y) {
+        int i = 0;
+        quint32 *d32 = reinterpret_cast<quint32 *>(d);
+        ALIGNMENT_PROLOGUE_16BYTES(d, i, width) {
+            const quint32 p = d32[i];
+            if (p <= 0x00ffffff)
+                d32[i] = 0;
+            else if (p < 0xff000000)
+                d32[i] = qPremultiply(p);
+        }
+        __m128i *d128 = reinterpret_cast<__m128i *>(d32 + i);
+        for (; i < (width - 3); i += 4) {
+            const __m128i srcVector = _mm_load_si128(d128);
+#ifdef __SSE4_1__
+            if (_mm_testc_si128(srcVector, alphaMask)) {
+                // opaque, data is unchanged
+            } else if (_mm_testz_si128(srcVector, alphaMask)) {
+                // fully transparent
+                _mm_store_si128(d128, nullVector);
+            } else {
+                const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
+#else
            const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
            if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
                // opaque, data is unchanged
            } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) == 0xffff) {
                // fully transparent
-                _mm_storeu_si128(d, nullVector);
+                _mm_store_si128(d128, nullVector);
            } else {
+#endif
                __m128i alphaChannel = _mm_srli_epi32(srcVector, 24);
                alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16));

                __m128i result;
                BYTE_MUL_SSE2(result, srcVector, alphaChannel, colorMask, half);
                result = _mm_or_si128(_mm_andnot_si128(alphaMask, result), srcVectorAlpha);
-                _mm_storeu_si128(d, result);
+                _mm_store_si128(d128, result);
            }
+            d128++;
        }

-        QRgb *p = reinterpret_cast<QRgb*>(d);
-        QRgb *pe = p+spare;
-        for (; p != pe; ++p) {
-            if (*p < 0x00ffffff)
-                *p = 0;
-            else if (*p < 0xff000000)
-                *p = qPremultiply(*p);
+        SIMD_EPILOGUE(i, width, 3) {
+            const quint32 p = d32[i];
+            if (p <= 0x00ffffff)
+                d32[i] = 0;
+            else if (p < 0xff000000)
+                d32[i] = qPremultiply(p);
        }

-        d = reinterpret_cast<__m128i*>(p+pad);
+        d += bpl;
    }

    if (data->format == QImage::Format_ARGB32)