From da30e402f38a434f856fa8670a8813c3cffe6440 Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Tue, 4 May 2021 13:03:50 +0200 Subject: [PATCH] Add SIMD optimizations for color-transform writes Add NEON for RGB32 and RGBA64 writeback, and SSE2 for RGBA64 writeback. Change-Id: Id9ee803267a78f5bdff5beaa719e7a59c1dbb9fb Reviewed-by: Qt CI Bot Reviewed-by: Thiago Macieira Reviewed-by: Lars Knoll --- src/gui/painting/qcolortransform.cpp | 246 ++++++++++++++---- .../painting/qcolorspace/tst_qcolorspace.cpp | 72 +++++ 2 files changed, 266 insertions(+), 52 deletions(-) diff --git a/src/gui/painting/qcolortransform.cpp b/src/gui/painting/qcolortransform.cpp index 116a6c0ec78..ff8261989e5 100644 --- a/src/gui/painting/qcolortransform.cpp +++ b/src/gui/painting/qcolortransform.cpp @@ -299,6 +299,7 @@ static void applyMatrix(QColorVector *buffer, const qsizetype len, const QColorM #endif } +#if defined(__SSE2__) || defined(__ARM_NEON__) template static constexpr inline bool isArgb(); template<> @@ -306,6 +307,16 @@ constexpr inline bool isArgb() { return true; } template<> constexpr inline bool isArgb() { return false; } +template +static inline int getAlpha(const T &p); +template<> +inline int getAlpha(const QRgb &p) +{ return qAlpha(p); } +template<> +inline int getAlpha(const QRgba64 &p) +{ return p.alpha(); } +#endif + template static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr); template @@ -466,7 +477,7 @@ static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetyp vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vAlphaMask)); // LUT - v = vcvtq_u32_f32(vmulq_n_f32(vf, 4080.f)); + v = vcvtq_u32_f32(vaddq_f32(vmulq_n_f32(vf, 4080.f), vdupq_n_f32(0.5f))); const int ridx = isARGB ? vgetq_lane_u32(v, 2) : vgetq_lane_u32(v, 0); const int gidx = vgetq_lane_u32(v, 1); const int bidx = isARGB ? vgetq_lane_u32(v, 0) : vgetq_lane_u32(v, 2); @@ -581,33 +592,205 @@ void loadUnpremultiplied(QColorVector *buffer, const QRgba64 *src, cons } #endif -static void storePremultiplied(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len, +#if defined(__SSE2__) +template +static inline void storeP(T &p, const __m128i &v); +template<> +inline void storeP(QRgb &p, const __m128i &v) +{ + p = _mm_cvtsi128_si32(_mm_packus_epi16(v, v)); +} +template<> +inline void storeP(QRgba64 &p, const __m128i &v) +{ + _mm_storel_epi64((__m128i *)&p, v); +} + +template +static void storePremultiplied(T *dst, const T *src, const QColorVector *buffer, const qsizetype len, const QColorTransformPrivate *d_ptr) { -#if defined(__SSE2__) const __m128 v4080 = _mm_set1_ps(4080.f); const __m128 iFF00 = _mm_set1_ps(1.0f / (255 * 256)); + constexpr bool isARGB = isArgb(); for (qsizetype i = 0; i < len; ++i) { - const int a = qAlpha(src[i]); + const int a = getAlpha(src[i]); __m128 vf = _mm_loadu_ps(&buffer[i].x); __m128i v = _mm_cvtps_epi32(_mm_mul_ps(vf, v4080)); - __m128 va = _mm_set1_ps(a); - va = _mm_mul_ps(va, iFF00); + __m128 va = _mm_mul_ps(_mm_set1_ps(a), iFF00); const int ridx = _mm_extract_epi16(v, 0); const int gidx = _mm_extract_epi16(v, 2); const int bidx = _mm_extract_epi16(v, 4); - v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], 4); + v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], isARGB ? 4 : 0); v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 2); - v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], 0); + v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], isARGB ? 0 : 4); vf = _mm_cvtepi32_ps(v); vf = _mm_mul_ps(vf, va); v = _mm_cvtps_epi32(vf); v = _mm_packs_epi32(v, v); v = _mm_insert_epi16(v, a, 3); - v = _mm_packus_epi16(v, v); - dst[i] = _mm_cvtsi128_si32(v); + storeP(dst[i], v); } +} + +template +static inline void storePU(T &p, __m128i &v, int a); +template<> +inline void storePU(QRgb &p, __m128i &v, int a) +{ + v = _mm_add_epi16(v, _mm_set1_epi16(0x80)); + v = _mm_srli_epi16(v, 8); + v = _mm_insert_epi16(v, a, 3); + p = _mm_cvtsi128_si32(_mm_packus_epi16(v, v)); +} +template<> +inline void storePU(QRgba64 &p, __m128i &v, int a) +{ + v = _mm_add_epi16(v, _mm_srli_epi16(v, 8)); + v = _mm_insert_epi16(v, a, 3); + _mm_storel_epi64((__m128i *)&p, v); +} + +template +static void storeUnpremultiplied(T *dst, const T *src, const QColorVector *buffer, const qsizetype len, + const QColorTransformPrivate *d_ptr) +{ + const __m128 v4080 = _mm_set1_ps(4080.f); + constexpr bool isARGB = isArgb(); + for (qsizetype i = 0; i < len; ++i) { + const int a = getAlpha(src[i]); + __m128 vf = _mm_loadu_ps(&buffer[i].x); + __m128i v = _mm_cvtps_epi32(_mm_mul_ps(vf, v4080)); + const int ridx = _mm_extract_epi16(v, 0); + const int gidx = _mm_extract_epi16(v, 2); + const int bidx = _mm_extract_epi16(v, 4); + v = _mm_setzero_si128(); + v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], isARGB ? 2 : 0); + v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 1); + v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], isARGB ? 0 : 2); + storePU(dst[i], v, a); + } +} + +template +static void storeOpaque(T *dst, const T *src, const QColorVector *buffer, const qsizetype len, + const QColorTransformPrivate *d_ptr) +{ + Q_UNUSED(src); + const __m128 v4080 = _mm_set1_ps(4080.f); + constexpr bool isARGB = isArgb(); + for (qsizetype i = 0; i < len; ++i) { + __m128 vf = _mm_loadu_ps(&buffer[i].x); + __m128i v = _mm_cvtps_epi32(_mm_mul_ps(vf, v4080)); + const int ridx = _mm_extract_epi16(v, 0); + const int gidx = _mm_extract_epi16(v, 2); + const int bidx = _mm_extract_epi16(v, 4); + v = _mm_setzero_si128(); + v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], isARGB ? 2 : 0); + v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 1); + v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], isARGB ? 0 : 2); + storePU(dst[i], v, isARGB ? 255 : 0xffff); + } +} +#elif defined(__ARM_NEON__) +template +static inline void storeP(T &p, const uint16x4_t &v); +template<> +inline void storeP(QRgb &p, const uint16x4_t &v) +{ + p = vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(v, v))), 0); +} +template<> +inline void storeP(QRgba64 &p, const uint16x4_t &v) +{ + vst1_u16((uint16_t *)&p, v); +} + +template +static void storePremultiplied(T *dst, const T *src, const QColorVector *buffer, const qsizetype len, + const QColorTransformPrivate *d_ptr) +{ + const float iFF00 = 1.0f / (255 * 256); + constexpr bool isARGB = isArgb(); + for (qsizetype i = 0; i < len; ++i) { + const int a = getAlpha(src[i]); + float32x4_t vf = vld1q_f32(&buffer[i].x); + uint32x4_t v = vcvtq_u32_f32(vaddq_f32(vmulq_n_f32(vf, 4080.f), vdupq_n_f32(0.5f))); + const int ridx = vgetq_lane_u32(v, 0); + const int gidx = vgetq_lane_u32(v, 1); + const int bidx = vgetq_lane_u32(v, 2); + v = vsetq_lane_u32(d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], v, isARGB ? 2 : 0); + v = vsetq_lane_u32(d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], v, 1); + v = vsetq_lane_u32(d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], v, isARGB ? 0 : 2); + vf = vcvtq_f32_u32(v); + vf = vmulq_n_f32(vf, a * iFF00); + vf = vaddq_f32(vf, vdupq_n_f32(0.5f)); + v = vcvtq_u32_f32(vf); + uint16x4_t v16 = vmovn_u32(v); + v16 = vset_lane_u16(a, v16, 3); + storeP(dst[i], v16); + } +} + +template +static inline void storePU(T &p, uint16x4_t &v, int a); +template<> +inline void storePU(QRgb &p, uint16x4_t &v, int a) +{ + v = vadd_u16(v, vdup_n_u16(0x80)); + v = vshr_n_u16(v, 8); + v = vset_lane_u16(a, v, 3); + p = vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(v, v))), 0); +} +template<> +inline void storePU(QRgba64 &p, uint16x4_t &v, int a) +{ + v = vadd_u16(v, vshr_n_u16(v, 8)); + v = vset_lane_u16(a, v, 3); + vst1_u16((uint16_t *)&p, v); +} + +template +static void storeUnpremultiplied(T *dst, const T *src, const QColorVector *buffer, const qsizetype len, + const QColorTransformPrivate *d_ptr) +{ + constexpr bool isARGB = isArgb(); + for (qsizetype i = 0; i < len; ++i) { + const int a = getAlpha(src[i]); + float32x4_t vf = vld1q_f32(&buffer[i].x); + uint16x4_t v = vmovn_u32(vcvtq_u32_f32(vaddq_f32(vmulq_n_f32(vf, 4080.f), vdupq_n_f32(0.5f)))); + const int ridx = vget_lane_u16(v, 0); + const int gidx = vget_lane_u16(v, 1); + const int bidx = vget_lane_u16(v, 2); + v = vset_lane_u16(d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], v, isARGB ? 2 : 0); + v = vset_lane_u16(d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], v, 1); + v = vset_lane_u16(d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], v, isARGB ? 0 : 2); + storePU(dst[i], v, a); + } +} + +template +static void storeOpaque(T *dst, const T *src, const QColorVector *buffer, const qsizetype len, + const QColorTransformPrivate *d_ptr) +{ + Q_UNUSED(src); + constexpr bool isARGB = isArgb(); + for (qsizetype i = 0; i < len; ++i) { + float32x4_t vf = vld1q_f32(&buffer[i].x); + uint16x4_t v = vmovn_u32(vcvtq_u32_f32(vaddq_f32(vmulq_n_f32(vf, 4080.f), vdupq_n_f32(0.5f)))); + const int ridx = vget_lane_u16(v, 0); + const int gidx = vget_lane_u16(v, 1); + const int bidx = vget_lane_u16(v, 2); + v = vset_lane_u16(d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], v, isARGB ? 2 : 0); + v = vset_lane_u16(d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], v, 1); + v = vset_lane_u16(d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], v, isARGB ? 0 : 2); + storePU(dst[i], v, isARGB ? 255 : 0xffff); + } +} #else +static void storePremultiplied(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len, + const QColorTransformPrivate *d_ptr) +{ for (qsizetype i = 0; i < len; ++i) { const int a = qAlpha(src[i]); const float fa = a / (255.0f * 256.0f); @@ -616,71 +799,29 @@ static void storePremultiplied(QRgb *dst, const QRgb *src, const QColorVector *b const float b = d_ptr->colorSpaceOut->lut[2]->m_fromLinear[int(buffer[i].z * 4080.0f + 0.5f)]; dst[i] = qRgba(r * fa + 0.5f, g * fa + 0.5f, b * fa + 0.5f, a); } -#endif } static void storeUnpremultiplied(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len, const QColorTransformPrivate *d_ptr) { -#if defined(__SSE2__) - const __m128 v4080 = _mm_set1_ps(4080.f); - for (qsizetype i = 0; i < len; ++i) { - const int a = qAlpha(src[i]); - __m128 vf = _mm_loadu_ps(&buffer[i].x); - __m128i v = _mm_cvtps_epi32(_mm_mul_ps(vf, v4080)); - const int ridx = _mm_extract_epi16(v, 0); - const int gidx = _mm_extract_epi16(v, 2); - const int bidx = _mm_extract_epi16(v, 4); - v = _mm_setzero_si128(); - v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], 2); - v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 1); - v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], 0); - v = _mm_add_epi16(v, _mm_set1_epi16(0x80)); - v = _mm_srli_epi16(v, 8); - v = _mm_insert_epi16(v, a, 3); - v = _mm_packus_epi16(v, v); - dst[i] = _mm_cvtsi128_si32(v); - } -#else for (qsizetype i = 0; i < len; ++i) { const int r = d_ptr->colorSpaceOut->lut[0]->u8FromLinearF32(buffer[i].x); const int g = d_ptr->colorSpaceOut->lut[1]->u8FromLinearF32(buffer[i].y); const int b = d_ptr->colorSpaceOut->lut[2]->u8FromLinearF32(buffer[i].z); dst[i] = (src[i] & 0xff000000) | (r << 16) | (g << 8) | (b << 0); } -#endif } static void storeOpaque(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len, const QColorTransformPrivate *d_ptr) { Q_UNUSED(src); -#if defined(__SSE2__) - const __m128 v4080 = _mm_set1_ps(4080.f); - for (qsizetype i = 0; i < len; ++i) { - __m128 vf = _mm_loadu_ps(&buffer[i].x); - __m128i v = _mm_cvtps_epi32(_mm_mul_ps(vf, v4080)); - const int ridx = _mm_extract_epi16(v, 0); - const int gidx = _mm_extract_epi16(v, 2); - const int bidx = _mm_extract_epi16(v, 4); - v = _mm_setzero_si128(); - v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], 2); - v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 1); - v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], 0); - v = _mm_add_epi16(v, _mm_set1_epi16(0x80)); - v = _mm_srli_epi16(v, 8); - v = _mm_insert_epi16(v, 255, 3); - v = _mm_packus_epi16(v, v); - dst[i] = _mm_cvtsi128_si32(v); - } -#else for (qsizetype i = 0; i < len; ++i) { const int r = d_ptr->colorSpaceOut->lut[0]->u8FromLinearF32(buffer[i].x); const int g = d_ptr->colorSpaceOut->lut[1]->u8FromLinearF32(buffer[i].y); const int b = d_ptr->colorSpaceOut->lut[2]->u8FromLinearF32(buffer[i].z); dst[i] = 0xff000000 | (r << 16) | (g << 8) | (b << 0); } -#endif } static void storePremultiplied(QRgba64 *dst, const QRgba64 *src, const QColorVector *buffer, const qsizetype len, @@ -718,6 +859,7 @@ static void storeOpaque(QRgba64 *dst, const QRgba64 *src, const QColorVector *bu dst[i] = qRgba64(r, g, b, 0xFFFF); } } +#endif static void storeGray(quint8 *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len, const QColorTransformPrivate *d_ptr) diff --git a/tests/auto/gui/painting/qcolorspace/tst_qcolorspace.cpp b/tests/auto/gui/painting/qcolorspace/tst_qcolorspace.cpp index 0c8536d3851..913fe55364f 100644 --- a/tests/auto/gui/painting/qcolorspace/tst_qcolorspace.cpp +++ b/tests/auto/gui/painting/qcolorspace/tst_qcolorspace.cpp @@ -61,6 +61,8 @@ private slots: void imageConversion(); void imageConversion64_data(); void imageConversion64(); + void imageConversion64PM_data(); + void imageConversion64PM(); void imageConversionOverLargerGamut_data(); void imageConversionOverLargerGamut(); @@ -353,6 +355,76 @@ void tst_QColorSpace::imageConversion64() } } +void tst_QColorSpace::imageConversion64PM_data() +{ + imageConversion64_data(); +} + +void tst_QColorSpace::imageConversion64PM() +{ + QFETCH(QColorSpace::NamedColorSpace, fromColorSpace); + QFETCH(QColorSpace::NamedColorSpace, toColorSpace); + + QImage testImage(256, 16, QImage::Format_RGBA64_Premultiplied); + + for (int j = 0; j < 16; ++j) { + int a = j * 15; + for (int i = 0; i < 256; ++i) + testImage.setPixel(i, j, qPremultiply(qRgba(i, i, i, a))); + } + + testImage.setColorSpace(fromColorSpace); + QCOMPARE(testImage.colorSpace(), QColorSpace(fromColorSpace)); + + testImage.convertToColorSpace(toColorSpace); + QCOMPARE(testImage.colorSpace(), QColorSpace(toColorSpace)); + + int lastRed = 0; + int lastGreen = 0; + int lastBlue = 0; + for (int j = 0; j < 16; ++j) { + for (int i = 0; i < 256; ++i) { + QRgb p = testImage.pixel(i, j); + QVERIFY(qRed(p) >= lastRed); + QVERIFY(qGreen(p) >= lastGreen); + QVERIFY(qBlue(p) >= lastBlue); + QCOMPARE(qAlpha(p), j * 15); + lastRed = qRed(p); + lastGreen = qGreen(p); + lastBlue = qBlue(p); + } + QVERIFY(lastRed <= j * 15); + QVERIFY(lastGreen <= j * 15); + QVERIFY(lastBlue <= j * 15); + lastRed = 0; + lastGreen = 0; + lastBlue = 0; + } + + testImage.convertToColorSpace(fromColorSpace); + QCOMPARE(testImage.colorSpace(), QColorSpace(fromColorSpace)); + for (int j = 0; j < 16; ++j) { + for (int i = 0; i < 256; ++i) { + QRgb p = testImage.pixel(i, j); + QCOMPARE(qRed(p), qGreen(p)); + QCOMPARE(qRed(p), qBlue(p)); + QCOMPARE(qAlpha(p), j * 15); + QVERIFY((lastRed - qRed(p)) <= 0); + QVERIFY((lastGreen - qGreen(p)) <= 0); + QVERIFY((lastBlue - qBlue(p)) <= 0); + lastRed = qRed(p); + lastGreen = qGreen(p); + lastBlue = qBlue(p); + } + QVERIFY(lastRed <= j * 15); + QVERIFY(lastGreen <= j * 15); + QVERIFY(lastBlue <= j * 15); + lastRed = 0; + lastGreen = 0; + lastBlue = 0; + } +} + void tst_QColorSpace::imageConversionOverLargerGamut_data() { QTest::addColumn("fromColorSpace");