From f9185516eb04907f68f3f0d156b5681e12e2c6ef Mon Sep 17 00:00:00 2001 From: Chen Zhanwang Date: Wed, 28 Aug 2024 09:58:25 +0800 Subject: [PATCH] Add some special LSX optimizations During compilation, Qt will not add the -mlsx compilation parameter for these files. Therefore, the compiler needs to enable LSX auto-vectorization by default for all files during compilation for them to be compiled. Change-Id: I90c5029b673f831d39591ffd96c36e7762c68fb0 Reviewed-by: Volker Hilsheimer --- src/gui/painting/qcompositionfunctions.cpp | 68 ++++++++ src/gui/painting/qdrawhelper.cpp | 140 ++++++++++++++++ src/gui/painting/qpixellayout.cpp | 185 +++++++++++++++++++++ src/gui/painting/qrgba64_p.h | 119 ++++++++++++- 4 files changed, 511 insertions(+), 1 deletion(-) diff --git a/src/gui/painting/qcompositionfunctions.cpp b/src/gui/painting/qcompositionfunctions.cpp index 9db5080f361..44314f90656 100644 --- a/src/gui/painting/qcompositionfunctions.cpp +++ b/src/gui/painting/qcompositionfunctions.cpp @@ -280,10 +280,78 @@ struct Rgba64OperationsNEON : public Rgba64OperationsBase }; #endif +#if defined(__loongarch_sx) +struct Rgba64OperationsLSX : public Rgba64OperationsBase +{ + typedef __m128i OptimalType; + typedef __m128i OptimalScalar; + static OptimalType load(const Type *ptr) + { + return __lsx_vilvl_d(__lsx_vldi(0), __lsx_vldrepl_d(reinterpret_cast(ptr), 0)); + } + static OptimalType convert(const Type &value) + { + return __lsx_vinsgr2vr_d(__lsx_vldi(0), value, 0); + } + static void store(Type *ptr, OptimalType value) + { + __lsx_vstelm_d(value, reinterpret_cast(ptr), 0, 0); + } + static OptimalType add(OptimalType a, OptimalType b) + { + return __lsx_vadd_h(a, b); + } +// same as above: +// static OptimalScalar add(OptimalScalar a, OptimalScalar b) + static OptimalType plus(OptimalType a, OptimalType b) + { + return __lsx_vsadd_hu(a, b); + } + static OptimalScalar alpha(OptimalType c) + { + const __m128i shuffleMask = (__m128i)(v8i16){3, 3, 3, 3, 4, 5, 6, 7}; + return __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), c); + } + static OptimalScalar invAlpha(Scalar c) + { + return scalar(65535 - c); + } + static OptimalScalar invAlpha(OptimalType c) + { + return __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), alpha(c)); + } + static OptimalScalar scalar(Scalar n) + { + const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 0, 0, 4, 5, 6, 7}; + return __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), __lsx_vinsgr2vr_w(__lsx_vldi(0), n, 0)); + } + static OptimalType multiplyAlpha8bit(OptimalType val, uint8_t a) + { + return multiplyAlpha255(val, a); + } +// same as above: +// static OptimalScalar multiplyAlpha8bit(OptimalScalar a, uint8_t a) + static OptimalType interpolate8bit(OptimalType x, uint8_t a1, OptimalType y, uint8_t a2) + { + return interpolate255(x, a1, y, a2); + } + static OptimalType multiplyAlpha(OptimalType val, OptimalScalar a) + { + return multiplyAlpha65535(val, a); + } + static OptimalType interpolate(OptimalType x, OptimalScalar a1, OptimalType y, const OptimalScalar &a2) + { + return interpolate65535(x, a1, y, a2); + } +}; +#endif + #if defined(__SSE2__) typedef Rgba64OperationsSSE2 Rgba64Operations; #elif defined(__ARM_NEON__) typedef Rgba64OperationsNEON Rgba64Operations; +#elif defined(__loongarch_sx) +typedef Rgba64OperationsLSX Rgba64Operations; #else typedef Rgba64OperationsC Rgba64Operations; #endif diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index b36044396f9..9cad0b4f697 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -1251,6 +1251,45 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, u } #endif +#if defined(__loongarch_sx) +static inline void interpolate_4_pixels_16_lsx(__m128i tl, __m128i tr, __m128i bl, __m128i br, + __m128i distx, __m128i disty, uint *b) +{ + const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); + const __m128i v_256 = __lsx_vreplgr2vr_h(256); + const __m128i dxdy = __lsx_vmul_h(distx, disty); + const __m128i distx_ = __lsx_vslli_h(distx, 4); + const __m128i disty_ = __lsx_vslli_h(disty, 4); + const __m128i idxidy = __lsx_vadd_h(dxdy, __lsx_vsub_h(v_256, __lsx_vadd_h(distx_, disty_))); + const __m128i dxidy = __lsx_vsub_h(distx_, dxdy); + const __m128i idxdy = __lsx_vsub_h(disty_,dxdy); + + __m128i tlAG = __lsx_vsrli_h(tl, 8); + __m128i tlRB = __lsx_vand_v(tl, colorMask); + __m128i trAG = __lsx_vsrli_h(tr, 8); + __m128i trRB = __lsx_vand_v(tr, colorMask); + __m128i blAG = __lsx_vsrli_h(bl, 8); + __m128i blRB = __lsx_vand_v(bl, colorMask); + __m128i brAG = __lsx_vsrli_h(br, 8); + __m128i brRB = __lsx_vand_v(br, colorMask); + + tlAG = __lsx_vmul_h(tlAG, idxidy); + tlRB = __lsx_vmul_h(tlRB, idxidy); + trAG = __lsx_vmul_h(trAG, dxidy); + trRB = __lsx_vmul_h(trRB, dxidy); + blAG = __lsx_vmul_h(blAG, idxdy); + blRB = __lsx_vmul_h(blRB, idxdy); + brAG = __lsx_vmul_h(brAG, dxdy); + brRB = __lsx_vmul_h(brRB, dxdy); + + __m128i rAG =__lsx_vadd_h(__lsx_vadd_h(tlAG, trAG), __lsx_vadd_h(blAG, brAG)); + __m128i rRB =__lsx_vadd_h(__lsx_vadd_h(tlRB, trRB), __lsx_vadd_h(blRB, brRB)); + rAG = __lsx_vandn_v(colorMask, rAG); + rRB = __lsx_vsrli_h(rRB, 8); + __lsx_vst(__lsx_vor_v(rAG, rRB), b, 0); +} +#endif + template void fetchTransformedBilinear_pixelBounds(int max, int l1, int l2, int &v1, int &v2); @@ -1426,6 +1465,36 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_scale_helper(uin rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8)); vst1q_s16((int16_t*)(&intermediate.buffer_rb[f]), rRB); } +#elif defined(__loongarch_sx) + const __m128i disty_ = __lsx_vreplgr2vr_h(disty); + const __m128i idisty_ = __lsx_vreplgr2vr_h(idisty); + const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff); + + lim -= 3; + for (; f < lim; x += 4, f += 4) { + // Load 4 pixels from s1, and split the alpha-green and red-blue component + __m128i top = __lsx_vld((const __m128i*)((const uint *)(s1)+x), 0); + __m128i topAG = __lsx_vsrli_h(top, 8); + __m128i topRB = __lsx_vand_v(top, colorMask); + // Multiplies each color component by idisty + topAG = __lsx_vmul_h(topAG, idisty_); + topRB = __lsx_vmul_h(topRB, idisty_); + + // Same for the s2 vector + __m128i bottom = __lsx_vld((const __m128i*)((const uint *)(s2)+x), 0); + __m128i bottomAG = __lsx_vsrli_h(bottom, 8); + __m128i bottomRB = __lsx_vand_v(bottom, colorMask); + bottomAG = __lsx_vmul_h(bottomAG, disty_); + bottomRB = __lsx_vmul_h(bottomRB, disty_); + + // Add the values, and shift to only keep 8 significant bits per colors + __m128i rAG = __lsx_vadd_h(topAG, bottomAG); + rAG = __lsx_vsrli_h(rAG, 8); + __lsx_vst(rAG, (__m128i*)(&intermediate.buffer_ag[f]), 0); + __m128i rRB = __lsx_vadd_h(topRB, bottomRB); + rRB = __lsx_vsrli_h(rRB, 8); + __lsx_vst(rRB, (__m128i*)(&intermediate.buffer_rb[f]), 0); + } #endif } for (; f < count; f++) { // Same as above but without simd @@ -1615,6 +1684,33 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper(uint * b+=4; v_fx = vaddq_s32(v_fx, v_fdx); } +#elif defined (__loongarch_sx) + const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 2, 2, 4, 4, 6, 6}; + const __m128i v_disty = __lsx_vreplgr2vr_h(disty4); + const __m128i v_fdx = __lsx_vreplgr2vr_w(fdx*4); + const __m128i v_fx_r = __lsx_vreplgr2vr_w(0x8); + __m128i v_fx = (__m128i)(v4i32){fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx}; + + while (b < boundedEnd - 3) { + __m128i offset = __lsx_vsrli_w(v_fx, 16); + const int offset0 = __lsx_vpickve2gr_w(offset, 0); + const int offset1 = __lsx_vpickve2gr_w(offset, 1); + const int offset2 = __lsx_vpickve2gr_w(offset, 2); + const int offset3 = __lsx_vpickve2gr_w(offset, 3); + const __m128i tl = (__m128i)(v4u32){s1[offset0], s1[offset1], s1[offset2], s1[offset3]}; + const __m128i tr = (__m128i)(v4u32){s1[offset0 + 1], s1[offset1 + 1], s1[offset2 + 1], s1[offset3 + 1]}; + const __m128i bl = (__m128i)(v4u32){s2[offset0], s2[offset1], s2[offset2], s2[offset3]}; + const __m128i br = (__m128i)(v4u32){s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]}; + + __m128i v_distx = __lsx_vsrli_h(v_fx, 8); + v_distx = __lsx_vsrli_h(__lsx_vadd_w(v_distx, v_fx_r), 4); + v_distx = __lsx_vshuf_h(shuffleMask, v_distx, v_distx); + + interpolate_4_pixels_16_lsx(tl, tr, bl, br, v_distx, v_disty, b); + b += 4; + v_fx = __lsx_vadd_w(v_fx, v_fdx); + } + fx = __lsx_vpickve2gr_w(v_fx, 0); #endif while (b < boundedEnd) { int x = (fx >> 16); @@ -1852,6 +1948,50 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper(uint v_fx = vaddq_s32(v_fx, v_fdx); v_fy = vaddq_s32(v_fy, v_fdy); } +#elif defined(__loongarch_sx) + const __m128i v_fdx = __lsx_vreplgr2vr_w(fdx*4); + const __m128i v_fdy = __lsx_vreplgr2vr_w(fdy*4); + const __m128i v_fxy_r = __lsx_vreplgr2vr_w(0x8); + __m128i v_fx = (__m128i)(v4i32){fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx}; + __m128i v_fy = (__m128i)(v4i32){fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy}; + + const uchar *textureData = image.imageData; + const qsizetype bytesPerLine = image.bytesPerLine; + const __m128i zero = __lsx_vldi(0); + const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 0, 0, 4, 5, 6, 7}; + const __m128i shuffleMask1 = (__m128i)(v8i16){0, 0, 2, 2, 4, 4, 6, 6}; + const __m128i vbpl = __lsx_vshuf_h(shuffleMask, zero, __lsx_vinsgr2vr_w(zero, bytesPerLine/4, 0)); + + while (b < boundedEnd - 3) { + const __m128i vy = __lsx_vpickev_h(zero, __lsx_vsat_w(__lsx_vsrli_w(v_fy, 16), 15)); + // 4x16bit * 4x16bit -> 4x32bit + __m128i offset = __lsx_vilvl_h(__lsx_vmuh_h(vy, vbpl), __lsx_vmul_h(vy, vbpl)); + offset = __lsx_vadd_w(offset, __lsx_vsrli_w(v_fx, 16)); + const int offset0 = __lsx_vpickve2gr_w(offset, 0); + const int offset1 = __lsx_vpickve2gr_w(offset, 1); + const int offset2 = __lsx_vpickve2gr_w(offset, 2); + const int offset3 = __lsx_vpickve2gr_w(offset, 3); + const uint *topData = (const uint *)(textureData); + const __m128i tl = (__m128i)(v4u32){topData[offset0], topData[offset1], topData[offset2], topData[offset3]}; + const __m128i tr = (__m128i)(v4u32){topData[offset0 + 1], topData[offset1 + 1], topData[offset2 + 1], topData[offset3 + 1]}; + const uint *bottomData = (const uint *)(textureData + bytesPerLine); + const __m128i bl = (__m128i)(v4u32){bottomData[offset0], bottomData[offset1], bottomData[offset2], bottomData[offset3]}; + const __m128i br = (__m128i)(v4u32){bottomData[offset0 + 1], bottomData[offset1 + 1], bottomData[offset2 + 1], bottomData[offset3 + 1]}; + + __m128i v_distx = __lsx_vsrli_h(v_fx, 8); + __m128i v_disty = __lsx_vsrli_h(v_fy, 8); + v_distx = __lsx_vsrli_h(__lsx_vadd_w(v_distx, v_fxy_r), 4); + v_disty = __lsx_vsrli_h(__lsx_vadd_w(v_disty, v_fxy_r), 4); + v_distx = __lsx_vshuf_h(shuffleMask1, zero, v_distx); + v_disty = __lsx_vshuf_h(shuffleMask1, zero, v_disty); + + interpolate_4_pixels_16_lsx(tl, tr, bl, br, v_distx, v_disty, b); + b += 4; + v_fx = __lsx_vadd_w(v_fx, v_fdx); + v_fy = __lsx_vadd_w(v_fy, v_fdy); + } + fx = __lsx_vpickve2gr_w(v_fx, 0); + fy = __lsx_vpickve2gr_w(v_fy, 0); #endif while (b < boundedEnd) { int x = (fx >> 16); diff --git a/src/gui/painting/qpixellayout.cpp b/src/gui/painting/qpixellayout.cpp index cdd18aa711f..6c21218f151 100644 --- a/src/gui/painting/qpixellayout.cpp +++ b/src/gui/painting/qpixellayout.cpp @@ -1141,6 +1141,108 @@ static inline void qConvertARGB32PMToRGBA64PM_neon(QRgba64 *buffer, const uint * *buffer++ = QRgba64::fromArgb32(s); } } +#elif defined __loongarch_sx +template +static inline void qConvertARGB32PMToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count) +{ + if (count <= 0) + return; + + const __m128i amask = __lsx_vreplgr2vr_w(0xff000000); + const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 6, 5, 4, 7}; + int i = 0; + for (; ((uintptr_t)buffer & 0xf) && i < count; ++i) { + uint s = *src++; + if (maskAlpha) + s = s | 0xff000000; + if (RGBA) + s = RGBA2ARGB(s); + *buffer++ = QRgba64::fromArgb32(s); + } + for (; i < count-3; i += 4) { + __m128i vs = __lsx_vld((const __m128i*)src, 0); + if (maskAlpha) + vs = __lsx_vor_v(vs, amask); + src += 4; + __m128i v1 = __lsx_vilvl_b(vs, vs); + __m128i v2 = __lsx_vilvh_b(vs, vs); + if (!RGBA) { + v1 = __lsx_vshuf_h(shuffleMask, v1, v1); + v2 = __lsx_vshuf_h(shuffleMask, v2, v2); + } + __lsx_vst(v1, buffer, 0); + buffer += 2; + __lsx_vst(v2, buffer, 0); + buffer += 2; + } + + SIMD_EPILOGUE(i, count, 3) { + uint s = *src++; + if (maskAlpha) + s = s | 0xff000000; + if (RGBA) + s = RGBA2ARGB(s); + *buffer++ = QRgba64::fromArgb32(s); + } +} + +template +static inline void qConvertRGBA64PMToA2RGB30PM_lsx(uint *dest, const QRgba64 *buffer, int count) +{ + const __m128i gmask = __lsx_vreplgr2vr_w(0x000ffc00); + const __m128i cmask = __lsx_vreplgr2vr_w(0x000003ff); + int i = 0; + __m128i vr, vg, vb, va; + for (; i < count && uintptr_t(buffer) & 0xF; ++i) { + *dest++ = qConvertRgb64ToRgb30(*buffer++); + } + + for (; i < count-15; i += 16) { + __m128i vOr = __lsx_vreplgr2vr_w(0); + __m128i vAnd = __lsx_vreplgr2vr_w(0xffffffff); + for (int j = 0; j < 16; j += 2) { + __m128i vs = __lsx_vld((const __m128i*)(buffer + j), 0); + vOr = __lsx_vor_v(vOr, vs); + vAnd = __lsx_vand_v(vAnd, vs); + } + const quint16 orAlpha = ((uint)__lsx_vpickve2gr_h(vOr, 3)) | ((uint)__lsx_vpickve2gr_h(vOr, 7)); + const quint16 andAlpha = ((uint)__lsx_vpickve2gr_h(vAnd, 3)) & ((uint)__lsx_vpickve2gr_h(vAnd, 7)); + + if (andAlpha == 0xffff) { + for (int j = 0; j < 16; j += 2) { + __m128i vs = __lsx_vld((const __m128i*)buffer, 0); + buffer += 2; + vr = __lsx_vsrli_d(vs, 6); + vg = __lsx_vsrli_d(vs, 16 + 6 - 10); + vb = __lsx_vsrli_d(vs, 32 + 6); + vr = __lsx_vand_v(vr, cmask); + vg = __lsx_vand_v(vg, gmask); + vb = __lsx_vand_v(vb, cmask); + va = __lsx_vsrli_d(vs, 48 + 14); + if (PixelOrder == PixelOrderRGB) + vr = __lsx_vslli_w(vr, 20); + else + vb = __lsx_vslli_w(vb, 20); + va = __lsx_vslli_w(va, 30); + __m128i vd = __lsx_vor_v(__lsx_vor_v(vr, vg), __lsx_vor_v(vb, va)); + vd = __lsx_vshuf4i_w(vd, 0b11011000); + __lsx_vstelm_d(vd, dest, 0, 0); + dest += 2; + } + } else if (orAlpha == 0) { + for (int j = 0; j < 16; ++j) { + *dest++ = 0; + buffer++; + } + } else { + for (int j = 0; j < 16; ++j) + *dest++ = qConvertRgb64ToRgb30(*buffer++); + } + } + + SIMD_EPILOGUE(i, count, 15) + *dest++ = qConvertRgb64ToRgb30(*buffer++); +} #endif static const QRgba64 *QT_FASTCALL convertRGB32ToRGB64(QRgba64 *buffer, const uint *src, int count, @@ -1150,6 +1252,8 @@ static const QRgba64 *QT_FASTCALL convertRGB32ToRGB64(QRgba64 *buffer, const uin qConvertARGB32PMToRGBA64PM_sse2(buffer, src, count); #elif defined(__ARM_NEON__) qConvertARGB32PMToRGBA64PM_neon(buffer, src, count); +#elif defined(__loongarch_sx) + qConvertARGB32PMToRGBA64PM_lsx(buffer, src, count); #else for (int i = 0; i < count; ++i) buffer[i] = QRgba64::fromArgb32(0xff000000 | src[i]); @@ -1184,6 +1288,8 @@ static const QRgba64 *QT_FASTCALL convertARGB32PMToRGBA64PM(QRgba64 *buffer, con qConvertARGB32PMToRGBA64PM_sse2(buffer, src, count); #elif defined(__ARM_NEON__) qConvertARGB32PMToRGBA64PM_neon(buffer, src, count); +#elif defined(__loongarch_sx) + qConvertARGB32PMToRGBA64PM_lsx(buffer, src, count); #else for (int i = 0; i < count; ++i) buffer[i] = QRgba64::fromArgb32(src[i]); @@ -1238,6 +1344,8 @@ static const QRgba64 *QT_FASTCALL convertRGBA8888PMToRGBA64PM(QRgba64 *buffer, c qConvertARGB32PMToRGBA64PM_sse2(buffer, src, count); #elif defined(__ARM_NEON__) qConvertARGB32PMToRGBA64PM_neon(buffer, src, count); +#elif defined(__loongarch_sx) + qConvertARGB32PMToRGBA64PM_lsx(buffer, src, count); #else for (int i = 0; i < count; ++i) buffer[i] = QRgba64::fromArgb32(RGBA2ARGB(src[i])); @@ -1348,6 +1456,48 @@ static inline void qConvertA2RGB30PMToRGBA64PM_sse2(QRgba64 *buffer, const uint SIMD_EPILOGUE(i, count, 3) *buffer++ = qConvertA2rgb30ToRgb64(*src++); } +#elif defined(__loongarch_sx) +template +static inline void qConvertA2RGB30PMToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count) +{ + if (count <= 0) + return; + + const __m128i rmask = __lsx_vreplgr2vr_w(0x3ff00000); + const __m128i gmask = __lsx_vreplgr2vr_w(0x000ffc00); + const __m128i bmask = __lsx_vreplgr2vr_w(0x000003ff); + const __m128i afactor = __lsx_vreplgr2vr_h(0x5555); + int i = 0; + + for (; ((uintptr_t)buffer & 0xf) && i < count; ++i) + *buffer++ = qConvertA2rgb30ToRgb64(*src++); + + for (; i < count-3; i += 4) { + __m128i vs = __lsx_vld((const __m128i*)src, 0); + src += 4; + __m128i va = __lsx_vsrli_w(vs, 30); + __m128i vr = __lsx_vand_v(vs, rmask); + __m128i vb = __lsx_vand_v(vs, bmask); + __m128i vg = __lsx_vand_v(vs, gmask); + va = __lsx_vmul_h(va, afactor); + vr = __lsx_vor_v(__lsx_vsrli_w(vr, 14), __lsx_vsrli_w(vr, 24)); + vg = __lsx_vor_v(__lsx_vsrli_w(vg, 4), __lsx_vsrli_w(vg, 14)); + vb = __lsx_vor_v(__lsx_vslli_w(vb, 6), __lsx_vsrli_w(vb, 4)); + __m128i vrb; + if (PixelOrder == PixelOrderRGB) + vrb = __lsx_vor_v(vr, __lsx_vbsll_v(vb, 2)); + else + vrb = __lsx_vor_v(vb, __lsx_vbsll_v(vr, 2)); + __m128i vga = __lsx_vor_v(vg, __lsx_vbsll_v(va, 2)); + __lsx_vst(__lsx_vilvl_h(vga, vrb), buffer, 0); + buffer += 2; + __lsx_vst(__lsx_vilvh_h(vga, vrb), buffer, 0); + buffer += 2; + } + + SIMD_EPILOGUE(i, count, 3) + *buffer++ = qConvertA2rgb30ToRgb64(*src++); +} #endif template @@ -1356,6 +1506,8 @@ static const QRgba64 *QT_FASTCALL convertA2RGB30PMToRGBA64PM(QRgba64 *buffer, co { #ifdef __SSE2__ qConvertA2RGB30PMToRGBA64PM_sse2(buffer, src, count); +#elif defined (__loongarch_sx) + qConvertA2RGB30PMToRGBA64PM_lsx(buffer, src, count); #else for (int i = 0; i < count; ++i) buffer[i] = qConvertA2rgb30ToRgb64(src[i]); @@ -1466,6 +1618,37 @@ void qt_convertRGBA64ToARGB32(uint *dst, const QRgba64 *src, int count) _mm_storel_epi64((__m128i*)(dst), v1); dst += 2; } +#elif defined(__loongarch_sx) + if (((uintptr_t)dst & 0x7) && count > 0) { + uint s = (*src++).toArgb32(); + if (RGBA) + s = ARGB2RGBA(s); + *dst++ = s; + i++; + } + const __m128i vhalf = __lsx_vreplgr2vr_w(0x80); + const __m128i vzero = __lsx_vldi(0); + const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 6, 5, 4, 7}; + for (; i < count-1; i += 2) { + __m128i vs = __lsx_vld((const __m128i*)src, 0); + src += 2; + if (!RGBA) { + vs = __lsx_vshuf_h(shuffleMask, vzero, vs); + } + __m128i v1 = __lsx_vilvl_h(vzero, vs); + __m128i v2 = __lsx_vilvh_h(vzero, vs); + v1 = __lsx_vadd_w(v1, vhalf); + v2 = __lsx_vadd_w(v2, vhalf); + v1 = __lsx_vsub_w(v1, __lsx_vsrli_w(v1, 8)); + v2 = __lsx_vsub_w(v2, __lsx_vsrli_w(v2, 8)); + v1 = __lsx_vsrli_w(v1, 8); + v2 = __lsx_vsrli_w(v2, 8); + v1 = __lsx_vpickev_h(__lsx_vsat_w(v2, 15), __lsx_vsat_w(v1, 15)); + v1 = __lsx_vmaxi_h(v1, 0); + v1 = __lsx_vpickev_b(vzero, __lsx_vsat_hu(v1, 7)); + __lsx_vstelm_d(v1, dst, 0, 0); + dst += 2; + } #endif for (; i < count; i++) { uint s = (*src++).toArgb32(); @@ -1902,6 +2085,8 @@ static void QT_FASTCALL storeRGB30FromRGBA64PM(uchar *dest, const QRgba64 *src, uint *d = (uint*)dest + index; #ifdef __SSE2__ qConvertRGBA64PMToA2RGB30PM_sse2(d, src, count); +#elif defined (__loongarch_sx) + qConvertRGBA64PMToA2RGB30PM_lsx(d, src, count); #else for (int i = 0; i < count; ++i) d[i] = qConvertRgb64ToRgb30(src[i]); diff --git a/src/gui/painting/qrgba64_p.h b/src/gui/painting/qrgba64_p.h index 058d77f7e90..e4d99323480 100644 --- a/src/gui/painting/qrgba64_p.h +++ b/src/gui/painting/qrgba64_p.h @@ -57,6 +57,24 @@ static inline uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535) vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16) return vrshrn_n_u32(vs32, 16); // vs = (vs + 0x8000) >> 16 } +#elif defined(__loongarch_sx) +static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, __m128i va) +{ + __m128i vs = rgba64; + vs = __lsx_vilvl_h(__lsx_vmuh_hu(vs, va), __lsx_vmul_h(vs, va)); + vs = __lsx_vadd_w(vs, __lsx_vsrli_w(vs, 16)); + vs = __lsx_vadd_w(vs, __lsx_vreplgr2vr_w(0x8000)); + vs = __lsx_vsrai_w(vs, 16); + vs = __lsx_vpickev_h(__lsx_vsat_w(vs, 15), __lsx_vsat_w(vs, 15)); + return vs; +} +static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, uint alpha65535) +{ + const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 0, 0, 4, 5, 6, 7}; + const __m128i va = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), + __lsx_vinsgr2vr_w(__lsx_vldi(0), alpha65535, 0)); + return multiplyAlpha65535(rgba64, va); +} #endif static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535) @@ -73,6 +91,12 @@ static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535) QRgba64 r; vst1_u64(reinterpret_cast(&r), vreinterpret_u64_u16(vr)); return r; +#elif defined(__loongarch_sx) + const __m128i v = __lsx_vldrepl_d(reinterpret_cast(&rgba64), 0); + const __m128i vr = multiplyAlpha65535(v, alpha65535); + QRgba64 r; + __lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0); + return r; #else return QRgba64::fromRgba64(qt_div_65535(rgba64.red() * alpha65535), qt_div_65535(rgba64.green() * alpha65535), @@ -81,7 +105,7 @@ static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535) #endif } -#if defined(__SSE2__) || defined(__ARM_NEON__) +#if defined(__SSE2__) || defined(__ARM_NEON__) || defined(__loongarch_sx) template static inline T Q_DECL_VECTORCALL multiplyAlpha255(T rgba64, uint alpha255) { @@ -112,6 +136,14 @@ inline uint16x4_t interpolate255(uint16x4_t x, uint alpha1, uint16x4_t y, uint a } #endif +#if defined __loongarch_sx +static inline __m128i Q_DECL_VECTORCALL +interpolate255(__m128i x, uint alpha1, __m128i y, uint alpha2) +{ + return __lsx_vadd_h(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2)); +} +#endif + static inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2) { #if defined(__SSE2__) @@ -128,6 +160,13 @@ static inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alp QRgba64 r; vst1_u64(reinterpret_cast(&r), vreinterpret_u64_u16(vr)); return r; +#elif defined(__loongarch_sx) + const __m128i vx = __lsx_vldrepl_d(reinterpret_cast(&x), 0); + const __m128i vy = __lsx_vldrepl_d(reinterpret_cast(&y), 0); + const __m128i vr = interpolate255(vx, alpha1, vy, alpha2); + QRgba64 r; + __lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0); + return r; #else return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2)); #endif @@ -156,6 +195,18 @@ inline uint16x4_t interpolate65535(uint16x4_t x, uint16x4_t alpha1, uint16x4_t y } #endif +#if defined __loongarch_sx +static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, uint alpha1, __m128i y, uint alpha2) +{ + return __lsx_vadd_h(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); +} + +static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, __m128i alpha1, __m128i y, __m128i alpha2) +{ + return __lsx_vadd_h(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); +} +#endif + static inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2) { #if defined(__SSE2__) @@ -172,6 +223,13 @@ static inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint a QRgba64 r; vst1_u64(reinterpret_cast(&r), vreinterpret_u64_u16(vr)); return r; +#elif defined(__loongarch_sx) + const __m128i vx = __lsx_vldrepl_d(reinterpret_cast(&x), 0); + const __m128i vy = __lsx_vldrepl_d(reinterpret_cast(&y), 0); + const __m128i vr = interpolate65535(vx, alpha1, vy, alpha2); + QRgba64 r; + __lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0); + return r; #else return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2)); #endif @@ -192,6 +250,13 @@ static inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b) QRgba64 r; vst1_u64(reinterpret_cast(&r), vreinterpret_u64_u16(vqadd_u16(va, vb))); return r; +#elif defined(__loongarch_sx) + const __m128i va = __lsx_vldrepl_d(reinterpret_cast(&a), 0); + const __m128i vb = __lsx_vldrepl_d(reinterpret_cast(&b), 0); + const __m128i vr = __lsx_vsadd_hu(va, vb); + QRgba64 r; + __lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0); + return r; #else return QRgba64::fromRgba64(qMin(a.red() + b.red(), 65535), @@ -221,6 +286,18 @@ static inline uint toArgb32(uint16x4_t v) uint8x8_t v8 = vmovn_u16(vcombine_u16(v, v)); return vget_lane_u32(vreinterpret_u32_u8(v8), 0); } +#elif defined __loongarch_sx +static inline uint Q_DECL_VECTORCALL toArgb32(__m128i v) +{ + v = __lsx_vilvl_h(__lsx_vldi(0), v); + v = __lsx_vadd_w(v, __lsx_vreplgr2vr_w(128)); + v = __lsx_vsub_w(v, __lsx_vsrli_w(v, 8)); + v = __lsx_vsrli_w(v, 8); + v = __lsx_vpickev_h(__lsx_vsat_w(v, 15), __lsx_vsat_w(v, 15)); + __m128i tmp = __lsx_vmaxi_h(v, 0); + v = __lsx_vpickev_b(__lsx_vsat_hu(tmp, 7), __lsx_vsat_hu(tmp, 7)); + return __lsx_vpickve2gr_w(v, 0); +} #endif static inline uint toArgb32(QRgba64 rgba64) @@ -238,6 +315,11 @@ static inline uint toArgb32(QRgba64 rgba64) v = vext_u16(v, v, 3); #endif return toArgb32(v); +#elif defined __loongarch_sx + __m128i v = __lsx_vldrepl_d(reinterpret_cast(&rgba64), 0); + const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 4, 5, 6, 7}; + v = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), v); + return toArgb32(v); #else return rgba64.toArgb32(); #endif @@ -251,6 +333,9 @@ static inline uint toRgba8888(QRgba64 rgba64) #elif defined __ARM_NEON__ uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast(&rgba64))); return toArgb32(v); +#elif defined __loongarch_sx + __m128i v = __lsx_vldrepl_d(reinterpret_cast(&rgba64), 0); + return toArgb32(v); #else return ARGB2RGBA(toArgb32(rgba64)); #endif @@ -289,6 +374,23 @@ static inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha) vd32 = vsraq_n_u32(vd32, vd32, 16); vd = vrshrn_n_u32(vd32, 16); vst1_u64(reinterpret_cast(&blend), vreinterpret_u64_u16(vd)); +#elif defined(__loongarch_sx) + __m128i vd = __lsx_vldrepl_d(reinterpret_cast(&d), 0); + __m128i vs = __lsx_vldrepl_d(reinterpret_cast(&s), 0); + __m128i va = __lsx_vinsgr2vr_w(__lsx_vldi(0), rgbAlpha, 0); + va = __lsx_vilvl_b(va, va); + const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 4, 5, 6, 7}; + va = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), va); + __m128i vb = __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), va); + + vs = __lsx_vilvl_h(__lsx_vmuh_hu(vs, va), __lsx_vmul_h(vs, va)); + vd = __lsx_vilvl_h(__lsx_vmuh_hu(vd, vb), __lsx_vmul_h(vd, vb)); + vd = __lsx_vadd_w(vd, vs); + vd = __lsx_vadd_w(vd, __lsx_vsrli_w(vd, 16)); + vd = __lsx_vadd_w(vd, __lsx_vreplgr2vr_w(0x8000)); + vd = __lsx_vsrai_w(vd, 16); + vd = __lsx_vpickev_h(__lsx_vsat_w(vd, 15), __lsx_vsat_w(vd, 15)); + __lsx_vstelm_d(vd, reinterpret_cast<__m128i *>(&blend), 0, 0); #else const int mr = qRed(rgbAlpha); const int mg = qGreen(rgbAlpha); @@ -318,6 +420,13 @@ static inline void blend_pixel(QRgba64 &dst, QRgba64 src) const uint16x4_t via = veor_u16(vdup_n_u16(0xffff), vdup_lane_u16(vs, 3)); const uint16x4_t vr = vadd_u16(vs, multiplyAlpha65535(vd, via)); vst1_u64(reinterpret_cast(&dst), vreinterpret_u64_u16(vr)); +#elif defined(__loongarch_sx) + const __m128i vd = __lsx_vldrepl_d(reinterpret_cast(&dst), 0); + const __m128i vs = __lsx_vldrepl_d(reinterpret_cast(&src), 0); + const __m128i shuffleMask = (__m128i)(v8i16){3, 3, 3, 3, 4, 5, 6, 7}; + const __m128i via = __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), vs)); + const __m128i vr = __lsx_vadd_h(vs, multiplyAlpha65535(vd, via)); + __lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&dst), 0, 0); #else dst = src + multiplyAlpha65535(dst, 65535 - src.alpha()); #endif @@ -343,6 +452,14 @@ static inline void blend_pixel(QRgba64 &dst, QRgba64 src, const int const_alpha) const uint16x4_t via = veor_u16(vdup_n_u16(0xffff), vdup_lane_u16(vs, 3)); const uint16x4_t vr = vadd_u16(vs, multiplyAlpha65535(vd, via)); vst1_u64(reinterpret_cast(&dst), vreinterpret_u64_u16(vr)); +#elif defined(__loongarch_sx) + const __m128i vd = __lsx_vldrepl_d(reinterpret_cast(&dst), 0); + __m128i vs = __lsx_vldrepl_d(reinterpret_cast(&src), 0); + vs = multiplyAlpha255(vs, const_alpha); + const __m128i shuffleMask = (__m128i)(v8i16){3, 3, 3, 3, 4, 5, 6, 7}; + const __m128i via = __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), vs)); + const __m128i vr = __lsx_vadd_h(vs, multiplyAlpha65535(vd, via)); + __lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&dst), 0, 0); #else src = multiplyAlpha255(src, const_alpha); dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());