Add some special LSX optimizations

During compilation, Qt will not add the -mlsx compilation parameter for these files. Therefore, the compiler needs to enable LSX auto-vectorization by default for all files during compilation for them to be compiled. Change-Id: I90c5029b673f831d39591ffd96c36e7762c68fb0 Reviewed-by: Volker Hilsheimer <volker.hilsheimer@qt.io>
2024-08-28 09:58:25 +08:00 · 2024-08-28 09:58:25 +08:00 · f9185516eb
commit f9185516eb
parent dfc84993b7
4 changed files with 511 additions and 1 deletions
--- a/src/gui/painting/qcompositionfunctions.cpp
+++ b/src/gui/painting/qcompositionfunctions.cpp
@ -280,10 +280,78 @@ struct Rgba64OperationsNEON : public Rgba64OperationsBase
 };
 #endif

+#if defined(__loongarch_sx)
+struct Rgba64OperationsLSX : public Rgba64OperationsBase
+{
+    typedef __m128i OptimalType;
+    typedef __m128i OptimalScalar;
+    static OptimalType load(const Type *ptr)
+    {
+        return __lsx_vilvl_d(__lsx_vldi(0), __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(ptr), 0));
+    }
+    static OptimalType convert(const Type &value)
+    {
+        return __lsx_vinsgr2vr_d(__lsx_vldi(0), value, 0);
+    }
+    static void store(Type *ptr, OptimalType value)
+    {
+        __lsx_vstelm_d(value, reinterpret_cast<const __m128i *>(ptr), 0, 0);
+    }
+    static OptimalType add(OptimalType a, OptimalType b)
+    {
+        return __lsx_vadd_h(a, b);
+    }
+//    same as above:
+//    static OptimalScalar add(OptimalScalar a, OptimalScalar b)
+    static OptimalType plus(OptimalType a, OptimalType b)
+    {
+        return __lsx_vsadd_hu(a, b);
+    }
+    static OptimalScalar alpha(OptimalType c)
+    {
+        const __m128i shuffleMask = (__m128i)(v8i16){3, 3, 3, 3, 4, 5, 6, 7};
+        return __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), c);
+    }
+    static OptimalScalar invAlpha(Scalar c)
+    {
+        return scalar(65535 - c);
+    }
+    static OptimalScalar invAlpha(OptimalType c)
+    {
+        return __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), alpha(c));
+    }
+    static OptimalScalar scalar(Scalar n)
+    {
+        const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 0, 0, 4, 5, 6, 7};
+        return __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), __lsx_vinsgr2vr_w(__lsx_vldi(0), n, 0));
+    }
+    static OptimalType multiplyAlpha8bit(OptimalType val, uint8_t a)
+    {
+        return multiplyAlpha255(val, a);
+    }
+//    same as above:
+//    static OptimalScalar multiplyAlpha8bit(OptimalScalar a, uint8_t a)
+    static OptimalType interpolate8bit(OptimalType x, uint8_t a1, OptimalType y, uint8_t a2)
+    {
+        return interpolate255(x, a1, y, a2);
+    }
+    static OptimalType multiplyAlpha(OptimalType val, OptimalScalar a)
+    {
+        return multiplyAlpha65535(val, a);
+    }
+    static OptimalType interpolate(OptimalType x, OptimalScalar a1, OptimalType y, const OptimalScalar &a2)
+    {
+        return interpolate65535(x, a1, y, a2);
+    }
+};
+#endif
+
 #if defined(__SSE2__)
 typedef Rgba64OperationsSSE2 Rgba64Operations;
 #elif defined(__ARM_NEON__)
 typedef Rgba64OperationsNEON Rgba64Operations;
+#elif defined(__loongarch_sx)
+typedef Rgba64OperationsLSX Rgba64Operations;
 #else
 typedef Rgba64OperationsC Rgba64Operations;
 #endif
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@ -1251,6 +1251,45 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, u
 }
 #endif

+#if defined(__loongarch_sx)
+static inline void interpolate_4_pixels_16_lsx(__m128i tl, __m128i tr, __m128i bl, __m128i br,
+                                               __m128i distx, __m128i disty, uint *b)
+{
+    const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+    const __m128i v_256 = __lsx_vreplgr2vr_h(256);
+    const __m128i dxdy = __lsx_vmul_h(distx, disty);
+    const __m128i distx_ = __lsx_vslli_h(distx, 4);
+    const __m128i disty_ = __lsx_vslli_h(disty, 4);
+    const __m128i idxidy =  __lsx_vadd_h(dxdy, __lsx_vsub_h(v_256, __lsx_vadd_h(distx_, disty_)));
+    const __m128i dxidy = __lsx_vsub_h(distx_, dxdy);
+    const __m128i idxdy = __lsx_vsub_h(disty_,dxdy);
+
+    __m128i tlAG = __lsx_vsrli_h(tl, 8);
+    __m128i tlRB = __lsx_vand_v(tl, colorMask);
+    __m128i trAG = __lsx_vsrli_h(tr, 8);
+    __m128i trRB = __lsx_vand_v(tr, colorMask);
+    __m128i blAG = __lsx_vsrli_h(bl, 8);
+    __m128i blRB = __lsx_vand_v(bl, colorMask);
+    __m128i brAG = __lsx_vsrli_h(br, 8);
+    __m128i brRB = __lsx_vand_v(br, colorMask);
+
+    tlAG = __lsx_vmul_h(tlAG, idxidy);
+    tlRB = __lsx_vmul_h(tlRB, idxidy);
+    trAG = __lsx_vmul_h(trAG, dxidy);
+    trRB = __lsx_vmul_h(trRB, dxidy);
+    blAG = __lsx_vmul_h(blAG, idxdy);
+    blRB = __lsx_vmul_h(blRB, idxdy);
+    brAG = __lsx_vmul_h(brAG, dxdy);
+    brRB = __lsx_vmul_h(brRB, dxdy);
+
+    __m128i rAG =__lsx_vadd_h(__lsx_vadd_h(tlAG, trAG), __lsx_vadd_h(blAG, brAG));
+    __m128i rRB =__lsx_vadd_h(__lsx_vadd_h(tlRB, trRB), __lsx_vadd_h(blRB, brRB));
+    rAG = __lsx_vandn_v(colorMask, rAG);
+    rRB = __lsx_vsrli_h(rRB, 8);
+    __lsx_vst(__lsx_vor_v(rAG, rRB), b, 0);
+}
+#endif
+
 template<TextureBlendType blendType>
 void fetchTransformedBilinear_pixelBounds(int max, int l1, int l2, int &v1, int &v2);

@ -1426,6 +1465,36 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_scale_helper(uin
            rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8));
            vst1q_s16((int16_t*)(&intermediate.buffer_rb[f]), rRB);
        }
+#elif defined(__loongarch_sx)
+        const __m128i disty_ = __lsx_vreplgr2vr_h(disty);
+        const __m128i idisty_ = __lsx_vreplgr2vr_h(idisty);
+        const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+
+        lim -= 3;
+        for (; f < lim; x += 4, f += 4) {
+            // Load 4 pixels from s1, and split the alpha-green and red-blue component
+            __m128i top = __lsx_vld((const __m128i*)((const uint *)(s1)+x), 0);
+            __m128i topAG = __lsx_vsrli_h(top, 8);
+            __m128i topRB = __lsx_vand_v(top, colorMask);
+            // Multiplies each color component by idisty
+            topAG = __lsx_vmul_h(topAG, idisty_);
+            topRB = __lsx_vmul_h(topRB, idisty_);
+
+            // Same for the s2 vector
+            __m128i bottom = __lsx_vld((const __m128i*)((const uint *)(s2)+x), 0);
+            __m128i bottomAG = __lsx_vsrli_h(bottom, 8);
+            __m128i bottomRB = __lsx_vand_v(bottom, colorMask);
+            bottomAG = __lsx_vmul_h(bottomAG, disty_);
+            bottomRB = __lsx_vmul_h(bottomRB, disty_);
+
+            // Add the values, and shift to only keep 8 significant bits per colors
+            __m128i rAG = __lsx_vadd_h(topAG, bottomAG);
+            rAG = __lsx_vsrli_h(rAG, 8);
+            __lsx_vst(rAG, (__m128i*)(&intermediate.buffer_ag[f]), 0);
+            __m128i rRB = __lsx_vadd_h(topRB, bottomRB);
+            rRB = __lsx_vsrli_h(rRB, 8);
+            __lsx_vst(rRB, (__m128i*)(&intermediate.buffer_rb[f]), 0);
+        }
 #endif
    }
    for (; f < count; f++) { // Same as above but without simd
@ -1615,6 +1684,33 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper(uint *
            b+=4;
            v_fx = vaddq_s32(v_fx, v_fdx);
        }
+#elif defined (__loongarch_sx)
+        const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 2, 2, 4, 4, 6, 6};
+        const __m128i v_disty = __lsx_vreplgr2vr_h(disty4);
+        const __m128i v_fdx = __lsx_vreplgr2vr_w(fdx*4);
+        const __m128i v_fx_r = __lsx_vreplgr2vr_w(0x8);
+        __m128i v_fx = (__m128i)(v4i32){fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx};
+
+        while (b < boundedEnd - 3) {
+            __m128i offset = __lsx_vsrli_w(v_fx, 16);
+            const int offset0 = __lsx_vpickve2gr_w(offset, 0);
+            const int offset1 = __lsx_vpickve2gr_w(offset, 1);
+            const int offset2 = __lsx_vpickve2gr_w(offset, 2);
+            const int offset3 = __lsx_vpickve2gr_w(offset, 3);
+            const __m128i tl = (__m128i)(v4u32){s1[offset0], s1[offset1], s1[offset2], s1[offset3]};
+            const __m128i tr = (__m128i)(v4u32){s1[offset0 + 1], s1[offset1 + 1], s1[offset2 + 1], s1[offset3 + 1]};
+            const __m128i bl = (__m128i)(v4u32){s2[offset0], s2[offset1], s2[offset2], s2[offset3]};
+            const __m128i br = (__m128i)(v4u32){s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]};
+
+            __m128i v_distx = __lsx_vsrli_h(v_fx, 8);
+            v_distx = __lsx_vsrli_h(__lsx_vadd_w(v_distx, v_fx_r), 4);
+            v_distx = __lsx_vshuf_h(shuffleMask, v_distx, v_distx);
+
+            interpolate_4_pixels_16_lsx(tl, tr, bl, br, v_distx, v_disty, b);
+            b += 4;
+            v_fx = __lsx_vadd_w(v_fx, v_fdx);
+        }
+        fx = __lsx_vpickve2gr_w(v_fx, 0);
 #endif
        while (b < boundedEnd) {
            int x = (fx >> 16);
@ -1852,6 +1948,50 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper(uint
            v_fx = vaddq_s32(v_fx, v_fdx);
            v_fy = vaddq_s32(v_fy, v_fdy);
        }
+#elif defined(__loongarch_sx)
+        const __m128i v_fdx = __lsx_vreplgr2vr_w(fdx*4);
+        const __m128i v_fdy = __lsx_vreplgr2vr_w(fdy*4);
+        const __m128i v_fxy_r = __lsx_vreplgr2vr_w(0x8);
+        __m128i v_fx = (__m128i)(v4i32){fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx};
+        __m128i v_fy = (__m128i)(v4i32){fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy};
+
+        const uchar *textureData = image.imageData;
+        const qsizetype bytesPerLine = image.bytesPerLine;
+        const __m128i zero = __lsx_vldi(0);
+        const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 0, 0, 4, 5, 6, 7};
+        const __m128i shuffleMask1 = (__m128i)(v8i16){0, 0, 2, 2, 4, 4, 6, 6};
+        const __m128i vbpl = __lsx_vshuf_h(shuffleMask, zero, __lsx_vinsgr2vr_w(zero, bytesPerLine/4, 0));
+
+        while (b < boundedEnd - 3) {
+            const __m128i vy = __lsx_vpickev_h(zero, __lsx_vsat_w(__lsx_vsrli_w(v_fy, 16), 15));
+            // 4x16bit * 4x16bit -> 4x32bit
+            __m128i offset = __lsx_vilvl_h(__lsx_vmuh_h(vy, vbpl), __lsx_vmul_h(vy, vbpl));
+            offset = __lsx_vadd_w(offset, __lsx_vsrli_w(v_fx, 16));
+            const int offset0 = __lsx_vpickve2gr_w(offset, 0);
+            const int offset1 = __lsx_vpickve2gr_w(offset, 1);
+            const int offset2 = __lsx_vpickve2gr_w(offset, 2);
+            const int offset3 = __lsx_vpickve2gr_w(offset, 3);
+            const uint *topData = (const uint *)(textureData);
+            const __m128i tl = (__m128i)(v4u32){topData[offset0], topData[offset1], topData[offset2], topData[offset3]};
+            const __m128i tr = (__m128i)(v4u32){topData[offset0 + 1], topData[offset1 + 1], topData[offset2 + 1], topData[offset3 + 1]};
+            const uint *bottomData = (const uint *)(textureData + bytesPerLine);
+            const __m128i bl = (__m128i)(v4u32){bottomData[offset0], bottomData[offset1], bottomData[offset2], bottomData[offset3]};
+            const __m128i br = (__m128i)(v4u32){bottomData[offset0 + 1], bottomData[offset1 + 1], bottomData[offset2 + 1], bottomData[offset3 + 1]};
+
+            __m128i v_distx = __lsx_vsrli_h(v_fx, 8);
+            __m128i v_disty = __lsx_vsrli_h(v_fy, 8);
+            v_distx = __lsx_vsrli_h(__lsx_vadd_w(v_distx, v_fxy_r), 4);
+            v_disty = __lsx_vsrli_h(__lsx_vadd_w(v_disty, v_fxy_r), 4);
+            v_distx = __lsx_vshuf_h(shuffleMask1, zero, v_distx);
+            v_disty = __lsx_vshuf_h(shuffleMask1, zero, v_disty);
+
+            interpolate_4_pixels_16_lsx(tl, tr, bl, br, v_distx, v_disty, b);
+            b += 4;
+            v_fx = __lsx_vadd_w(v_fx, v_fdx);
+            v_fy = __lsx_vadd_w(v_fy, v_fdy);
+        }
+        fx = __lsx_vpickve2gr_w(v_fx, 0);
+        fy = __lsx_vpickve2gr_w(v_fy, 0);
 #endif
        while (b < boundedEnd) {
            int x = (fx >> 16);
--- a/src/gui/painting/qpixellayout.cpp
+++ b/src/gui/painting/qpixellayout.cpp
@ -1141,6 +1141,108 @@ static inline void qConvertARGB32PMToRGBA64PM_neon(QRgba64 *buffer, const uint *
        *buffer++ = QRgba64::fromArgb32(s);
    }
 }
+#elif defined __loongarch_sx
+template<bool RGBA, bool maskAlpha>
+static inline void qConvertARGB32PMToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count)
+{
+    if (count <= 0)
+        return;
+
+    const __m128i amask = __lsx_vreplgr2vr_w(0xff000000);
+    const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 6, 5, 4, 7};
+    int i = 0;
+    for (; ((uintptr_t)buffer & 0xf) && i < count; ++i) {
+        uint s = *src++;
+        if (maskAlpha)
+            s = s | 0xff000000;
+        if (RGBA)
+            s = RGBA2ARGB(s);
+        *buffer++ = QRgba64::fromArgb32(s);
+    }
+    for (; i < count-3; i += 4) {
+        __m128i vs = __lsx_vld((const __m128i*)src, 0);
+        if (maskAlpha)
+            vs = __lsx_vor_v(vs, amask);
+        src += 4;
+        __m128i v1 = __lsx_vilvl_b(vs, vs);
+        __m128i v2 = __lsx_vilvh_b(vs, vs);
+        if (!RGBA) {
+            v1 = __lsx_vshuf_h(shuffleMask, v1, v1);
+            v2 = __lsx_vshuf_h(shuffleMask, v2, v2);
+        }
+        __lsx_vst(v1, buffer, 0);
+        buffer += 2;
+        __lsx_vst(v2, buffer, 0);
+        buffer += 2;
+    }
+
+    SIMD_EPILOGUE(i, count, 3) {
+        uint s = *src++;
+        if (maskAlpha)
+            s = s | 0xff000000;
+        if (RGBA)
+            s = RGBA2ARGB(s);
+        *buffer++ = QRgba64::fromArgb32(s);
+    }
+}
+
+template<QtPixelOrder PixelOrder>
+static inline void qConvertRGBA64PMToA2RGB30PM_lsx(uint *dest, const QRgba64 *buffer, int count)
+{
+    const __m128i gmask = __lsx_vreplgr2vr_w(0x000ffc00);
+    const __m128i cmask = __lsx_vreplgr2vr_w(0x000003ff);
+    int i = 0;
+    __m128i vr, vg, vb, va;
+    for (; i < count && uintptr_t(buffer) & 0xF; ++i) {
+        *dest++ = qConvertRgb64ToRgb30<PixelOrder>(*buffer++);
+    }
+
+    for (; i < count-15; i += 16) {
+        __m128i vOr = __lsx_vreplgr2vr_w(0);
+        __m128i vAnd = __lsx_vreplgr2vr_w(0xffffffff);
+        for (int j = 0; j < 16; j += 2) {
+            __m128i vs = __lsx_vld((const __m128i*)(buffer + j), 0);
+            vOr = __lsx_vor_v(vOr, vs);
+            vAnd = __lsx_vand_v(vAnd, vs);
+        }
+        const quint16 orAlpha = ((uint)__lsx_vpickve2gr_h(vOr, 3)) | ((uint)__lsx_vpickve2gr_h(vOr, 7));
+        const quint16 andAlpha = ((uint)__lsx_vpickve2gr_h(vAnd, 3)) & ((uint)__lsx_vpickve2gr_h(vAnd, 7));
+
+        if (andAlpha == 0xffff) {
+            for (int j = 0; j < 16; j += 2) {
+                __m128i vs = __lsx_vld((const __m128i*)buffer, 0);
+                buffer += 2;
+                vr = __lsx_vsrli_d(vs, 6);
+                vg = __lsx_vsrli_d(vs, 16 + 6 - 10);
+                vb = __lsx_vsrli_d(vs, 32 + 6);
+                vr = __lsx_vand_v(vr, cmask);
+                vg = __lsx_vand_v(vg, gmask);
+                vb = __lsx_vand_v(vb, cmask);
+                va = __lsx_vsrli_d(vs, 48 + 14);
+                if (PixelOrder == PixelOrderRGB)
+                    vr = __lsx_vslli_w(vr, 20);
+                else
+                    vb = __lsx_vslli_w(vb, 20);
+                va = __lsx_vslli_w(va, 30);
+                __m128i vd = __lsx_vor_v(__lsx_vor_v(vr, vg), __lsx_vor_v(vb, va));
+                vd = __lsx_vshuf4i_w(vd, 0b11011000);
+                __lsx_vstelm_d(vd, dest, 0, 0);
+                dest += 2;
+            }
+        } else if (orAlpha == 0) {
+            for (int j = 0; j < 16; ++j) {
+                *dest++ = 0;
+                buffer++;
+            }
+        } else {
+            for (int j = 0; j < 16; ++j)
+                *dest++ = qConvertRgb64ToRgb30<PixelOrder>(*buffer++);
+        }
+    }
+
+    SIMD_EPILOGUE(i, count, 15)
+        *dest++ = qConvertRgb64ToRgb30<PixelOrder>(*buffer++);
+}
 #endif

 static const QRgba64 *QT_FASTCALL convertRGB32ToRGB64(QRgba64 *buffer, const uint *src, int count,
@ -1150,6 +1252,8 @@ static const QRgba64 *QT_FASTCALL convertRGB32ToRGB64(QRgba64 *buffer, const uin
    qConvertARGB32PMToRGBA64PM_sse2<false, true>(buffer, src, count);
 #elif defined(__ARM_NEON__)
    qConvertARGB32PMToRGBA64PM_neon<false, true>(buffer, src, count);
+#elif defined(__loongarch_sx)
+    qConvertARGB32PMToRGBA64PM_lsx<false, true>(buffer, src, count);
 #else
    for (int i = 0; i < count; ++i)
        buffer[i] = QRgba64::fromArgb32(0xff000000 | src[i]);
@ -1184,6 +1288,8 @@ static const QRgba64 *QT_FASTCALL convertARGB32PMToRGBA64PM(QRgba64 *buffer, con
    qConvertARGB32PMToRGBA64PM_sse2<false, false>(buffer, src, count);
 #elif defined(__ARM_NEON__)
    qConvertARGB32PMToRGBA64PM_neon<false, false>(buffer, src, count);
+#elif defined(__loongarch_sx)
+    qConvertARGB32PMToRGBA64PM_lsx<false, false>(buffer, src, count);
 #else
    for (int i = 0; i < count; ++i)
        buffer[i] = QRgba64::fromArgb32(src[i]);
@ -1238,6 +1344,8 @@ static const QRgba64 *QT_FASTCALL convertRGBA8888PMToRGBA64PM(QRgba64 *buffer, c
    qConvertARGB32PMToRGBA64PM_sse2<true, false>(buffer, src, count);
 #elif defined(__ARM_NEON__)
    qConvertARGB32PMToRGBA64PM_neon<true, false>(buffer, src, count);
+#elif defined(__loongarch_sx)
+    qConvertARGB32PMToRGBA64PM_lsx<true, false>(buffer, src, count);
 #else
    for (int i = 0; i < count; ++i)
        buffer[i] = QRgba64::fromArgb32(RGBA2ARGB(src[i]));
@ -1348,6 +1456,48 @@ static inline void qConvertA2RGB30PMToRGBA64PM_sse2(QRgba64 *buffer, const uint
    SIMD_EPILOGUE(i, count, 3)
        *buffer++ = qConvertA2rgb30ToRgb64<PixelOrder>(*src++);
 }
+#elif defined(__loongarch_sx)
+template<QtPixelOrder PixelOrder>
+static inline void qConvertA2RGB30PMToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count)
+{
+    if (count <= 0)
+        return;
+
+    const __m128i rmask = __lsx_vreplgr2vr_w(0x3ff00000);
+    const __m128i gmask = __lsx_vreplgr2vr_w(0x000ffc00);
+    const __m128i bmask = __lsx_vreplgr2vr_w(0x000003ff);
+    const __m128i afactor = __lsx_vreplgr2vr_h(0x5555);
+    int i = 0;
+
+    for (; ((uintptr_t)buffer & 0xf) && i < count; ++i)
+        *buffer++ = qConvertA2rgb30ToRgb64<PixelOrder>(*src++);
+
+    for (; i < count-3; i += 4) {
+        __m128i vs = __lsx_vld((const __m128i*)src, 0);
+        src += 4;
+        __m128i va = __lsx_vsrli_w(vs, 30);
+        __m128i vr = __lsx_vand_v(vs, rmask);
+        __m128i vb = __lsx_vand_v(vs, bmask);
+        __m128i vg = __lsx_vand_v(vs, gmask);
+        va = __lsx_vmul_h(va, afactor);
+        vr = __lsx_vor_v(__lsx_vsrli_w(vr, 14), __lsx_vsrli_w(vr, 24));
+        vg = __lsx_vor_v(__lsx_vsrli_w(vg, 4), __lsx_vsrli_w(vg, 14));
+        vb = __lsx_vor_v(__lsx_vslli_w(vb, 6), __lsx_vsrli_w(vb, 4));
+        __m128i vrb;
+        if (PixelOrder == PixelOrderRGB)
+             vrb = __lsx_vor_v(vr, __lsx_vbsll_v(vb, 2));
+        else
+             vrb = __lsx_vor_v(vb, __lsx_vbsll_v(vr, 2));
+        __m128i vga = __lsx_vor_v(vg, __lsx_vbsll_v(va, 2));
+        __lsx_vst(__lsx_vilvl_h(vga, vrb), buffer, 0);
+        buffer += 2;
+        __lsx_vst(__lsx_vilvh_h(vga, vrb), buffer, 0);
+        buffer += 2;
+    }
+
+    SIMD_EPILOGUE(i, count, 3)
+        *buffer++ = qConvertA2rgb30ToRgb64<PixelOrder>(*src++);
+}
 #endif

 template<QtPixelOrder PixelOrder>
@ -1356,6 +1506,8 @@ static const QRgba64 *QT_FASTCALL convertA2RGB30PMToRGBA64PM(QRgba64 *buffer, co
 {
 #ifdef __SSE2__
    qConvertA2RGB30PMToRGBA64PM_sse2<PixelOrder>(buffer, src, count);
+#elif defined (__loongarch_sx)
+    qConvertA2RGB30PMToRGBA64PM_lsx<PixelOrder>(buffer, src, count);
 #else
    for (int i = 0; i < count; ++i)
        buffer[i] = qConvertA2rgb30ToRgb64<PixelOrder>(src[i]);
@ -1466,6 +1618,37 @@ void qt_convertRGBA64ToARGB32(uint *dst, const QRgba64 *src, int count)
        _mm_storel_epi64((__m128i*)(dst), v1);
        dst += 2;
    }
+#elif defined(__loongarch_sx)
+    if (((uintptr_t)dst & 0x7) && count > 0) {
+        uint s = (*src++).toArgb32();
+        if (RGBA)
+            s = ARGB2RGBA(s);
+        *dst++ = s;
+        i++;
+    }
+    const __m128i vhalf = __lsx_vreplgr2vr_w(0x80);
+    const __m128i vzero = __lsx_vldi(0);
+    const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 6, 5, 4, 7};
+    for (; i < count-1; i += 2) {
+        __m128i vs = __lsx_vld((const __m128i*)src, 0);
+        src += 2;
+        if (!RGBA) {
+            vs = __lsx_vshuf_h(shuffleMask, vzero, vs);
+        }
+        __m128i v1 = __lsx_vilvl_h(vzero, vs);
+        __m128i v2 = __lsx_vilvh_h(vzero, vs);
+        v1 = __lsx_vadd_w(v1, vhalf);
+        v2 = __lsx_vadd_w(v2, vhalf);
+        v1 = __lsx_vsub_w(v1, __lsx_vsrli_w(v1, 8));
+        v2 = __lsx_vsub_w(v2, __lsx_vsrli_w(v2, 8));
+        v1 = __lsx_vsrli_w(v1, 8);
+        v2 = __lsx_vsrli_w(v2, 8);
+        v1 = __lsx_vpickev_h(__lsx_vsat_w(v2, 15), __lsx_vsat_w(v1, 15));
+        v1 = __lsx_vmaxi_h(v1, 0);
+        v1 = __lsx_vpickev_b(vzero, __lsx_vsat_hu(v1, 7));
+        __lsx_vstelm_d(v1, dst, 0, 0);
+        dst += 2;
+    }
 #endif
    for (; i < count; i++) {
        uint s = (*src++).toArgb32();
@ -1902,6 +2085,8 @@ static void QT_FASTCALL storeRGB30FromRGBA64PM(uchar *dest, const QRgba64 *src,
    uint *d = (uint*)dest + index;
 #ifdef __SSE2__
    qConvertRGBA64PMToA2RGB30PM_sse2<PixelOrder>(d, src, count);
+#elif defined (__loongarch_sx)
+    qConvertRGBA64PMToA2RGB30PM_lsx<PixelOrder>(d, src, count);
 #else
    for (int i = 0; i < count; ++i)
        d[i] = qConvertRgb64ToRgb30<PixelOrder>(src[i]);
--- a/src/gui/painting/qrgba64_p.h
+++ b/src/gui/painting/qrgba64_p.h
@ -57,6 +57,24 @@ static inline uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535)
    vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16)
    return vrshrn_n_u32(vs32, 16); // vs = (vs + 0x8000) >> 16
 }
+#elif defined(__loongarch_sx)
+static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, __m128i va)
+{
+    __m128i vs = rgba64;
+    vs = __lsx_vilvl_h(__lsx_vmuh_hu(vs, va), __lsx_vmul_h(vs, va));
+    vs = __lsx_vadd_w(vs, __lsx_vsrli_w(vs, 16));
+    vs = __lsx_vadd_w(vs, __lsx_vreplgr2vr_w(0x8000));
+    vs = __lsx_vsrai_w(vs, 16);
+    vs = __lsx_vpickev_h(__lsx_vsat_w(vs, 15), __lsx_vsat_w(vs, 15));
+    return vs;
+}
+static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, uint alpha65535)
+{
+    const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 0, 0, 4, 5, 6, 7};
+    const __m128i va = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0),
+                                     __lsx_vinsgr2vr_w(__lsx_vldi(0), alpha65535, 0));
+    return multiplyAlpha65535(rgba64, va);
+}
 #endif

 static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
@ -73,6 +91,12 @@ static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
    QRgba64 r;
    vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
    return r;
+#elif defined(__loongarch_sx)
+    const __m128i v = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&rgba64), 0);
+    const __m128i vr = multiplyAlpha65535(v, alpha65535);
+    QRgba64 r;
+    __lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0);
+    return r;
 #else
    return QRgba64::fromRgba64(qt_div_65535(rgba64.red()   * alpha65535),
                               qt_div_65535(rgba64.green() * alpha65535),
@ -81,7 +105,7 @@ static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
 #endif
 }

-#if defined(__SSE2__) || defined(__ARM_NEON__)
+#if defined(__SSE2__) || defined(__ARM_NEON__) || defined(__loongarch_sx)
 template<typename T>
 static inline T Q_DECL_VECTORCALL multiplyAlpha255(T rgba64, uint alpha255)
 {
@ -112,6 +136,14 @@ inline uint16x4_t interpolate255(uint16x4_t x, uint alpha1, uint16x4_t y, uint a
 }
 #endif

+#if defined __loongarch_sx
+static inline __m128i Q_DECL_VECTORCALL
+interpolate255(__m128i x, uint alpha1, __m128i y, uint alpha2)
+{
+    return __lsx_vadd_h(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2));
+}
+#endif
+
 static inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
 {
 #if defined(__SSE2__)
@ -128,6 +160,13 @@ static inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alp
    QRgba64 r;
    vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
    return r;
+#elif defined(__loongarch_sx)
+    const __m128i vx = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&x), 0);
+    const __m128i vy = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&y), 0);
+    const __m128i vr = interpolate255(vx, alpha1, vy, alpha2);
+    QRgba64 r;
+    __lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0);
+    return r;
 #else
    return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2));
 #endif
@ -156,6 +195,18 @@ inline uint16x4_t interpolate65535(uint16x4_t x, uint16x4_t alpha1, uint16x4_t y
 }
 #endif

+#if defined __loongarch_sx
+static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, uint alpha1, __m128i y, uint alpha2)
+{
+    return __lsx_vadd_h(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
+}
+
+static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, __m128i alpha1, __m128i y, __m128i alpha2)
+{
+    return __lsx_vadd_h(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
+}
+#endif
+
 static inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
 {
 #if defined(__SSE2__)
@ -172,6 +223,13 @@ static inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint a
    QRgba64 r;
    vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
    return r;
+#elif defined(__loongarch_sx)
+    const __m128i vx = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&x), 0);
+    const __m128i vy = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&y), 0);
+    const __m128i vr = interpolate65535(vx, alpha1, vy, alpha2);
+    QRgba64 r;
+    __lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0);
+    return r;
 #else
    return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2));
 #endif
@ -192,6 +250,13 @@ static inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b)
    QRgba64 r;
    vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vqadd_u16(va, vb)));
    return r;
+#elif defined(__loongarch_sx)
+    const __m128i va = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&a), 0);
+    const __m128i vb = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&b), 0);
+    const __m128i vr = __lsx_vsadd_hu(va, vb);
+    QRgba64 r;
+    __lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0);
+    return r;
 #else

    return QRgba64::fromRgba64(qMin(a.red() + b.red(), 65535),
@ -221,6 +286,18 @@ static inline uint toArgb32(uint16x4_t v)
    uint8x8_t v8 = vmovn_u16(vcombine_u16(v, v));
    return vget_lane_u32(vreinterpret_u32_u8(v8), 0);
 }
+#elif defined __loongarch_sx
+static inline uint Q_DECL_VECTORCALL toArgb32(__m128i v)
+{
+    v = __lsx_vilvl_h(__lsx_vldi(0), v);
+    v = __lsx_vadd_w(v, __lsx_vreplgr2vr_w(128));
+    v = __lsx_vsub_w(v, __lsx_vsrli_w(v, 8));
+    v = __lsx_vsrli_w(v, 8);
+    v = __lsx_vpickev_h(__lsx_vsat_w(v, 15), __lsx_vsat_w(v, 15));
+    __m128i tmp = __lsx_vmaxi_h(v, 0);
+    v = __lsx_vpickev_b(__lsx_vsat_hu(tmp, 7), __lsx_vsat_hu(tmp, 7));
+    return __lsx_vpickve2gr_w(v, 0);
+}
 #endif

 static inline uint toArgb32(QRgba64 rgba64)
@ -238,6 +315,11 @@ static inline uint toArgb32(QRgba64 rgba64)
    v = vext_u16(v, v, 3);
 #endif
    return toArgb32(v);
+#elif defined __loongarch_sx
+    __m128i v = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&rgba64), 0);
+    const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 4, 5, 6, 7};
+    v = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), v);
+    return toArgb32(v);
 #else
    return rgba64.toArgb32();
 #endif
@ -251,6 +333,9 @@ static inline uint toRgba8888(QRgba64 rgba64)
 #elif defined __ARM_NEON__
    uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
    return toArgb32(v);
+#elif defined __loongarch_sx
+    __m128i v = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&rgba64), 0);
+    return toArgb32(v);
 #else
    return ARGB2RGBA(toArgb32(rgba64));
 #endif
@ -289,6 +374,23 @@ static inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha)
    vd32 = vsraq_n_u32(vd32, vd32, 16);
    vd = vrshrn_n_u32(vd32, 16);
    vst1_u64(reinterpret_cast<uint64_t *>(&blend), vreinterpret_u64_u16(vd));
+#elif defined(__loongarch_sx)
+    __m128i vd = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&d), 0);
+    __m128i vs = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&s), 0);
+    __m128i va = __lsx_vinsgr2vr_w(__lsx_vldi(0), rgbAlpha, 0);
+    va = __lsx_vilvl_b(va, va);
+    const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 4, 5, 6, 7};
+    va = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), va);
+    __m128i vb = __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), va);
+
+    vs = __lsx_vilvl_h(__lsx_vmuh_hu(vs, va), __lsx_vmul_h(vs, va));
+    vd = __lsx_vilvl_h(__lsx_vmuh_hu(vd, vb), __lsx_vmul_h(vd, vb));
+    vd = __lsx_vadd_w(vd, vs);
+    vd = __lsx_vadd_w(vd, __lsx_vsrli_w(vd, 16));
+    vd = __lsx_vadd_w(vd, __lsx_vreplgr2vr_w(0x8000));
+    vd = __lsx_vsrai_w(vd, 16);
+    vd = __lsx_vpickev_h(__lsx_vsat_w(vd, 15), __lsx_vsat_w(vd, 15));
+    __lsx_vstelm_d(vd, reinterpret_cast<__m128i *>(&blend), 0, 0);
 #else
    const int mr = qRed(rgbAlpha);
    const int mg = qGreen(rgbAlpha);
@ -318,6 +420,13 @@ static inline void blend_pixel(QRgba64 &dst, QRgba64 src)
        const uint16x4_t via = veor_u16(vdup_n_u16(0xffff), vdup_lane_u16(vs, 3));
        const uint16x4_t vr = vadd_u16(vs, multiplyAlpha65535(vd, via));
        vst1_u64(reinterpret_cast<uint64_t *>(&dst), vreinterpret_u64_u16(vr));
+#elif defined(__loongarch_sx)
+        const __m128i vd = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&dst), 0);
+        const __m128i vs = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&src), 0);
+        const __m128i shuffleMask = (__m128i)(v8i16){3, 3, 3, 3, 4, 5, 6, 7};
+        const __m128i via = __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), vs));
+        const __m128i vr = __lsx_vadd_h(vs, multiplyAlpha65535(vd, via));
+        __lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&dst), 0, 0);
 #else
        dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());
 #endif
@ -343,6 +452,14 @@ static inline void blend_pixel(QRgba64 &dst, QRgba64 src, const int const_alpha)
        const uint16x4_t via = veor_u16(vdup_n_u16(0xffff), vdup_lane_u16(vs, 3));
        const uint16x4_t vr = vadd_u16(vs, multiplyAlpha65535(vd, via));
        vst1_u64(reinterpret_cast<uint64_t *>(&dst), vreinterpret_u64_u16(vr));
+#elif defined(__loongarch_sx)
+        const __m128i vd = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&dst), 0);
+        __m128i vs = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&src), 0);
+        vs = multiplyAlpha255(vs, const_alpha);
+        const __m128i shuffleMask = (__m128i)(v8i16){3, 3, 3, 3, 4, 5, 6, 7};
+        const __m128i via = __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), vs));
+        const __m128i vr = __lsx_vadd_h(vs, multiplyAlpha65535(vd, via));
+        __lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&dst), 0, 0);
 #else
        src = multiplyAlpha255(src, const_alpha);
        dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());