Complete drawhelper Func with LSX

List of optimized implementations using LSX: - qt_blend_argb32_on_argb32 - qt_blend_rgb32_on_rgb32 - comp_func_SourceOver - comp_func_Plus - comp_func_Source - comp_func_solid_Source - comp_func_solid_SourceOver - qt_memfill64 - qt_memfill32 - qt_bitmapblit32 - qt_bitmapblit16 - qt_scale_image_argb32_on_argb32 - convert_RGB888_to_RGB32 - qt_qimageScaleAARGBA_up_x_down_y - qt_qimageScaleAARGBA_down_x_up_y - qt_qimageScaleAARGBA_down_xy All of the above functions have passed the tests under tests/auto/gui. Change-Id: I7ae6169305b81bdf7fb704619453c505f8bb960f Reviewed-by: Volker Hilsheimer <volker.hilsheimer@qt.io>
2024-06-21 17:05:49 +08:00 · 2024-06-21 17:05:49 +08:00 · d511a68684
commit d511a68684
parent 73ce5a940a
11 changed files with 1347 additions and 6 deletions
--- a/src/gui/CMakeLists.txt
+++ b/src/gui/CMakeLists.txt
@ -183,6 +183,8 @@ qt_internal_add_module(Gui
        painting/qdrawhelper_p.h
        painting/qdrawhelper_x86_p.h
        painting/qdrawingprimitive_sse2_p.h
+        painting/qdrawhelper_loongarch64_p.h
+        painting/qdrawingprimitive_lsx_p.h
        painting/qemulationpaintengine.cpp painting/qemulationpaintengine_p.h
        painting/qfixed_p.h
        painting/qgrayraster.c painting/qgrayraster_p.h
@ -655,6 +657,13 @@ qt_internal_add_simd_part(Gui SIMD neon
        painting/qimagescale_neon.cpp
 )

+qt_internal_add_simd_part(Gui SIMD lsx
+    SOURCES
+        image/qimage_lsx.cpp
+        painting/qdrawhelper_lsx.cpp
+        painting/qimagescale_lsx.cpp
+)
+
 if(NOT ANDROID)
    qt_internal_add_simd_part(Gui SIMD mips_dsp
        SOURCES
--- a/src/gui/image/qimage_conversions.cpp
+++ b/src/gui/image/qimage_conversions.cpp
@ -2754,6 +2754,18 @@ static void qInitImageConversions()
    }
 #endif

+#if defined(QT_COMPILER_SUPPORTS_LSX)
+    if (qCpuHasFeature(LSX)) {
+        extern void convert_RGB888_to_RGB32_lsx(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags);
+        qimage_converter_map[QImage::Format_RGB888][QImage::Format_RGB32] = convert_RGB888_to_RGB32_lsx;
+        qimage_converter_map[QImage::Format_RGB888][QImage::Format_ARGB32] = convert_RGB888_to_RGB32_lsx;
+        qimage_converter_map[QImage::Format_RGB888][QImage::Format_ARGB32_Premultiplied] = convert_RGB888_to_RGB32_lsx;
+        qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBX8888] = convert_RGB888_to_RGB32_lsx;
+        qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBA8888] = convert_RGB888_to_RGB32_lsx;
+        qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBA8888_Premultiplied] = convert_RGB888_to_RGB32_lsx;
+    }
+#endif
+
 #if defined(__ARM_NEON__)
    extern void convert_RGB888_to_RGB32_neon(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags);
    qimage_converter_map[QImage::Format_RGB888][QImage::Format_RGB32] = convert_RGB888_to_RGB32_neon;
--- a/src/gui/image/qimage_lsx.cpp
+++ b/src/gui/image/qimage_lsx.cpp
@ -0,0 +1,115 @@
+// Copyright (C) 2016 The Qt Company Ltd.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
+
+#include <qimage.h>
+#include <private/qimage_p.h>
+#include <private/qsimd_p.h>
+
+#ifdef QT_COMPILER_SUPPORTS_LSX
+
+QT_BEGIN_NAMESPACE
+
+// Convert a scanline of RGB888 (src) to RGB32 (dst)
+// src must be at least len * 3 bytes
+// dst must be at least len * 4 bytes
+Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_lsx(quint32 *dst, const uchar *src, int len)
+{
+    int i = 0;
+
+    // Prologue, align dst to 16 bytes.
+    ALIGNMENT_PROLOGUE_16BYTES(dst, i, len) {
+        dst[i] = qRgb(src[0], src[1], src[2]);
+        src += 3;
+    }
+
+    // Mask the 4 first colors of the RGB888 vector
+    const __m128i shuffleMask = (__m128i)(v16i8){2, 1, 0, 16, 5, 4, 3, 16,
+                                                 8, 7, 6, 16, 11, 10, 9, 16};
+    // Mask the 4 last colors of a RGB888 vector with an offset of 1 (so the last 3 bytes are RGB)
+    const __m128i shuffleMaskEnd = (__m128i)(v16i8){6, 5, 4, 16, 9, 8, 7, 16,
+                                                    12, 11, 10, 16, 15, 14, 13, 16};
+    // Mask to have alpha = 0xff
+    const __m128i alphaMask = __lsx_vreplgr2vr_b(0xff);
+
+    // Mask to concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by 12 bytes
+    const __m128i indexMask1 = (__m128i)(v16i8){12, 13, 14, 15, 16, 17, 18, 19,
+                                                20, 21, 22, 23, 24, 25, 26, 27};
+
+    // Mask to concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by 8 bytes
+    const __m128i indexMask2 = (__m128i)(v16i8){8, 9, 10, 11, 12, 13, 14, 15,
+                                                16, 17, 18, 19, 20, 21, 22, 23};
+
+    const __m128i *inVectorPtr = (const __m128i *)src;
+    __m128i *dstVectorPtr = (__m128i *)(dst + i);
+
+    for (; i < (len - 15); i += 16) { // one iteration in the loop converts 16 pixels
+        /*
+         RGB888 has 5 pixels per vector, + 1 byte from the next pixel. The idea here is
+         to load vectors of RGB888 and use palignr to select a vector out of two vectors.
+
+         After 3 loads of RGB888 and 3 stores of RGB32, we have 4 pixels left in the last
+         vector of RGB888, we can mask it directly to get a last store or RGB32. After that,
+         the first next byte is a R, and we can loop for the next 16 pixels.
+
+         The conversion itself is done with a byte permutation (vshuf_b).
+         */
+        __m128i firstSrcVector = __lsx_vld(inVectorPtr, 0);
+        __m128i outputVector = __lsx_vshuf_b(alphaMask, firstSrcVector, shuffleMask);
+        __lsx_vst(outputVector, dstVectorPtr, 0);
+        ++inVectorPtr;
+        ++dstVectorPtr;
+
+        // There are 4 unused bytes left in srcVector, we need to load the next 16 bytes
+        __m128i secondSrcVector = __lsx_vld(inVectorPtr, 0);
+        __m128i srcVector = __lsx_vshuf_b(secondSrcVector, firstSrcVector, indexMask1);
+        outputVector = __lsx_vshuf_b(alphaMask, srcVector, shuffleMask);
+        __lsx_vst(outputVector, dstVectorPtr, 0);
+        ++inVectorPtr;
+        ++dstVectorPtr;
+        firstSrcVector = secondSrcVector;
+
+        // We now have 8 unused bytes left in firstSrcVector
+        secondSrcVector = __lsx_vld(inVectorPtr, 0);
+        srcVector = __lsx_vshuf_b(secondSrcVector, firstSrcVector, indexMask2);
+        outputVector = __lsx_vshuf_b(alphaMask, srcVector, shuffleMask);
+        __lsx_vst(outputVector, dstVectorPtr, 0);
+        ++inVectorPtr;
+        ++dstVectorPtr;
+
+        // There are now 12 unused bytes in firstSrcVector.
+        // We can mask them directly, almost there.
+        outputVector = __lsx_vshuf_b(alphaMask, secondSrcVector, shuffleMaskEnd);
+        __lsx_vst(outputVector, dstVectorPtr, 0);
+        ++dstVectorPtr;
+    }
+    src = (const uchar *)inVectorPtr;
+
+    SIMD_EPILOGUE(i, len, 15) {
+        dst[i] = qRgb(src[0], src[1], src[2]);
+        src += 3;
+    }
+}
+
+void convert_RGB888_to_RGB32_lsx(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags)
+{
+    Q_ASSERT(src->format == QImage::Format_RGB888 || src->format == QImage::Format_BGR888);
+    if (src->format == QImage::Format_BGR888)
+        Q_ASSERT(dest->format == QImage::Format_RGBX8888 || dest->format == QImage::Format_RGBA8888 || dest->format == QImage::Format_RGBA8888_Premultiplied);
+    else
+        Q_ASSERT(dest->format == QImage::Format_RGB32 || dest->format == QImage::Format_ARGB32 || dest->format == QImage::Format_ARGB32_Premultiplied);
+    Q_ASSERT(src->width == dest->width);
+    Q_ASSERT(src->height == dest->height);
+
+    const uchar *src_data = (uchar *) src->data;
+    quint32 *dest_data = (quint32 *) dest->data;
+
+    for (int i = 0; i < src->height; ++i) {
+        qt_convert_rgb888_to_rgb32_lsx(dest_data, src_data, src->width);
+        src_data += src->bytes_per_line;
+        dest_data = (quint32 *)((uchar*)dest_data + dest->bytes_per_line);
+    }
+}
+
+QT_END_NAMESPACE
+
+#endif // QT_COMPILER_SUPPORTS_LSX
--- a/src/gui/image/qimage_p.h
+++ b/src/gui/image/qimage_p.h
@ -560,7 +560,7 @@ inline QImage::Format qt_opaqueVersionForPainting(QImage::Format format)
 inline QImage::Format qt_alphaVersionForPainting(QImage::Format format)
 {
    QImage::Format toFormat = qt_alphaVersion(format);
-#if defined(__ARM_NEON__) || defined(__SSE2__)
+#if defined(__ARM_NEON__) || defined(__SSE2__) || defined(QT_COMPILER_SUPPORT_LSX)
    // If we are switching depth anyway and we have optimized ARGB32PM routines, upgrade to that.
    if (qt_depthForFormat(format) != qt_depthForFormat(toFormat) && qt_depthForFormat(toFormat) <= 32)
        toFormat = QImage::Format_ARGB32_Premultiplied;
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@ -12,6 +12,8 @@
 #include <private/qdrawhelper_p.h>
 #include <private/qdrawhelper_x86_p.h>
 #include <private/qdrawingprimitive_sse2_p.h>
+#include <private/qdrawhelper_loongarch64_p.h>
+#include <private/qdrawingprimitive_lsx_p.h>
 #include <private/qdrawhelper_neon_p.h>
 #if defined(QT_COMPILER_SUPPORTS_MIPS_DSP) || defined(QT_COMPILER_SUPPORTS_MIPS_DSPR2)
 #include <private/qdrawhelper_mips_dsp_p.h>
@ -4971,7 +4973,7 @@ void qBlendTexture(int count, const QT_FT_Span *spans, void *userData)
    case QImage::Format_RGB16:
        proc = processTextureSpansRGB16[blendType];
        break;
-#if defined(__SSE2__) || defined(__ARM_NEON__) || (Q_PROCESSOR_WORDSIZE == 8)
+#if defined(__SSE2__) || defined(__ARM_NEON__) || defined(QT_COMPILER_SUPPORTS_LSX) || (Q_PROCESSOR_WORDSIZE == 8)
    case QImage::Format_ARGB32:
    case QImage::Format_RGBA8888:
 #endif
@ -5113,7 +5115,7 @@ void qBlendGradient(int count, const QT_FT_Span *spans, void *userData)
        if (isVerticalGradient && blend_vertical_gradient_argb(count, spans, userData))
            return;
        return blend_src_generic(count, spans, userData);
-#if defined(__SSE2__) || defined(__ARM_NEON__) || (Q_PROCESSOR_WORDSIZE == 8)
+#if defined(__SSE2__) || defined(__ARM_NEON__) || defined(QT_COMPILER_SUPPORTS_LSX) || (Q_PROCESSOR_WORDSIZE == 8)
    case QImage::Format_ARGB32:
    case QImage::Format_RGBA8888:
 #endif
@ -6368,7 +6370,7 @@ DrawHelper qDrawHelper[] =

 static_assert(std::size(qDrawHelper) == QImage::NImageFormats);

-#if !defined(Q_PROCESSOR_X86)
+#if !defined(Q_PROCESSOR_X86) && !defined(QT_COMPILER_SUPPORTS_LSX)
 void qt_memfill64(quint64 *dest, quint64 color, qsizetype count)
 {
    qt_memfill_template<quint64>(dest, color, count);
@ -6435,7 +6437,7 @@ void qt_memfill16(quint16 *dest, quint16 value, qsizetype count)
    qt_memfill32(reinterpret_cast<quint32*>(dest), value32, count / 2);
 }

-#if defined(Q_PROCESSOR_X86)
+#if defined(Q_PROCESSOR_X86) || defined(QT_COMPILER_SUPPORTS_LSX)
 void (*qt_memfill32)(quint32 *dest, quint32 value, qsizetype count) = nullptr;
 void (*qt_memfill64)(quint64 *dest, quint64 value, qsizetype count) = nullptr;
 #elif !defined(__ARM_NEON__) && !defined(__MIPS_DSP__)
@ -6712,6 +6714,68 @@ static void qInitDrawhelperFunctions()

 #endif // SSE2

+#if defined(QT_COMPILER_SUPPORTS_LSX)
+    if (qCpuHasFeature(LSX)) {
+        qt_memfill32 = qt_memfill32_lsx;
+        qt_memfill64 = qt_memfill64_lsx;
+
+        qDrawHelper[QImage::Format_RGB32].bitmapBlit = qt_bitmapblit32_lsx;
+        qDrawHelper[QImage::Format_ARGB32].bitmapBlit = qt_bitmapblit32_lsx;
+        qDrawHelper[QImage::Format_ARGB32_Premultiplied].bitmapBlit = qt_bitmapblit32_lsx;
+        qDrawHelper[QImage::Format_RGB16].bitmapBlit = qt_bitmapblit16_lsx;
+        qDrawHelper[QImage::Format_RGBX8888].bitmapBlit = qt_bitmapblit8888_lsx;
+        qDrawHelper[QImage::Format_RGBA8888].bitmapBlit = qt_bitmapblit8888_lsx;
+        qDrawHelper[QImage::Format_RGBA8888_Premultiplied].bitmapBlit = qt_bitmapblit8888_lsx;
+
+        extern void qt_scale_image_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
+                                                        const uchar *srcPixels, int sbpl, int srch,
+                                                        const QRectF &targetRect,
+                                                        const QRectF &sourceRect,
+                                                        const QRect &clip,
+                                                        int const_alpha);
+
+        qScaleFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
+        qScaleFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
+        qScaleFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
+        qScaleFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
+
+        extern void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl,
+                                                const uchar *srcPixels, int sbpl,
+                                                int w, int h,
+                                                int const_alpha);
+
+        extern void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
+                                                  const uchar *srcPixels, int sbpl,
+                                                  int w, int h,
+                                                  int const_alpha);
+
+        qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_lsx;
+        qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_lsx;
+        qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
+        qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
+        qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_lsx;
+        qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_lsx;
+        qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
+        qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
+
+        extern const uint * QT_FASTCALL qt_fetch_radial_gradient_lsx(uint *buffer, const Operator *op, const QSpanData *data,
+                                                                     int y, int x, int length);
+
+        qt_fetch_radial_gradient = qt_fetch_radial_gradient_lsx;
+
+        extern void QT_FASTCALL comp_func_SourceOver_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
+        extern void QT_FASTCALL comp_func_solid_SourceOver_lsx(uint *destPixels, int length, uint color, uint const_alpha);
+        extern void QT_FASTCALL comp_func_Source_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
+        extern void QT_FASTCALL comp_func_solid_Source_lsx(uint *destPixels, int length, uint color, uint const_alpha);
+        extern void QT_FASTCALL comp_func_Plus_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
+        qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_lsx;
+        qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_lsx;
+        qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_lsx;
+        qt_functionForModeSolid_C[QPainter::CompositionMode_Source] = comp_func_solid_Source_lsx;
+        qt_functionForMode_C[QPainter::CompositionMode_Plus] = comp_func_Plus_lsx;
+    }
+#endif //QT_COMPILER_SUPPORTS_LSX
+
 #if defined(__ARM_NEON__)
    qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_neon;
    qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_neon;
--- a/src/gui/painting/qdrawhelper_loongarch64_p.h
+++ b/src/gui/painting/qdrawhelper_loongarch64_p.h
@ -0,0 +1,48 @@
+// Copyright (C) 2024 Loongson Technology Corporation Limited.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
+
+#ifndef QDRAWHELPER_LOONGARCH64_P_H
+#define QDRAWHELPER_LOONGARCH64_P_H
+
+//
+//  W A R N I N G
+//  -------------
+//
+// This file is not part of the Qt API.  It exists purely as an
+// implementation detail.  This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+//
+
+#include <QtGui/private/qtguiglobal_p.h>
+#include <private/qdrawhelper_p.h>
+
+QT_BEGIN_NAMESPACE
+
+#ifdef QT_COMPILER_SUPPORTS_LSX
+void qt_memfill64_lsx(quint64 *dest, quint64 value, qsizetype count);
+void qt_memfill32_lsx(quint32 *dest, quint32 value, qsizetype count);
+void qt_bitmapblit32_lsx(QRasterBuffer *rasterBuffer, int x, int y,
+                         const QRgba64 &color,
+                         const uchar *src, int width, int height, int stride);
+void qt_bitmapblit8888_lsx(QRasterBuffer *rasterBuffer, int x, int y,
+                           const QRgba64 &color,
+                           const uchar *src, int width, int height, int stride);
+void qt_bitmapblit16_lsx(QRasterBuffer *rasterBuffer, int x, int y,
+                         const QRgba64 &color,
+                         const uchar *src, int width, int height, int stride);
+void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
+                                   const uchar *srcPixels, int sbpl,
+                                   int w, int h,
+                                   int const_alpha);
+void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl,
+                                 const uchar *srcPixels, int sbpl,
+                                 int w, int h,
+                                 int const_alpha);
+
+#endif // QT_COMPILER_SUPPORTS_LSX
+
+QT_END_NAMESPACE
+
+#endif // QDRAWHELPER_LOONGARCH64_P_H
--- a/src/gui/painting/qdrawhelper_lsx.cpp
+++ b/src/gui/painting/qdrawhelper_lsx.cpp
@ -0,0 +1,593 @@
+// Copyright (C) 2024 Loongson Technology Corporation Limited.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
+
+#include <private/qdrawhelper_loongarch64_p.h>
+
+#ifdef QT_COMPILER_SUPPORTS_LSX
+
+#include <private/qdrawingprimitive_lsx_p.h>
+#include <private/qpaintengine_raster_p.h>
+
+QT_BEGIN_NAMESPACE
+
+void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
+                                   const uchar *srcPixels, int sbpl,
+                                   int w, int h,
+                                   int const_alpha)
+{
+    const quint32 *src = (const quint32 *) srcPixels;
+    quint32 *dst = (quint32 *) destPixels;
+    if (const_alpha == 256) {
+        for (int y = 0; y < h; ++y) {
+            BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, w);
+            dst = (quint32 *)(((uchar *) dst) + dbpl);
+            src = (const quint32 *)(((const uchar *) src) + sbpl);
+        }
+    } else if (const_alpha != 0) {
+        // dest = (s + d * sia) * ca + d * cia
+        //      = s * ca + d * (sia * ca + cia)
+        //      = s * ca + d * (1 - sa*ca)
+        const_alpha = (const_alpha * 255) >> 8;
+
+        for (int y = 0; y < h; ++y) {
+            BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, w, const_alpha);
+            dst = (quint32 *)(((uchar *) dst) + dbpl);
+            src = (const quint32 *)(((const uchar *) src) + sbpl);
+        }
+    }
+}
+
+// qblendfunctions.cpp
+void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
+                             const uchar *srcPixels, int sbpl,
+                             int w, int h,
+                             int const_alpha);
+
+void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl,
+                                 const uchar *srcPixels, int sbpl,
+                                 int w, int h,
+                                 int const_alpha)
+{
+    const quint32 *src = (const quint32 *) srcPixels;
+    quint32 *dst = (quint32 *) destPixels;
+    if (const_alpha != 256) {
+        if (const_alpha != 0) {
+            const __m128i half = __lsx_vreplgr2vr_h(0x80);
+            const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+
+            const_alpha = (const_alpha * 255) >> 8;
+            int one_minus_const_alpha = 255 - const_alpha;
+            const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
+            const __m128i oneMinusConstAlpha =  __lsx_vreplgr2vr_h(one_minus_const_alpha);
+            for (int y = 0; y < h; ++y) {
+                int x = 0;
+
+                // First, align dest to 16 bytes:
+                ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
+                    dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha,
+                                                   dst[x], one_minus_const_alpha);
+                }
+
+                for (; x < w-3; x += 4) {
+                    __m128i srcVector = __lsx_vld(&src[x], 0);
+                    __m128i dstVector = __lsx_vld(&dst[x], 0);
+                    INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector,
+                                              oneMinusConstAlpha, colorMask, half);
+                    __lsx_vst(dstVector, &dst[x], 0);
+                }
+                SIMD_EPILOGUE(x, w, 3)
+                    dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha,
+                                                   dst[x], one_minus_const_alpha);
+                dst = (quint32 *)(((uchar *) dst) + dbpl);
+                src = (const quint32 *)(((const uchar *) src) + sbpl);
+            }
+        }
+    } else {
+        qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
+    }
+}
+
+void QT_FASTCALL comp_func_SourceOver_lsx(uint *destPixels, const uint *srcPixels,
+                                          int length, uint const_alpha)
+{
+    Q_ASSERT(const_alpha < 256);
+
+    const quint32 *src = (const quint32 *) srcPixels;
+    quint32 *dst = (quint32 *) destPixels;
+
+    if (const_alpha == 255) {
+        BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, length);
+    } else {
+        BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, length, const_alpha);
+    }
+}
+
+void QT_FASTCALL comp_func_Plus_lsx(uint *dst, const uint *src, int length, uint const_alpha)
+{
+    int x = 0;
+
+    if (const_alpha == 255) {
+        // 1) Prologue: align destination on 16 bytes
+        ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
+            dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
+
+        // 2) composition with LSX
+        for (; x < length - 3; x += 4) {
+            const __m128i srcVector = __lsx_vld(&src[x], 0);
+            const __m128i dstVector = __lsx_vld(&dst[x], 0);
+
+            const __m128i result = __lsx_vsadd_bu(srcVector, dstVector);
+            __lsx_vst(result, &dst[x], 0);
+        }
+
+        // 3) Epilogue:
+        SIMD_EPILOGUE(x, length, 3)
+            dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
+    } else {
+        const int one_minus_const_alpha = 255 - const_alpha;
+        const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
+        const __m128i oneMinusConstAlpha =  __lsx_vreplgr2vr_h(one_minus_const_alpha);
+
+        // 1) Prologue: align destination on 16 bytes
+        ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
+            dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x],
+                                                          const_alpha,
+                                                          one_minus_const_alpha);
+
+        const __m128i half = __lsx_vreplgr2vr_h(0x80);
+        const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+        // 2) composition with LSX
+        for (; x < length - 3; x += 4) {
+            const __m128i srcVector = __lsx_vld(&src[x], 0);
+            __m128i dstVector = __lsx_vld(&dst[x], 0);
+            __m128i result = __lsx_vsadd_bu(srcVector, dstVector);
+            INTERPOLATE_PIXEL_255_LSX(result, dstVector, constAlphaVector,
+                                      oneMinusConstAlpha, colorMask, half);
+            __lsx_vst(dstVector, &dst[x], 0);
+        }
+
+        // 3) Epilogue:
+        SIMD_EPILOGUE(x, length, 3)
+            dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x],
+                                                          const_alpha, one_minus_const_alpha);
+    }
+}
+
+void QT_FASTCALL comp_func_Source_lsx(uint *dst, const uint *src, int length, uint const_alpha)
+{
+    if (const_alpha == 255) {
+        ::memcpy(dst, src, length * sizeof(uint));
+    } else {
+        const int ialpha = 255 - const_alpha;
+
+        int x = 0;
+
+        // 1) prologue, align on 16 bytes
+        ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
+            dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
+
+        // 2) interpolate pixels with LSX
+        const __m128i half = __lsx_vreplgr2vr_h(0x80);
+        const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+
+        const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
+        const __m128i oneMinusConstAlpha =  __lsx_vreplgr2vr_h(ialpha);
+        for (; x < length - 3; x += 4) {
+            const __m128i srcVector = __lsx_vld(&src[x], 0);
+            __m128i dstVector = __lsx_vld(&dst[x], 0);
+            INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector,
+                                      oneMinusConstAlpha, colorMask, half);
+            __lsx_vst(dstVector, &dst[x], 0);
+        }
+
+        // 3) Epilogue
+        SIMD_EPILOGUE(x, length, 3)
+            dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
+    }
+}
+
+static Q_NEVER_INLINE
+void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount)
+{
+    __m128i *dst128 = reinterpret_cast<__m128i *>(dest);
+    __m128i *end128 = reinterpret_cast<__m128i *>(static_cast<uchar *>(dest) + bytecount);
+
+    while (dst128 + 4 <= end128) {
+        __lsx_vst(value128, dst128 + 0, 0);
+        __lsx_vst(value128, dst128 + 1, 0);
+        __lsx_vst(value128, dst128 + 2, 0);
+        __lsx_vst(value128, dst128 + 3, 0);
+        dst128 += 4;
+    }
+
+    bytecount %= 4 * sizeof(__m128i);
+    switch (bytecount / sizeof(__m128i)) {
+    case 3: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH();
+    case 2: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH();
+    case 1: __lsx_vst(value128, dst128++, 0);
+    }
+}
+
+void qt_memfill64_lsx(quint64 *dest, quint64 value, qsizetype count)
+{
+    quintptr misaligned = quintptr(dest) % sizeof(__m128i);
+    if (misaligned && count) {
+        *dest++ = value;
+        --count;
+    }
+
+    if (count % 2) {
+        dest[count - 1] = value;
+        --count;
+    }
+
+    qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_d(value), count * sizeof(quint64));
+}
+
+void qt_memfill32_lsx(quint32 *dest, quint32 value, qsizetype count)
+{
+    if (count < 4) {
+        // this simplifies the code below: the first switch can fall through
+        // without checking the value of count
+        switch (count) {
+        case 3: *dest++ = value; Q_FALLTHROUGH();
+        case 2: *dest++ = value; Q_FALLTHROUGH();
+        case 1: *dest   = value;
+        }
+        return;
+    }
+
+    const int align = (quintptr)(dest) & 0xf;
+    switch (align) {
+    case 4:  *dest++ = value; --count; Q_FALLTHROUGH();
+    case 8:  *dest++ = value; --count; Q_FALLTHROUGH();
+    case 12: *dest++ = value; --count;
+    }
+
+    const int rest = count & 0x3;
+    if (rest) {
+        switch (rest) {
+        case 3: dest[count - 3] = value; Q_FALLTHROUGH();
+        case 2: dest[count - 2] = value; Q_FALLTHROUGH();
+        case 1: dest[count - 1] = value;
+        }
+    }
+
+    qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_w(value), count * sizeof(quint32));
+}
+
+void QT_FASTCALL comp_func_solid_Source_lsx(uint *destPixels, int length,
+                                            uint color, uint const_alpha)
+{
+    if (const_alpha == 255) {
+        qt_memfill32(destPixels, color, length);
+    } else {
+        const quint32 ialpha = 255 - const_alpha;
+        color = BYTE_MUL(color, const_alpha);
+        int x = 0;
+
+        quint32 *dst = (quint32 *) destPixels;
+        const __m128i colorVector = __lsx_vreplgr2vr_w(color);
+        const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+        const __m128i half = __lsx_vreplgr2vr_h(0x80);
+        const __m128i iAlphaVector = __lsx_vreplgr2vr_h(ialpha);
+
+        ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
+            destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha);
+
+        for (; x < length-3; x += 4) {
+            __m128i dstVector = __lsx_vld(&dst[x], 0);
+            BYTE_MUL_LSX(dstVector, iAlphaVector, colorMask, half);
+            dstVector = __lsx_vadd_b(colorVector, dstVector);
+            __lsx_vst(dstVector, &dst[x], 0);
+        }
+        SIMD_EPILOGUE(x, length, 3)
+            destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha);
+    }
+}
+
+void QT_FASTCALL comp_func_solid_SourceOver_lsx(uint *destPixels, int length,
+                                                uint color, uint const_alpha)
+{
+    if ((const_alpha & qAlpha(color)) == 255) {
+        qt_memfill32(destPixels, color, length);
+    } else {
+        if (const_alpha != 255)
+            color = BYTE_MUL(color, const_alpha);
+
+        const quint32 minusAlphaOfColor = qAlpha(~color);
+        int x = 0;
+
+        quint32 *dst = (quint32 *) destPixels;
+        const __m128i colorVector = __lsx_vreplgr2vr_w(color);
+        const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+        const __m128i half = __lsx_vreplgr2vr_h(0x80);
+        const __m128i minusAlphaOfColorVector = __lsx_vreplgr2vr_h(minusAlphaOfColor);
+
+        ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
+            destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
+
+        for (; x < length-3; x += 4) {
+            __m128i dstVector = __lsx_vld(&dst[x], 0);
+            BYTE_MUL_LSX(dstVector, minusAlphaOfColorVector, colorMask, half);
+            dstVector = __lsx_vadd_b(colorVector, dstVector);
+            __lsx_vst(dstVector, &dst[x], 0);
+        }
+        SIMD_EPILOGUE(x, length, 3)
+            destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
+    }
+}
+
+void qt_bitmapblit32_lsx_base(QRasterBuffer *rasterBuffer, int x, int y,
+                              quint32 color,
+                              const uchar *src, int width, int height, int stride)
+{
+    quint32 *dest = reinterpret_cast<quint32*>(rasterBuffer->scanLine(y)) + x;
+    const int destStride = rasterBuffer->stride<quint32>();
+
+    const __m128i c128 = __lsx_vreplgr2vr_w(color);
+    const __m128i maskmask1 = (__m128i)(v4u32){0x80808080, 0x40404040,
+                                               0x20202020, 0x10101010};
+    const __m128i maskadd1 = (__m128i)(v4i32){0x00000000, 0x40404040,
+                                              0x60606060, 0x70707070};
+
+    if (width > 4) {
+        const __m128i maskmask2 = (__m128i)(v4i32){0x08080808, 0x04040404,
+                                                   0x02020202, 0x01010101};
+        const __m128i maskadd2 = (__m128i)(v4i32){0x78787878, 0x7c7c7c7c,
+                                                  0x7e7e7e7e, 0x7f7f7f7f};
+        while (height--) {
+            for (int x = 0; x < width; x += 8) {
+                const quint8 s = src[x >> 3];
+                if (!s)
+                    continue;
+                __m128i mask1 = __lsx_vreplgr2vr_b(s);
+                __m128i mask2 = mask1;
+
+                mask1 = __lsx_vand_v(mask1, maskmask1);
+                mask1 = __lsx_vadd_b(mask1, maskadd1);
+
+                __m128i destSrc1 = __lsx_vld((char*)(dest + x), 0);
+
+                mask1 = __lsx_vslti_b(mask1,0);
+                destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1);
+                __lsx_vst(destSrc1, (char*)(dest + x), 0);
+
+                __m128i destSrc2 = __lsx_vld((char*)(dest + x + 4), 0);
+
+                mask2 = __lsx_vand_v(mask2, maskmask2);
+                mask2 = __lsx_vadd_b(mask2, maskadd2);
+
+                mask2 = __lsx_vslti_b(mask2,0);
+                destSrc2 = __lsx_vbitsel_v(destSrc2, c128, mask2);
+                __lsx_vst(destSrc2, (char*)(dest + x + 4), 0);
+            }
+            dest += destStride;
+            src += stride;
+        }
+    } else {
+        while (height--) {
+            const quint8 s = *src;
+            if (s) {
+                __m128i mask1 = __lsx_vreplgr2vr_b(s);
+
+                __m128i destSrc1 = __lsx_vld((char*)(dest), 0);
+                mask1 = __lsx_vand_v(mask1, maskmask1);
+                mask1 = __lsx_vadd_b(mask1, maskadd1);
+
+                mask1 = __lsx_vslti_b(mask1, 0);
+                destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1);
+                __lsx_vst(destSrc1, (char*)(dest), 0);
+            }
+            dest += destStride;
+            src += stride;
+        }
+    }
+}
+
+void qt_bitmapblit32_lsx(QRasterBuffer *rasterBuffer, int x, int y,
+                         const QRgba64 &color,
+                         const uchar *src, int width, int height, int stride)
+{
+    qt_bitmapblit32_lsx_base(rasterBuffer, x, y, color.toArgb32(), src, width, height, stride);
+}
+
+void qt_bitmapblit8888_lsx(QRasterBuffer *rasterBuffer, int x, int y,
+                           const QRgba64 &color,
+                           const uchar *src, int width, int height, int stride)
+{
+    qt_bitmapblit32_lsx_base(rasterBuffer, x, y, ARGB2RGBA(color.toArgb32()), src, width, height, stride);
+}
+
+void qt_bitmapblit16_lsx(QRasterBuffer *rasterBuffer, int x, int y,
+                         const QRgba64 &color,
+                         const uchar *src, int width, int height, int stride)
+{
+    const quint16 c = qConvertRgb32To16(color.toArgb32());
+    quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
+    const int destStride = rasterBuffer->stride<quint32>();
+
+    const __m128i c128 = __lsx_vreplgr2vr_h(c);
+    const __m128i maskmask = (__m128i)(v8u16){0x8080, 0x4040, 0x2020, 0x1010,
+                                              0x0808, 0x0404, 0x0202, 0x0101};
+
+    const __m128i maskadd = (__m128i)(v8i16){0x0000, 0x4040, 0x6060, 0x7070,
+                                             0x7878, 0x7c7c, 0x7e7e, 0x7f7f};
+    while (--height >= 0) {
+        for (int x = 0; x < width; x += 8) {
+            const quint8 s = src[x >> 3];
+            if (!s)
+                continue;
+            __m128i mask = __lsx_vreplgr2vr_b(s);
+            __m128i destSrc = __lsx_vld((char*)(dest + x), 0);
+            mask = __lsx_vand_v(mask, maskmask);
+            mask = __lsx_vadd_b(mask, maskadd);
+            mask = __lsx_vslti_b(mask, 0);
+            destSrc = __lsx_vbitsel_v(destSrc, c128, mask);
+            __lsx_vst(destSrc, (char*)(dest + x), 0);
+        }
+        dest += destStride;
+        src += stride;
+    }
+}
+
+class QSimdLsx
+{
+public:
+    typedef __m128i Int32x4;
+    typedef __m128 Float32x4;
+
+    union Vect_buffer_i { Int32x4 v; int i[4]; };
+    union Vect_buffer_f { Float32x4 v; float f[4]; };
+
+    static inline Float32x4 Q_DECL_VECTORCALL v_dup(float x) { return __lsx_vreplfr2vr_s(x); }
+    static inline Float32x4 Q_DECL_VECTORCALL v_dup(double x) { return __lsx_vreplfr2vr_s(x); }
+    static inline Int32x4 Q_DECL_VECTORCALL v_dup(int x) { return __lsx_vreplgr2vr_w(x); }
+    static inline Int32x4 Q_DECL_VECTORCALL v_dup(uint x) { return __lsx_vreplgr2vr_w(x); }
+
+    static inline Float32x4 Q_DECL_VECTORCALL v_add(Float32x4 a, Float32x4 b) { return __lsx_vfadd_s(a, b); }
+    static inline Int32x4 Q_DECL_VECTORCALL v_add(Int32x4 a, Int32x4 b) { return __lsx_vadd_w(a, b); }
+
+    static inline Float32x4 Q_DECL_VECTORCALL v_max(Float32x4 a, Float32x4 b) { return __lsx_vfmax_s(a, b); }
+    static inline Float32x4 Q_DECL_VECTORCALL v_min(Float32x4 a, Float32x4 b) { return __lsx_vfmin_s(a, b); }
+    static inline Int32x4 Q_DECL_VECTORCALL v_min_16(Int32x4 a, Int32x4 b) { return __lsx_vmin_h(a, b); }
+
+    static inline Int32x4 Q_DECL_VECTORCALL v_and(Int32x4 a, Int32x4 b) { return __lsx_vand_v(a, b); }
+
+    static inline Float32x4 Q_DECL_VECTORCALL v_sub(Float32x4 a, Float32x4 b) { return __lsx_vfsub_s(a, b); }
+    static inline Int32x4 Q_DECL_VECTORCALL v_sub(Int32x4 a, Int32x4 b) { return __lsx_vsub_w(a, b); }
+
+    static inline Float32x4 Q_DECL_VECTORCALL v_mul(Float32x4 a, Float32x4 b) { return __lsx_vfmul_s(a, b); }
+
+    static inline Float32x4 Q_DECL_VECTORCALL v_sqrt(Float32x4 x) { return __lsx_vfsqrt_s(x); }
+
+    static inline Int32x4 Q_DECL_VECTORCALL v_toInt(Float32x4 x) { return __lsx_vftintrz_w_s(x); }
+
+    static inline Int32x4 Q_DECL_VECTORCALL v_greaterOrEqual(Float32x4 a, Float32x4 b) { return __lsx_vfcmp_clt_s(b, a); }
+};
+
+const uint * QT_FASTCALL qt_fetch_radial_gradient_lsx(uint *buffer, const Operator *op,
+                                                      const QSpanData *data,
+                                                      int y, int x, int length)
+{
+    return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdLsx>,uint>(buffer, op, data, y, x, length);
+}
+
+void qt_scale_image_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
+                                         const uchar *srcPixels, int sbpl, int srch,
+                                         const QRectF &targetRect,
+                                         const QRectF &sourceRect,
+                                         const QRect &clip,
+                                         int const_alpha)
+{
+    if (const_alpha != 256) {
+        // from qblendfunctions.cpp
+        extern void qt_scale_image_argb32_on_argb32(uchar *destPixels, int dbpl,
+                                                    const uchar *srcPixels, int sbpl, int srch,
+                                                    const QRectF &targetRect,
+                                                    const QRectF &sourceRect,
+                                                    const QRect &clip,
+                                                    int const_alpha);
+        return qt_scale_image_argb32_on_argb32(destPixels, dbpl, srcPixels, sbpl, srch,
+                                               targetRect, sourceRect, clip, const_alpha);
+    }
+
+    qreal sx = sourceRect.width() / (qreal)targetRect.width();
+    qreal sy = sourceRect.height() / (qreal)targetRect.height();
+
+
+    const int ix = 0x00010000 * sx;
+    const int iy = 0x00010000 * sy;
+
+    QRect tr = targetRect.normalized().toRect();
+    tr = tr.intersected(clip);
+    if (tr.isEmpty())
+        return;
+    const int tx1 = tr.left();
+    const int ty1 = tr.top();
+    int h = tr.height();
+    int w = tr.width();
+
+    quint32 basex;
+    quint32 srcy;
+
+    if (sx < 0) {
+        int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * sx * 65536) + 1;
+        basex = quint32(sourceRect.right() * 65536) + dstx;
+    } else {
+        int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * sx * 65536) - 1;
+        basex = quint32(sourceRect.left() * 65536) + dstx;
+    }
+    if (sy < 0) {
+        int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * sy * 65536) + 1;
+        srcy = quint32(sourceRect.bottom() * 65536) + dsty;
+    } else {
+        int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * sy * 65536) - 1;
+        srcy = quint32(sourceRect.top() * 65536) + dsty;
+    }
+
+    quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1;
+
+    const __m128i nullVector = __lsx_vreplgr2vr_w(0);
+    const __m128i half = __lsx_vreplgr2vr_h(0x80);
+    const __m128i one = __lsx_vreplgr2vr_h(0xff);
+    const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+    const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
+    const __m128i ixVector = __lsx_vreplgr2vr_w(4*ix);
+
+    // this bounds check here is required as floating point rounding above might in some cases lead to
+    // w/h values that are one pixel too large, falling outside of the valid image area.
+    const int ystart = srcy >> 16;
+    if (ystart >= srch && iy < 0) {
+        srcy += iy;
+        --h;
+    }
+    const int xstart = basex >> 16;
+    if (xstart >=  (int)(sbpl/sizeof(quint32)) && ix < 0) {
+        basex += ix;
+        --w;
+    }
+    int yend = (srcy + iy * (h - 1)) >> 16;
+    if (yend < 0 || yend >= srch)
+        --h;
+    int xend = (basex + ix * (w - 1)) >> 16;
+    if (xend < 0 || xend >= (int)(sbpl/sizeof(quint32)))
+        --w;
+
+    while (--h >= 0) {
+        const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl);
+        int srcx = basex;
+        int x = 0;
+
+        ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
+            uint s = src[srcx >> 16];
+            dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
+            srcx += ix;
+        }
+
+        __m128i srcxVector = (__m128i)(v4i32){srcx + ix + ix + ix, srcx + ix + ix, srcx + ix, srcx};
+
+        for (; x < (w - 3); x += 4) {
+            const int idx0 = __lsx_vpickve2gr_h(srcxVector, 1);
+            const int idx1 = __lsx_vpickve2gr_h(srcxVector, 3);
+            const int idx2 = __lsx_vpickve2gr_h(srcxVector, 5);
+            const int idx3 = __lsx_vpickve2gr_h(srcxVector, 7);
+            srcxVector = __lsx_vadd_w(srcxVector, ixVector);
+
+            const __m128i srcVector = (__m128i)((v4u32){src[idx3], src[idx2], src[idx1], src[idx0]});
+
+            BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask);
+        }
+
+        SIMD_EPILOGUE(x, w, 3) {
+            uint s = src[(basex + x*ix) >> 16];
+            dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
+        }
+        dst = (quint32 *)(((uchar *) dst) + dbpl);
+        srcy += iy;
+    }
+}
+
+QT_END_NAMESPACE
+
+#endif // QT_COMPILER_SUPPORTS_LSX
--- a/src/gui/painting/qdrawhelper_p.h
+++ b/src/gui/painting/qdrawhelper_p.h
@ -142,7 +142,7 @@ struct quint24 {

 void qBlendGradient(int count, const QT_FT_Span *spans, void *userData);
 void qBlendTexture(int count, const QT_FT_Span *spans, void *userData);
-#ifdef Q_PROCESSOR_X86
+#if defined(Q_PROCESSOR_X86) || defined(QT_COMPILER_SUPPORTS_LSX)
 extern void (*qt_memfill64)(quint64 *dest, quint64 value, qsizetype count);
 extern void (*qt_memfill32)(quint32 *dest, quint32 value, qsizetype count);
 #else
--- a/src/gui/painting/qdrawingprimitive_lsx_p.h
+++ b/src/gui/painting/qdrawingprimitive_lsx_p.h
@ -0,0 +1,231 @@
+// Copyright (C) 2024 Loongson Technology Corporation Limited.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
+
+#ifndef QDRAWINGPRIMITIVE_LSX_P_H
+#define QDRAWINGPRIMITIVE_LSX_P_H
+
+#include <QtGui/private/qtguiglobal_p.h>
+#include <private/qsimd_p.h>
+#include "qdrawhelper_loongarch64_p.h"
+#include "qrgba64_p.h"
+
+#ifdef __loongarch_sx
+
+//
+//  W A R N I N G
+//  -------------
+//
+// This file is not part of the Qt API.  It exists purely as an
+// implementation detail.  This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+//
+
+QT_BEGIN_NAMESPACE
+
+/*
+ * Multiply the components of pixelVector by alphaChannel
+ * Each 32bits components of alphaChannel must be in the form 0x00AA00AA
+ * colorMask must have 0x00ff00ff on each 32 bits component
+ * half must have the value 128 (0x80) for each 32 bits component
+ */
+inline static void Q_DECL_VECTORCALL
+BYTE_MUL_LSX(__m128i &pixelVector, __m128i alphaChannel, __m128i colorMask, __m128i half)
+{
+    /* 1. separate the colors in 2 vectors so each color is on 16 bits
+       (in order to be multiplied by the alpha
+       each 32 bit of dstVectorAG are in the form 0x00AA00GG
+       each 32 bit of dstVectorRB are in the form 0x00RR00BB */
+    __m128i pixelVectorAG = __lsx_vsrli_h(pixelVector, 8);
+    __m128i pixelVectorRB = __lsx_vand_v(pixelVector, colorMask);
+
+    /* 2. multiply the vectors by the alpha channel */
+    pixelVectorAG = __lsx_vmul_h(pixelVectorAG, alphaChannel);
+    pixelVectorRB = __lsx_vmul_h(pixelVectorRB, alphaChannel);
+
+    /* 3. divide by 255, that's the tricky part.
+       we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */
+    /** so first (X + X/256 + rounding) */
+    pixelVectorRB = __lsx_vadd_h(pixelVectorRB, __lsx_vsrli_h(pixelVectorRB, 8));
+    pixelVectorRB = __lsx_vadd_h(pixelVectorRB, half);
+    pixelVectorAG = __lsx_vadd_h(pixelVectorAG, __lsx_vsrli_h(pixelVectorAG, 8));
+    pixelVectorAG = __lsx_vadd_h(pixelVectorAG, half);
+
+    /** second divide by 256 */
+    pixelVectorRB = __lsx_vsrli_h(pixelVectorRB, 8);
+    /** for AG, we could >> 8 to divide followed by << 8 to put the
+        bytes in the correct position. By masking instead, we execute
+        only one instruction */
+    pixelVectorAG = __lsx_vandn_v(colorMask, pixelVectorAG);
+
+    /* 4. combine the 2 pairs of colors */
+    pixelVector = __lsx_vor_v(pixelVectorAG, pixelVectorRB);
+}
+
+/*
+ * Each 32bits components of alphaChannel must be in the form 0x00AA00AA
+ * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component
+ * colorMask must have 0x00ff00ff on each 32 bits component
+ * half must have the value 128 (0x80) for each 32 bits component
+ */
+inline static void Q_DECL_VECTORCALL
+INTERPOLATE_PIXEL_255_LSX(__m128i srcVector, __m128i &dstVector, __m128i alphaChannel,
+                          __m128i oneMinusAlphaChannel, __m128i colorMask, __m128i half)
+{
+    /* interpolate AG */
+    __m128i srcVectorAG = __lsx_vsrli_h(srcVector, 8);
+    __m128i dstVectorAG = __lsx_vsrli_h(dstVector, 8);
+    __m128i srcVectorAGalpha = __lsx_vmul_h(srcVectorAG, alphaChannel);
+    __m128i dstVectorAGoneMinusAlphalpha = __lsx_vmul_h(dstVectorAG, oneMinusAlphaChannel);
+    __m128i finalAG = __lsx_vadd_h(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha);
+    finalAG = __lsx_vadd_h(finalAG, __lsx_vsrli_h(finalAG, 8));
+    finalAG = __lsx_vadd_h(finalAG, half);
+    finalAG = __lsx_vandn_v(colorMask, finalAG);
+
+    /* interpolate RB */
+    __m128i srcVectorRB = __lsx_vand_v(srcVector, colorMask);
+    __m128i dstVectorRB = __lsx_vand_v(dstVector, colorMask);
+    __m128i srcVectorRBalpha = __lsx_vmul_h(srcVectorRB, alphaChannel);
+    __m128i dstVectorRBoneMinusAlphalpha = __lsx_vmul_h(dstVectorRB, oneMinusAlphaChannel);
+    __m128i finalRB = __lsx_vadd_h(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha);
+    finalRB = __lsx_vadd_h(finalRB, __lsx_vsrli_h(finalRB, 8));
+    finalRB = __lsx_vadd_h(finalRB, half);
+    finalRB = __lsx_vsrli_h(finalRB, 8);
+
+    /* combine */
+    dstVector = __lsx_vor_v(finalAG, finalRB);
+}
+
+// same as BLEND_SOURCE_OVER_ARGB32_LSX, but for one vector srcVector
+inline static void Q_DECL_VECTORCALL
+BLEND_SOURCE_OVER_ARGB32_LSX_helper(quint32 *dst, int x, __m128i srcVector,
+                                    __m128i nullVector, __m128i half, __m128i one,
+                                    __m128i colorMask, __m128i alphaMask)
+{
+    const __m128i srcVectorAlpha = __lsx_vand_v(srcVector, alphaMask);
+    __m128i vseq = __lsx_vseq_w(srcVectorAlpha, alphaMask);
+    v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq);
+    if (vseq_res[0] == (0x0000ffff)) {
+        /* all opaque */
+        __lsx_vst(srcVector, &dst[x], 0);
+    } else {
+        __m128i vseq_n = __lsx_vseq_w(srcVectorAlpha, nullVector);
+        v4i32 vseq_n_res = (v4i32)__lsx_vmsknz_b(vseq_n);
+        if (vseq_n_res[0] != (0x0000ffff)) {
+            /* not fully transparent */
+            /* extract the alpha channel on 2 x 16 bits */
+            /* so we have room for the multiplication */
+            /* each 32 bits will be in the form 0x00AA00AA */
+            /* with A being the 1 - alpha */
+            __m128i alphaChannel = __lsx_vsrli_w(srcVector, 24);
+            alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, 16));
+            alphaChannel = __lsx_vsub_h(one, alphaChannel);
+
+            __m128i dstVector = __lsx_vld(&dst[x], 0);
+            BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half);
+
+            /* result = s + d * (1-alpha) */
+            const __m128i result = __lsx_vadd_b(srcVector, dstVector);
+            __lsx_vst(result, &dst[x], 0);
+        }
+    }
+}
+
+// Basically blend src over dst with the const alpha defined as constAlphaVector.
+// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
+//const __m128i nullVector = __lsx_vreplgr2vr_w(0);
+//const __m128i half = __lsx_vreplgr2vr_h(0x80);
+//const __m128i one = __lsx_vreplgr2vr_h(0xff);
+//const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+//const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
+//
+// The computation being done is:
+// result = s + d * (1-alpha)
+// with shortcuts if fully opaque or fully transparent.
+inline static void Q_DECL_VECTORCALL
+BLEND_SOURCE_OVER_ARGB32_LSX(quint32 *dst, const quint32 *src, int length)
+{
+    int x = 0;
+
+    /* First, get dst aligned. */
+    ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
+        blend_pixel(dst[x], src[x]);
+    }
+
+    const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
+    const __m128i nullVector = __lsx_vreplgr2vr_w(0);
+    const __m128i half = __lsx_vreplgr2vr_h(0x80);
+    const __m128i one = __lsx_vreplgr2vr_h(0xff);
+    const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+
+    for (; x < length-3; x += 4) {
+        const __m128i srcVector = __lsx_vld((const __m128i *)&src[x], 0);
+        BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask);
+    }
+    SIMD_EPILOGUE(x, length, 3) {
+        blend_pixel(dst[x], src[x]);
+    }
+}
+
+// Basically blend src over dst with the const alpha defined as constAlphaVector.
+// The computation being done is:
+// dest = (s + d * sia) * ca + d * cia
+//      = s * ca + d * (sia * ca + cia)
+//      = s * ca + d * (1 - sa*ca)
+inline static void Q_DECL_VECTORCALL
+BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(quint32 *dst, const quint32 *src, int length, uint const_alpha)
+{
+    int x = 0;
+
+    ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
+        blend_pixel(dst[x], src[x], const_alpha);
+    }
+
+    const __m128i nullVector = __lsx_vreplgr2vr_w(0);
+    const __m128i half = __lsx_vreplgr2vr_h(0x80);
+    const __m128i one = __lsx_vreplgr2vr_h(0xff);
+    const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
+    const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
+
+    for (; x < length-3; x += 4) {
+        __m128i srcVector = __lsx_vld((const __m128i *)&src[x], 0);
+        __m128i vseq = __lsx_vseq_w(srcVector, nullVector);
+        v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq);
+        if (vseq_res[0] != 0x0000ffff) {
+            BYTE_MUL_LSX(srcVector, constAlphaVector, colorMask, half);
+
+            __m128i alphaChannel = __lsx_vsrli_w(srcVector, 24);
+            alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, 16));
+            alphaChannel = __lsx_vsub_h(one, alphaChannel);
+
+            __m128i dstVector = __lsx_vld((__m128i *)&dst[x], 0);
+            BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half);
+
+            const __m128i result = __lsx_vadd_b(srcVector, dstVector);
+            __lsx_vst(result, &dst[x], 0);
+        }
+    }
+    SIMD_EPILOGUE(x, length, 3) {
+        blend_pixel(dst[x], src[x], const_alpha);
+    }
+}
+
+typedef union
+{
+    int i;
+    float f;
+} FloatInt;
+
+/* float type data load instructions */
+static __m128 __lsx_vreplfr2vr_s(float val)
+{
+    FloatInt fi_tmpval = {.f = val};
+    return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
+}
+
+QT_END_NAMESPACE
+
+#endif // __loongarch_sx
+
+#endif // QDRAWINGPRIMITIVE_LSX_P_H
--- a/src/gui/painting/qimagescale.cpp
+++ b/src/gui/painting/qimagescale.cpp
@ -257,6 +257,18 @@ void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi, unsigned int *dest,
                                       int dw, int dh, int dow, int sow);
 #endif

+#if defined(QT_COMPILER_SUPPORTS_LSX)
+template<bool RGB>
+void qt_qimageScaleAARGBA_up_x_down_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
+                                          int dw, int dh, int dow, int sow);
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_x_up_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
+                                          int dw, int dh, int dow, int sow);
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_xy_lsx(QImageScaleInfo *isi, unsigned int *dest,
+                                      int dw, int dh, int dow, int sow);
+#endif
+
 #if defined(__ARM_NEON__)
 template<bool RGB>
 void qt_qimageScaleAARGBA_up_x_down_y_neon(QImageScaleInfo *isi, unsigned int *dest,
@ -351,6 +363,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest,
        if (qCpuHasFeature(SSE4_1))
            qt_qimageScaleAARGBA_up_x_down_y_sse4<false>(isi, dest, dw, dh, dow, sow);
        else
+#elif defined(QT_COMPILER_SUPPORTS_LSX)
+        if (qCpuHasFeature(LSX))
+            qt_qimageScaleAARGBA_up_x_down_y_lsx<false>(isi, dest, dw, dh, dow, sow);
+        else
 #elif defined(__ARM_NEON__)
        if (qCpuHasFeature(NEON))
            qt_qimageScaleAARGBA_up_x_down_y_neon<false>(isi, dest, dw, dh, dow, sow);
@ -364,6 +380,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest,
        if (qCpuHasFeature(SSE4_1))
            qt_qimageScaleAARGBA_down_x_up_y_sse4<false>(isi, dest, dw, dh, dow, sow);
        else
+#elif defined(QT_COMPILER_SUPPORTS_LSX)
+        if (qCpuHasFeature(LSX))
+            qt_qimageScaleAARGBA_down_x_up_y_lsx<false>(isi, dest, dw, dh, dow, sow);
+        else
 #elif defined(__ARM_NEON__)
        if (qCpuHasFeature(NEON))
            qt_qimageScaleAARGBA_down_x_up_y_neon<false>(isi, dest, dw, dh, dow, sow);
@ -377,6 +397,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest,
        if (qCpuHasFeature(SSE4_1))
            qt_qimageScaleAARGBA_down_xy_sse4<false>(isi, dest, dw, dh, dow, sow);
        else
+#elif defined(QT_COMPILER_SUPPORTS_LSX)
+        if (qCpuHasFeature(LSX))
+            qt_qimageScaleAARGBA_down_xy_lsx<false>(isi, dest, dw, dh, dow, sow);
+        else
 #elif defined(__ARM_NEON__)
        if (qCpuHasFeature(NEON))
            qt_qimageScaleAARGBA_down_xy_neon<false>(isi, dest, dw, dh, dow, sow);
@ -995,6 +1019,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest,
        if (qCpuHasFeature(SSE4_1))
            qt_qimageScaleAARGBA_up_x_down_y_sse4<true>(isi, dest, dw, dh, dow, sow);
        else
+#elif defined QT_COMPILER_SUPPORTS_LSX
+        if (qCpuHasFeature(LSX))
+            qt_qimageScaleAARGBA_up_x_down_y_lsx<true>(isi, dest, dw, dh, dow, sow);
+        else
 #elif defined(__ARM_NEON__)
        if (qCpuHasFeature(NEON))
            qt_qimageScaleAARGBA_up_x_down_y_neon<true>(isi, dest, dw, dh, dow, sow);
@ -1008,6 +1036,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest,
        if (qCpuHasFeature(SSE4_1))
            qt_qimageScaleAARGBA_down_x_up_y_sse4<true>(isi, dest, dw, dh, dow, sow);
        else
+#elif defined QT_COMPILER_SUPPORTS_LSX
+        if (qCpuHasFeature(LSX))
+            qt_qimageScaleAARGBA_down_x_up_y_lsx<true>(isi, dest, dw, dh, dow, sow);
+        else
 #elif defined(__ARM_NEON__)
        if (qCpuHasFeature(NEON))
            qt_qimageScaleAARGBA_down_x_up_y_neon<true>(isi, dest, dw, dh, dow, sow);
@ -1021,6 +1053,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest,
        if (qCpuHasFeature(SSE4_1))
            qt_qimageScaleAARGBA_down_xy_sse4<true>(isi, dest, dw, dh, dow, sow);
        else
+#elif defined QT_COMPILER_SUPPORTS_LSX
+        if (qCpuHasFeature(LSX))
+            qt_qimageScaleAARGBA_down_xy_lsx<true>(isi, dest, dw, dh, dow, sow);
+        else
 #elif defined(__ARM_NEON__)
        if (qCpuHasFeature(NEON))
            qt_qimageScaleAARGBA_down_xy_neon<true>(isi, dest, dw, dh, dow, sow);
--- a/src/gui/painting/qimagescale_lsx.cpp
+++ b/src/gui/painting/qimagescale_lsx.cpp
@ -0,0 +1,233 @@
+// Copyright (C) 2024 Loongson Technology Corporation Limited.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
+
+#include "qimagescale_p.h"
+#include "qimage.h"
+#include <private/qdrawhelper_loongarch64_p.h>
+#include <private/qsimd_p.h>
+
+#if QT_CONFIG(thread) && !defined(Q_OS_WASM)
+#include <qsemaphore.h>
+#include <private/qthreadpool_p.h>
+#endif
+
+#if defined(QT_COMPILER_SUPPORTS_LSX)
+
+QT_BEGIN_NAMESPACE
+
+using namespace QImageScale;
+
+template<typename T>
+static inline void multithread_pixels_function(QImageScaleInfo *isi, int dh, const T &scaleSection)
+{
+#if QT_CONFIG(thread) && !defined(Q_OS_WASM)
+    int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16);
+    segments = std::min(segments, dh);
+    QThreadPool *threadPool = QThreadPoolPrivate::qtGuiInstance();
+    if (segments > 1 && threadPool && !threadPool->contains(QThread::currentThread())) {
+        QSemaphore semaphore;
+        int y = 0;
+        for (int i = 0; i < segments; ++i) {
+            int yn = (dh - y) / (segments - i);
+            threadPool->start([&, y, yn]() {
+                scaleSection(y, y + yn);
+                semaphore.release(1);
+            });
+            y += yn;
+        }
+        semaphore.acquire(segments);
+        return;
+    }
+#else
+    Q_UNUSED(isi);
+#endif
+    scaleSection(0, dh);
+}
+
+inline static __m128i Q_DECL_VECTORCALL
+qt_qimageScaleAARGBA_helper(const unsigned int *pix, int xyap, int Cxy,
+                            int step, const __m128i vxyap, const __m128i vCxy)
+{
+    const __m128i shuffleMask = (__m128i)(v16i8){0, 16, 16, 16, 1, 16, 16, 16,
+                                                 2, 16, 16, 16, 3, 16, 16, 16};
+    __m128i vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask);
+    __m128i vx = __lsx_vmul_w(vpix, vxyap);
+    int i;
+    for (i = (1 << 14) - xyap; i > Cxy; i -= Cxy) {
+        pix += step;
+        vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask);
+        vx = __lsx_vadd_w(vx, __lsx_vmul_w(vpix, vCxy));
+    }
+    pix += step;
+    vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask);
+    vx = __lsx_vadd_w(vx, __lsx_vmul_w(vpix, __lsx_vreplgr2vr_w(i)));
+    return vx;
+}
+
+template<bool RGB>
+void qt_qimageScaleAARGBA_up_x_down_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
+                                          int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    const int *xpoints = isi->xpoints;
+    const int *xapoints = isi->xapoints;
+    const int *yapoints = isi->yapoints;
+
+    const __m128i v256 = __lsx_vreplgr2vr_w(256);
+
+    /* go through every scanline in the output buffer */
+    auto scaleSection = [&] (int yStart, int yEnd) {
+        for (int y = yStart; y < yEnd; ++y) {
+            const int Cy = yapoints[y] >> 16;
+            const int yap = yapoints[y] & 0xffff;
+            const __m128i vCy = __lsx_vreplgr2vr_w(Cy);
+            const __m128i vyap = __lsx_vreplgr2vr_w(yap);
+
+            unsigned int *dptr = dest + (y * dow);
+            for (int x = 0; x < dw; x++) {
+                const unsigned int *sptr = ypoints[y] + xpoints[x];
+                __m128i vx = qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow, vyap, vCy);
+
+                const int xap = xapoints[x];
+                if (xap > 0) {
+                    const __m128i vxap = __lsx_vreplgr2vr_w(xap);
+                    const __m128i vinvxap = __lsx_vsub_w(v256, vxap);
+                    __m128i vr = qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow, vyap, vCy);
+
+                    vx = __lsx_vmul_w(vx, vinvxap);
+                    vr = __lsx_vmul_w(vr, vxap);
+                    vx = __lsx_vadd_w(vx, vr);
+                    vx = __lsx_vsrli_w(vx, 8);
+                }
+                vx = __lsx_vsrli_w(vx, 14);
+                vx = __lsx_vpickev_h(__lsx_vsat_wu(vx, 15), __lsx_vsat_wu(vx, 15));
+                vx = __lsx_vpickev_b(__lsx_vsat_hu(vx, 7), __lsx_vsat_hu(vx, 7));
+                *dptr = __lsx_vpickve2gr_w(vx, 0);
+                if (RGB)
+                    *dptr |= 0xff000000;
+                dptr++;
+            }
+        }
+    };
+    multithread_pixels_function(isi, dh, scaleSection);
+}
+
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_x_up_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
+                                          int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    const __m128i v256 = __lsx_vreplgr2vr_w(256);
+
+    /* go through every scanline in the output buffer */
+    auto scaleSection = [&] (int yStart, int yEnd) {
+        for (int y = yStart; y < yEnd; ++y) {
+            unsigned int *dptr = dest + (y * dow);
+            for (int x = 0; x < dw; x++) {
+                int Cx = xapoints[x] >> 16;
+                int xap = xapoints[x] & 0xffff;
+                const __m128i vCx = __lsx_vreplgr2vr_w(Cx);
+                const __m128i vxap = __lsx_vreplgr2vr_w(xap);
+
+                const unsigned int *sptr = ypoints[y] + xpoints[x];
+                __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
+
+                int yap = yapoints[y];
+                if (yap > 0) {
+                    const __m128i vyap = __lsx_vreplgr2vr_w(yap);
+                    const __m128i vinvyap = __lsx_vsub_w(v256, vyap);
+                    __m128i vr = qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1, vxap, vCx);
+
+                    vx = __lsx_vmul_w(vx, vinvyap);
+                    vr = __lsx_vmul_w(vr, vyap);
+                    vx = __lsx_vadd_w(vx, vr);
+                    vx = __lsx_vsrli_w(vx, 8);
+                }
+                vx = __lsx_vsrli_w(vx, 14);
+                vx = __lsx_vpickev_h(__lsx_vsat_wu(vx, 15), __lsx_vsat_wu(vx, 15));
+                vx = __lsx_vpickev_b(__lsx_vsat_wu(vx, 7), __lsx_vsat_hu(vx, 7));
+                *dptr = __lsx_vpickve2gr_w(vx, 0);
+                if (RGB)
+                    *dptr |= 0xff000000;
+                dptr++;
+            }
+        }
+    };
+    multithread_pixels_function(isi, dh, scaleSection);
+}
+
+template<bool RGB>
+void qt_qimageScaleAARGBA_down_xy_lsx(QImageScaleInfo *isi, unsigned int *dest,
+                                      int dw, int dh, int dow, int sow)
+{
+    const unsigned int **ypoints = isi->ypoints;
+    int *xpoints = isi->xpoints;
+    int *xapoints = isi->xapoints;
+    int *yapoints = isi->yapoints;
+
+    auto scaleSection = [&] (int yStart, int yEnd) {
+        for (int y = yStart; y < yEnd; ++y) {
+            int Cy = yapoints[y] >> 16;
+            int yap = yapoints[y] & 0xffff;
+            const __m128i vCy = __lsx_vreplgr2vr_w(Cy);
+            const __m128i vyap = __lsx_vreplgr2vr_w(yap);
+
+            unsigned int *dptr = dest + (y * dow);
+            for (int x = 0; x < dw; x++) {
+                const int Cx = xapoints[x] >> 16;
+                const int xap = xapoints[x] & 0xffff;
+                const __m128i vCx = __lsx_vreplgr2vr_w(Cx);
+                const __m128i vxap = __lsx_vreplgr2vr_w(xap);
+
+                const unsigned int *sptr = ypoints[y] + xpoints[x];
+                __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
+                __m128i vr = __lsx_vmul_w(__lsx_vsrli_w(vx, 4), vyap);
+
+                int j;
+                for (j = (1 << 14) - yap; j > Cy; j -= Cy) {
+                    sptr += sow;
+                    vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
+                    vr = __lsx_vadd_w(vr, __lsx_vmul_w(__lsx_vsrli_w(vx, 4), vCy));
+                }
+                sptr += sow;
+                vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
+                vr = __lsx_vadd_w(vr, __lsx_vmul_w(__lsx_vsrli_w(vx, 4), __lsx_vreplgr2vr_w(j)));
+
+                vr = __lsx_vsrli_w(vr, 24);
+                vr = __lsx_vpickev_h(__lsx_vldi(0), __lsx_vsat_wu(vr, 15));
+                vr = __lsx_vpickev_b(__lsx_vldi(0), __lsx_vsat_hu(vr, 7));
+                *dptr = __lsx_vpickve2gr_w(vr, 0);
+                if (RGB)
+                    *dptr |= 0xff000000;
+                dptr++;
+            }
+        }
+    };
+    multithread_pixels_function(isi, dh, scaleSection);
+}
+
+template void qt_qimageScaleAARGBA_up_x_down_y_lsx<false>(QImageScaleInfo *isi, unsigned int *dest,
+                                                          int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_up_x_down_y_lsx<true>(QImageScaleInfo *isi, unsigned int *dest,
+                                                         int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_x_up_y_lsx<false>(QImageScaleInfo *isi, unsigned int *dest,
+                                                          int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_x_up_y_lsx<true>(QImageScaleInfo *isi, unsigned int *dest,
+                                                         int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_xy_lsx<false>(QImageScaleInfo *isi, unsigned int *dest,
+                                                      int dw, int dh, int dow, int sow);
+
+template void qt_qimageScaleAARGBA_down_xy_lsx<true>(QImageScaleInfo *isi, unsigned int *dest,
+                                                     int dw, int dh, int dow, int sow);
+
+QT_END_NAMESPACE
+
+#endif