Complete drawhelper Func with LSX

List of optimized implementations using LSX:

- qt_blend_argb32_on_argb32
- qt_blend_rgb32_on_rgb32
- comp_func_SourceOver
- comp_func_Plus
- comp_func_Source
- comp_func_solid_Source
- comp_func_solid_SourceOver
- qt_memfill64
- qt_memfill32
- qt_bitmapblit32
- qt_bitmapblit16
- qt_scale_image_argb32_on_argb32
- convert_RGB888_to_RGB32
- qt_qimageScaleAARGBA_up_x_down_y
- qt_qimageScaleAARGBA_down_x_up_y
- qt_qimageScaleAARGBA_down_xy

All of the above functions have passed the tests under tests/auto/gui.

Change-Id: I7ae6169305b81bdf7fb704619453c505f8bb960f
Reviewed-by: Volker Hilsheimer <volker.hilsheimer@qt.io>
This commit is contained in:
Chen Zhanwang 2024-06-21 17:05:49 +08:00 committed by Volker Hilsheimer
parent 73ce5a940a
commit d511a68684
11 changed files with 1347 additions and 6 deletions

View File

@ -183,6 +183,8 @@ qt_internal_add_module(Gui
painting/qdrawhelper_p.h
painting/qdrawhelper_x86_p.h
painting/qdrawingprimitive_sse2_p.h
painting/qdrawhelper_loongarch64_p.h
painting/qdrawingprimitive_lsx_p.h
painting/qemulationpaintengine.cpp painting/qemulationpaintengine_p.h
painting/qfixed_p.h
painting/qgrayraster.c painting/qgrayraster_p.h
@ -655,6 +657,13 @@ qt_internal_add_simd_part(Gui SIMD neon
painting/qimagescale_neon.cpp
)
qt_internal_add_simd_part(Gui SIMD lsx
SOURCES
image/qimage_lsx.cpp
painting/qdrawhelper_lsx.cpp
painting/qimagescale_lsx.cpp
)
if(NOT ANDROID)
qt_internal_add_simd_part(Gui SIMD mips_dsp
SOURCES

View File

@ -2754,6 +2754,18 @@ static void qInitImageConversions()
}
#endif
#if defined(QT_COMPILER_SUPPORTS_LSX)
if (qCpuHasFeature(LSX)) {
extern void convert_RGB888_to_RGB32_lsx(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags);
qimage_converter_map[QImage::Format_RGB888][QImage::Format_RGB32] = convert_RGB888_to_RGB32_lsx;
qimage_converter_map[QImage::Format_RGB888][QImage::Format_ARGB32] = convert_RGB888_to_RGB32_lsx;
qimage_converter_map[QImage::Format_RGB888][QImage::Format_ARGB32_Premultiplied] = convert_RGB888_to_RGB32_lsx;
qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBX8888] = convert_RGB888_to_RGB32_lsx;
qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBA8888] = convert_RGB888_to_RGB32_lsx;
qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBA8888_Premultiplied] = convert_RGB888_to_RGB32_lsx;
}
#endif
#if defined(__ARM_NEON__)
extern void convert_RGB888_to_RGB32_neon(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags);
qimage_converter_map[QImage::Format_RGB888][QImage::Format_RGB32] = convert_RGB888_to_RGB32_neon;

View File

@ -0,0 +1,115 @@
// Copyright (C) 2016 The Qt Company Ltd.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
#include <qimage.h>
#include <private/qimage_p.h>
#include <private/qsimd_p.h>
#ifdef QT_COMPILER_SUPPORTS_LSX
QT_BEGIN_NAMESPACE
// Convert a scanline of RGB888 (src) to RGB32 (dst)
// src must be at least len * 3 bytes
// dst must be at least len * 4 bytes
Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_lsx(quint32 *dst, const uchar *src, int len)
{
int i = 0;
// Prologue, align dst to 16 bytes.
ALIGNMENT_PROLOGUE_16BYTES(dst, i, len) {
dst[i] = qRgb(src[0], src[1], src[2]);
src += 3;
}
// Mask the 4 first colors of the RGB888 vector
const __m128i shuffleMask = (__m128i)(v16i8){2, 1, 0, 16, 5, 4, 3, 16,
8, 7, 6, 16, 11, 10, 9, 16};
// Mask the 4 last colors of a RGB888 vector with an offset of 1 (so the last 3 bytes are RGB)
const __m128i shuffleMaskEnd = (__m128i)(v16i8){6, 5, 4, 16, 9, 8, 7, 16,
12, 11, 10, 16, 15, 14, 13, 16};
// Mask to have alpha = 0xff
const __m128i alphaMask = __lsx_vreplgr2vr_b(0xff);
// Mask to concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by 12 bytes
const __m128i indexMask1 = (__m128i)(v16i8){12, 13, 14, 15, 16, 17, 18, 19,
20, 21, 22, 23, 24, 25, 26, 27};
// Mask to concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by 8 bytes
const __m128i indexMask2 = (__m128i)(v16i8){8, 9, 10, 11, 12, 13, 14, 15,
16, 17, 18, 19, 20, 21, 22, 23};
const __m128i *inVectorPtr = (const __m128i *)src;
__m128i *dstVectorPtr = (__m128i *)(dst + i);
for (; i < (len - 15); i += 16) { // one iteration in the loop converts 16 pixels
/*
RGB888 has 5 pixels per vector, + 1 byte from the next pixel. The idea here is
to load vectors of RGB888 and use palignr to select a vector out of two vectors.
After 3 loads of RGB888 and 3 stores of RGB32, we have 4 pixels left in the last
vector of RGB888, we can mask it directly to get a last store or RGB32. After that,
the first next byte is a R, and we can loop for the next 16 pixels.
The conversion itself is done with a byte permutation (vshuf_b).
*/
__m128i firstSrcVector = __lsx_vld(inVectorPtr, 0);
__m128i outputVector = __lsx_vshuf_b(alphaMask, firstSrcVector, shuffleMask);
__lsx_vst(outputVector, dstVectorPtr, 0);
++inVectorPtr;
++dstVectorPtr;
// There are 4 unused bytes left in srcVector, we need to load the next 16 bytes
__m128i secondSrcVector = __lsx_vld(inVectorPtr, 0);
__m128i srcVector = __lsx_vshuf_b(secondSrcVector, firstSrcVector, indexMask1);
outputVector = __lsx_vshuf_b(alphaMask, srcVector, shuffleMask);
__lsx_vst(outputVector, dstVectorPtr, 0);
++inVectorPtr;
++dstVectorPtr;
firstSrcVector = secondSrcVector;
// We now have 8 unused bytes left in firstSrcVector
secondSrcVector = __lsx_vld(inVectorPtr, 0);
srcVector = __lsx_vshuf_b(secondSrcVector, firstSrcVector, indexMask2);
outputVector = __lsx_vshuf_b(alphaMask, srcVector, shuffleMask);
__lsx_vst(outputVector, dstVectorPtr, 0);
++inVectorPtr;
++dstVectorPtr;
// There are now 12 unused bytes in firstSrcVector.
// We can mask them directly, almost there.
outputVector = __lsx_vshuf_b(alphaMask, secondSrcVector, shuffleMaskEnd);
__lsx_vst(outputVector, dstVectorPtr, 0);
++dstVectorPtr;
}
src = (const uchar *)inVectorPtr;
SIMD_EPILOGUE(i, len, 15) {
dst[i] = qRgb(src[0], src[1], src[2]);
src += 3;
}
}
void convert_RGB888_to_RGB32_lsx(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags)
{
Q_ASSERT(src->format == QImage::Format_RGB888 || src->format == QImage::Format_BGR888);
if (src->format == QImage::Format_BGR888)
Q_ASSERT(dest->format == QImage::Format_RGBX8888 || dest->format == QImage::Format_RGBA8888 || dest->format == QImage::Format_RGBA8888_Premultiplied);
else
Q_ASSERT(dest->format == QImage::Format_RGB32 || dest->format == QImage::Format_ARGB32 || dest->format == QImage::Format_ARGB32_Premultiplied);
Q_ASSERT(src->width == dest->width);
Q_ASSERT(src->height == dest->height);
const uchar *src_data = (uchar *) src->data;
quint32 *dest_data = (quint32 *) dest->data;
for (int i = 0; i < src->height; ++i) {
qt_convert_rgb888_to_rgb32_lsx(dest_data, src_data, src->width);
src_data += src->bytes_per_line;
dest_data = (quint32 *)((uchar*)dest_data + dest->bytes_per_line);
}
}
QT_END_NAMESPACE
#endif // QT_COMPILER_SUPPORTS_LSX

View File

@ -560,7 +560,7 @@ inline QImage::Format qt_opaqueVersionForPainting(QImage::Format format)
inline QImage::Format qt_alphaVersionForPainting(QImage::Format format)
{
QImage::Format toFormat = qt_alphaVersion(format);
#if defined(__ARM_NEON__) || defined(__SSE2__)
#if defined(__ARM_NEON__) || defined(__SSE2__) || defined(QT_COMPILER_SUPPORT_LSX)
// If we are switching depth anyway and we have optimized ARGB32PM routines, upgrade to that.
if (qt_depthForFormat(format) != qt_depthForFormat(toFormat) && qt_depthForFormat(toFormat) <= 32)
toFormat = QImage::Format_ARGB32_Premultiplied;

View File

@ -12,6 +12,8 @@
#include <private/qdrawhelper_p.h>
#include <private/qdrawhelper_x86_p.h>
#include <private/qdrawingprimitive_sse2_p.h>
#include <private/qdrawhelper_loongarch64_p.h>
#include <private/qdrawingprimitive_lsx_p.h>
#include <private/qdrawhelper_neon_p.h>
#if defined(QT_COMPILER_SUPPORTS_MIPS_DSP) || defined(QT_COMPILER_SUPPORTS_MIPS_DSPR2)
#include <private/qdrawhelper_mips_dsp_p.h>
@ -4971,7 +4973,7 @@ void qBlendTexture(int count, const QT_FT_Span *spans, void *userData)
case QImage::Format_RGB16:
proc = processTextureSpansRGB16[blendType];
break;
#if defined(__SSE2__) || defined(__ARM_NEON__) || (Q_PROCESSOR_WORDSIZE == 8)
#if defined(__SSE2__) || defined(__ARM_NEON__) || defined(QT_COMPILER_SUPPORTS_LSX) || (Q_PROCESSOR_WORDSIZE == 8)
case QImage::Format_ARGB32:
case QImage::Format_RGBA8888:
#endif
@ -5113,7 +5115,7 @@ void qBlendGradient(int count, const QT_FT_Span *spans, void *userData)
if (isVerticalGradient && blend_vertical_gradient_argb(count, spans, userData))
return;
return blend_src_generic(count, spans, userData);
#if defined(__SSE2__) || defined(__ARM_NEON__) || (Q_PROCESSOR_WORDSIZE == 8)
#if defined(__SSE2__) || defined(__ARM_NEON__) || defined(QT_COMPILER_SUPPORTS_LSX) || (Q_PROCESSOR_WORDSIZE == 8)
case QImage::Format_ARGB32:
case QImage::Format_RGBA8888:
#endif
@ -6368,7 +6370,7 @@ DrawHelper qDrawHelper[] =
static_assert(std::size(qDrawHelper) == QImage::NImageFormats);
#if !defined(Q_PROCESSOR_X86)
#if !defined(Q_PROCESSOR_X86) && !defined(QT_COMPILER_SUPPORTS_LSX)
void qt_memfill64(quint64 *dest, quint64 color, qsizetype count)
{
qt_memfill_template<quint64>(dest, color, count);
@ -6435,7 +6437,7 @@ void qt_memfill16(quint16 *dest, quint16 value, qsizetype count)
qt_memfill32(reinterpret_cast<quint32*>(dest), value32, count / 2);
}
#if defined(Q_PROCESSOR_X86)
#if defined(Q_PROCESSOR_X86) || defined(QT_COMPILER_SUPPORTS_LSX)
void (*qt_memfill32)(quint32 *dest, quint32 value, qsizetype count) = nullptr;
void (*qt_memfill64)(quint64 *dest, quint64 value, qsizetype count) = nullptr;
#elif !defined(__ARM_NEON__) && !defined(__MIPS_DSP__)
@ -6712,6 +6714,68 @@ static void qInitDrawhelperFunctions()
#endif // SSE2
#if defined(QT_COMPILER_SUPPORTS_LSX)
if (qCpuHasFeature(LSX)) {
qt_memfill32 = qt_memfill32_lsx;
qt_memfill64 = qt_memfill64_lsx;
qDrawHelper[QImage::Format_RGB32].bitmapBlit = qt_bitmapblit32_lsx;
qDrawHelper[QImage::Format_ARGB32].bitmapBlit = qt_bitmapblit32_lsx;
qDrawHelper[QImage::Format_ARGB32_Premultiplied].bitmapBlit = qt_bitmapblit32_lsx;
qDrawHelper[QImage::Format_RGB16].bitmapBlit = qt_bitmapblit16_lsx;
qDrawHelper[QImage::Format_RGBX8888].bitmapBlit = qt_bitmapblit8888_lsx;
qDrawHelper[QImage::Format_RGBA8888].bitmapBlit = qt_bitmapblit8888_lsx;
qDrawHelper[QImage::Format_RGBA8888_Premultiplied].bitmapBlit = qt_bitmapblit8888_lsx;
extern void qt_scale_image_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
const uchar *srcPixels, int sbpl, int srch,
const QRectF &targetRect,
const QRectF &sourceRect,
const QRect &clip,
int const_alpha);
qScaleFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
qScaleFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
qScaleFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
qScaleFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
extern void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl,
const uchar *srcPixels, int sbpl,
int w, int h,
int const_alpha);
extern void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
const uchar *srcPixels, int sbpl,
int w, int h,
int const_alpha);
qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_lsx;
qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_lsx;
qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_lsx;
qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_lsx;
qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
extern const uint * QT_FASTCALL qt_fetch_radial_gradient_lsx(uint *buffer, const Operator *op, const QSpanData *data,
int y, int x, int length);
qt_fetch_radial_gradient = qt_fetch_radial_gradient_lsx;
extern void QT_FASTCALL comp_func_SourceOver_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
extern void QT_FASTCALL comp_func_solid_SourceOver_lsx(uint *destPixels, int length, uint color, uint const_alpha);
extern void QT_FASTCALL comp_func_Source_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
extern void QT_FASTCALL comp_func_solid_Source_lsx(uint *destPixels, int length, uint color, uint const_alpha);
extern void QT_FASTCALL comp_func_Plus_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_lsx;
qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_lsx;
qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_lsx;
qt_functionForModeSolid_C[QPainter::CompositionMode_Source] = comp_func_solid_Source_lsx;
qt_functionForMode_C[QPainter::CompositionMode_Plus] = comp_func_Plus_lsx;
}
#endif //QT_COMPILER_SUPPORTS_LSX
#if defined(__ARM_NEON__)
qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_neon;
qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_neon;

View File

@ -0,0 +1,48 @@
// Copyright (C) 2024 Loongson Technology Corporation Limited.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
#ifndef QDRAWHELPER_LOONGARCH64_P_H
#define QDRAWHELPER_LOONGARCH64_P_H
//
// W A R N I N G
// -------------
//
// This file is not part of the Qt API. It exists purely as an
// implementation detail. This header file may change from version to
// version without notice, or even be removed.
//
// We mean it.
//
#include <QtGui/private/qtguiglobal_p.h>
#include <private/qdrawhelper_p.h>
QT_BEGIN_NAMESPACE
#ifdef QT_COMPILER_SUPPORTS_LSX
void qt_memfill64_lsx(quint64 *dest, quint64 value, qsizetype count);
void qt_memfill32_lsx(quint32 *dest, quint32 value, qsizetype count);
void qt_bitmapblit32_lsx(QRasterBuffer *rasterBuffer, int x, int y,
const QRgba64 &color,
const uchar *src, int width, int height, int stride);
void qt_bitmapblit8888_lsx(QRasterBuffer *rasterBuffer, int x, int y,
const QRgba64 &color,
const uchar *src, int width, int height, int stride);
void qt_bitmapblit16_lsx(QRasterBuffer *rasterBuffer, int x, int y,
const QRgba64 &color,
const uchar *src, int width, int height, int stride);
void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
const uchar *srcPixels, int sbpl,
int w, int h,
int const_alpha);
void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl,
const uchar *srcPixels, int sbpl,
int w, int h,
int const_alpha);
#endif // QT_COMPILER_SUPPORTS_LSX
QT_END_NAMESPACE
#endif // QDRAWHELPER_LOONGARCH64_P_H

View File

@ -0,0 +1,593 @@
// Copyright (C) 2024 Loongson Technology Corporation Limited.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
#include <private/qdrawhelper_loongarch64_p.h>
#ifdef QT_COMPILER_SUPPORTS_LSX
#include <private/qdrawingprimitive_lsx_p.h>
#include <private/qpaintengine_raster_p.h>
QT_BEGIN_NAMESPACE
void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
const uchar *srcPixels, int sbpl,
int w, int h,
int const_alpha)
{
const quint32 *src = (const quint32 *) srcPixels;
quint32 *dst = (quint32 *) destPixels;
if (const_alpha == 256) {
for (int y = 0; y < h; ++y) {
BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, w);
dst = (quint32 *)(((uchar *) dst) + dbpl);
src = (const quint32 *)(((const uchar *) src) + sbpl);
}
} else if (const_alpha != 0) {
// dest = (s + d * sia) * ca + d * cia
// = s * ca + d * (sia * ca + cia)
// = s * ca + d * (1 - sa*ca)
const_alpha = (const_alpha * 255) >> 8;
for (int y = 0; y < h; ++y) {
BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, w, const_alpha);
dst = (quint32 *)(((uchar *) dst) + dbpl);
src = (const quint32 *)(((const uchar *) src) + sbpl);
}
}
}
// qblendfunctions.cpp
void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
const uchar *srcPixels, int sbpl,
int w, int h,
int const_alpha);
void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl,
const uchar *srcPixels, int sbpl,
int w, int h,
int const_alpha)
{
const quint32 *src = (const quint32 *) srcPixels;
quint32 *dst = (quint32 *) destPixels;
if (const_alpha != 256) {
if (const_alpha != 0) {
const __m128i half = __lsx_vreplgr2vr_h(0x80);
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
const_alpha = (const_alpha * 255) >> 8;
int one_minus_const_alpha = 255 - const_alpha;
const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(one_minus_const_alpha);
for (int y = 0; y < h; ++y) {
int x = 0;
// First, align dest to 16 bytes:
ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha,
dst[x], one_minus_const_alpha);
}
for (; x < w-3; x += 4) {
__m128i srcVector = __lsx_vld(&src[x], 0);
__m128i dstVector = __lsx_vld(&dst[x], 0);
INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector,
oneMinusConstAlpha, colorMask, half);
__lsx_vst(dstVector, &dst[x], 0);
}
SIMD_EPILOGUE(x, w, 3)
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha,
dst[x], one_minus_const_alpha);
dst = (quint32 *)(((uchar *) dst) + dbpl);
src = (const quint32 *)(((const uchar *) src) + sbpl);
}
}
} else {
qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
}
}
void QT_FASTCALL comp_func_SourceOver_lsx(uint *destPixels, const uint *srcPixels,
int length, uint const_alpha)
{
Q_ASSERT(const_alpha < 256);
const quint32 *src = (const quint32 *) srcPixels;
quint32 *dst = (quint32 *) destPixels;
if (const_alpha == 255) {
BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, length);
} else {
BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, length, const_alpha);
}
}
void QT_FASTCALL comp_func_Plus_lsx(uint *dst, const uint *src, int length, uint const_alpha)
{
int x = 0;
if (const_alpha == 255) {
// 1) Prologue: align destination on 16 bytes
ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
// 2) composition with LSX
for (; x < length - 3; x += 4) {
const __m128i srcVector = __lsx_vld(&src[x], 0);
const __m128i dstVector = __lsx_vld(&dst[x], 0);
const __m128i result = __lsx_vsadd_bu(srcVector, dstVector);
__lsx_vst(result, &dst[x], 0);
}
// 3) Epilogue:
SIMD_EPILOGUE(x, length, 3)
dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
} else {
const int one_minus_const_alpha = 255 - const_alpha;
const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(one_minus_const_alpha);
// 1) Prologue: align destination on 16 bytes
ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x],
const_alpha,
one_minus_const_alpha);
const __m128i half = __lsx_vreplgr2vr_h(0x80);
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
// 2) composition with LSX
for (; x < length - 3; x += 4) {
const __m128i srcVector = __lsx_vld(&src[x], 0);
__m128i dstVector = __lsx_vld(&dst[x], 0);
__m128i result = __lsx_vsadd_bu(srcVector, dstVector);
INTERPOLATE_PIXEL_255_LSX(result, dstVector, constAlphaVector,
oneMinusConstAlpha, colorMask, half);
__lsx_vst(dstVector, &dst[x], 0);
}
// 3) Epilogue:
SIMD_EPILOGUE(x, length, 3)
dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x],
const_alpha, one_minus_const_alpha);
}
}
void QT_FASTCALL comp_func_Source_lsx(uint *dst, const uint *src, int length, uint const_alpha)
{
if (const_alpha == 255) {
::memcpy(dst, src, length * sizeof(uint));
} else {
const int ialpha = 255 - const_alpha;
int x = 0;
// 1) prologue, align on 16 bytes
ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
// 2) interpolate pixels with LSX
const __m128i half = __lsx_vreplgr2vr_h(0x80);
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(ialpha);
for (; x < length - 3; x += 4) {
const __m128i srcVector = __lsx_vld(&src[x], 0);
__m128i dstVector = __lsx_vld(&dst[x], 0);
INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector,
oneMinusConstAlpha, colorMask, half);
__lsx_vst(dstVector, &dst[x], 0);
}
// 3) Epilogue
SIMD_EPILOGUE(x, length, 3)
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
}
}
static Q_NEVER_INLINE
void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount)
{
__m128i *dst128 = reinterpret_cast<__m128i *>(dest);
__m128i *end128 = reinterpret_cast<__m128i *>(static_cast<uchar *>(dest) + bytecount);
while (dst128 + 4 <= end128) {
__lsx_vst(value128, dst128 + 0, 0);
__lsx_vst(value128, dst128 + 1, 0);
__lsx_vst(value128, dst128 + 2, 0);
__lsx_vst(value128, dst128 + 3, 0);
dst128 += 4;
}
bytecount %= 4 * sizeof(__m128i);
switch (bytecount / sizeof(__m128i)) {
case 3: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH();
case 2: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH();
case 1: __lsx_vst(value128, dst128++, 0);
}
}
void qt_memfill64_lsx(quint64 *dest, quint64 value, qsizetype count)
{
quintptr misaligned = quintptr(dest) % sizeof(__m128i);
if (misaligned && count) {
*dest++ = value;
--count;
}
if (count % 2) {
dest[count - 1] = value;
--count;
}
qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_d(value), count * sizeof(quint64));
}
void qt_memfill32_lsx(quint32 *dest, quint32 value, qsizetype count)
{
if (count < 4) {
// this simplifies the code below: the first switch can fall through
// without checking the value of count
switch (count) {
case 3: *dest++ = value; Q_FALLTHROUGH();
case 2: *dest++ = value; Q_FALLTHROUGH();
case 1: *dest = value;
}
return;
}
const int align = (quintptr)(dest) & 0xf;
switch (align) {
case 4: *dest++ = value; --count; Q_FALLTHROUGH();
case 8: *dest++ = value; --count; Q_FALLTHROUGH();
case 12: *dest++ = value; --count;
}
const int rest = count & 0x3;
if (rest) {
switch (rest) {
case 3: dest[count - 3] = value; Q_FALLTHROUGH();
case 2: dest[count - 2] = value; Q_FALLTHROUGH();
case 1: dest[count - 1] = value;
}
}
qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_w(value), count * sizeof(quint32));
}
void QT_FASTCALL comp_func_solid_Source_lsx(uint *destPixels, int length,
uint color, uint const_alpha)
{
if (const_alpha == 255) {
qt_memfill32(destPixels, color, length);
} else {
const quint32 ialpha = 255 - const_alpha;
color = BYTE_MUL(color, const_alpha);
int x = 0;
quint32 *dst = (quint32 *) destPixels;
const __m128i colorVector = __lsx_vreplgr2vr_w(color);
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
const __m128i half = __lsx_vreplgr2vr_h(0x80);
const __m128i iAlphaVector = __lsx_vreplgr2vr_h(ialpha);
ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha);
for (; x < length-3; x += 4) {
__m128i dstVector = __lsx_vld(&dst[x], 0);
BYTE_MUL_LSX(dstVector, iAlphaVector, colorMask, half);
dstVector = __lsx_vadd_b(colorVector, dstVector);
__lsx_vst(dstVector, &dst[x], 0);
}
SIMD_EPILOGUE(x, length, 3)
destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha);
}
}
void QT_FASTCALL comp_func_solid_SourceOver_lsx(uint *destPixels, int length,
uint color, uint const_alpha)
{
if ((const_alpha & qAlpha(color)) == 255) {
qt_memfill32(destPixels, color, length);
} else {
if (const_alpha != 255)
color = BYTE_MUL(color, const_alpha);
const quint32 minusAlphaOfColor = qAlpha(~color);
int x = 0;
quint32 *dst = (quint32 *) destPixels;
const __m128i colorVector = __lsx_vreplgr2vr_w(color);
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
const __m128i half = __lsx_vreplgr2vr_h(0x80);
const __m128i minusAlphaOfColorVector = __lsx_vreplgr2vr_h(minusAlphaOfColor);
ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
for (; x < length-3; x += 4) {
__m128i dstVector = __lsx_vld(&dst[x], 0);
BYTE_MUL_LSX(dstVector, minusAlphaOfColorVector, colorMask, half);
dstVector = __lsx_vadd_b(colorVector, dstVector);
__lsx_vst(dstVector, &dst[x], 0);
}
SIMD_EPILOGUE(x, length, 3)
destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
}
}
void qt_bitmapblit32_lsx_base(QRasterBuffer *rasterBuffer, int x, int y,
quint32 color,
const uchar *src, int width, int height, int stride)
{
quint32 *dest = reinterpret_cast<quint32*>(rasterBuffer->scanLine(y)) + x;
const int destStride = rasterBuffer->stride<quint32>();
const __m128i c128 = __lsx_vreplgr2vr_w(color);
const __m128i maskmask1 = (__m128i)(v4u32){0x80808080, 0x40404040,
0x20202020, 0x10101010};
const __m128i maskadd1 = (__m128i)(v4i32){0x00000000, 0x40404040,
0x60606060, 0x70707070};
if (width > 4) {
const __m128i maskmask2 = (__m128i)(v4i32){0x08080808, 0x04040404,
0x02020202, 0x01010101};
const __m128i maskadd2 = (__m128i)(v4i32){0x78787878, 0x7c7c7c7c,
0x7e7e7e7e, 0x7f7f7f7f};
while (height--) {
for (int x = 0; x < width; x += 8) {
const quint8 s = src[x >> 3];
if (!s)
continue;
__m128i mask1 = __lsx_vreplgr2vr_b(s);
__m128i mask2 = mask1;
mask1 = __lsx_vand_v(mask1, maskmask1);
mask1 = __lsx_vadd_b(mask1, maskadd1);
__m128i destSrc1 = __lsx_vld((char*)(dest + x), 0);
mask1 = __lsx_vslti_b(mask1,0);
destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1);
__lsx_vst(destSrc1, (char*)(dest + x), 0);
__m128i destSrc2 = __lsx_vld((char*)(dest + x + 4), 0);
mask2 = __lsx_vand_v(mask2, maskmask2);
mask2 = __lsx_vadd_b(mask2, maskadd2);
mask2 = __lsx_vslti_b(mask2,0);
destSrc2 = __lsx_vbitsel_v(destSrc2, c128, mask2);
__lsx_vst(destSrc2, (char*)(dest + x + 4), 0);
}
dest += destStride;
src += stride;
}
} else {
while (height--) {
const quint8 s = *src;
if (s) {
__m128i mask1 = __lsx_vreplgr2vr_b(s);
__m128i destSrc1 = __lsx_vld((char*)(dest), 0);
mask1 = __lsx_vand_v(mask1, maskmask1);
mask1 = __lsx_vadd_b(mask1, maskadd1);
mask1 = __lsx_vslti_b(mask1, 0);
destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1);
__lsx_vst(destSrc1, (char*)(dest), 0);
}
dest += destStride;
src += stride;
}
}
}
void qt_bitmapblit32_lsx(QRasterBuffer *rasterBuffer, int x, int y,
const QRgba64 &color,
const uchar *src, int width, int height, int stride)
{
qt_bitmapblit32_lsx_base(rasterBuffer, x, y, color.toArgb32(), src, width, height, stride);
}
void qt_bitmapblit8888_lsx(QRasterBuffer *rasterBuffer, int x, int y,
const QRgba64 &color,
const uchar *src, int width, int height, int stride)
{
qt_bitmapblit32_lsx_base(rasterBuffer, x, y, ARGB2RGBA(color.toArgb32()), src, width, height, stride);
}
void qt_bitmapblit16_lsx(QRasterBuffer *rasterBuffer, int x, int y,
const QRgba64 &color,
const uchar *src, int width, int height, int stride)
{
const quint16 c = qConvertRgb32To16(color.toArgb32());
quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
const int destStride = rasterBuffer->stride<quint32>();
const __m128i c128 = __lsx_vreplgr2vr_h(c);
const __m128i maskmask = (__m128i)(v8u16){0x8080, 0x4040, 0x2020, 0x1010,
0x0808, 0x0404, 0x0202, 0x0101};
const __m128i maskadd = (__m128i)(v8i16){0x0000, 0x4040, 0x6060, 0x7070,
0x7878, 0x7c7c, 0x7e7e, 0x7f7f};
while (--height >= 0) {
for (int x = 0; x < width; x += 8) {
const quint8 s = src[x >> 3];
if (!s)
continue;
__m128i mask = __lsx_vreplgr2vr_b(s);
__m128i destSrc = __lsx_vld((char*)(dest + x), 0);
mask = __lsx_vand_v(mask, maskmask);
mask = __lsx_vadd_b(mask, maskadd);
mask = __lsx_vslti_b(mask, 0);
destSrc = __lsx_vbitsel_v(destSrc, c128, mask);
__lsx_vst(destSrc, (char*)(dest + x), 0);
}
dest += destStride;
src += stride;
}
}
class QSimdLsx
{
public:
typedef __m128i Int32x4;
typedef __m128 Float32x4;
union Vect_buffer_i { Int32x4 v; int i[4]; };
union Vect_buffer_f { Float32x4 v; float f[4]; };
static inline Float32x4 Q_DECL_VECTORCALL v_dup(float x) { return __lsx_vreplfr2vr_s(x); }
static inline Float32x4 Q_DECL_VECTORCALL v_dup(double x) { return __lsx_vreplfr2vr_s(x); }
static inline Int32x4 Q_DECL_VECTORCALL v_dup(int x) { return __lsx_vreplgr2vr_w(x); }
static inline Int32x4 Q_DECL_VECTORCALL v_dup(uint x) { return __lsx_vreplgr2vr_w(x); }
static inline Float32x4 Q_DECL_VECTORCALL v_add(Float32x4 a, Float32x4 b) { return __lsx_vfadd_s(a, b); }
static inline Int32x4 Q_DECL_VECTORCALL v_add(Int32x4 a, Int32x4 b) { return __lsx_vadd_w(a, b); }
static inline Float32x4 Q_DECL_VECTORCALL v_max(Float32x4 a, Float32x4 b) { return __lsx_vfmax_s(a, b); }
static inline Float32x4 Q_DECL_VECTORCALL v_min(Float32x4 a, Float32x4 b) { return __lsx_vfmin_s(a, b); }
static inline Int32x4 Q_DECL_VECTORCALL v_min_16(Int32x4 a, Int32x4 b) { return __lsx_vmin_h(a, b); }
static inline Int32x4 Q_DECL_VECTORCALL v_and(Int32x4 a, Int32x4 b) { return __lsx_vand_v(a, b); }
static inline Float32x4 Q_DECL_VECTORCALL v_sub(Float32x4 a, Float32x4 b) { return __lsx_vfsub_s(a, b); }
static inline Int32x4 Q_DECL_VECTORCALL v_sub(Int32x4 a, Int32x4 b) { return __lsx_vsub_w(a, b); }
static inline Float32x4 Q_DECL_VECTORCALL v_mul(Float32x4 a, Float32x4 b) { return __lsx_vfmul_s(a, b); }
static inline Float32x4 Q_DECL_VECTORCALL v_sqrt(Float32x4 x) { return __lsx_vfsqrt_s(x); }
static inline Int32x4 Q_DECL_VECTORCALL v_toInt(Float32x4 x) { return __lsx_vftintrz_w_s(x); }
static inline Int32x4 Q_DECL_VECTORCALL v_greaterOrEqual(Float32x4 a, Float32x4 b) { return __lsx_vfcmp_clt_s(b, a); }
};
const uint * QT_FASTCALL qt_fetch_radial_gradient_lsx(uint *buffer, const Operator *op,
const QSpanData *data,
int y, int x, int length)
{
return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdLsx>,uint>(buffer, op, data, y, x, length);
}
void qt_scale_image_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
const uchar *srcPixels, int sbpl, int srch,
const QRectF &targetRect,
const QRectF &sourceRect,
const QRect &clip,
int const_alpha)
{
if (const_alpha != 256) {
// from qblendfunctions.cpp
extern void qt_scale_image_argb32_on_argb32(uchar *destPixels, int dbpl,
const uchar *srcPixels, int sbpl, int srch,
const QRectF &targetRect,
const QRectF &sourceRect,
const QRect &clip,
int const_alpha);
return qt_scale_image_argb32_on_argb32(destPixels, dbpl, srcPixels, sbpl, srch,
targetRect, sourceRect, clip, const_alpha);
}
qreal sx = sourceRect.width() / (qreal)targetRect.width();
qreal sy = sourceRect.height() / (qreal)targetRect.height();
const int ix = 0x00010000 * sx;
const int iy = 0x00010000 * sy;
QRect tr = targetRect.normalized().toRect();
tr = tr.intersected(clip);
if (tr.isEmpty())
return;
const int tx1 = tr.left();
const int ty1 = tr.top();
int h = tr.height();
int w = tr.width();
quint32 basex;
quint32 srcy;
if (sx < 0) {
int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * sx * 65536) + 1;
basex = quint32(sourceRect.right() * 65536) + dstx;
} else {
int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * sx * 65536) - 1;
basex = quint32(sourceRect.left() * 65536) + dstx;
}
if (sy < 0) {
int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * sy * 65536) + 1;
srcy = quint32(sourceRect.bottom() * 65536) + dsty;
} else {
int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * sy * 65536) - 1;
srcy = quint32(sourceRect.top() * 65536) + dsty;
}
quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1;
const __m128i nullVector = __lsx_vreplgr2vr_w(0);
const __m128i half = __lsx_vreplgr2vr_h(0x80);
const __m128i one = __lsx_vreplgr2vr_h(0xff);
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
const __m128i ixVector = __lsx_vreplgr2vr_w(4*ix);
// this bounds check here is required as floating point rounding above might in some cases lead to
// w/h values that are one pixel too large, falling outside of the valid image area.
const int ystart = srcy >> 16;
if (ystart >= srch && iy < 0) {
srcy += iy;
--h;
}
const int xstart = basex >> 16;
if (xstart >= (int)(sbpl/sizeof(quint32)) && ix < 0) {
basex += ix;
--w;
}
int yend = (srcy + iy * (h - 1)) >> 16;
if (yend < 0 || yend >= srch)
--h;
int xend = (basex + ix * (w - 1)) >> 16;
if (xend < 0 || xend >= (int)(sbpl/sizeof(quint32)))
--w;
while (--h >= 0) {
const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl);
int srcx = basex;
int x = 0;
ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
uint s = src[srcx >> 16];
dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
srcx += ix;
}
__m128i srcxVector = (__m128i)(v4i32){srcx + ix + ix + ix, srcx + ix + ix, srcx + ix, srcx};
for (; x < (w - 3); x += 4) {
const int idx0 = __lsx_vpickve2gr_h(srcxVector, 1);
const int idx1 = __lsx_vpickve2gr_h(srcxVector, 3);
const int idx2 = __lsx_vpickve2gr_h(srcxVector, 5);
const int idx3 = __lsx_vpickve2gr_h(srcxVector, 7);
srcxVector = __lsx_vadd_w(srcxVector, ixVector);
const __m128i srcVector = (__m128i)((v4u32){src[idx3], src[idx2], src[idx1], src[idx0]});
BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask);
}
SIMD_EPILOGUE(x, w, 3) {
uint s = src[(basex + x*ix) >> 16];
dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
}
dst = (quint32 *)(((uchar *) dst) + dbpl);
srcy += iy;
}
}
QT_END_NAMESPACE
#endif // QT_COMPILER_SUPPORTS_LSX

View File

@ -142,7 +142,7 @@ struct quint24 {
void qBlendGradient(int count, const QT_FT_Span *spans, void *userData);
void qBlendTexture(int count, const QT_FT_Span *spans, void *userData);
#ifdef Q_PROCESSOR_X86
#if defined(Q_PROCESSOR_X86) || defined(QT_COMPILER_SUPPORTS_LSX)
extern void (*qt_memfill64)(quint64 *dest, quint64 value, qsizetype count);
extern void (*qt_memfill32)(quint32 *dest, quint32 value, qsizetype count);
#else

View File

@ -0,0 +1,231 @@
// Copyright (C) 2024 Loongson Technology Corporation Limited.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
#ifndef QDRAWINGPRIMITIVE_LSX_P_H
#define QDRAWINGPRIMITIVE_LSX_P_H
#include <QtGui/private/qtguiglobal_p.h>
#include <private/qsimd_p.h>
#include "qdrawhelper_loongarch64_p.h"
#include "qrgba64_p.h"
#ifdef __loongarch_sx
//
// W A R N I N G
// -------------
//
// This file is not part of the Qt API. It exists purely as an
// implementation detail. This header file may change from version to
// version without notice, or even be removed.
//
// We mean it.
//
QT_BEGIN_NAMESPACE
/*
* Multiply the components of pixelVector by alphaChannel
* Each 32bits components of alphaChannel must be in the form 0x00AA00AA
* colorMask must have 0x00ff00ff on each 32 bits component
* half must have the value 128 (0x80) for each 32 bits component
*/
inline static void Q_DECL_VECTORCALL
BYTE_MUL_LSX(__m128i &pixelVector, __m128i alphaChannel, __m128i colorMask, __m128i half)
{
/* 1. separate the colors in 2 vectors so each color is on 16 bits
(in order to be multiplied by the alpha
each 32 bit of dstVectorAG are in the form 0x00AA00GG
each 32 bit of dstVectorRB are in the form 0x00RR00BB */
__m128i pixelVectorAG = __lsx_vsrli_h(pixelVector, 8);
__m128i pixelVectorRB = __lsx_vand_v(pixelVector, colorMask);
/* 2. multiply the vectors by the alpha channel */
pixelVectorAG = __lsx_vmul_h(pixelVectorAG, alphaChannel);
pixelVectorRB = __lsx_vmul_h(pixelVectorRB, alphaChannel);
/* 3. divide by 255, that's the tricky part.
we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */
/** so first (X + X/256 + rounding) */
pixelVectorRB = __lsx_vadd_h(pixelVectorRB, __lsx_vsrli_h(pixelVectorRB, 8));
pixelVectorRB = __lsx_vadd_h(pixelVectorRB, half);
pixelVectorAG = __lsx_vadd_h(pixelVectorAG, __lsx_vsrli_h(pixelVectorAG, 8));
pixelVectorAG = __lsx_vadd_h(pixelVectorAG, half);
/** second divide by 256 */
pixelVectorRB = __lsx_vsrli_h(pixelVectorRB, 8);
/** for AG, we could >> 8 to divide followed by << 8 to put the
bytes in the correct position. By masking instead, we execute
only one instruction */
pixelVectorAG = __lsx_vandn_v(colorMask, pixelVectorAG);
/* 4. combine the 2 pairs of colors */
pixelVector = __lsx_vor_v(pixelVectorAG, pixelVectorRB);
}
/*
* Each 32bits components of alphaChannel must be in the form 0x00AA00AA
* oneMinusAlphaChannel must be 255 - alpha for each 32 bits component
* colorMask must have 0x00ff00ff on each 32 bits component
* half must have the value 128 (0x80) for each 32 bits component
*/
inline static void Q_DECL_VECTORCALL
INTERPOLATE_PIXEL_255_LSX(__m128i srcVector, __m128i &dstVector, __m128i alphaChannel,
__m128i oneMinusAlphaChannel, __m128i colorMask, __m128i half)
{
/* interpolate AG */
__m128i srcVectorAG = __lsx_vsrli_h(srcVector, 8);
__m128i dstVectorAG = __lsx_vsrli_h(dstVector, 8);
__m128i srcVectorAGalpha = __lsx_vmul_h(srcVectorAG, alphaChannel);
__m128i dstVectorAGoneMinusAlphalpha = __lsx_vmul_h(dstVectorAG, oneMinusAlphaChannel);
__m128i finalAG = __lsx_vadd_h(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha);
finalAG = __lsx_vadd_h(finalAG, __lsx_vsrli_h(finalAG, 8));
finalAG = __lsx_vadd_h(finalAG, half);
finalAG = __lsx_vandn_v(colorMask, finalAG);
/* interpolate RB */
__m128i srcVectorRB = __lsx_vand_v(srcVector, colorMask);
__m128i dstVectorRB = __lsx_vand_v(dstVector, colorMask);
__m128i srcVectorRBalpha = __lsx_vmul_h(srcVectorRB, alphaChannel);
__m128i dstVectorRBoneMinusAlphalpha = __lsx_vmul_h(dstVectorRB, oneMinusAlphaChannel);
__m128i finalRB = __lsx_vadd_h(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha);
finalRB = __lsx_vadd_h(finalRB, __lsx_vsrli_h(finalRB, 8));
finalRB = __lsx_vadd_h(finalRB, half);
finalRB = __lsx_vsrli_h(finalRB, 8);
/* combine */
dstVector = __lsx_vor_v(finalAG, finalRB);
}
// same as BLEND_SOURCE_OVER_ARGB32_LSX, but for one vector srcVector
inline static void Q_DECL_VECTORCALL
BLEND_SOURCE_OVER_ARGB32_LSX_helper(quint32 *dst, int x, __m128i srcVector,
__m128i nullVector, __m128i half, __m128i one,
__m128i colorMask, __m128i alphaMask)
{
const __m128i srcVectorAlpha = __lsx_vand_v(srcVector, alphaMask);
__m128i vseq = __lsx_vseq_w(srcVectorAlpha, alphaMask);
v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq);
if (vseq_res[0] == (0x0000ffff)) {
/* all opaque */
__lsx_vst(srcVector, &dst[x], 0);
} else {
__m128i vseq_n = __lsx_vseq_w(srcVectorAlpha, nullVector);
v4i32 vseq_n_res = (v4i32)__lsx_vmsknz_b(vseq_n);
if (vseq_n_res[0] != (0x0000ffff)) {
/* not fully transparent */
/* extract the alpha channel on 2 x 16 bits */
/* so we have room for the multiplication */
/* each 32 bits will be in the form 0x00AA00AA */
/* with A being the 1 - alpha */
__m128i alphaChannel = __lsx_vsrli_w(srcVector, 24);
alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, 16));
alphaChannel = __lsx_vsub_h(one, alphaChannel);
__m128i dstVector = __lsx_vld(&dst[x], 0);
BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half);
/* result = s + d * (1-alpha) */
const __m128i result = __lsx_vadd_b(srcVector, dstVector);
__lsx_vst(result, &dst[x], 0);
}
}
}
// Basically blend src over dst with the const alpha defined as constAlphaVector.
// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
//const __m128i nullVector = __lsx_vreplgr2vr_w(0);
//const __m128i half = __lsx_vreplgr2vr_h(0x80);
//const __m128i one = __lsx_vreplgr2vr_h(0xff);
//const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
//const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
//
// The computation being done is:
// result = s + d * (1-alpha)
// with shortcuts if fully opaque or fully transparent.
inline static void Q_DECL_VECTORCALL
BLEND_SOURCE_OVER_ARGB32_LSX(quint32 *dst, const quint32 *src, int length)
{
int x = 0;
/* First, get dst aligned. */
ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
blend_pixel(dst[x], src[x]);
}
const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
const __m128i nullVector = __lsx_vreplgr2vr_w(0);
const __m128i half = __lsx_vreplgr2vr_h(0x80);
const __m128i one = __lsx_vreplgr2vr_h(0xff);
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
for (; x < length-3; x += 4) {
const __m128i srcVector = __lsx_vld((const __m128i *)&src[x], 0);
BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask);
}
SIMD_EPILOGUE(x, length, 3) {
blend_pixel(dst[x], src[x]);
}
}
// Basically blend src over dst with the const alpha defined as constAlphaVector.
// The computation being done is:
// dest = (s + d * sia) * ca + d * cia
// = s * ca + d * (sia * ca + cia)
// = s * ca + d * (1 - sa*ca)
inline static void Q_DECL_VECTORCALL
BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(quint32 *dst, const quint32 *src, int length, uint const_alpha)
{
int x = 0;
ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
blend_pixel(dst[x], src[x], const_alpha);
}
const __m128i nullVector = __lsx_vreplgr2vr_w(0);
const __m128i half = __lsx_vreplgr2vr_h(0x80);
const __m128i one = __lsx_vreplgr2vr_h(0xff);
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
for (; x < length-3; x += 4) {
__m128i srcVector = __lsx_vld((const __m128i *)&src[x], 0);
__m128i vseq = __lsx_vseq_w(srcVector, nullVector);
v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq);
if (vseq_res[0] != 0x0000ffff) {
BYTE_MUL_LSX(srcVector, constAlphaVector, colorMask, half);
__m128i alphaChannel = __lsx_vsrli_w(srcVector, 24);
alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, 16));
alphaChannel = __lsx_vsub_h(one, alphaChannel);
__m128i dstVector = __lsx_vld((__m128i *)&dst[x], 0);
BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half);
const __m128i result = __lsx_vadd_b(srcVector, dstVector);
__lsx_vst(result, &dst[x], 0);
}
}
SIMD_EPILOGUE(x, length, 3) {
blend_pixel(dst[x], src[x], const_alpha);
}
}
typedef union
{
int i;
float f;
} FloatInt;
/* float type data load instructions */
static __m128 __lsx_vreplfr2vr_s(float val)
{
FloatInt fi_tmpval = {.f = val};
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
}
QT_END_NAMESPACE
#endif // __loongarch_sx
#endif // QDRAWINGPRIMITIVE_LSX_P_H

View File

@ -257,6 +257,18 @@ void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi, unsigned int *dest,
int dw, int dh, int dow, int sow);
#endif
#if defined(QT_COMPILER_SUPPORTS_LSX)
template<bool RGB>
void qt_qimageScaleAARGBA_up_x_down_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
int dw, int dh, int dow, int sow);
template<bool RGB>
void qt_qimageScaleAARGBA_down_x_up_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
int dw, int dh, int dow, int sow);
template<bool RGB>
void qt_qimageScaleAARGBA_down_xy_lsx(QImageScaleInfo *isi, unsigned int *dest,
int dw, int dh, int dow, int sow);
#endif
#if defined(__ARM_NEON__)
template<bool RGB>
void qt_qimageScaleAARGBA_up_x_down_y_neon(QImageScaleInfo *isi, unsigned int *dest,
@ -351,6 +363,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest,
if (qCpuHasFeature(SSE4_1))
qt_qimageScaleAARGBA_up_x_down_y_sse4<false>(isi, dest, dw, dh, dow, sow);
else
#elif defined(QT_COMPILER_SUPPORTS_LSX)
if (qCpuHasFeature(LSX))
qt_qimageScaleAARGBA_up_x_down_y_lsx<false>(isi, dest, dw, dh, dow, sow);
else
#elif defined(__ARM_NEON__)
if (qCpuHasFeature(NEON))
qt_qimageScaleAARGBA_up_x_down_y_neon<false>(isi, dest, dw, dh, dow, sow);
@ -364,6 +380,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest,
if (qCpuHasFeature(SSE4_1))
qt_qimageScaleAARGBA_down_x_up_y_sse4<false>(isi, dest, dw, dh, dow, sow);
else
#elif defined(QT_COMPILER_SUPPORTS_LSX)
if (qCpuHasFeature(LSX))
qt_qimageScaleAARGBA_down_x_up_y_lsx<false>(isi, dest, dw, dh, dow, sow);
else
#elif defined(__ARM_NEON__)
if (qCpuHasFeature(NEON))
qt_qimageScaleAARGBA_down_x_up_y_neon<false>(isi, dest, dw, dh, dow, sow);
@ -377,6 +397,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest,
if (qCpuHasFeature(SSE4_1))
qt_qimageScaleAARGBA_down_xy_sse4<false>(isi, dest, dw, dh, dow, sow);
else
#elif defined(QT_COMPILER_SUPPORTS_LSX)
if (qCpuHasFeature(LSX))
qt_qimageScaleAARGBA_down_xy_lsx<false>(isi, dest, dw, dh, dow, sow);
else
#elif defined(__ARM_NEON__)
if (qCpuHasFeature(NEON))
qt_qimageScaleAARGBA_down_xy_neon<false>(isi, dest, dw, dh, dow, sow);
@ -995,6 +1019,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest,
if (qCpuHasFeature(SSE4_1))
qt_qimageScaleAARGBA_up_x_down_y_sse4<true>(isi, dest, dw, dh, dow, sow);
else
#elif defined QT_COMPILER_SUPPORTS_LSX
if (qCpuHasFeature(LSX))
qt_qimageScaleAARGBA_up_x_down_y_lsx<true>(isi, dest, dw, dh, dow, sow);
else
#elif defined(__ARM_NEON__)
if (qCpuHasFeature(NEON))
qt_qimageScaleAARGBA_up_x_down_y_neon<true>(isi, dest, dw, dh, dow, sow);
@ -1008,6 +1036,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest,
if (qCpuHasFeature(SSE4_1))
qt_qimageScaleAARGBA_down_x_up_y_sse4<true>(isi, dest, dw, dh, dow, sow);
else
#elif defined QT_COMPILER_SUPPORTS_LSX
if (qCpuHasFeature(LSX))
qt_qimageScaleAARGBA_down_x_up_y_lsx<true>(isi, dest, dw, dh, dow, sow);
else
#elif defined(__ARM_NEON__)
if (qCpuHasFeature(NEON))
qt_qimageScaleAARGBA_down_x_up_y_neon<true>(isi, dest, dw, dh, dow, sow);
@ -1021,6 +1053,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest,
if (qCpuHasFeature(SSE4_1))
qt_qimageScaleAARGBA_down_xy_sse4<true>(isi, dest, dw, dh, dow, sow);
else
#elif defined QT_COMPILER_SUPPORTS_LSX
if (qCpuHasFeature(LSX))
qt_qimageScaleAARGBA_down_xy_lsx<true>(isi, dest, dw, dh, dow, sow);
else
#elif defined(__ARM_NEON__)
if (qCpuHasFeature(NEON))
qt_qimageScaleAARGBA_down_xy_neon<true>(isi, dest, dw, dh, dow, sow);

View File

@ -0,0 +1,233 @@
// Copyright (C) 2024 Loongson Technology Corporation Limited.
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
#include "qimagescale_p.h"
#include "qimage.h"
#include <private/qdrawhelper_loongarch64_p.h>
#include <private/qsimd_p.h>
#if QT_CONFIG(thread) && !defined(Q_OS_WASM)
#include <qsemaphore.h>
#include <private/qthreadpool_p.h>
#endif
#if defined(QT_COMPILER_SUPPORTS_LSX)
QT_BEGIN_NAMESPACE
using namespace QImageScale;
template<typename T>
static inline void multithread_pixels_function(QImageScaleInfo *isi, int dh, const T &scaleSection)
{
#if QT_CONFIG(thread) && !defined(Q_OS_WASM)
int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16);
segments = std::min(segments, dh);
QThreadPool *threadPool = QThreadPoolPrivate::qtGuiInstance();
if (segments > 1 && threadPool && !threadPool->contains(QThread::currentThread())) {
QSemaphore semaphore;
int y = 0;
for (int i = 0; i < segments; ++i) {
int yn = (dh - y) / (segments - i);
threadPool->start([&, y, yn]() {
scaleSection(y, y + yn);
semaphore.release(1);
});
y += yn;
}
semaphore.acquire(segments);
return;
}
#else
Q_UNUSED(isi);
#endif
scaleSection(0, dh);
}
inline static __m128i Q_DECL_VECTORCALL
qt_qimageScaleAARGBA_helper(const unsigned int *pix, int xyap, int Cxy,
int step, const __m128i vxyap, const __m128i vCxy)
{
const __m128i shuffleMask = (__m128i)(v16i8){0, 16, 16, 16, 1, 16, 16, 16,
2, 16, 16, 16, 3, 16, 16, 16};
__m128i vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask);
__m128i vx = __lsx_vmul_w(vpix, vxyap);
int i;
for (i = (1 << 14) - xyap; i > Cxy; i -= Cxy) {
pix += step;
vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask);
vx = __lsx_vadd_w(vx, __lsx_vmul_w(vpix, vCxy));
}
pix += step;
vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask);
vx = __lsx_vadd_w(vx, __lsx_vmul_w(vpix, __lsx_vreplgr2vr_w(i)));
return vx;
}
template<bool RGB>
void qt_qimageScaleAARGBA_up_x_down_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
int dw, int dh, int dow, int sow)
{
const unsigned int **ypoints = isi->ypoints;
const int *xpoints = isi->xpoints;
const int *xapoints = isi->xapoints;
const int *yapoints = isi->yapoints;
const __m128i v256 = __lsx_vreplgr2vr_w(256);
/* go through every scanline in the output buffer */
auto scaleSection = [&] (int yStart, int yEnd) {
for (int y = yStart; y < yEnd; ++y) {
const int Cy = yapoints[y] >> 16;
const int yap = yapoints[y] & 0xffff;
const __m128i vCy = __lsx_vreplgr2vr_w(Cy);
const __m128i vyap = __lsx_vreplgr2vr_w(yap);
unsigned int *dptr = dest + (y * dow);
for (int x = 0; x < dw; x++) {
const unsigned int *sptr = ypoints[y] + xpoints[x];
__m128i vx = qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow, vyap, vCy);
const int xap = xapoints[x];
if (xap > 0) {
const __m128i vxap = __lsx_vreplgr2vr_w(xap);
const __m128i vinvxap = __lsx_vsub_w(v256, vxap);
__m128i vr = qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow, vyap, vCy);
vx = __lsx_vmul_w(vx, vinvxap);
vr = __lsx_vmul_w(vr, vxap);
vx = __lsx_vadd_w(vx, vr);
vx = __lsx_vsrli_w(vx, 8);
}
vx = __lsx_vsrli_w(vx, 14);
vx = __lsx_vpickev_h(__lsx_vsat_wu(vx, 15), __lsx_vsat_wu(vx, 15));
vx = __lsx_vpickev_b(__lsx_vsat_hu(vx, 7), __lsx_vsat_hu(vx, 7));
*dptr = __lsx_vpickve2gr_w(vx, 0);
if (RGB)
*dptr |= 0xff000000;
dptr++;
}
}
};
multithread_pixels_function(isi, dh, scaleSection);
}
template<bool RGB>
void qt_qimageScaleAARGBA_down_x_up_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
int dw, int dh, int dow, int sow)
{
const unsigned int **ypoints = isi->ypoints;
int *xpoints = isi->xpoints;
int *xapoints = isi->xapoints;
int *yapoints = isi->yapoints;
const __m128i v256 = __lsx_vreplgr2vr_w(256);
/* go through every scanline in the output buffer */
auto scaleSection = [&] (int yStart, int yEnd) {
for (int y = yStart; y < yEnd; ++y) {
unsigned int *dptr = dest + (y * dow);
for (int x = 0; x < dw; x++) {
int Cx = xapoints[x] >> 16;
int xap = xapoints[x] & 0xffff;
const __m128i vCx = __lsx_vreplgr2vr_w(Cx);
const __m128i vxap = __lsx_vreplgr2vr_w(xap);
const unsigned int *sptr = ypoints[y] + xpoints[x];
__m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
int yap = yapoints[y];
if (yap > 0) {
const __m128i vyap = __lsx_vreplgr2vr_w(yap);
const __m128i vinvyap = __lsx_vsub_w(v256, vyap);
__m128i vr = qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1, vxap, vCx);
vx = __lsx_vmul_w(vx, vinvyap);
vr = __lsx_vmul_w(vr, vyap);
vx = __lsx_vadd_w(vx, vr);
vx = __lsx_vsrli_w(vx, 8);
}
vx = __lsx_vsrli_w(vx, 14);
vx = __lsx_vpickev_h(__lsx_vsat_wu(vx, 15), __lsx_vsat_wu(vx, 15));
vx = __lsx_vpickev_b(__lsx_vsat_wu(vx, 7), __lsx_vsat_hu(vx, 7));
*dptr = __lsx_vpickve2gr_w(vx, 0);
if (RGB)
*dptr |= 0xff000000;
dptr++;
}
}
};
multithread_pixels_function(isi, dh, scaleSection);
}
template<bool RGB>
void qt_qimageScaleAARGBA_down_xy_lsx(QImageScaleInfo *isi, unsigned int *dest,
int dw, int dh, int dow, int sow)
{
const unsigned int **ypoints = isi->ypoints;
int *xpoints = isi->xpoints;
int *xapoints = isi->xapoints;
int *yapoints = isi->yapoints;
auto scaleSection = [&] (int yStart, int yEnd) {
for (int y = yStart; y < yEnd; ++y) {
int Cy = yapoints[y] >> 16;
int yap = yapoints[y] & 0xffff;
const __m128i vCy = __lsx_vreplgr2vr_w(Cy);
const __m128i vyap = __lsx_vreplgr2vr_w(yap);
unsigned int *dptr = dest + (y * dow);
for (int x = 0; x < dw; x++) {
const int Cx = xapoints[x] >> 16;
const int xap = xapoints[x] & 0xffff;
const __m128i vCx = __lsx_vreplgr2vr_w(Cx);
const __m128i vxap = __lsx_vreplgr2vr_w(xap);
const unsigned int *sptr = ypoints[y] + xpoints[x];
__m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
__m128i vr = __lsx_vmul_w(__lsx_vsrli_w(vx, 4), vyap);
int j;
for (j = (1 << 14) - yap; j > Cy; j -= Cy) {
sptr += sow;
vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
vr = __lsx_vadd_w(vr, __lsx_vmul_w(__lsx_vsrli_w(vx, 4), vCy));
}
sptr += sow;
vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
vr = __lsx_vadd_w(vr, __lsx_vmul_w(__lsx_vsrli_w(vx, 4), __lsx_vreplgr2vr_w(j)));
vr = __lsx_vsrli_w(vr, 24);
vr = __lsx_vpickev_h(__lsx_vldi(0), __lsx_vsat_wu(vr, 15));
vr = __lsx_vpickev_b(__lsx_vldi(0), __lsx_vsat_hu(vr, 7));
*dptr = __lsx_vpickve2gr_w(vr, 0);
if (RGB)
*dptr |= 0xff000000;
dptr++;
}
}
};
multithread_pixels_function(isi, dh, scaleSection);
}
template void qt_qimageScaleAARGBA_up_x_down_y_lsx<false>(QImageScaleInfo *isi, unsigned int *dest,
int dw, int dh, int dow, int sow);
template void qt_qimageScaleAARGBA_up_x_down_y_lsx<true>(QImageScaleInfo *isi, unsigned int *dest,
int dw, int dh, int dow, int sow);
template void qt_qimageScaleAARGBA_down_x_up_y_lsx<false>(QImageScaleInfo *isi, unsigned int *dest,
int dw, int dh, int dow, int sow);
template void qt_qimageScaleAARGBA_down_x_up_y_lsx<true>(QImageScaleInfo *isi, unsigned int *dest,
int dw, int dh, int dow, int sow);
template void qt_qimageScaleAARGBA_down_xy_lsx<false>(QImageScaleInfo *isi, unsigned int *dest,
int dw, int dh, int dow, int sow);
template void qt_qimageScaleAARGBA_down_xy_lsx<true>(QImageScaleInfo *isi, unsigned int *dest,
int dw, int dh, int dow, int sow);
QT_END_NAMESPACE
#endif