Complete drawhelper Func with LSX
List of optimized implementations using LSX: - qt_blend_argb32_on_argb32 - qt_blend_rgb32_on_rgb32 - comp_func_SourceOver - comp_func_Plus - comp_func_Source - comp_func_solid_Source - comp_func_solid_SourceOver - qt_memfill64 - qt_memfill32 - qt_bitmapblit32 - qt_bitmapblit16 - qt_scale_image_argb32_on_argb32 - convert_RGB888_to_RGB32 - qt_qimageScaleAARGBA_up_x_down_y - qt_qimageScaleAARGBA_down_x_up_y - qt_qimageScaleAARGBA_down_xy All of the above functions have passed the tests under tests/auto/gui. Change-Id: I7ae6169305b81bdf7fb704619453c505f8bb960f Reviewed-by: Volker Hilsheimer <volker.hilsheimer@qt.io>
This commit is contained in:
parent
73ce5a940a
commit
d511a68684
@ -183,6 +183,8 @@ qt_internal_add_module(Gui
|
||||
painting/qdrawhelper_p.h
|
||||
painting/qdrawhelper_x86_p.h
|
||||
painting/qdrawingprimitive_sse2_p.h
|
||||
painting/qdrawhelper_loongarch64_p.h
|
||||
painting/qdrawingprimitive_lsx_p.h
|
||||
painting/qemulationpaintengine.cpp painting/qemulationpaintengine_p.h
|
||||
painting/qfixed_p.h
|
||||
painting/qgrayraster.c painting/qgrayraster_p.h
|
||||
@ -655,6 +657,13 @@ qt_internal_add_simd_part(Gui SIMD neon
|
||||
painting/qimagescale_neon.cpp
|
||||
)
|
||||
|
||||
qt_internal_add_simd_part(Gui SIMD lsx
|
||||
SOURCES
|
||||
image/qimage_lsx.cpp
|
||||
painting/qdrawhelper_lsx.cpp
|
||||
painting/qimagescale_lsx.cpp
|
||||
)
|
||||
|
||||
if(NOT ANDROID)
|
||||
qt_internal_add_simd_part(Gui SIMD mips_dsp
|
||||
SOURCES
|
||||
|
@ -2754,6 +2754,18 @@ static void qInitImageConversions()
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(QT_COMPILER_SUPPORTS_LSX)
|
||||
if (qCpuHasFeature(LSX)) {
|
||||
extern void convert_RGB888_to_RGB32_lsx(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags);
|
||||
qimage_converter_map[QImage::Format_RGB888][QImage::Format_RGB32] = convert_RGB888_to_RGB32_lsx;
|
||||
qimage_converter_map[QImage::Format_RGB888][QImage::Format_ARGB32] = convert_RGB888_to_RGB32_lsx;
|
||||
qimage_converter_map[QImage::Format_RGB888][QImage::Format_ARGB32_Premultiplied] = convert_RGB888_to_RGB32_lsx;
|
||||
qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBX8888] = convert_RGB888_to_RGB32_lsx;
|
||||
qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBA8888] = convert_RGB888_to_RGB32_lsx;
|
||||
qimage_converter_map[QImage::Format_BGR888][QImage::Format_RGBA8888_Premultiplied] = convert_RGB888_to_RGB32_lsx;
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON__)
|
||||
extern void convert_RGB888_to_RGB32_neon(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags);
|
||||
qimage_converter_map[QImage::Format_RGB888][QImage::Format_RGB32] = convert_RGB888_to_RGB32_neon;
|
||||
|
115
src/gui/image/qimage_lsx.cpp
Normal file
115
src/gui/image/qimage_lsx.cpp
Normal file
@ -0,0 +1,115 @@
|
||||
// Copyright (C) 2016 The Qt Company Ltd.
|
||||
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
|
||||
|
||||
#include <qimage.h>
|
||||
#include <private/qimage_p.h>
|
||||
#include <private/qsimd_p.h>
|
||||
|
||||
#ifdef QT_COMPILER_SUPPORTS_LSX
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
|
||||
// Convert a scanline of RGB888 (src) to RGB32 (dst)
|
||||
// src must be at least len * 3 bytes
|
||||
// dst must be at least len * 4 bytes
|
||||
Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_lsx(quint32 *dst, const uchar *src, int len)
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
// Prologue, align dst to 16 bytes.
|
||||
ALIGNMENT_PROLOGUE_16BYTES(dst, i, len) {
|
||||
dst[i] = qRgb(src[0], src[1], src[2]);
|
||||
src += 3;
|
||||
}
|
||||
|
||||
// Mask the 4 first colors of the RGB888 vector
|
||||
const __m128i shuffleMask = (__m128i)(v16i8){2, 1, 0, 16, 5, 4, 3, 16,
|
||||
8, 7, 6, 16, 11, 10, 9, 16};
|
||||
// Mask the 4 last colors of a RGB888 vector with an offset of 1 (so the last 3 bytes are RGB)
|
||||
const __m128i shuffleMaskEnd = (__m128i)(v16i8){6, 5, 4, 16, 9, 8, 7, 16,
|
||||
12, 11, 10, 16, 15, 14, 13, 16};
|
||||
// Mask to have alpha = 0xff
|
||||
const __m128i alphaMask = __lsx_vreplgr2vr_b(0xff);
|
||||
|
||||
// Mask to concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by 12 bytes
|
||||
const __m128i indexMask1 = (__m128i)(v16i8){12, 13, 14, 15, 16, 17, 18, 19,
|
||||
20, 21, 22, 23, 24, 25, 26, 27};
|
||||
|
||||
// Mask to concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift the result right by 8 bytes
|
||||
const __m128i indexMask2 = (__m128i)(v16i8){8, 9, 10, 11, 12, 13, 14, 15,
|
||||
16, 17, 18, 19, 20, 21, 22, 23};
|
||||
|
||||
const __m128i *inVectorPtr = (const __m128i *)src;
|
||||
__m128i *dstVectorPtr = (__m128i *)(dst + i);
|
||||
|
||||
for (; i < (len - 15); i += 16) { // one iteration in the loop converts 16 pixels
|
||||
/*
|
||||
RGB888 has 5 pixels per vector, + 1 byte from the next pixel. The idea here is
|
||||
to load vectors of RGB888 and use palignr to select a vector out of two vectors.
|
||||
|
||||
After 3 loads of RGB888 and 3 stores of RGB32, we have 4 pixels left in the last
|
||||
vector of RGB888, we can mask it directly to get a last store or RGB32. After that,
|
||||
the first next byte is a R, and we can loop for the next 16 pixels.
|
||||
|
||||
The conversion itself is done with a byte permutation (vshuf_b).
|
||||
*/
|
||||
__m128i firstSrcVector = __lsx_vld(inVectorPtr, 0);
|
||||
__m128i outputVector = __lsx_vshuf_b(alphaMask, firstSrcVector, shuffleMask);
|
||||
__lsx_vst(outputVector, dstVectorPtr, 0);
|
||||
++inVectorPtr;
|
||||
++dstVectorPtr;
|
||||
|
||||
// There are 4 unused bytes left in srcVector, we need to load the next 16 bytes
|
||||
__m128i secondSrcVector = __lsx_vld(inVectorPtr, 0);
|
||||
__m128i srcVector = __lsx_vshuf_b(secondSrcVector, firstSrcVector, indexMask1);
|
||||
outputVector = __lsx_vshuf_b(alphaMask, srcVector, shuffleMask);
|
||||
__lsx_vst(outputVector, dstVectorPtr, 0);
|
||||
++inVectorPtr;
|
||||
++dstVectorPtr;
|
||||
firstSrcVector = secondSrcVector;
|
||||
|
||||
// We now have 8 unused bytes left in firstSrcVector
|
||||
secondSrcVector = __lsx_vld(inVectorPtr, 0);
|
||||
srcVector = __lsx_vshuf_b(secondSrcVector, firstSrcVector, indexMask2);
|
||||
outputVector = __lsx_vshuf_b(alphaMask, srcVector, shuffleMask);
|
||||
__lsx_vst(outputVector, dstVectorPtr, 0);
|
||||
++inVectorPtr;
|
||||
++dstVectorPtr;
|
||||
|
||||
// There are now 12 unused bytes in firstSrcVector.
|
||||
// We can mask them directly, almost there.
|
||||
outputVector = __lsx_vshuf_b(alphaMask, secondSrcVector, shuffleMaskEnd);
|
||||
__lsx_vst(outputVector, dstVectorPtr, 0);
|
||||
++dstVectorPtr;
|
||||
}
|
||||
src = (const uchar *)inVectorPtr;
|
||||
|
||||
SIMD_EPILOGUE(i, len, 15) {
|
||||
dst[i] = qRgb(src[0], src[1], src[2]);
|
||||
src += 3;
|
||||
}
|
||||
}
|
||||
|
||||
void convert_RGB888_to_RGB32_lsx(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags)
|
||||
{
|
||||
Q_ASSERT(src->format == QImage::Format_RGB888 || src->format == QImage::Format_BGR888);
|
||||
if (src->format == QImage::Format_BGR888)
|
||||
Q_ASSERT(dest->format == QImage::Format_RGBX8888 || dest->format == QImage::Format_RGBA8888 || dest->format == QImage::Format_RGBA8888_Premultiplied);
|
||||
else
|
||||
Q_ASSERT(dest->format == QImage::Format_RGB32 || dest->format == QImage::Format_ARGB32 || dest->format == QImage::Format_ARGB32_Premultiplied);
|
||||
Q_ASSERT(src->width == dest->width);
|
||||
Q_ASSERT(src->height == dest->height);
|
||||
|
||||
const uchar *src_data = (uchar *) src->data;
|
||||
quint32 *dest_data = (quint32 *) dest->data;
|
||||
|
||||
for (int i = 0; i < src->height; ++i) {
|
||||
qt_convert_rgb888_to_rgb32_lsx(dest_data, src_data, src->width);
|
||||
src_data += src->bytes_per_line;
|
||||
dest_data = (quint32 *)((uchar*)dest_data + dest->bytes_per_line);
|
||||
}
|
||||
}
|
||||
|
||||
QT_END_NAMESPACE
|
||||
|
||||
#endif // QT_COMPILER_SUPPORTS_LSX
|
@ -560,7 +560,7 @@ inline QImage::Format qt_opaqueVersionForPainting(QImage::Format format)
|
||||
inline QImage::Format qt_alphaVersionForPainting(QImage::Format format)
|
||||
{
|
||||
QImage::Format toFormat = qt_alphaVersion(format);
|
||||
#if defined(__ARM_NEON__) || defined(__SSE2__)
|
||||
#if defined(__ARM_NEON__) || defined(__SSE2__) || defined(QT_COMPILER_SUPPORT_LSX)
|
||||
// If we are switching depth anyway and we have optimized ARGB32PM routines, upgrade to that.
|
||||
if (qt_depthForFormat(format) != qt_depthForFormat(toFormat) && qt_depthForFormat(toFormat) <= 32)
|
||||
toFormat = QImage::Format_ARGB32_Premultiplied;
|
||||
|
@ -12,6 +12,8 @@
|
||||
#include <private/qdrawhelper_p.h>
|
||||
#include <private/qdrawhelper_x86_p.h>
|
||||
#include <private/qdrawingprimitive_sse2_p.h>
|
||||
#include <private/qdrawhelper_loongarch64_p.h>
|
||||
#include <private/qdrawingprimitive_lsx_p.h>
|
||||
#include <private/qdrawhelper_neon_p.h>
|
||||
#if defined(QT_COMPILER_SUPPORTS_MIPS_DSP) || defined(QT_COMPILER_SUPPORTS_MIPS_DSPR2)
|
||||
#include <private/qdrawhelper_mips_dsp_p.h>
|
||||
@ -4971,7 +4973,7 @@ void qBlendTexture(int count, const QT_FT_Span *spans, void *userData)
|
||||
case QImage::Format_RGB16:
|
||||
proc = processTextureSpansRGB16[blendType];
|
||||
break;
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON__) || (Q_PROCESSOR_WORDSIZE == 8)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON__) || defined(QT_COMPILER_SUPPORTS_LSX) || (Q_PROCESSOR_WORDSIZE == 8)
|
||||
case QImage::Format_ARGB32:
|
||||
case QImage::Format_RGBA8888:
|
||||
#endif
|
||||
@ -5113,7 +5115,7 @@ void qBlendGradient(int count, const QT_FT_Span *spans, void *userData)
|
||||
if (isVerticalGradient && blend_vertical_gradient_argb(count, spans, userData))
|
||||
return;
|
||||
return blend_src_generic(count, spans, userData);
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON__) || (Q_PROCESSOR_WORDSIZE == 8)
|
||||
#if defined(__SSE2__) || defined(__ARM_NEON__) || defined(QT_COMPILER_SUPPORTS_LSX) || (Q_PROCESSOR_WORDSIZE == 8)
|
||||
case QImage::Format_ARGB32:
|
||||
case QImage::Format_RGBA8888:
|
||||
#endif
|
||||
@ -6368,7 +6370,7 @@ DrawHelper qDrawHelper[] =
|
||||
|
||||
static_assert(std::size(qDrawHelper) == QImage::NImageFormats);
|
||||
|
||||
#if !defined(Q_PROCESSOR_X86)
|
||||
#if !defined(Q_PROCESSOR_X86) && !defined(QT_COMPILER_SUPPORTS_LSX)
|
||||
void qt_memfill64(quint64 *dest, quint64 color, qsizetype count)
|
||||
{
|
||||
qt_memfill_template<quint64>(dest, color, count);
|
||||
@ -6435,7 +6437,7 @@ void qt_memfill16(quint16 *dest, quint16 value, qsizetype count)
|
||||
qt_memfill32(reinterpret_cast<quint32*>(dest), value32, count / 2);
|
||||
}
|
||||
|
||||
#if defined(Q_PROCESSOR_X86)
|
||||
#if defined(Q_PROCESSOR_X86) || defined(QT_COMPILER_SUPPORTS_LSX)
|
||||
void (*qt_memfill32)(quint32 *dest, quint32 value, qsizetype count) = nullptr;
|
||||
void (*qt_memfill64)(quint64 *dest, quint64 value, qsizetype count) = nullptr;
|
||||
#elif !defined(__ARM_NEON__) && !defined(__MIPS_DSP__)
|
||||
@ -6712,6 +6714,68 @@ static void qInitDrawhelperFunctions()
|
||||
|
||||
#endif // SSE2
|
||||
|
||||
#if defined(QT_COMPILER_SUPPORTS_LSX)
|
||||
if (qCpuHasFeature(LSX)) {
|
||||
qt_memfill32 = qt_memfill32_lsx;
|
||||
qt_memfill64 = qt_memfill64_lsx;
|
||||
|
||||
qDrawHelper[QImage::Format_RGB32].bitmapBlit = qt_bitmapblit32_lsx;
|
||||
qDrawHelper[QImage::Format_ARGB32].bitmapBlit = qt_bitmapblit32_lsx;
|
||||
qDrawHelper[QImage::Format_ARGB32_Premultiplied].bitmapBlit = qt_bitmapblit32_lsx;
|
||||
qDrawHelper[QImage::Format_RGB16].bitmapBlit = qt_bitmapblit16_lsx;
|
||||
qDrawHelper[QImage::Format_RGBX8888].bitmapBlit = qt_bitmapblit8888_lsx;
|
||||
qDrawHelper[QImage::Format_RGBA8888].bitmapBlit = qt_bitmapblit8888_lsx;
|
||||
qDrawHelper[QImage::Format_RGBA8888_Premultiplied].bitmapBlit = qt_bitmapblit8888_lsx;
|
||||
|
||||
extern void qt_scale_image_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
|
||||
const uchar *srcPixels, int sbpl, int srch,
|
||||
const QRectF &targetRect,
|
||||
const QRectF &sourceRect,
|
||||
const QRect &clip,
|
||||
int const_alpha);
|
||||
|
||||
qScaleFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
|
||||
qScaleFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
|
||||
qScaleFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
|
||||
qScaleFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_scale_image_argb32_on_argb32_lsx;
|
||||
|
||||
extern void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl,
|
||||
const uchar *srcPixels, int sbpl,
|
||||
int w, int h,
|
||||
int const_alpha);
|
||||
|
||||
extern void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
|
||||
const uchar *srcPixels, int sbpl,
|
||||
int w, int h,
|
||||
int const_alpha);
|
||||
|
||||
qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_lsx;
|
||||
qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_lsx;
|
||||
qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
|
||||
qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
|
||||
qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_lsx;
|
||||
qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_lsx;
|
||||
qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
|
||||
qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_lsx;
|
||||
|
||||
extern const uint * QT_FASTCALL qt_fetch_radial_gradient_lsx(uint *buffer, const Operator *op, const QSpanData *data,
|
||||
int y, int x, int length);
|
||||
|
||||
qt_fetch_radial_gradient = qt_fetch_radial_gradient_lsx;
|
||||
|
||||
extern void QT_FASTCALL comp_func_SourceOver_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_solid_SourceOver_lsx(uint *destPixels, int length, uint color, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_Source_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_solid_Source_lsx(uint *destPixels, int length, uint color, uint const_alpha);
|
||||
extern void QT_FASTCALL comp_func_Plus_lsx(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
|
||||
qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_lsx;
|
||||
qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_lsx;
|
||||
qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_lsx;
|
||||
qt_functionForModeSolid_C[QPainter::CompositionMode_Source] = comp_func_solid_Source_lsx;
|
||||
qt_functionForMode_C[QPainter::CompositionMode_Plus] = comp_func_Plus_lsx;
|
||||
}
|
||||
#endif //QT_COMPILER_SUPPORTS_LSX
|
||||
|
||||
#if defined(__ARM_NEON__)
|
||||
qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_neon;
|
||||
qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_neon;
|
||||
|
48
src/gui/painting/qdrawhelper_loongarch64_p.h
Normal file
48
src/gui/painting/qdrawhelper_loongarch64_p.h
Normal file
@ -0,0 +1,48 @@
|
||||
// Copyright (C) 2024 Loongson Technology Corporation Limited.
|
||||
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
|
||||
|
||||
#ifndef QDRAWHELPER_LOONGARCH64_P_H
|
||||
#define QDRAWHELPER_LOONGARCH64_P_H
|
||||
|
||||
//
|
||||
// W A R N I N G
|
||||
// -------------
|
||||
//
|
||||
// This file is not part of the Qt API. It exists purely as an
|
||||
// implementation detail. This header file may change from version to
|
||||
// version without notice, or even be removed.
|
||||
//
|
||||
// We mean it.
|
||||
//
|
||||
|
||||
#include <QtGui/private/qtguiglobal_p.h>
|
||||
#include <private/qdrawhelper_p.h>
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
|
||||
#ifdef QT_COMPILER_SUPPORTS_LSX
|
||||
void qt_memfill64_lsx(quint64 *dest, quint64 value, qsizetype count);
|
||||
void qt_memfill32_lsx(quint32 *dest, quint32 value, qsizetype count);
|
||||
void qt_bitmapblit32_lsx(QRasterBuffer *rasterBuffer, int x, int y,
|
||||
const QRgba64 &color,
|
||||
const uchar *src, int width, int height, int stride);
|
||||
void qt_bitmapblit8888_lsx(QRasterBuffer *rasterBuffer, int x, int y,
|
||||
const QRgba64 &color,
|
||||
const uchar *src, int width, int height, int stride);
|
||||
void qt_bitmapblit16_lsx(QRasterBuffer *rasterBuffer, int x, int y,
|
||||
const QRgba64 &color,
|
||||
const uchar *src, int width, int height, int stride);
|
||||
void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
|
||||
const uchar *srcPixels, int sbpl,
|
||||
int w, int h,
|
||||
int const_alpha);
|
||||
void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl,
|
||||
const uchar *srcPixels, int sbpl,
|
||||
int w, int h,
|
||||
int const_alpha);
|
||||
|
||||
#endif // QT_COMPILER_SUPPORTS_LSX
|
||||
|
||||
QT_END_NAMESPACE
|
||||
|
||||
#endif // QDRAWHELPER_LOONGARCH64_P_H
|
593
src/gui/painting/qdrawhelper_lsx.cpp
Normal file
593
src/gui/painting/qdrawhelper_lsx.cpp
Normal file
@ -0,0 +1,593 @@
|
||||
// Copyright (C) 2024 Loongson Technology Corporation Limited.
|
||||
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
|
||||
|
||||
#include <private/qdrawhelper_loongarch64_p.h>
|
||||
|
||||
#ifdef QT_COMPILER_SUPPORTS_LSX
|
||||
|
||||
#include <private/qdrawingprimitive_lsx_p.h>
|
||||
#include <private/qpaintengine_raster_p.h>
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
|
||||
void qt_blend_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
|
||||
const uchar *srcPixels, int sbpl,
|
||||
int w, int h,
|
||||
int const_alpha)
|
||||
{
|
||||
const quint32 *src = (const quint32 *) srcPixels;
|
||||
quint32 *dst = (quint32 *) destPixels;
|
||||
if (const_alpha == 256) {
|
||||
for (int y = 0; y < h; ++y) {
|
||||
BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, w);
|
||||
dst = (quint32 *)(((uchar *) dst) + dbpl);
|
||||
src = (const quint32 *)(((const uchar *) src) + sbpl);
|
||||
}
|
||||
} else if (const_alpha != 0) {
|
||||
// dest = (s + d * sia) * ca + d * cia
|
||||
// = s * ca + d * (sia * ca + cia)
|
||||
// = s * ca + d * (1 - sa*ca)
|
||||
const_alpha = (const_alpha * 255) >> 8;
|
||||
|
||||
for (int y = 0; y < h; ++y) {
|
||||
BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, w, const_alpha);
|
||||
dst = (quint32 *)(((uchar *) dst) + dbpl);
|
||||
src = (const quint32 *)(((const uchar *) src) + sbpl);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// qblendfunctions.cpp
|
||||
void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
|
||||
const uchar *srcPixels, int sbpl,
|
||||
int w, int h,
|
||||
int const_alpha);
|
||||
|
||||
void qt_blend_rgb32_on_rgb32_lsx(uchar *destPixels, int dbpl,
|
||||
const uchar *srcPixels, int sbpl,
|
||||
int w, int h,
|
||||
int const_alpha)
|
||||
{
|
||||
const quint32 *src = (const quint32 *) srcPixels;
|
||||
quint32 *dst = (quint32 *) destPixels;
|
||||
if (const_alpha != 256) {
|
||||
if (const_alpha != 0) {
|
||||
const __m128i half = __lsx_vreplgr2vr_h(0x80);
|
||||
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
|
||||
|
||||
const_alpha = (const_alpha * 255) >> 8;
|
||||
int one_minus_const_alpha = 255 - const_alpha;
|
||||
const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
|
||||
const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(one_minus_const_alpha);
|
||||
for (int y = 0; y < h; ++y) {
|
||||
int x = 0;
|
||||
|
||||
// First, align dest to 16 bytes:
|
||||
ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
|
||||
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha,
|
||||
dst[x], one_minus_const_alpha);
|
||||
}
|
||||
|
||||
for (; x < w-3; x += 4) {
|
||||
__m128i srcVector = __lsx_vld(&src[x], 0);
|
||||
__m128i dstVector = __lsx_vld(&dst[x], 0);
|
||||
INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector,
|
||||
oneMinusConstAlpha, colorMask, half);
|
||||
__lsx_vst(dstVector, &dst[x], 0);
|
||||
}
|
||||
SIMD_EPILOGUE(x, w, 3)
|
||||
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha,
|
||||
dst[x], one_minus_const_alpha);
|
||||
dst = (quint32 *)(((uchar *) dst) + dbpl);
|
||||
src = (const quint32 *)(((const uchar *) src) + sbpl);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
|
||||
}
|
||||
}
|
||||
|
||||
void QT_FASTCALL comp_func_SourceOver_lsx(uint *destPixels, const uint *srcPixels,
|
||||
int length, uint const_alpha)
|
||||
{
|
||||
Q_ASSERT(const_alpha < 256);
|
||||
|
||||
const quint32 *src = (const quint32 *) srcPixels;
|
||||
quint32 *dst = (quint32 *) destPixels;
|
||||
|
||||
if (const_alpha == 255) {
|
||||
BLEND_SOURCE_OVER_ARGB32_LSX(dst, src, length);
|
||||
} else {
|
||||
BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(dst, src, length, const_alpha);
|
||||
}
|
||||
}
|
||||
|
||||
void QT_FASTCALL comp_func_Plus_lsx(uint *dst, const uint *src, int length, uint const_alpha)
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
if (const_alpha == 255) {
|
||||
// 1) Prologue: align destination on 16 bytes
|
||||
ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
|
||||
dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
|
||||
|
||||
// 2) composition with LSX
|
||||
for (; x < length - 3; x += 4) {
|
||||
const __m128i srcVector = __lsx_vld(&src[x], 0);
|
||||
const __m128i dstVector = __lsx_vld(&dst[x], 0);
|
||||
|
||||
const __m128i result = __lsx_vsadd_bu(srcVector, dstVector);
|
||||
__lsx_vst(result, &dst[x], 0);
|
||||
}
|
||||
|
||||
// 3) Epilogue:
|
||||
SIMD_EPILOGUE(x, length, 3)
|
||||
dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
|
||||
} else {
|
||||
const int one_minus_const_alpha = 255 - const_alpha;
|
||||
const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
|
||||
const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(one_minus_const_alpha);
|
||||
|
||||
// 1) Prologue: align destination on 16 bytes
|
||||
ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
|
||||
dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x],
|
||||
const_alpha,
|
||||
one_minus_const_alpha);
|
||||
|
||||
const __m128i half = __lsx_vreplgr2vr_h(0x80);
|
||||
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
|
||||
// 2) composition with LSX
|
||||
for (; x < length - 3; x += 4) {
|
||||
const __m128i srcVector = __lsx_vld(&src[x], 0);
|
||||
__m128i dstVector = __lsx_vld(&dst[x], 0);
|
||||
__m128i result = __lsx_vsadd_bu(srcVector, dstVector);
|
||||
INTERPOLATE_PIXEL_255_LSX(result, dstVector, constAlphaVector,
|
||||
oneMinusConstAlpha, colorMask, half);
|
||||
__lsx_vst(dstVector, &dst[x], 0);
|
||||
}
|
||||
|
||||
// 3) Epilogue:
|
||||
SIMD_EPILOGUE(x, length, 3)
|
||||
dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x],
|
||||
const_alpha, one_minus_const_alpha);
|
||||
}
|
||||
}
|
||||
|
||||
void QT_FASTCALL comp_func_Source_lsx(uint *dst, const uint *src, int length, uint const_alpha)
|
||||
{
|
||||
if (const_alpha == 255) {
|
||||
::memcpy(dst, src, length * sizeof(uint));
|
||||
} else {
|
||||
const int ialpha = 255 - const_alpha;
|
||||
|
||||
int x = 0;
|
||||
|
||||
// 1) prologue, align on 16 bytes
|
||||
ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
|
||||
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
|
||||
|
||||
// 2) interpolate pixels with LSX
|
||||
const __m128i half = __lsx_vreplgr2vr_h(0x80);
|
||||
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
|
||||
|
||||
const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
|
||||
const __m128i oneMinusConstAlpha = __lsx_vreplgr2vr_h(ialpha);
|
||||
for (; x < length - 3; x += 4) {
|
||||
const __m128i srcVector = __lsx_vld(&src[x], 0);
|
||||
__m128i dstVector = __lsx_vld(&dst[x], 0);
|
||||
INTERPOLATE_PIXEL_255_LSX(srcVector, dstVector, constAlphaVector,
|
||||
oneMinusConstAlpha, colorMask, half);
|
||||
__lsx_vst(dstVector, &dst[x], 0);
|
||||
}
|
||||
|
||||
// 3) Epilogue
|
||||
SIMD_EPILOGUE(x, length, 3)
|
||||
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
|
||||
}
|
||||
}
|
||||
|
||||
static Q_NEVER_INLINE
|
||||
void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount)
|
||||
{
|
||||
__m128i *dst128 = reinterpret_cast<__m128i *>(dest);
|
||||
__m128i *end128 = reinterpret_cast<__m128i *>(static_cast<uchar *>(dest) + bytecount);
|
||||
|
||||
while (dst128 + 4 <= end128) {
|
||||
__lsx_vst(value128, dst128 + 0, 0);
|
||||
__lsx_vst(value128, dst128 + 1, 0);
|
||||
__lsx_vst(value128, dst128 + 2, 0);
|
||||
__lsx_vst(value128, dst128 + 3, 0);
|
||||
dst128 += 4;
|
||||
}
|
||||
|
||||
bytecount %= 4 * sizeof(__m128i);
|
||||
switch (bytecount / sizeof(__m128i)) {
|
||||
case 3: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH();
|
||||
case 2: __lsx_vst(value128, dst128++, 0); Q_FALLTHROUGH();
|
||||
case 1: __lsx_vst(value128, dst128++, 0);
|
||||
}
|
||||
}
|
||||
|
||||
void qt_memfill64_lsx(quint64 *dest, quint64 value, qsizetype count)
|
||||
{
|
||||
quintptr misaligned = quintptr(dest) % sizeof(__m128i);
|
||||
if (misaligned && count) {
|
||||
*dest++ = value;
|
||||
--count;
|
||||
}
|
||||
|
||||
if (count % 2) {
|
||||
dest[count - 1] = value;
|
||||
--count;
|
||||
}
|
||||
|
||||
qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_d(value), count * sizeof(quint64));
|
||||
}
|
||||
|
||||
void qt_memfill32_lsx(quint32 *dest, quint32 value, qsizetype count)
|
||||
{
|
||||
if (count < 4) {
|
||||
// this simplifies the code below: the first switch can fall through
|
||||
// without checking the value of count
|
||||
switch (count) {
|
||||
case 3: *dest++ = value; Q_FALLTHROUGH();
|
||||
case 2: *dest++ = value; Q_FALLTHROUGH();
|
||||
case 1: *dest = value;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const int align = (quintptr)(dest) & 0xf;
|
||||
switch (align) {
|
||||
case 4: *dest++ = value; --count; Q_FALLTHROUGH();
|
||||
case 8: *dest++ = value; --count; Q_FALLTHROUGH();
|
||||
case 12: *dest++ = value; --count;
|
||||
}
|
||||
|
||||
const int rest = count & 0x3;
|
||||
if (rest) {
|
||||
switch (rest) {
|
||||
case 3: dest[count - 3] = value; Q_FALLTHROUGH();
|
||||
case 2: dest[count - 2] = value; Q_FALLTHROUGH();
|
||||
case 1: dest[count - 1] = value;
|
||||
}
|
||||
}
|
||||
|
||||
qt_memfillXX_aligned(dest, __lsx_vreplgr2vr_w(value), count * sizeof(quint32));
|
||||
}
|
||||
|
||||
void QT_FASTCALL comp_func_solid_Source_lsx(uint *destPixels, int length,
|
||||
uint color, uint const_alpha)
|
||||
{
|
||||
if (const_alpha == 255) {
|
||||
qt_memfill32(destPixels, color, length);
|
||||
} else {
|
||||
const quint32 ialpha = 255 - const_alpha;
|
||||
color = BYTE_MUL(color, const_alpha);
|
||||
int x = 0;
|
||||
|
||||
quint32 *dst = (quint32 *) destPixels;
|
||||
const __m128i colorVector = __lsx_vreplgr2vr_w(color);
|
||||
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
|
||||
const __m128i half = __lsx_vreplgr2vr_h(0x80);
|
||||
const __m128i iAlphaVector = __lsx_vreplgr2vr_h(ialpha);
|
||||
|
||||
ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
|
||||
destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha);
|
||||
|
||||
for (; x < length-3; x += 4) {
|
||||
__m128i dstVector = __lsx_vld(&dst[x], 0);
|
||||
BYTE_MUL_LSX(dstVector, iAlphaVector, colorMask, half);
|
||||
dstVector = __lsx_vadd_b(colorVector, dstVector);
|
||||
__lsx_vst(dstVector, &dst[x], 0);
|
||||
}
|
||||
SIMD_EPILOGUE(x, length, 3)
|
||||
destPixels[x] = color + BYTE_MUL(destPixels[x], ialpha);
|
||||
}
|
||||
}
|
||||
|
||||
void QT_FASTCALL comp_func_solid_SourceOver_lsx(uint *destPixels, int length,
|
||||
uint color, uint const_alpha)
|
||||
{
|
||||
if ((const_alpha & qAlpha(color)) == 255) {
|
||||
qt_memfill32(destPixels, color, length);
|
||||
} else {
|
||||
if (const_alpha != 255)
|
||||
color = BYTE_MUL(color, const_alpha);
|
||||
|
||||
const quint32 minusAlphaOfColor = qAlpha(~color);
|
||||
int x = 0;
|
||||
|
||||
quint32 *dst = (quint32 *) destPixels;
|
||||
const __m128i colorVector = __lsx_vreplgr2vr_w(color);
|
||||
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
|
||||
const __m128i half = __lsx_vreplgr2vr_h(0x80);
|
||||
const __m128i minusAlphaOfColorVector = __lsx_vreplgr2vr_h(minusAlphaOfColor);
|
||||
|
||||
ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
|
||||
destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
|
||||
|
||||
for (; x < length-3; x += 4) {
|
||||
__m128i dstVector = __lsx_vld(&dst[x], 0);
|
||||
BYTE_MUL_LSX(dstVector, minusAlphaOfColorVector, colorMask, half);
|
||||
dstVector = __lsx_vadd_b(colorVector, dstVector);
|
||||
__lsx_vst(dstVector, &dst[x], 0);
|
||||
}
|
||||
SIMD_EPILOGUE(x, length, 3)
|
||||
destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
|
||||
}
|
||||
}
|
||||
|
||||
void qt_bitmapblit32_lsx_base(QRasterBuffer *rasterBuffer, int x, int y,
|
||||
quint32 color,
|
||||
const uchar *src, int width, int height, int stride)
|
||||
{
|
||||
quint32 *dest = reinterpret_cast<quint32*>(rasterBuffer->scanLine(y)) + x;
|
||||
const int destStride = rasterBuffer->stride<quint32>();
|
||||
|
||||
const __m128i c128 = __lsx_vreplgr2vr_w(color);
|
||||
const __m128i maskmask1 = (__m128i)(v4u32){0x80808080, 0x40404040,
|
||||
0x20202020, 0x10101010};
|
||||
const __m128i maskadd1 = (__m128i)(v4i32){0x00000000, 0x40404040,
|
||||
0x60606060, 0x70707070};
|
||||
|
||||
if (width > 4) {
|
||||
const __m128i maskmask2 = (__m128i)(v4i32){0x08080808, 0x04040404,
|
||||
0x02020202, 0x01010101};
|
||||
const __m128i maskadd2 = (__m128i)(v4i32){0x78787878, 0x7c7c7c7c,
|
||||
0x7e7e7e7e, 0x7f7f7f7f};
|
||||
while (height--) {
|
||||
for (int x = 0; x < width; x += 8) {
|
||||
const quint8 s = src[x >> 3];
|
||||
if (!s)
|
||||
continue;
|
||||
__m128i mask1 = __lsx_vreplgr2vr_b(s);
|
||||
__m128i mask2 = mask1;
|
||||
|
||||
mask1 = __lsx_vand_v(mask1, maskmask1);
|
||||
mask1 = __lsx_vadd_b(mask1, maskadd1);
|
||||
|
||||
__m128i destSrc1 = __lsx_vld((char*)(dest + x), 0);
|
||||
|
||||
mask1 = __lsx_vslti_b(mask1,0);
|
||||
destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1);
|
||||
__lsx_vst(destSrc1, (char*)(dest + x), 0);
|
||||
|
||||
__m128i destSrc2 = __lsx_vld((char*)(dest + x + 4), 0);
|
||||
|
||||
mask2 = __lsx_vand_v(mask2, maskmask2);
|
||||
mask2 = __lsx_vadd_b(mask2, maskadd2);
|
||||
|
||||
mask2 = __lsx_vslti_b(mask2,0);
|
||||
destSrc2 = __lsx_vbitsel_v(destSrc2, c128, mask2);
|
||||
__lsx_vst(destSrc2, (char*)(dest + x + 4), 0);
|
||||
}
|
||||
dest += destStride;
|
||||
src += stride;
|
||||
}
|
||||
} else {
|
||||
while (height--) {
|
||||
const quint8 s = *src;
|
||||
if (s) {
|
||||
__m128i mask1 = __lsx_vreplgr2vr_b(s);
|
||||
|
||||
__m128i destSrc1 = __lsx_vld((char*)(dest), 0);
|
||||
mask1 = __lsx_vand_v(mask1, maskmask1);
|
||||
mask1 = __lsx_vadd_b(mask1, maskadd1);
|
||||
|
||||
mask1 = __lsx_vslti_b(mask1, 0);
|
||||
destSrc1 = __lsx_vbitsel_v(destSrc1, c128, mask1);
|
||||
__lsx_vst(destSrc1, (char*)(dest), 0);
|
||||
}
|
||||
dest += destStride;
|
||||
src += stride;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void qt_bitmapblit32_lsx(QRasterBuffer *rasterBuffer, int x, int y,
|
||||
const QRgba64 &color,
|
||||
const uchar *src, int width, int height, int stride)
|
||||
{
|
||||
qt_bitmapblit32_lsx_base(rasterBuffer, x, y, color.toArgb32(), src, width, height, stride);
|
||||
}
|
||||
|
||||
void qt_bitmapblit8888_lsx(QRasterBuffer *rasterBuffer, int x, int y,
|
||||
const QRgba64 &color,
|
||||
const uchar *src, int width, int height, int stride)
|
||||
{
|
||||
qt_bitmapblit32_lsx_base(rasterBuffer, x, y, ARGB2RGBA(color.toArgb32()), src, width, height, stride);
|
||||
}
|
||||
|
||||
void qt_bitmapblit16_lsx(QRasterBuffer *rasterBuffer, int x, int y,
|
||||
const QRgba64 &color,
|
||||
const uchar *src, int width, int height, int stride)
|
||||
{
|
||||
const quint16 c = qConvertRgb32To16(color.toArgb32());
|
||||
quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
|
||||
const int destStride = rasterBuffer->stride<quint32>();
|
||||
|
||||
const __m128i c128 = __lsx_vreplgr2vr_h(c);
|
||||
const __m128i maskmask = (__m128i)(v8u16){0x8080, 0x4040, 0x2020, 0x1010,
|
||||
0x0808, 0x0404, 0x0202, 0x0101};
|
||||
|
||||
const __m128i maskadd = (__m128i)(v8i16){0x0000, 0x4040, 0x6060, 0x7070,
|
||||
0x7878, 0x7c7c, 0x7e7e, 0x7f7f};
|
||||
while (--height >= 0) {
|
||||
for (int x = 0; x < width; x += 8) {
|
||||
const quint8 s = src[x >> 3];
|
||||
if (!s)
|
||||
continue;
|
||||
__m128i mask = __lsx_vreplgr2vr_b(s);
|
||||
__m128i destSrc = __lsx_vld((char*)(dest + x), 0);
|
||||
mask = __lsx_vand_v(mask, maskmask);
|
||||
mask = __lsx_vadd_b(mask, maskadd);
|
||||
mask = __lsx_vslti_b(mask, 0);
|
||||
destSrc = __lsx_vbitsel_v(destSrc, c128, mask);
|
||||
__lsx_vst(destSrc, (char*)(dest + x), 0);
|
||||
}
|
||||
dest += destStride;
|
||||
src += stride;
|
||||
}
|
||||
}
|
||||
|
||||
class QSimdLsx
|
||||
{
|
||||
public:
|
||||
typedef __m128i Int32x4;
|
||||
typedef __m128 Float32x4;
|
||||
|
||||
union Vect_buffer_i { Int32x4 v; int i[4]; };
|
||||
union Vect_buffer_f { Float32x4 v; float f[4]; };
|
||||
|
||||
static inline Float32x4 Q_DECL_VECTORCALL v_dup(float x) { return __lsx_vreplfr2vr_s(x); }
|
||||
static inline Float32x4 Q_DECL_VECTORCALL v_dup(double x) { return __lsx_vreplfr2vr_s(x); }
|
||||
static inline Int32x4 Q_DECL_VECTORCALL v_dup(int x) { return __lsx_vreplgr2vr_w(x); }
|
||||
static inline Int32x4 Q_DECL_VECTORCALL v_dup(uint x) { return __lsx_vreplgr2vr_w(x); }
|
||||
|
||||
static inline Float32x4 Q_DECL_VECTORCALL v_add(Float32x4 a, Float32x4 b) { return __lsx_vfadd_s(a, b); }
|
||||
static inline Int32x4 Q_DECL_VECTORCALL v_add(Int32x4 a, Int32x4 b) { return __lsx_vadd_w(a, b); }
|
||||
|
||||
static inline Float32x4 Q_DECL_VECTORCALL v_max(Float32x4 a, Float32x4 b) { return __lsx_vfmax_s(a, b); }
|
||||
static inline Float32x4 Q_DECL_VECTORCALL v_min(Float32x4 a, Float32x4 b) { return __lsx_vfmin_s(a, b); }
|
||||
static inline Int32x4 Q_DECL_VECTORCALL v_min_16(Int32x4 a, Int32x4 b) { return __lsx_vmin_h(a, b); }
|
||||
|
||||
static inline Int32x4 Q_DECL_VECTORCALL v_and(Int32x4 a, Int32x4 b) { return __lsx_vand_v(a, b); }
|
||||
|
||||
static inline Float32x4 Q_DECL_VECTORCALL v_sub(Float32x4 a, Float32x4 b) { return __lsx_vfsub_s(a, b); }
|
||||
static inline Int32x4 Q_DECL_VECTORCALL v_sub(Int32x4 a, Int32x4 b) { return __lsx_vsub_w(a, b); }
|
||||
|
||||
static inline Float32x4 Q_DECL_VECTORCALL v_mul(Float32x4 a, Float32x4 b) { return __lsx_vfmul_s(a, b); }
|
||||
|
||||
static inline Float32x4 Q_DECL_VECTORCALL v_sqrt(Float32x4 x) { return __lsx_vfsqrt_s(x); }
|
||||
|
||||
static inline Int32x4 Q_DECL_VECTORCALL v_toInt(Float32x4 x) { return __lsx_vftintrz_w_s(x); }
|
||||
|
||||
static inline Int32x4 Q_DECL_VECTORCALL v_greaterOrEqual(Float32x4 a, Float32x4 b) { return __lsx_vfcmp_clt_s(b, a); }
|
||||
};
|
||||
|
||||
const uint * QT_FASTCALL qt_fetch_radial_gradient_lsx(uint *buffer, const Operator *op,
|
||||
const QSpanData *data,
|
||||
int y, int x, int length)
|
||||
{
|
||||
return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdLsx>,uint>(buffer, op, data, y, x, length);
|
||||
}
|
||||
|
||||
void qt_scale_image_argb32_on_argb32_lsx(uchar *destPixels, int dbpl,
|
||||
const uchar *srcPixels, int sbpl, int srch,
|
||||
const QRectF &targetRect,
|
||||
const QRectF &sourceRect,
|
||||
const QRect &clip,
|
||||
int const_alpha)
|
||||
{
|
||||
if (const_alpha != 256) {
|
||||
// from qblendfunctions.cpp
|
||||
extern void qt_scale_image_argb32_on_argb32(uchar *destPixels, int dbpl,
|
||||
const uchar *srcPixels, int sbpl, int srch,
|
||||
const QRectF &targetRect,
|
||||
const QRectF &sourceRect,
|
||||
const QRect &clip,
|
||||
int const_alpha);
|
||||
return qt_scale_image_argb32_on_argb32(destPixels, dbpl, srcPixels, sbpl, srch,
|
||||
targetRect, sourceRect, clip, const_alpha);
|
||||
}
|
||||
|
||||
qreal sx = sourceRect.width() / (qreal)targetRect.width();
|
||||
qreal sy = sourceRect.height() / (qreal)targetRect.height();
|
||||
|
||||
|
||||
const int ix = 0x00010000 * sx;
|
||||
const int iy = 0x00010000 * sy;
|
||||
|
||||
QRect tr = targetRect.normalized().toRect();
|
||||
tr = tr.intersected(clip);
|
||||
if (tr.isEmpty())
|
||||
return;
|
||||
const int tx1 = tr.left();
|
||||
const int ty1 = tr.top();
|
||||
int h = tr.height();
|
||||
int w = tr.width();
|
||||
|
||||
quint32 basex;
|
||||
quint32 srcy;
|
||||
|
||||
if (sx < 0) {
|
||||
int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * sx * 65536) + 1;
|
||||
basex = quint32(sourceRect.right() * 65536) + dstx;
|
||||
} else {
|
||||
int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * sx * 65536) - 1;
|
||||
basex = quint32(sourceRect.left() * 65536) + dstx;
|
||||
}
|
||||
if (sy < 0) {
|
||||
int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * sy * 65536) + 1;
|
||||
srcy = quint32(sourceRect.bottom() * 65536) + dsty;
|
||||
} else {
|
||||
int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * sy * 65536) - 1;
|
||||
srcy = quint32(sourceRect.top() * 65536) + dsty;
|
||||
}
|
||||
|
||||
quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1;
|
||||
|
||||
const __m128i nullVector = __lsx_vreplgr2vr_w(0);
|
||||
const __m128i half = __lsx_vreplgr2vr_h(0x80);
|
||||
const __m128i one = __lsx_vreplgr2vr_h(0xff);
|
||||
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
|
||||
const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
|
||||
const __m128i ixVector = __lsx_vreplgr2vr_w(4*ix);
|
||||
|
||||
// this bounds check here is required as floating point rounding above might in some cases lead to
|
||||
// w/h values that are one pixel too large, falling outside of the valid image area.
|
||||
const int ystart = srcy >> 16;
|
||||
if (ystart >= srch && iy < 0) {
|
||||
srcy += iy;
|
||||
--h;
|
||||
}
|
||||
const int xstart = basex >> 16;
|
||||
if (xstart >= (int)(sbpl/sizeof(quint32)) && ix < 0) {
|
||||
basex += ix;
|
||||
--w;
|
||||
}
|
||||
int yend = (srcy + iy * (h - 1)) >> 16;
|
||||
if (yend < 0 || yend >= srch)
|
||||
--h;
|
||||
int xend = (basex + ix * (w - 1)) >> 16;
|
||||
if (xend < 0 || xend >= (int)(sbpl/sizeof(quint32)))
|
||||
--w;
|
||||
|
||||
while (--h >= 0) {
|
||||
const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl);
|
||||
int srcx = basex;
|
||||
int x = 0;
|
||||
|
||||
ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
|
||||
uint s = src[srcx >> 16];
|
||||
dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
|
||||
srcx += ix;
|
||||
}
|
||||
|
||||
__m128i srcxVector = (__m128i)(v4i32){srcx + ix + ix + ix, srcx + ix + ix, srcx + ix, srcx};
|
||||
|
||||
for (; x < (w - 3); x += 4) {
|
||||
const int idx0 = __lsx_vpickve2gr_h(srcxVector, 1);
|
||||
const int idx1 = __lsx_vpickve2gr_h(srcxVector, 3);
|
||||
const int idx2 = __lsx_vpickve2gr_h(srcxVector, 5);
|
||||
const int idx3 = __lsx_vpickve2gr_h(srcxVector, 7);
|
||||
srcxVector = __lsx_vadd_w(srcxVector, ixVector);
|
||||
|
||||
const __m128i srcVector = (__m128i)((v4u32){src[idx3], src[idx2], src[idx1], src[idx0]});
|
||||
|
||||
BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask);
|
||||
}
|
||||
|
||||
SIMD_EPILOGUE(x, w, 3) {
|
||||
uint s = src[(basex + x*ix) >> 16];
|
||||
dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
|
||||
}
|
||||
dst = (quint32 *)(((uchar *) dst) + dbpl);
|
||||
srcy += iy;
|
||||
}
|
||||
}
|
||||
|
||||
QT_END_NAMESPACE
|
||||
|
||||
#endif // QT_COMPILER_SUPPORTS_LSX
|
@ -142,7 +142,7 @@ struct quint24 {
|
||||
|
||||
void qBlendGradient(int count, const QT_FT_Span *spans, void *userData);
|
||||
void qBlendTexture(int count, const QT_FT_Span *spans, void *userData);
|
||||
#ifdef Q_PROCESSOR_X86
|
||||
#if defined(Q_PROCESSOR_X86) || defined(QT_COMPILER_SUPPORTS_LSX)
|
||||
extern void (*qt_memfill64)(quint64 *dest, quint64 value, qsizetype count);
|
||||
extern void (*qt_memfill32)(quint32 *dest, quint32 value, qsizetype count);
|
||||
#else
|
||||
|
231
src/gui/painting/qdrawingprimitive_lsx_p.h
Normal file
231
src/gui/painting/qdrawingprimitive_lsx_p.h
Normal file
@ -0,0 +1,231 @@
|
||||
// Copyright (C) 2024 Loongson Technology Corporation Limited.
|
||||
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
|
||||
|
||||
#ifndef QDRAWINGPRIMITIVE_LSX_P_H
|
||||
#define QDRAWINGPRIMITIVE_LSX_P_H
|
||||
|
||||
#include <QtGui/private/qtguiglobal_p.h>
|
||||
#include <private/qsimd_p.h>
|
||||
#include "qdrawhelper_loongarch64_p.h"
|
||||
#include "qrgba64_p.h"
|
||||
|
||||
#ifdef __loongarch_sx
|
||||
|
||||
//
|
||||
// W A R N I N G
|
||||
// -------------
|
||||
//
|
||||
// This file is not part of the Qt API. It exists purely as an
|
||||
// implementation detail. This header file may change from version to
|
||||
// version without notice, or even be removed.
|
||||
//
|
||||
// We mean it.
|
||||
//
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
|
||||
/*
|
||||
* Multiply the components of pixelVector by alphaChannel
|
||||
* Each 32bits components of alphaChannel must be in the form 0x00AA00AA
|
||||
* colorMask must have 0x00ff00ff on each 32 bits component
|
||||
* half must have the value 128 (0x80) for each 32 bits component
|
||||
*/
|
||||
inline static void Q_DECL_VECTORCALL
|
||||
BYTE_MUL_LSX(__m128i &pixelVector, __m128i alphaChannel, __m128i colorMask, __m128i half)
|
||||
{
|
||||
/* 1. separate the colors in 2 vectors so each color is on 16 bits
|
||||
(in order to be multiplied by the alpha
|
||||
each 32 bit of dstVectorAG are in the form 0x00AA00GG
|
||||
each 32 bit of dstVectorRB are in the form 0x00RR00BB */
|
||||
__m128i pixelVectorAG = __lsx_vsrli_h(pixelVector, 8);
|
||||
__m128i pixelVectorRB = __lsx_vand_v(pixelVector, colorMask);
|
||||
|
||||
/* 2. multiply the vectors by the alpha channel */
|
||||
pixelVectorAG = __lsx_vmul_h(pixelVectorAG, alphaChannel);
|
||||
pixelVectorRB = __lsx_vmul_h(pixelVectorRB, alphaChannel);
|
||||
|
||||
/* 3. divide by 255, that's the tricky part.
|
||||
we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */
|
||||
/** so first (X + X/256 + rounding) */
|
||||
pixelVectorRB = __lsx_vadd_h(pixelVectorRB, __lsx_vsrli_h(pixelVectorRB, 8));
|
||||
pixelVectorRB = __lsx_vadd_h(pixelVectorRB, half);
|
||||
pixelVectorAG = __lsx_vadd_h(pixelVectorAG, __lsx_vsrli_h(pixelVectorAG, 8));
|
||||
pixelVectorAG = __lsx_vadd_h(pixelVectorAG, half);
|
||||
|
||||
/** second divide by 256 */
|
||||
pixelVectorRB = __lsx_vsrli_h(pixelVectorRB, 8);
|
||||
/** for AG, we could >> 8 to divide followed by << 8 to put the
|
||||
bytes in the correct position. By masking instead, we execute
|
||||
only one instruction */
|
||||
pixelVectorAG = __lsx_vandn_v(colorMask, pixelVectorAG);
|
||||
|
||||
/* 4. combine the 2 pairs of colors */
|
||||
pixelVector = __lsx_vor_v(pixelVectorAG, pixelVectorRB);
|
||||
}
|
||||
|
||||
/*
|
||||
* Each 32bits components of alphaChannel must be in the form 0x00AA00AA
|
||||
* oneMinusAlphaChannel must be 255 - alpha for each 32 bits component
|
||||
* colorMask must have 0x00ff00ff on each 32 bits component
|
||||
* half must have the value 128 (0x80) for each 32 bits component
|
||||
*/
|
||||
inline static void Q_DECL_VECTORCALL
|
||||
INTERPOLATE_PIXEL_255_LSX(__m128i srcVector, __m128i &dstVector, __m128i alphaChannel,
|
||||
__m128i oneMinusAlphaChannel, __m128i colorMask, __m128i half)
|
||||
{
|
||||
/* interpolate AG */
|
||||
__m128i srcVectorAG = __lsx_vsrli_h(srcVector, 8);
|
||||
__m128i dstVectorAG = __lsx_vsrli_h(dstVector, 8);
|
||||
__m128i srcVectorAGalpha = __lsx_vmul_h(srcVectorAG, alphaChannel);
|
||||
__m128i dstVectorAGoneMinusAlphalpha = __lsx_vmul_h(dstVectorAG, oneMinusAlphaChannel);
|
||||
__m128i finalAG = __lsx_vadd_h(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha);
|
||||
finalAG = __lsx_vadd_h(finalAG, __lsx_vsrli_h(finalAG, 8));
|
||||
finalAG = __lsx_vadd_h(finalAG, half);
|
||||
finalAG = __lsx_vandn_v(colorMask, finalAG);
|
||||
|
||||
/* interpolate RB */
|
||||
__m128i srcVectorRB = __lsx_vand_v(srcVector, colorMask);
|
||||
__m128i dstVectorRB = __lsx_vand_v(dstVector, colorMask);
|
||||
__m128i srcVectorRBalpha = __lsx_vmul_h(srcVectorRB, alphaChannel);
|
||||
__m128i dstVectorRBoneMinusAlphalpha = __lsx_vmul_h(dstVectorRB, oneMinusAlphaChannel);
|
||||
__m128i finalRB = __lsx_vadd_h(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha);
|
||||
finalRB = __lsx_vadd_h(finalRB, __lsx_vsrli_h(finalRB, 8));
|
||||
finalRB = __lsx_vadd_h(finalRB, half);
|
||||
finalRB = __lsx_vsrli_h(finalRB, 8);
|
||||
|
||||
/* combine */
|
||||
dstVector = __lsx_vor_v(finalAG, finalRB);
|
||||
}
|
||||
|
||||
// same as BLEND_SOURCE_OVER_ARGB32_LSX, but for one vector srcVector
|
||||
inline static void Q_DECL_VECTORCALL
|
||||
BLEND_SOURCE_OVER_ARGB32_LSX_helper(quint32 *dst, int x, __m128i srcVector,
|
||||
__m128i nullVector, __m128i half, __m128i one,
|
||||
__m128i colorMask, __m128i alphaMask)
|
||||
{
|
||||
const __m128i srcVectorAlpha = __lsx_vand_v(srcVector, alphaMask);
|
||||
__m128i vseq = __lsx_vseq_w(srcVectorAlpha, alphaMask);
|
||||
v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq);
|
||||
if (vseq_res[0] == (0x0000ffff)) {
|
||||
/* all opaque */
|
||||
__lsx_vst(srcVector, &dst[x], 0);
|
||||
} else {
|
||||
__m128i vseq_n = __lsx_vseq_w(srcVectorAlpha, nullVector);
|
||||
v4i32 vseq_n_res = (v4i32)__lsx_vmsknz_b(vseq_n);
|
||||
if (vseq_n_res[0] != (0x0000ffff)) {
|
||||
/* not fully transparent */
|
||||
/* extract the alpha channel on 2 x 16 bits */
|
||||
/* so we have room for the multiplication */
|
||||
/* each 32 bits will be in the form 0x00AA00AA */
|
||||
/* with A being the 1 - alpha */
|
||||
__m128i alphaChannel = __lsx_vsrli_w(srcVector, 24);
|
||||
alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, 16));
|
||||
alphaChannel = __lsx_vsub_h(one, alphaChannel);
|
||||
|
||||
__m128i dstVector = __lsx_vld(&dst[x], 0);
|
||||
BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half);
|
||||
|
||||
/* result = s + d * (1-alpha) */
|
||||
const __m128i result = __lsx_vadd_b(srcVector, dstVector);
|
||||
__lsx_vst(result, &dst[x], 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Basically blend src over dst with the const alpha defined as constAlphaVector.
|
||||
// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
|
||||
//const __m128i nullVector = __lsx_vreplgr2vr_w(0);
|
||||
//const __m128i half = __lsx_vreplgr2vr_h(0x80);
|
||||
//const __m128i one = __lsx_vreplgr2vr_h(0xff);
|
||||
//const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
|
||||
//const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
|
||||
//
|
||||
// The computation being done is:
|
||||
// result = s + d * (1-alpha)
|
||||
// with shortcuts if fully opaque or fully transparent.
|
||||
inline static void Q_DECL_VECTORCALL
|
||||
BLEND_SOURCE_OVER_ARGB32_LSX(quint32 *dst, const quint32 *src, int length)
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
/* First, get dst aligned. */
|
||||
ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
|
||||
blend_pixel(dst[x], src[x]);
|
||||
}
|
||||
|
||||
const __m128i alphaMask = __lsx_vreplgr2vr_w(0xff000000);
|
||||
const __m128i nullVector = __lsx_vreplgr2vr_w(0);
|
||||
const __m128i half = __lsx_vreplgr2vr_h(0x80);
|
||||
const __m128i one = __lsx_vreplgr2vr_h(0xff);
|
||||
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
|
||||
|
||||
for (; x < length-3; x += 4) {
|
||||
const __m128i srcVector = __lsx_vld((const __m128i *)&src[x], 0);
|
||||
BLEND_SOURCE_OVER_ARGB32_LSX_helper(dst, x, srcVector, nullVector, half, one, colorMask, alphaMask);
|
||||
}
|
||||
SIMD_EPILOGUE(x, length, 3) {
|
||||
blend_pixel(dst[x], src[x]);
|
||||
}
|
||||
}
|
||||
|
||||
// Basically blend src over dst with the const alpha defined as constAlphaVector.
|
||||
// The computation being done is:
|
||||
// dest = (s + d * sia) * ca + d * cia
|
||||
// = s * ca + d * (sia * ca + cia)
|
||||
// = s * ca + d * (1 - sa*ca)
|
||||
inline static void Q_DECL_VECTORCALL
|
||||
BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_LSX(quint32 *dst, const quint32 *src, int length, uint const_alpha)
|
||||
{
|
||||
int x = 0;
|
||||
|
||||
ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
|
||||
blend_pixel(dst[x], src[x], const_alpha);
|
||||
}
|
||||
|
||||
const __m128i nullVector = __lsx_vreplgr2vr_w(0);
|
||||
const __m128i half = __lsx_vreplgr2vr_h(0x80);
|
||||
const __m128i one = __lsx_vreplgr2vr_h(0xff);
|
||||
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
|
||||
const __m128i constAlphaVector = __lsx_vreplgr2vr_h(const_alpha);
|
||||
|
||||
for (; x < length-3; x += 4) {
|
||||
__m128i srcVector = __lsx_vld((const __m128i *)&src[x], 0);
|
||||
__m128i vseq = __lsx_vseq_w(srcVector, nullVector);
|
||||
v4i32 vseq_res = (v4i32)__lsx_vmsknz_b(vseq);
|
||||
if (vseq_res[0] != 0x0000ffff) {
|
||||
BYTE_MUL_LSX(srcVector, constAlphaVector, colorMask, half);
|
||||
|
||||
__m128i alphaChannel = __lsx_vsrli_w(srcVector, 24);
|
||||
alphaChannel = __lsx_vor_v(alphaChannel, __lsx_vslli_w(alphaChannel, 16));
|
||||
alphaChannel = __lsx_vsub_h(one, alphaChannel);
|
||||
|
||||
__m128i dstVector = __lsx_vld((__m128i *)&dst[x], 0);
|
||||
BYTE_MUL_LSX(dstVector, alphaChannel, colorMask, half);
|
||||
|
||||
const __m128i result = __lsx_vadd_b(srcVector, dstVector);
|
||||
__lsx_vst(result, &dst[x], 0);
|
||||
}
|
||||
}
|
||||
SIMD_EPILOGUE(x, length, 3) {
|
||||
blend_pixel(dst[x], src[x], const_alpha);
|
||||
}
|
||||
}
|
||||
|
||||
typedef union
|
||||
{
|
||||
int i;
|
||||
float f;
|
||||
} FloatInt;
|
||||
|
||||
/* float type data load instructions */
|
||||
static __m128 __lsx_vreplfr2vr_s(float val)
|
||||
{
|
||||
FloatInt fi_tmpval = {.f = val};
|
||||
return (__m128)__lsx_vreplgr2vr_w(fi_tmpval.i);
|
||||
}
|
||||
|
||||
QT_END_NAMESPACE
|
||||
|
||||
#endif // __loongarch_sx
|
||||
|
||||
#endif // QDRAWINGPRIMITIVE_LSX_P_H
|
@ -257,6 +257,18 @@ void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi, unsigned int *dest,
|
||||
int dw, int dh, int dow, int sow);
|
||||
#endif
|
||||
|
||||
#if defined(QT_COMPILER_SUPPORTS_LSX)
|
||||
template<bool RGB>
|
||||
void qt_qimageScaleAARGBA_up_x_down_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
|
||||
int dw, int dh, int dow, int sow);
|
||||
template<bool RGB>
|
||||
void qt_qimageScaleAARGBA_down_x_up_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
|
||||
int dw, int dh, int dow, int sow);
|
||||
template<bool RGB>
|
||||
void qt_qimageScaleAARGBA_down_xy_lsx(QImageScaleInfo *isi, unsigned int *dest,
|
||||
int dw, int dh, int dow, int sow);
|
||||
#endif
|
||||
|
||||
#if defined(__ARM_NEON__)
|
||||
template<bool RGB>
|
||||
void qt_qimageScaleAARGBA_up_x_down_y_neon(QImageScaleInfo *isi, unsigned int *dest,
|
||||
@ -351,6 +363,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest,
|
||||
if (qCpuHasFeature(SSE4_1))
|
||||
qt_qimageScaleAARGBA_up_x_down_y_sse4<false>(isi, dest, dw, dh, dow, sow);
|
||||
else
|
||||
#elif defined(QT_COMPILER_SUPPORTS_LSX)
|
||||
if (qCpuHasFeature(LSX))
|
||||
qt_qimageScaleAARGBA_up_x_down_y_lsx<false>(isi, dest, dw, dh, dow, sow);
|
||||
else
|
||||
#elif defined(__ARM_NEON__)
|
||||
if (qCpuHasFeature(NEON))
|
||||
qt_qimageScaleAARGBA_up_x_down_y_neon<false>(isi, dest, dw, dh, dow, sow);
|
||||
@ -364,6 +380,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest,
|
||||
if (qCpuHasFeature(SSE4_1))
|
||||
qt_qimageScaleAARGBA_down_x_up_y_sse4<false>(isi, dest, dw, dh, dow, sow);
|
||||
else
|
||||
#elif defined(QT_COMPILER_SUPPORTS_LSX)
|
||||
if (qCpuHasFeature(LSX))
|
||||
qt_qimageScaleAARGBA_down_x_up_y_lsx<false>(isi, dest, dw, dh, dow, sow);
|
||||
else
|
||||
#elif defined(__ARM_NEON__)
|
||||
if (qCpuHasFeature(NEON))
|
||||
qt_qimageScaleAARGBA_down_x_up_y_neon<false>(isi, dest, dw, dh, dow, sow);
|
||||
@ -377,6 +397,10 @@ static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest,
|
||||
if (qCpuHasFeature(SSE4_1))
|
||||
qt_qimageScaleAARGBA_down_xy_sse4<false>(isi, dest, dw, dh, dow, sow);
|
||||
else
|
||||
#elif defined(QT_COMPILER_SUPPORTS_LSX)
|
||||
if (qCpuHasFeature(LSX))
|
||||
qt_qimageScaleAARGBA_down_xy_lsx<false>(isi, dest, dw, dh, dow, sow);
|
||||
else
|
||||
#elif defined(__ARM_NEON__)
|
||||
if (qCpuHasFeature(NEON))
|
||||
qt_qimageScaleAARGBA_down_xy_neon<false>(isi, dest, dw, dh, dow, sow);
|
||||
@ -995,6 +1019,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest,
|
||||
if (qCpuHasFeature(SSE4_1))
|
||||
qt_qimageScaleAARGBA_up_x_down_y_sse4<true>(isi, dest, dw, dh, dow, sow);
|
||||
else
|
||||
#elif defined QT_COMPILER_SUPPORTS_LSX
|
||||
if (qCpuHasFeature(LSX))
|
||||
qt_qimageScaleAARGBA_up_x_down_y_lsx<true>(isi, dest, dw, dh, dow, sow);
|
||||
else
|
||||
#elif defined(__ARM_NEON__)
|
||||
if (qCpuHasFeature(NEON))
|
||||
qt_qimageScaleAARGBA_up_x_down_y_neon<true>(isi, dest, dw, dh, dow, sow);
|
||||
@ -1008,6 +1036,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest,
|
||||
if (qCpuHasFeature(SSE4_1))
|
||||
qt_qimageScaleAARGBA_down_x_up_y_sse4<true>(isi, dest, dw, dh, dow, sow);
|
||||
else
|
||||
#elif defined QT_COMPILER_SUPPORTS_LSX
|
||||
if (qCpuHasFeature(LSX))
|
||||
qt_qimageScaleAARGBA_down_x_up_y_lsx<true>(isi, dest, dw, dh, dow, sow);
|
||||
else
|
||||
#elif defined(__ARM_NEON__)
|
||||
if (qCpuHasFeature(NEON))
|
||||
qt_qimageScaleAARGBA_down_x_up_y_neon<true>(isi, dest, dw, dh, dow, sow);
|
||||
@ -1021,6 +1053,10 @@ static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest,
|
||||
if (qCpuHasFeature(SSE4_1))
|
||||
qt_qimageScaleAARGBA_down_xy_sse4<true>(isi, dest, dw, dh, dow, sow);
|
||||
else
|
||||
#elif defined QT_COMPILER_SUPPORTS_LSX
|
||||
if (qCpuHasFeature(LSX))
|
||||
qt_qimageScaleAARGBA_down_xy_lsx<true>(isi, dest, dw, dh, dow, sow);
|
||||
else
|
||||
#elif defined(__ARM_NEON__)
|
||||
if (qCpuHasFeature(NEON))
|
||||
qt_qimageScaleAARGBA_down_xy_neon<true>(isi, dest, dw, dh, dow, sow);
|
||||
|
233
src/gui/painting/qimagescale_lsx.cpp
Normal file
233
src/gui/painting/qimagescale_lsx.cpp
Normal file
@ -0,0 +1,233 @@
|
||||
// Copyright (C) 2024 Loongson Technology Corporation Limited.
|
||||
// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
|
||||
|
||||
#include "qimagescale_p.h"
|
||||
#include "qimage.h"
|
||||
#include <private/qdrawhelper_loongarch64_p.h>
|
||||
#include <private/qsimd_p.h>
|
||||
|
||||
#if QT_CONFIG(thread) && !defined(Q_OS_WASM)
|
||||
#include <qsemaphore.h>
|
||||
#include <private/qthreadpool_p.h>
|
||||
#endif
|
||||
|
||||
#if defined(QT_COMPILER_SUPPORTS_LSX)
|
||||
|
||||
QT_BEGIN_NAMESPACE
|
||||
|
||||
using namespace QImageScale;
|
||||
|
||||
template<typename T>
|
||||
static inline void multithread_pixels_function(QImageScaleInfo *isi, int dh, const T &scaleSection)
|
||||
{
|
||||
#if QT_CONFIG(thread) && !defined(Q_OS_WASM)
|
||||
int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16);
|
||||
segments = std::min(segments, dh);
|
||||
QThreadPool *threadPool = QThreadPoolPrivate::qtGuiInstance();
|
||||
if (segments > 1 && threadPool && !threadPool->contains(QThread::currentThread())) {
|
||||
QSemaphore semaphore;
|
||||
int y = 0;
|
||||
for (int i = 0; i < segments; ++i) {
|
||||
int yn = (dh - y) / (segments - i);
|
||||
threadPool->start([&, y, yn]() {
|
||||
scaleSection(y, y + yn);
|
||||
semaphore.release(1);
|
||||
});
|
||||
y += yn;
|
||||
}
|
||||
semaphore.acquire(segments);
|
||||
return;
|
||||
}
|
||||
#else
|
||||
Q_UNUSED(isi);
|
||||
#endif
|
||||
scaleSection(0, dh);
|
||||
}
|
||||
|
||||
inline static __m128i Q_DECL_VECTORCALL
|
||||
qt_qimageScaleAARGBA_helper(const unsigned int *pix, int xyap, int Cxy,
|
||||
int step, const __m128i vxyap, const __m128i vCxy)
|
||||
{
|
||||
const __m128i shuffleMask = (__m128i)(v16i8){0, 16, 16, 16, 1, 16, 16, 16,
|
||||
2, 16, 16, 16, 3, 16, 16, 16};
|
||||
__m128i vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask);
|
||||
__m128i vx = __lsx_vmul_w(vpix, vxyap);
|
||||
int i;
|
||||
for (i = (1 << 14) - xyap; i > Cxy; i -= Cxy) {
|
||||
pix += step;
|
||||
vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask);
|
||||
vx = __lsx_vadd_w(vx, __lsx_vmul_w(vpix, vCxy));
|
||||
}
|
||||
pix += step;
|
||||
vpix = __lsx_vshuf_b(__lsx_vldi(0), __lsx_vreplgr2vr_w(*pix), shuffleMask);
|
||||
vx = __lsx_vadd_w(vx, __lsx_vmul_w(vpix, __lsx_vreplgr2vr_w(i)));
|
||||
return vx;
|
||||
}
|
||||
|
||||
template<bool RGB>
|
||||
void qt_qimageScaleAARGBA_up_x_down_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
|
||||
int dw, int dh, int dow, int sow)
|
||||
{
|
||||
const unsigned int **ypoints = isi->ypoints;
|
||||
const int *xpoints = isi->xpoints;
|
||||
const int *xapoints = isi->xapoints;
|
||||
const int *yapoints = isi->yapoints;
|
||||
|
||||
const __m128i v256 = __lsx_vreplgr2vr_w(256);
|
||||
|
||||
/* go through every scanline in the output buffer */
|
||||
auto scaleSection = [&] (int yStart, int yEnd) {
|
||||
for (int y = yStart; y < yEnd; ++y) {
|
||||
const int Cy = yapoints[y] >> 16;
|
||||
const int yap = yapoints[y] & 0xffff;
|
||||
const __m128i vCy = __lsx_vreplgr2vr_w(Cy);
|
||||
const __m128i vyap = __lsx_vreplgr2vr_w(yap);
|
||||
|
||||
unsigned int *dptr = dest + (y * dow);
|
||||
for (int x = 0; x < dw; x++) {
|
||||
const unsigned int *sptr = ypoints[y] + xpoints[x];
|
||||
__m128i vx = qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow, vyap, vCy);
|
||||
|
||||
const int xap = xapoints[x];
|
||||
if (xap > 0) {
|
||||
const __m128i vxap = __lsx_vreplgr2vr_w(xap);
|
||||
const __m128i vinvxap = __lsx_vsub_w(v256, vxap);
|
||||
__m128i vr = qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow, vyap, vCy);
|
||||
|
||||
vx = __lsx_vmul_w(vx, vinvxap);
|
||||
vr = __lsx_vmul_w(vr, vxap);
|
||||
vx = __lsx_vadd_w(vx, vr);
|
||||
vx = __lsx_vsrli_w(vx, 8);
|
||||
}
|
||||
vx = __lsx_vsrli_w(vx, 14);
|
||||
vx = __lsx_vpickev_h(__lsx_vsat_wu(vx, 15), __lsx_vsat_wu(vx, 15));
|
||||
vx = __lsx_vpickev_b(__lsx_vsat_hu(vx, 7), __lsx_vsat_hu(vx, 7));
|
||||
*dptr = __lsx_vpickve2gr_w(vx, 0);
|
||||
if (RGB)
|
||||
*dptr |= 0xff000000;
|
||||
dptr++;
|
||||
}
|
||||
}
|
||||
};
|
||||
multithread_pixels_function(isi, dh, scaleSection);
|
||||
}
|
||||
|
||||
template<bool RGB>
|
||||
void qt_qimageScaleAARGBA_down_x_up_y_lsx(QImageScaleInfo *isi, unsigned int *dest,
|
||||
int dw, int dh, int dow, int sow)
|
||||
{
|
||||
const unsigned int **ypoints = isi->ypoints;
|
||||
int *xpoints = isi->xpoints;
|
||||
int *xapoints = isi->xapoints;
|
||||
int *yapoints = isi->yapoints;
|
||||
|
||||
const __m128i v256 = __lsx_vreplgr2vr_w(256);
|
||||
|
||||
/* go through every scanline in the output buffer */
|
||||
auto scaleSection = [&] (int yStart, int yEnd) {
|
||||
for (int y = yStart; y < yEnd; ++y) {
|
||||
unsigned int *dptr = dest + (y * dow);
|
||||
for (int x = 0; x < dw; x++) {
|
||||
int Cx = xapoints[x] >> 16;
|
||||
int xap = xapoints[x] & 0xffff;
|
||||
const __m128i vCx = __lsx_vreplgr2vr_w(Cx);
|
||||
const __m128i vxap = __lsx_vreplgr2vr_w(xap);
|
||||
|
||||
const unsigned int *sptr = ypoints[y] + xpoints[x];
|
||||
__m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
|
||||
|
||||
int yap = yapoints[y];
|
||||
if (yap > 0) {
|
||||
const __m128i vyap = __lsx_vreplgr2vr_w(yap);
|
||||
const __m128i vinvyap = __lsx_vsub_w(v256, vyap);
|
||||
__m128i vr = qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1, vxap, vCx);
|
||||
|
||||
vx = __lsx_vmul_w(vx, vinvyap);
|
||||
vr = __lsx_vmul_w(vr, vyap);
|
||||
vx = __lsx_vadd_w(vx, vr);
|
||||
vx = __lsx_vsrli_w(vx, 8);
|
||||
}
|
||||
vx = __lsx_vsrli_w(vx, 14);
|
||||
vx = __lsx_vpickev_h(__lsx_vsat_wu(vx, 15), __lsx_vsat_wu(vx, 15));
|
||||
vx = __lsx_vpickev_b(__lsx_vsat_wu(vx, 7), __lsx_vsat_hu(vx, 7));
|
||||
*dptr = __lsx_vpickve2gr_w(vx, 0);
|
||||
if (RGB)
|
||||
*dptr |= 0xff000000;
|
||||
dptr++;
|
||||
}
|
||||
}
|
||||
};
|
||||
multithread_pixels_function(isi, dh, scaleSection);
|
||||
}
|
||||
|
||||
template<bool RGB>
|
||||
void qt_qimageScaleAARGBA_down_xy_lsx(QImageScaleInfo *isi, unsigned int *dest,
|
||||
int dw, int dh, int dow, int sow)
|
||||
{
|
||||
const unsigned int **ypoints = isi->ypoints;
|
||||
int *xpoints = isi->xpoints;
|
||||
int *xapoints = isi->xapoints;
|
||||
int *yapoints = isi->yapoints;
|
||||
|
||||
auto scaleSection = [&] (int yStart, int yEnd) {
|
||||
for (int y = yStart; y < yEnd; ++y) {
|
||||
int Cy = yapoints[y] >> 16;
|
||||
int yap = yapoints[y] & 0xffff;
|
||||
const __m128i vCy = __lsx_vreplgr2vr_w(Cy);
|
||||
const __m128i vyap = __lsx_vreplgr2vr_w(yap);
|
||||
|
||||
unsigned int *dptr = dest + (y * dow);
|
||||
for (int x = 0; x < dw; x++) {
|
||||
const int Cx = xapoints[x] >> 16;
|
||||
const int xap = xapoints[x] & 0xffff;
|
||||
const __m128i vCx = __lsx_vreplgr2vr_w(Cx);
|
||||
const __m128i vxap = __lsx_vreplgr2vr_w(xap);
|
||||
|
||||
const unsigned int *sptr = ypoints[y] + xpoints[x];
|
||||
__m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
|
||||
__m128i vr = __lsx_vmul_w(__lsx_vsrli_w(vx, 4), vyap);
|
||||
|
||||
int j;
|
||||
for (j = (1 << 14) - yap; j > Cy; j -= Cy) {
|
||||
sptr += sow;
|
||||
vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
|
||||
vr = __lsx_vadd_w(vr, __lsx_vmul_w(__lsx_vsrli_w(vx, 4), vCy));
|
||||
}
|
||||
sptr += sow;
|
||||
vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx);
|
||||
vr = __lsx_vadd_w(vr, __lsx_vmul_w(__lsx_vsrli_w(vx, 4), __lsx_vreplgr2vr_w(j)));
|
||||
|
||||
vr = __lsx_vsrli_w(vr, 24);
|
||||
vr = __lsx_vpickev_h(__lsx_vldi(0), __lsx_vsat_wu(vr, 15));
|
||||
vr = __lsx_vpickev_b(__lsx_vldi(0), __lsx_vsat_hu(vr, 7));
|
||||
*dptr = __lsx_vpickve2gr_w(vr, 0);
|
||||
if (RGB)
|
||||
*dptr |= 0xff000000;
|
||||
dptr++;
|
||||
}
|
||||
}
|
||||
};
|
||||
multithread_pixels_function(isi, dh, scaleSection);
|
||||
}
|
||||
|
||||
template void qt_qimageScaleAARGBA_up_x_down_y_lsx<false>(QImageScaleInfo *isi, unsigned int *dest,
|
||||
int dw, int dh, int dow, int sow);
|
||||
|
||||
template void qt_qimageScaleAARGBA_up_x_down_y_lsx<true>(QImageScaleInfo *isi, unsigned int *dest,
|
||||
int dw, int dh, int dow, int sow);
|
||||
|
||||
template void qt_qimageScaleAARGBA_down_x_up_y_lsx<false>(QImageScaleInfo *isi, unsigned int *dest,
|
||||
int dw, int dh, int dow, int sow);
|
||||
|
||||
template void qt_qimageScaleAARGBA_down_x_up_y_lsx<true>(QImageScaleInfo *isi, unsigned int *dest,
|
||||
int dw, int dh, int dow, int sow);
|
||||
|
||||
template void qt_qimageScaleAARGBA_down_xy_lsx<false>(QImageScaleInfo *isi, unsigned int *dest,
|
||||
int dw, int dh, int dow, int sow);
|
||||
|
||||
template void qt_qimageScaleAARGBA_down_xy_lsx<true>(QImageScaleInfo *isi, unsigned int *dest,
|
||||
int dw, int dh, int dow, int sow);
|
||||
|
||||
QT_END_NAMESPACE
|
||||
|
||||
#endif
|
Loading…
x
Reference in New Issue
Block a user