Add some special LSX optimizations

During compilation, Qt will not add the -mlsx compilation parameter
for these files. Therefore, the compiler needs to enable LSX
auto-vectorization by default for all files during compilation
for them to be compiled.

Change-Id: I90c5029b673f831d39591ffd96c36e7762c68fb0
Reviewed-by: Volker Hilsheimer <volker.hilsheimer@qt.io>
This commit is contained in:
Chen Zhanwang 2024-08-28 09:58:25 +08:00
parent dfc84993b7
commit f9185516eb
4 changed files with 511 additions and 1 deletions

View File

@ -280,10 +280,78 @@ struct Rgba64OperationsNEON : public Rgba64OperationsBase
};
#endif
#if defined(__loongarch_sx)
struct Rgba64OperationsLSX : public Rgba64OperationsBase
{
typedef __m128i OptimalType;
typedef __m128i OptimalScalar;
static OptimalType load(const Type *ptr)
{
return __lsx_vilvl_d(__lsx_vldi(0), __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(ptr), 0));
}
static OptimalType convert(const Type &value)
{
return __lsx_vinsgr2vr_d(__lsx_vldi(0), value, 0);
}
static void store(Type *ptr, OptimalType value)
{
__lsx_vstelm_d(value, reinterpret_cast<const __m128i *>(ptr), 0, 0);
}
static OptimalType add(OptimalType a, OptimalType b)
{
return __lsx_vadd_h(a, b);
}
// same as above:
// static OptimalScalar add(OptimalScalar a, OptimalScalar b)
static OptimalType plus(OptimalType a, OptimalType b)
{
return __lsx_vsadd_hu(a, b);
}
static OptimalScalar alpha(OptimalType c)
{
const __m128i shuffleMask = (__m128i)(v8i16){3, 3, 3, 3, 4, 5, 6, 7};
return __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), c);
}
static OptimalScalar invAlpha(Scalar c)
{
return scalar(65535 - c);
}
static OptimalScalar invAlpha(OptimalType c)
{
return __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), alpha(c));
}
static OptimalScalar scalar(Scalar n)
{
const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 0, 0, 4, 5, 6, 7};
return __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), __lsx_vinsgr2vr_w(__lsx_vldi(0), n, 0));
}
static OptimalType multiplyAlpha8bit(OptimalType val, uint8_t a)
{
return multiplyAlpha255(val, a);
}
// same as above:
// static OptimalScalar multiplyAlpha8bit(OptimalScalar a, uint8_t a)
static OptimalType interpolate8bit(OptimalType x, uint8_t a1, OptimalType y, uint8_t a2)
{
return interpolate255(x, a1, y, a2);
}
static OptimalType multiplyAlpha(OptimalType val, OptimalScalar a)
{
return multiplyAlpha65535(val, a);
}
static OptimalType interpolate(OptimalType x, OptimalScalar a1, OptimalType y, const OptimalScalar &a2)
{
return interpolate65535(x, a1, y, a2);
}
};
#endif
#if defined(__SSE2__)
typedef Rgba64OperationsSSE2 Rgba64Operations;
#elif defined(__ARM_NEON__)
typedef Rgba64OperationsNEON Rgba64Operations;
#elif defined(__loongarch_sx)
typedef Rgba64OperationsLSX Rgba64Operations;
#else
typedef Rgba64OperationsC Rgba64Operations;
#endif

View File

@ -1251,6 +1251,45 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, u
}
#endif
#if defined(__loongarch_sx)
static inline void interpolate_4_pixels_16_lsx(__m128i tl, __m128i tr, __m128i bl, __m128i br,
__m128i distx, __m128i disty, uint *b)
{
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
const __m128i v_256 = __lsx_vreplgr2vr_h(256);
const __m128i dxdy = __lsx_vmul_h(distx, disty);
const __m128i distx_ = __lsx_vslli_h(distx, 4);
const __m128i disty_ = __lsx_vslli_h(disty, 4);
const __m128i idxidy = __lsx_vadd_h(dxdy, __lsx_vsub_h(v_256, __lsx_vadd_h(distx_, disty_)));
const __m128i dxidy = __lsx_vsub_h(distx_, dxdy);
const __m128i idxdy = __lsx_vsub_h(disty_,dxdy);
__m128i tlAG = __lsx_vsrli_h(tl, 8);
__m128i tlRB = __lsx_vand_v(tl, colorMask);
__m128i trAG = __lsx_vsrli_h(tr, 8);
__m128i trRB = __lsx_vand_v(tr, colorMask);
__m128i blAG = __lsx_vsrli_h(bl, 8);
__m128i blRB = __lsx_vand_v(bl, colorMask);
__m128i brAG = __lsx_vsrli_h(br, 8);
__m128i brRB = __lsx_vand_v(br, colorMask);
tlAG = __lsx_vmul_h(tlAG, idxidy);
tlRB = __lsx_vmul_h(tlRB, idxidy);
trAG = __lsx_vmul_h(trAG, dxidy);
trRB = __lsx_vmul_h(trRB, dxidy);
blAG = __lsx_vmul_h(blAG, idxdy);
blRB = __lsx_vmul_h(blRB, idxdy);
brAG = __lsx_vmul_h(brAG, dxdy);
brRB = __lsx_vmul_h(brRB, dxdy);
__m128i rAG =__lsx_vadd_h(__lsx_vadd_h(tlAG, trAG), __lsx_vadd_h(blAG, brAG));
__m128i rRB =__lsx_vadd_h(__lsx_vadd_h(tlRB, trRB), __lsx_vadd_h(blRB, brRB));
rAG = __lsx_vandn_v(colorMask, rAG);
rRB = __lsx_vsrli_h(rRB, 8);
__lsx_vst(__lsx_vor_v(rAG, rRB), b, 0);
}
#endif
template<TextureBlendType blendType>
void fetchTransformedBilinear_pixelBounds(int max, int l1, int l2, int &v1, int &v2);
@ -1426,6 +1465,36 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_scale_helper(uin
rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8));
vst1q_s16((int16_t*)(&intermediate.buffer_rb[f]), rRB);
}
#elif defined(__loongarch_sx)
const __m128i disty_ = __lsx_vreplgr2vr_h(disty);
const __m128i idisty_ = __lsx_vreplgr2vr_h(idisty);
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
lim -= 3;
for (; f < lim; x += 4, f += 4) {
// Load 4 pixels from s1, and split the alpha-green and red-blue component
__m128i top = __lsx_vld((const __m128i*)((const uint *)(s1)+x), 0);
__m128i topAG = __lsx_vsrli_h(top, 8);
__m128i topRB = __lsx_vand_v(top, colorMask);
// Multiplies each color component by idisty
topAG = __lsx_vmul_h(topAG, idisty_);
topRB = __lsx_vmul_h(topRB, idisty_);
// Same for the s2 vector
__m128i bottom = __lsx_vld((const __m128i*)((const uint *)(s2)+x), 0);
__m128i bottomAG = __lsx_vsrli_h(bottom, 8);
__m128i bottomRB = __lsx_vand_v(bottom, colorMask);
bottomAG = __lsx_vmul_h(bottomAG, disty_);
bottomRB = __lsx_vmul_h(bottomRB, disty_);
// Add the values, and shift to only keep 8 significant bits per colors
__m128i rAG = __lsx_vadd_h(topAG, bottomAG);
rAG = __lsx_vsrli_h(rAG, 8);
__lsx_vst(rAG, (__m128i*)(&intermediate.buffer_ag[f]), 0);
__m128i rRB = __lsx_vadd_h(topRB, bottomRB);
rRB = __lsx_vsrli_h(rRB, 8);
__lsx_vst(rRB, (__m128i*)(&intermediate.buffer_rb[f]), 0);
}
#endif
}
for (; f < count; f++) { // Same as above but without simd
@ -1615,6 +1684,33 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper(uint *
b+=4;
v_fx = vaddq_s32(v_fx, v_fdx);
}
#elif defined (__loongarch_sx)
const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 2, 2, 4, 4, 6, 6};
const __m128i v_disty = __lsx_vreplgr2vr_h(disty4);
const __m128i v_fdx = __lsx_vreplgr2vr_w(fdx*4);
const __m128i v_fx_r = __lsx_vreplgr2vr_w(0x8);
__m128i v_fx = (__m128i)(v4i32){fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx};
while (b < boundedEnd - 3) {
__m128i offset = __lsx_vsrli_w(v_fx, 16);
const int offset0 = __lsx_vpickve2gr_w(offset, 0);
const int offset1 = __lsx_vpickve2gr_w(offset, 1);
const int offset2 = __lsx_vpickve2gr_w(offset, 2);
const int offset3 = __lsx_vpickve2gr_w(offset, 3);
const __m128i tl = (__m128i)(v4u32){s1[offset0], s1[offset1], s1[offset2], s1[offset3]};
const __m128i tr = (__m128i)(v4u32){s1[offset0 + 1], s1[offset1 + 1], s1[offset2 + 1], s1[offset3 + 1]};
const __m128i bl = (__m128i)(v4u32){s2[offset0], s2[offset1], s2[offset2], s2[offset3]};
const __m128i br = (__m128i)(v4u32){s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]};
__m128i v_distx = __lsx_vsrli_h(v_fx, 8);
v_distx = __lsx_vsrli_h(__lsx_vadd_w(v_distx, v_fx_r), 4);
v_distx = __lsx_vshuf_h(shuffleMask, v_distx, v_distx);
interpolate_4_pixels_16_lsx(tl, tr, bl, br, v_distx, v_disty, b);
b += 4;
v_fx = __lsx_vadd_w(v_fx, v_fdx);
}
fx = __lsx_vpickve2gr_w(v_fx, 0);
#endif
while (b < boundedEnd) {
int x = (fx >> 16);
@ -1852,6 +1948,50 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper(uint
v_fx = vaddq_s32(v_fx, v_fdx);
v_fy = vaddq_s32(v_fy, v_fdy);
}
#elif defined(__loongarch_sx)
const __m128i v_fdx = __lsx_vreplgr2vr_w(fdx*4);
const __m128i v_fdy = __lsx_vreplgr2vr_w(fdy*4);
const __m128i v_fxy_r = __lsx_vreplgr2vr_w(0x8);
__m128i v_fx = (__m128i)(v4i32){fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx};
__m128i v_fy = (__m128i)(v4i32){fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy};
const uchar *textureData = image.imageData;
const qsizetype bytesPerLine = image.bytesPerLine;
const __m128i zero = __lsx_vldi(0);
const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 0, 0, 4, 5, 6, 7};
const __m128i shuffleMask1 = (__m128i)(v8i16){0, 0, 2, 2, 4, 4, 6, 6};
const __m128i vbpl = __lsx_vshuf_h(shuffleMask, zero, __lsx_vinsgr2vr_w(zero, bytesPerLine/4, 0));
while (b < boundedEnd - 3) {
const __m128i vy = __lsx_vpickev_h(zero, __lsx_vsat_w(__lsx_vsrli_w(v_fy, 16), 15));
// 4x16bit * 4x16bit -> 4x32bit
__m128i offset = __lsx_vilvl_h(__lsx_vmuh_h(vy, vbpl), __lsx_vmul_h(vy, vbpl));
offset = __lsx_vadd_w(offset, __lsx_vsrli_w(v_fx, 16));
const int offset0 = __lsx_vpickve2gr_w(offset, 0);
const int offset1 = __lsx_vpickve2gr_w(offset, 1);
const int offset2 = __lsx_vpickve2gr_w(offset, 2);
const int offset3 = __lsx_vpickve2gr_w(offset, 3);
const uint *topData = (const uint *)(textureData);
const __m128i tl = (__m128i)(v4u32){topData[offset0], topData[offset1], topData[offset2], topData[offset3]};
const __m128i tr = (__m128i)(v4u32){topData[offset0 + 1], topData[offset1 + 1], topData[offset2 + 1], topData[offset3 + 1]};
const uint *bottomData = (const uint *)(textureData + bytesPerLine);
const __m128i bl = (__m128i)(v4u32){bottomData[offset0], bottomData[offset1], bottomData[offset2], bottomData[offset3]};
const __m128i br = (__m128i)(v4u32){bottomData[offset0 + 1], bottomData[offset1 + 1], bottomData[offset2 + 1], bottomData[offset3 + 1]};
__m128i v_distx = __lsx_vsrli_h(v_fx, 8);
__m128i v_disty = __lsx_vsrli_h(v_fy, 8);
v_distx = __lsx_vsrli_h(__lsx_vadd_w(v_distx, v_fxy_r), 4);
v_disty = __lsx_vsrli_h(__lsx_vadd_w(v_disty, v_fxy_r), 4);
v_distx = __lsx_vshuf_h(shuffleMask1, zero, v_distx);
v_disty = __lsx_vshuf_h(shuffleMask1, zero, v_disty);
interpolate_4_pixels_16_lsx(tl, tr, bl, br, v_distx, v_disty, b);
b += 4;
v_fx = __lsx_vadd_w(v_fx, v_fdx);
v_fy = __lsx_vadd_w(v_fy, v_fdy);
}
fx = __lsx_vpickve2gr_w(v_fx, 0);
fy = __lsx_vpickve2gr_w(v_fy, 0);
#endif
while (b < boundedEnd) {
int x = (fx >> 16);

View File

@ -1141,6 +1141,108 @@ static inline void qConvertARGB32PMToRGBA64PM_neon(QRgba64 *buffer, const uint *
*buffer++ = QRgba64::fromArgb32(s);
}
}
#elif defined __loongarch_sx
template<bool RGBA, bool maskAlpha>
static inline void qConvertARGB32PMToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count)
{
if (count <= 0)
return;
const __m128i amask = __lsx_vreplgr2vr_w(0xff000000);
const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 6, 5, 4, 7};
int i = 0;
for (; ((uintptr_t)buffer & 0xf) && i < count; ++i) {
uint s = *src++;
if (maskAlpha)
s = s | 0xff000000;
if (RGBA)
s = RGBA2ARGB(s);
*buffer++ = QRgba64::fromArgb32(s);
}
for (; i < count-3; i += 4) {
__m128i vs = __lsx_vld((const __m128i*)src, 0);
if (maskAlpha)
vs = __lsx_vor_v(vs, amask);
src += 4;
__m128i v1 = __lsx_vilvl_b(vs, vs);
__m128i v2 = __lsx_vilvh_b(vs, vs);
if (!RGBA) {
v1 = __lsx_vshuf_h(shuffleMask, v1, v1);
v2 = __lsx_vshuf_h(shuffleMask, v2, v2);
}
__lsx_vst(v1, buffer, 0);
buffer += 2;
__lsx_vst(v2, buffer, 0);
buffer += 2;
}
SIMD_EPILOGUE(i, count, 3) {
uint s = *src++;
if (maskAlpha)
s = s | 0xff000000;
if (RGBA)
s = RGBA2ARGB(s);
*buffer++ = QRgba64::fromArgb32(s);
}
}
template<QtPixelOrder PixelOrder>
static inline void qConvertRGBA64PMToA2RGB30PM_lsx(uint *dest, const QRgba64 *buffer, int count)
{
const __m128i gmask = __lsx_vreplgr2vr_w(0x000ffc00);
const __m128i cmask = __lsx_vreplgr2vr_w(0x000003ff);
int i = 0;
__m128i vr, vg, vb, va;
for (; i < count && uintptr_t(buffer) & 0xF; ++i) {
*dest++ = qConvertRgb64ToRgb30<PixelOrder>(*buffer++);
}
for (; i < count-15; i += 16) {
__m128i vOr = __lsx_vreplgr2vr_w(0);
__m128i vAnd = __lsx_vreplgr2vr_w(0xffffffff);
for (int j = 0; j < 16; j += 2) {
__m128i vs = __lsx_vld((const __m128i*)(buffer + j), 0);
vOr = __lsx_vor_v(vOr, vs);
vAnd = __lsx_vand_v(vAnd, vs);
}
const quint16 orAlpha = ((uint)__lsx_vpickve2gr_h(vOr, 3)) | ((uint)__lsx_vpickve2gr_h(vOr, 7));
const quint16 andAlpha = ((uint)__lsx_vpickve2gr_h(vAnd, 3)) & ((uint)__lsx_vpickve2gr_h(vAnd, 7));
if (andAlpha == 0xffff) {
for (int j = 0; j < 16; j += 2) {
__m128i vs = __lsx_vld((const __m128i*)buffer, 0);
buffer += 2;
vr = __lsx_vsrli_d(vs, 6);
vg = __lsx_vsrli_d(vs, 16 + 6 - 10);
vb = __lsx_vsrli_d(vs, 32 + 6);
vr = __lsx_vand_v(vr, cmask);
vg = __lsx_vand_v(vg, gmask);
vb = __lsx_vand_v(vb, cmask);
va = __lsx_vsrli_d(vs, 48 + 14);
if (PixelOrder == PixelOrderRGB)
vr = __lsx_vslli_w(vr, 20);
else
vb = __lsx_vslli_w(vb, 20);
va = __lsx_vslli_w(va, 30);
__m128i vd = __lsx_vor_v(__lsx_vor_v(vr, vg), __lsx_vor_v(vb, va));
vd = __lsx_vshuf4i_w(vd, 0b11011000);
__lsx_vstelm_d(vd, dest, 0, 0);
dest += 2;
}
} else if (orAlpha == 0) {
for (int j = 0; j < 16; ++j) {
*dest++ = 0;
buffer++;
}
} else {
for (int j = 0; j < 16; ++j)
*dest++ = qConvertRgb64ToRgb30<PixelOrder>(*buffer++);
}
}
SIMD_EPILOGUE(i, count, 15)
*dest++ = qConvertRgb64ToRgb30<PixelOrder>(*buffer++);
}
#endif
static const QRgba64 *QT_FASTCALL convertRGB32ToRGB64(QRgba64 *buffer, const uint *src, int count,
@ -1150,6 +1252,8 @@ static const QRgba64 *QT_FASTCALL convertRGB32ToRGB64(QRgba64 *buffer, const uin
qConvertARGB32PMToRGBA64PM_sse2<false, true>(buffer, src, count);
#elif defined(__ARM_NEON__)
qConvertARGB32PMToRGBA64PM_neon<false, true>(buffer, src, count);
#elif defined(__loongarch_sx)
qConvertARGB32PMToRGBA64PM_lsx<false, true>(buffer, src, count);
#else
for (int i = 0; i < count; ++i)
buffer[i] = QRgba64::fromArgb32(0xff000000 | src[i]);
@ -1184,6 +1288,8 @@ static const QRgba64 *QT_FASTCALL convertARGB32PMToRGBA64PM(QRgba64 *buffer, con
qConvertARGB32PMToRGBA64PM_sse2<false, false>(buffer, src, count);
#elif defined(__ARM_NEON__)
qConvertARGB32PMToRGBA64PM_neon<false, false>(buffer, src, count);
#elif defined(__loongarch_sx)
qConvertARGB32PMToRGBA64PM_lsx<false, false>(buffer, src, count);
#else
for (int i = 0; i < count; ++i)
buffer[i] = QRgba64::fromArgb32(src[i]);
@ -1238,6 +1344,8 @@ static const QRgba64 *QT_FASTCALL convertRGBA8888PMToRGBA64PM(QRgba64 *buffer, c
qConvertARGB32PMToRGBA64PM_sse2<true, false>(buffer, src, count);
#elif defined(__ARM_NEON__)
qConvertARGB32PMToRGBA64PM_neon<true, false>(buffer, src, count);
#elif defined(__loongarch_sx)
qConvertARGB32PMToRGBA64PM_lsx<true, false>(buffer, src, count);
#else
for (int i = 0; i < count; ++i)
buffer[i] = QRgba64::fromArgb32(RGBA2ARGB(src[i]));
@ -1348,6 +1456,48 @@ static inline void qConvertA2RGB30PMToRGBA64PM_sse2(QRgba64 *buffer, const uint
SIMD_EPILOGUE(i, count, 3)
*buffer++ = qConvertA2rgb30ToRgb64<PixelOrder>(*src++);
}
#elif defined(__loongarch_sx)
template<QtPixelOrder PixelOrder>
static inline void qConvertA2RGB30PMToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count)
{
if (count <= 0)
return;
const __m128i rmask = __lsx_vreplgr2vr_w(0x3ff00000);
const __m128i gmask = __lsx_vreplgr2vr_w(0x000ffc00);
const __m128i bmask = __lsx_vreplgr2vr_w(0x000003ff);
const __m128i afactor = __lsx_vreplgr2vr_h(0x5555);
int i = 0;
for (; ((uintptr_t)buffer & 0xf) && i < count; ++i)
*buffer++ = qConvertA2rgb30ToRgb64<PixelOrder>(*src++);
for (; i < count-3; i += 4) {
__m128i vs = __lsx_vld((const __m128i*)src, 0);
src += 4;
__m128i va = __lsx_vsrli_w(vs, 30);
__m128i vr = __lsx_vand_v(vs, rmask);
__m128i vb = __lsx_vand_v(vs, bmask);
__m128i vg = __lsx_vand_v(vs, gmask);
va = __lsx_vmul_h(va, afactor);
vr = __lsx_vor_v(__lsx_vsrli_w(vr, 14), __lsx_vsrli_w(vr, 24));
vg = __lsx_vor_v(__lsx_vsrli_w(vg, 4), __lsx_vsrli_w(vg, 14));
vb = __lsx_vor_v(__lsx_vslli_w(vb, 6), __lsx_vsrli_w(vb, 4));
__m128i vrb;
if (PixelOrder == PixelOrderRGB)
vrb = __lsx_vor_v(vr, __lsx_vbsll_v(vb, 2));
else
vrb = __lsx_vor_v(vb, __lsx_vbsll_v(vr, 2));
__m128i vga = __lsx_vor_v(vg, __lsx_vbsll_v(va, 2));
__lsx_vst(__lsx_vilvl_h(vga, vrb), buffer, 0);
buffer += 2;
__lsx_vst(__lsx_vilvh_h(vga, vrb), buffer, 0);
buffer += 2;
}
SIMD_EPILOGUE(i, count, 3)
*buffer++ = qConvertA2rgb30ToRgb64<PixelOrder>(*src++);
}
#endif
template<QtPixelOrder PixelOrder>
@ -1356,6 +1506,8 @@ static const QRgba64 *QT_FASTCALL convertA2RGB30PMToRGBA64PM(QRgba64 *buffer, co
{
#ifdef __SSE2__
qConvertA2RGB30PMToRGBA64PM_sse2<PixelOrder>(buffer, src, count);
#elif defined (__loongarch_sx)
qConvertA2RGB30PMToRGBA64PM_lsx<PixelOrder>(buffer, src, count);
#else
for (int i = 0; i < count; ++i)
buffer[i] = qConvertA2rgb30ToRgb64<PixelOrder>(src[i]);
@ -1466,6 +1618,37 @@ void qt_convertRGBA64ToARGB32(uint *dst, const QRgba64 *src, int count)
_mm_storel_epi64((__m128i*)(dst), v1);
dst += 2;
}
#elif defined(__loongarch_sx)
if (((uintptr_t)dst & 0x7) && count > 0) {
uint s = (*src++).toArgb32();
if (RGBA)
s = ARGB2RGBA(s);
*dst++ = s;
i++;
}
const __m128i vhalf = __lsx_vreplgr2vr_w(0x80);
const __m128i vzero = __lsx_vldi(0);
const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 6, 5, 4, 7};
for (; i < count-1; i += 2) {
__m128i vs = __lsx_vld((const __m128i*)src, 0);
src += 2;
if (!RGBA) {
vs = __lsx_vshuf_h(shuffleMask, vzero, vs);
}
__m128i v1 = __lsx_vilvl_h(vzero, vs);
__m128i v2 = __lsx_vilvh_h(vzero, vs);
v1 = __lsx_vadd_w(v1, vhalf);
v2 = __lsx_vadd_w(v2, vhalf);
v1 = __lsx_vsub_w(v1, __lsx_vsrli_w(v1, 8));
v2 = __lsx_vsub_w(v2, __lsx_vsrli_w(v2, 8));
v1 = __lsx_vsrli_w(v1, 8);
v2 = __lsx_vsrli_w(v2, 8);
v1 = __lsx_vpickev_h(__lsx_vsat_w(v2, 15), __lsx_vsat_w(v1, 15));
v1 = __lsx_vmaxi_h(v1, 0);
v1 = __lsx_vpickev_b(vzero, __lsx_vsat_hu(v1, 7));
__lsx_vstelm_d(v1, dst, 0, 0);
dst += 2;
}
#endif
for (; i < count; i++) {
uint s = (*src++).toArgb32();
@ -1902,6 +2085,8 @@ static void QT_FASTCALL storeRGB30FromRGBA64PM(uchar *dest, const QRgba64 *src,
uint *d = (uint*)dest + index;
#ifdef __SSE2__
qConvertRGBA64PMToA2RGB30PM_sse2<PixelOrder>(d, src, count);
#elif defined (__loongarch_sx)
qConvertRGBA64PMToA2RGB30PM_lsx<PixelOrder>(d, src, count);
#else
for (int i = 0; i < count; ++i)
d[i] = qConvertRgb64ToRgb30<PixelOrder>(src[i]);

View File

@ -57,6 +57,24 @@ static inline uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535)
vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16)
return vrshrn_n_u32(vs32, 16); // vs = (vs + 0x8000) >> 16
}
#elif defined(__loongarch_sx)
static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, __m128i va)
{
__m128i vs = rgba64;
vs = __lsx_vilvl_h(__lsx_vmuh_hu(vs, va), __lsx_vmul_h(vs, va));
vs = __lsx_vadd_w(vs, __lsx_vsrli_w(vs, 16));
vs = __lsx_vadd_w(vs, __lsx_vreplgr2vr_w(0x8000));
vs = __lsx_vsrai_w(vs, 16);
vs = __lsx_vpickev_h(__lsx_vsat_w(vs, 15), __lsx_vsat_w(vs, 15));
return vs;
}
static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, uint alpha65535)
{
const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 0, 0, 4, 5, 6, 7};
const __m128i va = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0),
__lsx_vinsgr2vr_w(__lsx_vldi(0), alpha65535, 0));
return multiplyAlpha65535(rgba64, va);
}
#endif
static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
@ -73,6 +91,12 @@ static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
QRgba64 r;
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
return r;
#elif defined(__loongarch_sx)
const __m128i v = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&rgba64), 0);
const __m128i vr = multiplyAlpha65535(v, alpha65535);
QRgba64 r;
__lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0);
return r;
#else
return QRgba64::fromRgba64(qt_div_65535(rgba64.red() * alpha65535),
qt_div_65535(rgba64.green() * alpha65535),
@ -81,7 +105,7 @@ static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
#endif
}
#if defined(__SSE2__) || defined(__ARM_NEON__)
#if defined(__SSE2__) || defined(__ARM_NEON__) || defined(__loongarch_sx)
template<typename T>
static inline T Q_DECL_VECTORCALL multiplyAlpha255(T rgba64, uint alpha255)
{
@ -112,6 +136,14 @@ inline uint16x4_t interpolate255(uint16x4_t x, uint alpha1, uint16x4_t y, uint a
}
#endif
#if defined __loongarch_sx
static inline __m128i Q_DECL_VECTORCALL
interpolate255(__m128i x, uint alpha1, __m128i y, uint alpha2)
{
return __lsx_vadd_h(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2));
}
#endif
static inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
{
#if defined(__SSE2__)
@ -128,6 +160,13 @@ static inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alp
QRgba64 r;
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
return r;
#elif defined(__loongarch_sx)
const __m128i vx = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&x), 0);
const __m128i vy = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&y), 0);
const __m128i vr = interpolate255(vx, alpha1, vy, alpha2);
QRgba64 r;
__lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0);
return r;
#else
return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2));
#endif
@ -156,6 +195,18 @@ inline uint16x4_t interpolate65535(uint16x4_t x, uint16x4_t alpha1, uint16x4_t y
}
#endif
#if defined __loongarch_sx
static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, uint alpha1, __m128i y, uint alpha2)
{
return __lsx_vadd_h(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
}
static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, __m128i alpha1, __m128i y, __m128i alpha2)
{
return __lsx_vadd_h(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
}
#endif
static inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
{
#if defined(__SSE2__)
@ -172,6 +223,13 @@ static inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint a
QRgba64 r;
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
return r;
#elif defined(__loongarch_sx)
const __m128i vx = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&x), 0);
const __m128i vy = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&y), 0);
const __m128i vr = interpolate65535(vx, alpha1, vy, alpha2);
QRgba64 r;
__lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0);
return r;
#else
return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2));
#endif
@ -192,6 +250,13 @@ static inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b)
QRgba64 r;
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vqadd_u16(va, vb)));
return r;
#elif defined(__loongarch_sx)
const __m128i va = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&a), 0);
const __m128i vb = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&b), 0);
const __m128i vr = __lsx_vsadd_hu(va, vb);
QRgba64 r;
__lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0);
return r;
#else
return QRgba64::fromRgba64(qMin(a.red() + b.red(), 65535),
@ -221,6 +286,18 @@ static inline uint toArgb32(uint16x4_t v)
uint8x8_t v8 = vmovn_u16(vcombine_u16(v, v));
return vget_lane_u32(vreinterpret_u32_u8(v8), 0);
}
#elif defined __loongarch_sx
static inline uint Q_DECL_VECTORCALL toArgb32(__m128i v)
{
v = __lsx_vilvl_h(__lsx_vldi(0), v);
v = __lsx_vadd_w(v, __lsx_vreplgr2vr_w(128));
v = __lsx_vsub_w(v, __lsx_vsrli_w(v, 8));
v = __lsx_vsrli_w(v, 8);
v = __lsx_vpickev_h(__lsx_vsat_w(v, 15), __lsx_vsat_w(v, 15));
__m128i tmp = __lsx_vmaxi_h(v, 0);
v = __lsx_vpickev_b(__lsx_vsat_hu(tmp, 7), __lsx_vsat_hu(tmp, 7));
return __lsx_vpickve2gr_w(v, 0);
}
#endif
static inline uint toArgb32(QRgba64 rgba64)
@ -238,6 +315,11 @@ static inline uint toArgb32(QRgba64 rgba64)
v = vext_u16(v, v, 3);
#endif
return toArgb32(v);
#elif defined __loongarch_sx
__m128i v = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&rgba64), 0);
const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 4, 5, 6, 7};
v = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), v);
return toArgb32(v);
#else
return rgba64.toArgb32();
#endif
@ -251,6 +333,9 @@ static inline uint toRgba8888(QRgba64 rgba64)
#elif defined __ARM_NEON__
uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
return toArgb32(v);
#elif defined __loongarch_sx
__m128i v = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&rgba64), 0);
return toArgb32(v);
#else
return ARGB2RGBA(toArgb32(rgba64));
#endif
@ -289,6 +374,23 @@ static inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha)
vd32 = vsraq_n_u32(vd32, vd32, 16);
vd = vrshrn_n_u32(vd32, 16);
vst1_u64(reinterpret_cast<uint64_t *>(&blend), vreinterpret_u64_u16(vd));
#elif defined(__loongarch_sx)
__m128i vd = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&d), 0);
__m128i vs = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&s), 0);
__m128i va = __lsx_vinsgr2vr_w(__lsx_vldi(0), rgbAlpha, 0);
va = __lsx_vilvl_b(va, va);
const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 4, 5, 6, 7};
va = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), va);
__m128i vb = __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), va);
vs = __lsx_vilvl_h(__lsx_vmuh_hu(vs, va), __lsx_vmul_h(vs, va));
vd = __lsx_vilvl_h(__lsx_vmuh_hu(vd, vb), __lsx_vmul_h(vd, vb));
vd = __lsx_vadd_w(vd, vs);
vd = __lsx_vadd_w(vd, __lsx_vsrli_w(vd, 16));
vd = __lsx_vadd_w(vd, __lsx_vreplgr2vr_w(0x8000));
vd = __lsx_vsrai_w(vd, 16);
vd = __lsx_vpickev_h(__lsx_vsat_w(vd, 15), __lsx_vsat_w(vd, 15));
__lsx_vstelm_d(vd, reinterpret_cast<__m128i *>(&blend), 0, 0);
#else
const int mr = qRed(rgbAlpha);
const int mg = qGreen(rgbAlpha);
@ -318,6 +420,13 @@ static inline void blend_pixel(QRgba64 &dst, QRgba64 src)
const uint16x4_t via = veor_u16(vdup_n_u16(0xffff), vdup_lane_u16(vs, 3));
const uint16x4_t vr = vadd_u16(vs, multiplyAlpha65535(vd, via));
vst1_u64(reinterpret_cast<uint64_t *>(&dst), vreinterpret_u64_u16(vr));
#elif defined(__loongarch_sx)
const __m128i vd = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&dst), 0);
const __m128i vs = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&src), 0);
const __m128i shuffleMask = (__m128i)(v8i16){3, 3, 3, 3, 4, 5, 6, 7};
const __m128i via = __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), vs));
const __m128i vr = __lsx_vadd_h(vs, multiplyAlpha65535(vd, via));
__lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&dst), 0, 0);
#else
dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());
#endif
@ -343,6 +452,14 @@ static inline void blend_pixel(QRgba64 &dst, QRgba64 src, const int const_alpha)
const uint16x4_t via = veor_u16(vdup_n_u16(0xffff), vdup_lane_u16(vs, 3));
const uint16x4_t vr = vadd_u16(vs, multiplyAlpha65535(vd, via));
vst1_u64(reinterpret_cast<uint64_t *>(&dst), vreinterpret_u64_u16(vr));
#elif defined(__loongarch_sx)
const __m128i vd = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&dst), 0);
__m128i vs = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&src), 0);
vs = multiplyAlpha255(vs, const_alpha);
const __m128i shuffleMask = (__m128i)(v8i16){3, 3, 3, 3, 4, 5, 6, 7};
const __m128i via = __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), vs));
const __m128i vr = __lsx_vadd_h(vs, multiplyAlpha65535(vd, via));
__lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&dst), 0, 0);
#else
src = multiplyAlpha255(src, const_alpha);
dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());