Add some special LSX optimizations
During compilation, Qt will not add the -mlsx compilation parameter for these files. Therefore, the compiler needs to enable LSX auto-vectorization by default for all files during compilation for them to be compiled. Change-Id: I90c5029b673f831d39591ffd96c36e7762c68fb0 Reviewed-by: Volker Hilsheimer <volker.hilsheimer@qt.io>
This commit is contained in:
parent
dfc84993b7
commit
f9185516eb
@ -280,10 +280,78 @@ struct Rgba64OperationsNEON : public Rgba64OperationsBase
|
|||||||
};
|
};
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__loongarch_sx)
|
||||||
|
struct Rgba64OperationsLSX : public Rgba64OperationsBase
|
||||||
|
{
|
||||||
|
typedef __m128i OptimalType;
|
||||||
|
typedef __m128i OptimalScalar;
|
||||||
|
static OptimalType load(const Type *ptr)
|
||||||
|
{
|
||||||
|
return __lsx_vilvl_d(__lsx_vldi(0), __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(ptr), 0));
|
||||||
|
}
|
||||||
|
static OptimalType convert(const Type &value)
|
||||||
|
{
|
||||||
|
return __lsx_vinsgr2vr_d(__lsx_vldi(0), value, 0);
|
||||||
|
}
|
||||||
|
static void store(Type *ptr, OptimalType value)
|
||||||
|
{
|
||||||
|
__lsx_vstelm_d(value, reinterpret_cast<const __m128i *>(ptr), 0, 0);
|
||||||
|
}
|
||||||
|
static OptimalType add(OptimalType a, OptimalType b)
|
||||||
|
{
|
||||||
|
return __lsx_vadd_h(a, b);
|
||||||
|
}
|
||||||
|
// same as above:
|
||||||
|
// static OptimalScalar add(OptimalScalar a, OptimalScalar b)
|
||||||
|
static OptimalType plus(OptimalType a, OptimalType b)
|
||||||
|
{
|
||||||
|
return __lsx_vsadd_hu(a, b);
|
||||||
|
}
|
||||||
|
static OptimalScalar alpha(OptimalType c)
|
||||||
|
{
|
||||||
|
const __m128i shuffleMask = (__m128i)(v8i16){3, 3, 3, 3, 4, 5, 6, 7};
|
||||||
|
return __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), c);
|
||||||
|
}
|
||||||
|
static OptimalScalar invAlpha(Scalar c)
|
||||||
|
{
|
||||||
|
return scalar(65535 - c);
|
||||||
|
}
|
||||||
|
static OptimalScalar invAlpha(OptimalType c)
|
||||||
|
{
|
||||||
|
return __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), alpha(c));
|
||||||
|
}
|
||||||
|
static OptimalScalar scalar(Scalar n)
|
||||||
|
{
|
||||||
|
const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 0, 0, 4, 5, 6, 7};
|
||||||
|
return __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), __lsx_vinsgr2vr_w(__lsx_vldi(0), n, 0));
|
||||||
|
}
|
||||||
|
static OptimalType multiplyAlpha8bit(OptimalType val, uint8_t a)
|
||||||
|
{
|
||||||
|
return multiplyAlpha255(val, a);
|
||||||
|
}
|
||||||
|
// same as above:
|
||||||
|
// static OptimalScalar multiplyAlpha8bit(OptimalScalar a, uint8_t a)
|
||||||
|
static OptimalType interpolate8bit(OptimalType x, uint8_t a1, OptimalType y, uint8_t a2)
|
||||||
|
{
|
||||||
|
return interpolate255(x, a1, y, a2);
|
||||||
|
}
|
||||||
|
static OptimalType multiplyAlpha(OptimalType val, OptimalScalar a)
|
||||||
|
{
|
||||||
|
return multiplyAlpha65535(val, a);
|
||||||
|
}
|
||||||
|
static OptimalType interpolate(OptimalType x, OptimalScalar a1, OptimalType y, const OptimalScalar &a2)
|
||||||
|
{
|
||||||
|
return interpolate65535(x, a1, y, a2);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__)
|
||||||
typedef Rgba64OperationsSSE2 Rgba64Operations;
|
typedef Rgba64OperationsSSE2 Rgba64Operations;
|
||||||
#elif defined(__ARM_NEON__)
|
#elif defined(__ARM_NEON__)
|
||||||
typedef Rgba64OperationsNEON Rgba64Operations;
|
typedef Rgba64OperationsNEON Rgba64Operations;
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
typedef Rgba64OperationsLSX Rgba64Operations;
|
||||||
#else
|
#else
|
||||||
typedef Rgba64OperationsC Rgba64Operations;
|
typedef Rgba64OperationsC Rgba64Operations;
|
||||||
#endif
|
#endif
|
||||||
|
@ -1251,6 +1251,45 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, u
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__loongarch_sx)
|
||||||
|
static inline void interpolate_4_pixels_16_lsx(__m128i tl, __m128i tr, __m128i bl, __m128i br,
|
||||||
|
__m128i distx, __m128i disty, uint *b)
|
||||||
|
{
|
||||||
|
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
|
||||||
|
const __m128i v_256 = __lsx_vreplgr2vr_h(256);
|
||||||
|
const __m128i dxdy = __lsx_vmul_h(distx, disty);
|
||||||
|
const __m128i distx_ = __lsx_vslli_h(distx, 4);
|
||||||
|
const __m128i disty_ = __lsx_vslli_h(disty, 4);
|
||||||
|
const __m128i idxidy = __lsx_vadd_h(dxdy, __lsx_vsub_h(v_256, __lsx_vadd_h(distx_, disty_)));
|
||||||
|
const __m128i dxidy = __lsx_vsub_h(distx_, dxdy);
|
||||||
|
const __m128i idxdy = __lsx_vsub_h(disty_,dxdy);
|
||||||
|
|
||||||
|
__m128i tlAG = __lsx_vsrli_h(tl, 8);
|
||||||
|
__m128i tlRB = __lsx_vand_v(tl, colorMask);
|
||||||
|
__m128i trAG = __lsx_vsrli_h(tr, 8);
|
||||||
|
__m128i trRB = __lsx_vand_v(tr, colorMask);
|
||||||
|
__m128i blAG = __lsx_vsrli_h(bl, 8);
|
||||||
|
__m128i blRB = __lsx_vand_v(bl, colorMask);
|
||||||
|
__m128i brAG = __lsx_vsrli_h(br, 8);
|
||||||
|
__m128i brRB = __lsx_vand_v(br, colorMask);
|
||||||
|
|
||||||
|
tlAG = __lsx_vmul_h(tlAG, idxidy);
|
||||||
|
tlRB = __lsx_vmul_h(tlRB, idxidy);
|
||||||
|
trAG = __lsx_vmul_h(trAG, dxidy);
|
||||||
|
trRB = __lsx_vmul_h(trRB, dxidy);
|
||||||
|
blAG = __lsx_vmul_h(blAG, idxdy);
|
||||||
|
blRB = __lsx_vmul_h(blRB, idxdy);
|
||||||
|
brAG = __lsx_vmul_h(brAG, dxdy);
|
||||||
|
brRB = __lsx_vmul_h(brRB, dxdy);
|
||||||
|
|
||||||
|
__m128i rAG =__lsx_vadd_h(__lsx_vadd_h(tlAG, trAG), __lsx_vadd_h(blAG, brAG));
|
||||||
|
__m128i rRB =__lsx_vadd_h(__lsx_vadd_h(tlRB, trRB), __lsx_vadd_h(blRB, brRB));
|
||||||
|
rAG = __lsx_vandn_v(colorMask, rAG);
|
||||||
|
rRB = __lsx_vsrli_h(rRB, 8);
|
||||||
|
__lsx_vst(__lsx_vor_v(rAG, rRB), b, 0);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
template<TextureBlendType blendType>
|
template<TextureBlendType blendType>
|
||||||
void fetchTransformedBilinear_pixelBounds(int max, int l1, int l2, int &v1, int &v2);
|
void fetchTransformedBilinear_pixelBounds(int max, int l1, int l2, int &v1, int &v2);
|
||||||
|
|
||||||
@ -1426,6 +1465,36 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_scale_helper(uin
|
|||||||
rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8));
|
rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8));
|
||||||
vst1q_s16((int16_t*)(&intermediate.buffer_rb[f]), rRB);
|
vst1q_s16((int16_t*)(&intermediate.buffer_rb[f]), rRB);
|
||||||
}
|
}
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
const __m128i disty_ = __lsx_vreplgr2vr_h(disty);
|
||||||
|
const __m128i idisty_ = __lsx_vreplgr2vr_h(idisty);
|
||||||
|
const __m128i colorMask = __lsx_vreplgr2vr_w(0x00ff00ff);
|
||||||
|
|
||||||
|
lim -= 3;
|
||||||
|
for (; f < lim; x += 4, f += 4) {
|
||||||
|
// Load 4 pixels from s1, and split the alpha-green and red-blue component
|
||||||
|
__m128i top = __lsx_vld((const __m128i*)((const uint *)(s1)+x), 0);
|
||||||
|
__m128i topAG = __lsx_vsrli_h(top, 8);
|
||||||
|
__m128i topRB = __lsx_vand_v(top, colorMask);
|
||||||
|
// Multiplies each color component by idisty
|
||||||
|
topAG = __lsx_vmul_h(topAG, idisty_);
|
||||||
|
topRB = __lsx_vmul_h(topRB, idisty_);
|
||||||
|
|
||||||
|
// Same for the s2 vector
|
||||||
|
__m128i bottom = __lsx_vld((const __m128i*)((const uint *)(s2)+x), 0);
|
||||||
|
__m128i bottomAG = __lsx_vsrli_h(bottom, 8);
|
||||||
|
__m128i bottomRB = __lsx_vand_v(bottom, colorMask);
|
||||||
|
bottomAG = __lsx_vmul_h(bottomAG, disty_);
|
||||||
|
bottomRB = __lsx_vmul_h(bottomRB, disty_);
|
||||||
|
|
||||||
|
// Add the values, and shift to only keep 8 significant bits per colors
|
||||||
|
__m128i rAG = __lsx_vadd_h(topAG, bottomAG);
|
||||||
|
rAG = __lsx_vsrli_h(rAG, 8);
|
||||||
|
__lsx_vst(rAG, (__m128i*)(&intermediate.buffer_ag[f]), 0);
|
||||||
|
__m128i rRB = __lsx_vadd_h(topRB, bottomRB);
|
||||||
|
rRB = __lsx_vsrli_h(rRB, 8);
|
||||||
|
__lsx_vst(rRB, (__m128i*)(&intermediate.buffer_rb[f]), 0);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
for (; f < count; f++) { // Same as above but without simd
|
for (; f < count; f++) { // Same as above but without simd
|
||||||
@ -1615,6 +1684,33 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper(uint *
|
|||||||
b+=4;
|
b+=4;
|
||||||
v_fx = vaddq_s32(v_fx, v_fdx);
|
v_fx = vaddq_s32(v_fx, v_fdx);
|
||||||
}
|
}
|
||||||
|
#elif defined (__loongarch_sx)
|
||||||
|
const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 2, 2, 4, 4, 6, 6};
|
||||||
|
const __m128i v_disty = __lsx_vreplgr2vr_h(disty4);
|
||||||
|
const __m128i v_fdx = __lsx_vreplgr2vr_w(fdx*4);
|
||||||
|
const __m128i v_fx_r = __lsx_vreplgr2vr_w(0x8);
|
||||||
|
__m128i v_fx = (__m128i)(v4i32){fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx};
|
||||||
|
|
||||||
|
while (b < boundedEnd - 3) {
|
||||||
|
__m128i offset = __lsx_vsrli_w(v_fx, 16);
|
||||||
|
const int offset0 = __lsx_vpickve2gr_w(offset, 0);
|
||||||
|
const int offset1 = __lsx_vpickve2gr_w(offset, 1);
|
||||||
|
const int offset2 = __lsx_vpickve2gr_w(offset, 2);
|
||||||
|
const int offset3 = __lsx_vpickve2gr_w(offset, 3);
|
||||||
|
const __m128i tl = (__m128i)(v4u32){s1[offset0], s1[offset1], s1[offset2], s1[offset3]};
|
||||||
|
const __m128i tr = (__m128i)(v4u32){s1[offset0 + 1], s1[offset1 + 1], s1[offset2 + 1], s1[offset3 + 1]};
|
||||||
|
const __m128i bl = (__m128i)(v4u32){s2[offset0], s2[offset1], s2[offset2], s2[offset3]};
|
||||||
|
const __m128i br = (__m128i)(v4u32){s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]};
|
||||||
|
|
||||||
|
__m128i v_distx = __lsx_vsrli_h(v_fx, 8);
|
||||||
|
v_distx = __lsx_vsrli_h(__lsx_vadd_w(v_distx, v_fx_r), 4);
|
||||||
|
v_distx = __lsx_vshuf_h(shuffleMask, v_distx, v_distx);
|
||||||
|
|
||||||
|
interpolate_4_pixels_16_lsx(tl, tr, bl, br, v_distx, v_disty, b);
|
||||||
|
b += 4;
|
||||||
|
v_fx = __lsx_vadd_w(v_fx, v_fdx);
|
||||||
|
}
|
||||||
|
fx = __lsx_vpickve2gr_w(v_fx, 0);
|
||||||
#endif
|
#endif
|
||||||
while (b < boundedEnd) {
|
while (b < boundedEnd) {
|
||||||
int x = (fx >> 16);
|
int x = (fx >> 16);
|
||||||
@ -1852,6 +1948,50 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper(uint
|
|||||||
v_fx = vaddq_s32(v_fx, v_fdx);
|
v_fx = vaddq_s32(v_fx, v_fdx);
|
||||||
v_fy = vaddq_s32(v_fy, v_fdy);
|
v_fy = vaddq_s32(v_fy, v_fdy);
|
||||||
}
|
}
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
const __m128i v_fdx = __lsx_vreplgr2vr_w(fdx*4);
|
||||||
|
const __m128i v_fdy = __lsx_vreplgr2vr_w(fdy*4);
|
||||||
|
const __m128i v_fxy_r = __lsx_vreplgr2vr_w(0x8);
|
||||||
|
__m128i v_fx = (__m128i)(v4i32){fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx};
|
||||||
|
__m128i v_fy = (__m128i)(v4i32){fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy};
|
||||||
|
|
||||||
|
const uchar *textureData = image.imageData;
|
||||||
|
const qsizetype bytesPerLine = image.bytesPerLine;
|
||||||
|
const __m128i zero = __lsx_vldi(0);
|
||||||
|
const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 0, 0, 4, 5, 6, 7};
|
||||||
|
const __m128i shuffleMask1 = (__m128i)(v8i16){0, 0, 2, 2, 4, 4, 6, 6};
|
||||||
|
const __m128i vbpl = __lsx_vshuf_h(shuffleMask, zero, __lsx_vinsgr2vr_w(zero, bytesPerLine/4, 0));
|
||||||
|
|
||||||
|
while (b < boundedEnd - 3) {
|
||||||
|
const __m128i vy = __lsx_vpickev_h(zero, __lsx_vsat_w(__lsx_vsrli_w(v_fy, 16), 15));
|
||||||
|
// 4x16bit * 4x16bit -> 4x32bit
|
||||||
|
__m128i offset = __lsx_vilvl_h(__lsx_vmuh_h(vy, vbpl), __lsx_vmul_h(vy, vbpl));
|
||||||
|
offset = __lsx_vadd_w(offset, __lsx_vsrli_w(v_fx, 16));
|
||||||
|
const int offset0 = __lsx_vpickve2gr_w(offset, 0);
|
||||||
|
const int offset1 = __lsx_vpickve2gr_w(offset, 1);
|
||||||
|
const int offset2 = __lsx_vpickve2gr_w(offset, 2);
|
||||||
|
const int offset3 = __lsx_vpickve2gr_w(offset, 3);
|
||||||
|
const uint *topData = (const uint *)(textureData);
|
||||||
|
const __m128i tl = (__m128i)(v4u32){topData[offset0], topData[offset1], topData[offset2], topData[offset3]};
|
||||||
|
const __m128i tr = (__m128i)(v4u32){topData[offset0 + 1], topData[offset1 + 1], topData[offset2 + 1], topData[offset3 + 1]};
|
||||||
|
const uint *bottomData = (const uint *)(textureData + bytesPerLine);
|
||||||
|
const __m128i bl = (__m128i)(v4u32){bottomData[offset0], bottomData[offset1], bottomData[offset2], bottomData[offset3]};
|
||||||
|
const __m128i br = (__m128i)(v4u32){bottomData[offset0 + 1], bottomData[offset1 + 1], bottomData[offset2 + 1], bottomData[offset3 + 1]};
|
||||||
|
|
||||||
|
__m128i v_distx = __lsx_vsrli_h(v_fx, 8);
|
||||||
|
__m128i v_disty = __lsx_vsrli_h(v_fy, 8);
|
||||||
|
v_distx = __lsx_vsrli_h(__lsx_vadd_w(v_distx, v_fxy_r), 4);
|
||||||
|
v_disty = __lsx_vsrli_h(__lsx_vadd_w(v_disty, v_fxy_r), 4);
|
||||||
|
v_distx = __lsx_vshuf_h(shuffleMask1, zero, v_distx);
|
||||||
|
v_disty = __lsx_vshuf_h(shuffleMask1, zero, v_disty);
|
||||||
|
|
||||||
|
interpolate_4_pixels_16_lsx(tl, tr, bl, br, v_distx, v_disty, b);
|
||||||
|
b += 4;
|
||||||
|
v_fx = __lsx_vadd_w(v_fx, v_fdx);
|
||||||
|
v_fy = __lsx_vadd_w(v_fy, v_fdy);
|
||||||
|
}
|
||||||
|
fx = __lsx_vpickve2gr_w(v_fx, 0);
|
||||||
|
fy = __lsx_vpickve2gr_w(v_fy, 0);
|
||||||
#endif
|
#endif
|
||||||
while (b < boundedEnd) {
|
while (b < boundedEnd) {
|
||||||
int x = (fx >> 16);
|
int x = (fx >> 16);
|
||||||
|
@ -1141,6 +1141,108 @@ static inline void qConvertARGB32PMToRGBA64PM_neon(QRgba64 *buffer, const uint *
|
|||||||
*buffer++ = QRgba64::fromArgb32(s);
|
*buffer++ = QRgba64::fromArgb32(s);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
#elif defined __loongarch_sx
|
||||||
|
template<bool RGBA, bool maskAlpha>
|
||||||
|
static inline void qConvertARGB32PMToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count)
|
||||||
|
{
|
||||||
|
if (count <= 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
const __m128i amask = __lsx_vreplgr2vr_w(0xff000000);
|
||||||
|
const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 6, 5, 4, 7};
|
||||||
|
int i = 0;
|
||||||
|
for (; ((uintptr_t)buffer & 0xf) && i < count; ++i) {
|
||||||
|
uint s = *src++;
|
||||||
|
if (maskAlpha)
|
||||||
|
s = s | 0xff000000;
|
||||||
|
if (RGBA)
|
||||||
|
s = RGBA2ARGB(s);
|
||||||
|
*buffer++ = QRgba64::fromArgb32(s);
|
||||||
|
}
|
||||||
|
for (; i < count-3; i += 4) {
|
||||||
|
__m128i vs = __lsx_vld((const __m128i*)src, 0);
|
||||||
|
if (maskAlpha)
|
||||||
|
vs = __lsx_vor_v(vs, amask);
|
||||||
|
src += 4;
|
||||||
|
__m128i v1 = __lsx_vilvl_b(vs, vs);
|
||||||
|
__m128i v2 = __lsx_vilvh_b(vs, vs);
|
||||||
|
if (!RGBA) {
|
||||||
|
v1 = __lsx_vshuf_h(shuffleMask, v1, v1);
|
||||||
|
v2 = __lsx_vshuf_h(shuffleMask, v2, v2);
|
||||||
|
}
|
||||||
|
__lsx_vst(v1, buffer, 0);
|
||||||
|
buffer += 2;
|
||||||
|
__lsx_vst(v2, buffer, 0);
|
||||||
|
buffer += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
SIMD_EPILOGUE(i, count, 3) {
|
||||||
|
uint s = *src++;
|
||||||
|
if (maskAlpha)
|
||||||
|
s = s | 0xff000000;
|
||||||
|
if (RGBA)
|
||||||
|
s = RGBA2ARGB(s);
|
||||||
|
*buffer++ = QRgba64::fromArgb32(s);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template<QtPixelOrder PixelOrder>
|
||||||
|
static inline void qConvertRGBA64PMToA2RGB30PM_lsx(uint *dest, const QRgba64 *buffer, int count)
|
||||||
|
{
|
||||||
|
const __m128i gmask = __lsx_vreplgr2vr_w(0x000ffc00);
|
||||||
|
const __m128i cmask = __lsx_vreplgr2vr_w(0x000003ff);
|
||||||
|
int i = 0;
|
||||||
|
__m128i vr, vg, vb, va;
|
||||||
|
for (; i < count && uintptr_t(buffer) & 0xF; ++i) {
|
||||||
|
*dest++ = qConvertRgb64ToRgb30<PixelOrder>(*buffer++);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; i < count-15; i += 16) {
|
||||||
|
__m128i vOr = __lsx_vreplgr2vr_w(0);
|
||||||
|
__m128i vAnd = __lsx_vreplgr2vr_w(0xffffffff);
|
||||||
|
for (int j = 0; j < 16; j += 2) {
|
||||||
|
__m128i vs = __lsx_vld((const __m128i*)(buffer + j), 0);
|
||||||
|
vOr = __lsx_vor_v(vOr, vs);
|
||||||
|
vAnd = __lsx_vand_v(vAnd, vs);
|
||||||
|
}
|
||||||
|
const quint16 orAlpha = ((uint)__lsx_vpickve2gr_h(vOr, 3)) | ((uint)__lsx_vpickve2gr_h(vOr, 7));
|
||||||
|
const quint16 andAlpha = ((uint)__lsx_vpickve2gr_h(vAnd, 3)) & ((uint)__lsx_vpickve2gr_h(vAnd, 7));
|
||||||
|
|
||||||
|
if (andAlpha == 0xffff) {
|
||||||
|
for (int j = 0; j < 16; j += 2) {
|
||||||
|
__m128i vs = __lsx_vld((const __m128i*)buffer, 0);
|
||||||
|
buffer += 2;
|
||||||
|
vr = __lsx_vsrli_d(vs, 6);
|
||||||
|
vg = __lsx_vsrli_d(vs, 16 + 6 - 10);
|
||||||
|
vb = __lsx_vsrli_d(vs, 32 + 6);
|
||||||
|
vr = __lsx_vand_v(vr, cmask);
|
||||||
|
vg = __lsx_vand_v(vg, gmask);
|
||||||
|
vb = __lsx_vand_v(vb, cmask);
|
||||||
|
va = __lsx_vsrli_d(vs, 48 + 14);
|
||||||
|
if (PixelOrder == PixelOrderRGB)
|
||||||
|
vr = __lsx_vslli_w(vr, 20);
|
||||||
|
else
|
||||||
|
vb = __lsx_vslli_w(vb, 20);
|
||||||
|
va = __lsx_vslli_w(va, 30);
|
||||||
|
__m128i vd = __lsx_vor_v(__lsx_vor_v(vr, vg), __lsx_vor_v(vb, va));
|
||||||
|
vd = __lsx_vshuf4i_w(vd, 0b11011000);
|
||||||
|
__lsx_vstelm_d(vd, dest, 0, 0);
|
||||||
|
dest += 2;
|
||||||
|
}
|
||||||
|
} else if (orAlpha == 0) {
|
||||||
|
for (int j = 0; j < 16; ++j) {
|
||||||
|
*dest++ = 0;
|
||||||
|
buffer++;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (int j = 0; j < 16; ++j)
|
||||||
|
*dest++ = qConvertRgb64ToRgb30<PixelOrder>(*buffer++);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
SIMD_EPILOGUE(i, count, 15)
|
||||||
|
*dest++ = qConvertRgb64ToRgb30<PixelOrder>(*buffer++);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static const QRgba64 *QT_FASTCALL convertRGB32ToRGB64(QRgba64 *buffer, const uint *src, int count,
|
static const QRgba64 *QT_FASTCALL convertRGB32ToRGB64(QRgba64 *buffer, const uint *src, int count,
|
||||||
@ -1150,6 +1252,8 @@ static const QRgba64 *QT_FASTCALL convertRGB32ToRGB64(QRgba64 *buffer, const uin
|
|||||||
qConvertARGB32PMToRGBA64PM_sse2<false, true>(buffer, src, count);
|
qConvertARGB32PMToRGBA64PM_sse2<false, true>(buffer, src, count);
|
||||||
#elif defined(__ARM_NEON__)
|
#elif defined(__ARM_NEON__)
|
||||||
qConvertARGB32PMToRGBA64PM_neon<false, true>(buffer, src, count);
|
qConvertARGB32PMToRGBA64PM_neon<false, true>(buffer, src, count);
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
qConvertARGB32PMToRGBA64PM_lsx<false, true>(buffer, src, count);
|
||||||
#else
|
#else
|
||||||
for (int i = 0; i < count; ++i)
|
for (int i = 0; i < count; ++i)
|
||||||
buffer[i] = QRgba64::fromArgb32(0xff000000 | src[i]);
|
buffer[i] = QRgba64::fromArgb32(0xff000000 | src[i]);
|
||||||
@ -1184,6 +1288,8 @@ static const QRgba64 *QT_FASTCALL convertARGB32PMToRGBA64PM(QRgba64 *buffer, con
|
|||||||
qConvertARGB32PMToRGBA64PM_sse2<false, false>(buffer, src, count);
|
qConvertARGB32PMToRGBA64PM_sse2<false, false>(buffer, src, count);
|
||||||
#elif defined(__ARM_NEON__)
|
#elif defined(__ARM_NEON__)
|
||||||
qConvertARGB32PMToRGBA64PM_neon<false, false>(buffer, src, count);
|
qConvertARGB32PMToRGBA64PM_neon<false, false>(buffer, src, count);
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
qConvertARGB32PMToRGBA64PM_lsx<false, false>(buffer, src, count);
|
||||||
#else
|
#else
|
||||||
for (int i = 0; i < count; ++i)
|
for (int i = 0; i < count; ++i)
|
||||||
buffer[i] = QRgba64::fromArgb32(src[i]);
|
buffer[i] = QRgba64::fromArgb32(src[i]);
|
||||||
@ -1238,6 +1344,8 @@ static const QRgba64 *QT_FASTCALL convertRGBA8888PMToRGBA64PM(QRgba64 *buffer, c
|
|||||||
qConvertARGB32PMToRGBA64PM_sse2<true, false>(buffer, src, count);
|
qConvertARGB32PMToRGBA64PM_sse2<true, false>(buffer, src, count);
|
||||||
#elif defined(__ARM_NEON__)
|
#elif defined(__ARM_NEON__)
|
||||||
qConvertARGB32PMToRGBA64PM_neon<true, false>(buffer, src, count);
|
qConvertARGB32PMToRGBA64PM_neon<true, false>(buffer, src, count);
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
qConvertARGB32PMToRGBA64PM_lsx<true, false>(buffer, src, count);
|
||||||
#else
|
#else
|
||||||
for (int i = 0; i < count; ++i)
|
for (int i = 0; i < count; ++i)
|
||||||
buffer[i] = QRgba64::fromArgb32(RGBA2ARGB(src[i]));
|
buffer[i] = QRgba64::fromArgb32(RGBA2ARGB(src[i]));
|
||||||
@ -1348,6 +1456,48 @@ static inline void qConvertA2RGB30PMToRGBA64PM_sse2(QRgba64 *buffer, const uint
|
|||||||
SIMD_EPILOGUE(i, count, 3)
|
SIMD_EPILOGUE(i, count, 3)
|
||||||
*buffer++ = qConvertA2rgb30ToRgb64<PixelOrder>(*src++);
|
*buffer++ = qConvertA2rgb30ToRgb64<PixelOrder>(*src++);
|
||||||
}
|
}
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
template<QtPixelOrder PixelOrder>
|
||||||
|
static inline void qConvertA2RGB30PMToRGBA64PM_lsx(QRgba64 *buffer, const uint *src, int count)
|
||||||
|
{
|
||||||
|
if (count <= 0)
|
||||||
|
return;
|
||||||
|
|
||||||
|
const __m128i rmask = __lsx_vreplgr2vr_w(0x3ff00000);
|
||||||
|
const __m128i gmask = __lsx_vreplgr2vr_w(0x000ffc00);
|
||||||
|
const __m128i bmask = __lsx_vreplgr2vr_w(0x000003ff);
|
||||||
|
const __m128i afactor = __lsx_vreplgr2vr_h(0x5555);
|
||||||
|
int i = 0;
|
||||||
|
|
||||||
|
for (; ((uintptr_t)buffer & 0xf) && i < count; ++i)
|
||||||
|
*buffer++ = qConvertA2rgb30ToRgb64<PixelOrder>(*src++);
|
||||||
|
|
||||||
|
for (; i < count-3; i += 4) {
|
||||||
|
__m128i vs = __lsx_vld((const __m128i*)src, 0);
|
||||||
|
src += 4;
|
||||||
|
__m128i va = __lsx_vsrli_w(vs, 30);
|
||||||
|
__m128i vr = __lsx_vand_v(vs, rmask);
|
||||||
|
__m128i vb = __lsx_vand_v(vs, bmask);
|
||||||
|
__m128i vg = __lsx_vand_v(vs, gmask);
|
||||||
|
va = __lsx_vmul_h(va, afactor);
|
||||||
|
vr = __lsx_vor_v(__lsx_vsrli_w(vr, 14), __lsx_vsrli_w(vr, 24));
|
||||||
|
vg = __lsx_vor_v(__lsx_vsrli_w(vg, 4), __lsx_vsrli_w(vg, 14));
|
||||||
|
vb = __lsx_vor_v(__lsx_vslli_w(vb, 6), __lsx_vsrli_w(vb, 4));
|
||||||
|
__m128i vrb;
|
||||||
|
if (PixelOrder == PixelOrderRGB)
|
||||||
|
vrb = __lsx_vor_v(vr, __lsx_vbsll_v(vb, 2));
|
||||||
|
else
|
||||||
|
vrb = __lsx_vor_v(vb, __lsx_vbsll_v(vr, 2));
|
||||||
|
__m128i vga = __lsx_vor_v(vg, __lsx_vbsll_v(va, 2));
|
||||||
|
__lsx_vst(__lsx_vilvl_h(vga, vrb), buffer, 0);
|
||||||
|
buffer += 2;
|
||||||
|
__lsx_vst(__lsx_vilvh_h(vga, vrb), buffer, 0);
|
||||||
|
buffer += 2;
|
||||||
|
}
|
||||||
|
|
||||||
|
SIMD_EPILOGUE(i, count, 3)
|
||||||
|
*buffer++ = qConvertA2rgb30ToRgb64<PixelOrder>(*src++);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
template<QtPixelOrder PixelOrder>
|
template<QtPixelOrder PixelOrder>
|
||||||
@ -1356,6 +1506,8 @@ static const QRgba64 *QT_FASTCALL convertA2RGB30PMToRGBA64PM(QRgba64 *buffer, co
|
|||||||
{
|
{
|
||||||
#ifdef __SSE2__
|
#ifdef __SSE2__
|
||||||
qConvertA2RGB30PMToRGBA64PM_sse2<PixelOrder>(buffer, src, count);
|
qConvertA2RGB30PMToRGBA64PM_sse2<PixelOrder>(buffer, src, count);
|
||||||
|
#elif defined (__loongarch_sx)
|
||||||
|
qConvertA2RGB30PMToRGBA64PM_lsx<PixelOrder>(buffer, src, count);
|
||||||
#else
|
#else
|
||||||
for (int i = 0; i < count; ++i)
|
for (int i = 0; i < count; ++i)
|
||||||
buffer[i] = qConvertA2rgb30ToRgb64<PixelOrder>(src[i]);
|
buffer[i] = qConvertA2rgb30ToRgb64<PixelOrder>(src[i]);
|
||||||
@ -1466,6 +1618,37 @@ void qt_convertRGBA64ToARGB32(uint *dst, const QRgba64 *src, int count)
|
|||||||
_mm_storel_epi64((__m128i*)(dst), v1);
|
_mm_storel_epi64((__m128i*)(dst), v1);
|
||||||
dst += 2;
|
dst += 2;
|
||||||
}
|
}
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
if (((uintptr_t)dst & 0x7) && count > 0) {
|
||||||
|
uint s = (*src++).toArgb32();
|
||||||
|
if (RGBA)
|
||||||
|
s = ARGB2RGBA(s);
|
||||||
|
*dst++ = s;
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
const __m128i vhalf = __lsx_vreplgr2vr_w(0x80);
|
||||||
|
const __m128i vzero = __lsx_vldi(0);
|
||||||
|
const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 6, 5, 4, 7};
|
||||||
|
for (; i < count-1; i += 2) {
|
||||||
|
__m128i vs = __lsx_vld((const __m128i*)src, 0);
|
||||||
|
src += 2;
|
||||||
|
if (!RGBA) {
|
||||||
|
vs = __lsx_vshuf_h(shuffleMask, vzero, vs);
|
||||||
|
}
|
||||||
|
__m128i v1 = __lsx_vilvl_h(vzero, vs);
|
||||||
|
__m128i v2 = __lsx_vilvh_h(vzero, vs);
|
||||||
|
v1 = __lsx_vadd_w(v1, vhalf);
|
||||||
|
v2 = __lsx_vadd_w(v2, vhalf);
|
||||||
|
v1 = __lsx_vsub_w(v1, __lsx_vsrli_w(v1, 8));
|
||||||
|
v2 = __lsx_vsub_w(v2, __lsx_vsrli_w(v2, 8));
|
||||||
|
v1 = __lsx_vsrli_w(v1, 8);
|
||||||
|
v2 = __lsx_vsrli_w(v2, 8);
|
||||||
|
v1 = __lsx_vpickev_h(__lsx_vsat_w(v2, 15), __lsx_vsat_w(v1, 15));
|
||||||
|
v1 = __lsx_vmaxi_h(v1, 0);
|
||||||
|
v1 = __lsx_vpickev_b(vzero, __lsx_vsat_hu(v1, 7));
|
||||||
|
__lsx_vstelm_d(v1, dst, 0, 0);
|
||||||
|
dst += 2;
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
for (; i < count; i++) {
|
for (; i < count; i++) {
|
||||||
uint s = (*src++).toArgb32();
|
uint s = (*src++).toArgb32();
|
||||||
@ -1902,6 +2085,8 @@ static void QT_FASTCALL storeRGB30FromRGBA64PM(uchar *dest, const QRgba64 *src,
|
|||||||
uint *d = (uint*)dest + index;
|
uint *d = (uint*)dest + index;
|
||||||
#ifdef __SSE2__
|
#ifdef __SSE2__
|
||||||
qConvertRGBA64PMToA2RGB30PM_sse2<PixelOrder>(d, src, count);
|
qConvertRGBA64PMToA2RGB30PM_sse2<PixelOrder>(d, src, count);
|
||||||
|
#elif defined (__loongarch_sx)
|
||||||
|
qConvertRGBA64PMToA2RGB30PM_lsx<PixelOrder>(d, src, count);
|
||||||
#else
|
#else
|
||||||
for (int i = 0; i < count; ++i)
|
for (int i = 0; i < count; ++i)
|
||||||
d[i] = qConvertRgb64ToRgb30<PixelOrder>(src[i]);
|
d[i] = qConvertRgb64ToRgb30<PixelOrder>(src[i]);
|
||||||
|
@ -57,6 +57,24 @@ static inline uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535)
|
|||||||
vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16)
|
vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16)
|
||||||
return vrshrn_n_u32(vs32, 16); // vs = (vs + 0x8000) >> 16
|
return vrshrn_n_u32(vs32, 16); // vs = (vs + 0x8000) >> 16
|
||||||
}
|
}
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, __m128i va)
|
||||||
|
{
|
||||||
|
__m128i vs = rgba64;
|
||||||
|
vs = __lsx_vilvl_h(__lsx_vmuh_hu(vs, va), __lsx_vmul_h(vs, va));
|
||||||
|
vs = __lsx_vadd_w(vs, __lsx_vsrli_w(vs, 16));
|
||||||
|
vs = __lsx_vadd_w(vs, __lsx_vreplgr2vr_w(0x8000));
|
||||||
|
vs = __lsx_vsrai_w(vs, 16);
|
||||||
|
vs = __lsx_vpickev_h(__lsx_vsat_w(vs, 15), __lsx_vsat_w(vs, 15));
|
||||||
|
return vs;
|
||||||
|
}
|
||||||
|
static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, uint alpha65535)
|
||||||
|
{
|
||||||
|
const __m128i shuffleMask = (__m128i)(v8i16){0, 0, 0, 0, 4, 5, 6, 7};
|
||||||
|
const __m128i va = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0),
|
||||||
|
__lsx_vinsgr2vr_w(__lsx_vldi(0), alpha65535, 0));
|
||||||
|
return multiplyAlpha65535(rgba64, va);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
|
static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
|
||||||
@ -73,6 +91,12 @@ static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
|
|||||||
QRgba64 r;
|
QRgba64 r;
|
||||||
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
|
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
|
||||||
return r;
|
return r;
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
const __m128i v = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&rgba64), 0);
|
||||||
|
const __m128i vr = multiplyAlpha65535(v, alpha65535);
|
||||||
|
QRgba64 r;
|
||||||
|
__lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0);
|
||||||
|
return r;
|
||||||
#else
|
#else
|
||||||
return QRgba64::fromRgba64(qt_div_65535(rgba64.red() * alpha65535),
|
return QRgba64::fromRgba64(qt_div_65535(rgba64.red() * alpha65535),
|
||||||
qt_div_65535(rgba64.green() * alpha65535),
|
qt_div_65535(rgba64.green() * alpha65535),
|
||||||
@ -81,7 +105,7 @@ static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
|
|||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined(__SSE2__) || defined(__ARM_NEON__)
|
#if defined(__SSE2__) || defined(__ARM_NEON__) || defined(__loongarch_sx)
|
||||||
template<typename T>
|
template<typename T>
|
||||||
static inline T Q_DECL_VECTORCALL multiplyAlpha255(T rgba64, uint alpha255)
|
static inline T Q_DECL_VECTORCALL multiplyAlpha255(T rgba64, uint alpha255)
|
||||||
{
|
{
|
||||||
@ -112,6 +136,14 @@ inline uint16x4_t interpolate255(uint16x4_t x, uint alpha1, uint16x4_t y, uint a
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined __loongarch_sx
|
||||||
|
static inline __m128i Q_DECL_VECTORCALL
|
||||||
|
interpolate255(__m128i x, uint alpha1, __m128i y, uint alpha2)
|
||||||
|
{
|
||||||
|
return __lsx_vadd_h(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
|
static inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
|
||||||
{
|
{
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__)
|
||||||
@ -128,6 +160,13 @@ static inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alp
|
|||||||
QRgba64 r;
|
QRgba64 r;
|
||||||
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
|
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
|
||||||
return r;
|
return r;
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
const __m128i vx = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&x), 0);
|
||||||
|
const __m128i vy = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&y), 0);
|
||||||
|
const __m128i vr = interpolate255(vx, alpha1, vy, alpha2);
|
||||||
|
QRgba64 r;
|
||||||
|
__lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0);
|
||||||
|
return r;
|
||||||
#else
|
#else
|
||||||
return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2));
|
return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2));
|
||||||
#endif
|
#endif
|
||||||
@ -156,6 +195,18 @@ inline uint16x4_t interpolate65535(uint16x4_t x, uint16x4_t alpha1, uint16x4_t y
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined __loongarch_sx
|
||||||
|
static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, uint alpha1, __m128i y, uint alpha2)
|
||||||
|
{
|
||||||
|
return __lsx_vadd_h(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
|
||||||
|
}
|
||||||
|
|
||||||
|
static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, __m128i alpha1, __m128i y, __m128i alpha2)
|
||||||
|
{
|
||||||
|
return __lsx_vadd_h(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
|
static inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
|
||||||
{
|
{
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__)
|
||||||
@ -172,6 +223,13 @@ static inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint a
|
|||||||
QRgba64 r;
|
QRgba64 r;
|
||||||
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
|
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
|
||||||
return r;
|
return r;
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
const __m128i vx = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&x), 0);
|
||||||
|
const __m128i vy = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&y), 0);
|
||||||
|
const __m128i vr = interpolate65535(vx, alpha1, vy, alpha2);
|
||||||
|
QRgba64 r;
|
||||||
|
__lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0);
|
||||||
|
return r;
|
||||||
#else
|
#else
|
||||||
return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2));
|
return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2));
|
||||||
#endif
|
#endif
|
||||||
@ -192,6 +250,13 @@ static inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b)
|
|||||||
QRgba64 r;
|
QRgba64 r;
|
||||||
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vqadd_u16(va, vb)));
|
vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vqadd_u16(va, vb)));
|
||||||
return r;
|
return r;
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
const __m128i va = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&a), 0);
|
||||||
|
const __m128i vb = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&b), 0);
|
||||||
|
const __m128i vr = __lsx_vsadd_hu(va, vb);
|
||||||
|
QRgba64 r;
|
||||||
|
__lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&r), 0, 0);
|
||||||
|
return r;
|
||||||
#else
|
#else
|
||||||
|
|
||||||
return QRgba64::fromRgba64(qMin(a.red() + b.red(), 65535),
|
return QRgba64::fromRgba64(qMin(a.red() + b.red(), 65535),
|
||||||
@ -221,6 +286,18 @@ static inline uint toArgb32(uint16x4_t v)
|
|||||||
uint8x8_t v8 = vmovn_u16(vcombine_u16(v, v));
|
uint8x8_t v8 = vmovn_u16(vcombine_u16(v, v));
|
||||||
return vget_lane_u32(vreinterpret_u32_u8(v8), 0);
|
return vget_lane_u32(vreinterpret_u32_u8(v8), 0);
|
||||||
}
|
}
|
||||||
|
#elif defined __loongarch_sx
|
||||||
|
static inline uint Q_DECL_VECTORCALL toArgb32(__m128i v)
|
||||||
|
{
|
||||||
|
v = __lsx_vilvl_h(__lsx_vldi(0), v);
|
||||||
|
v = __lsx_vadd_w(v, __lsx_vreplgr2vr_w(128));
|
||||||
|
v = __lsx_vsub_w(v, __lsx_vsrli_w(v, 8));
|
||||||
|
v = __lsx_vsrli_w(v, 8);
|
||||||
|
v = __lsx_vpickev_h(__lsx_vsat_w(v, 15), __lsx_vsat_w(v, 15));
|
||||||
|
__m128i tmp = __lsx_vmaxi_h(v, 0);
|
||||||
|
v = __lsx_vpickev_b(__lsx_vsat_hu(tmp, 7), __lsx_vsat_hu(tmp, 7));
|
||||||
|
return __lsx_vpickve2gr_w(v, 0);
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static inline uint toArgb32(QRgba64 rgba64)
|
static inline uint toArgb32(QRgba64 rgba64)
|
||||||
@ -238,6 +315,11 @@ static inline uint toArgb32(QRgba64 rgba64)
|
|||||||
v = vext_u16(v, v, 3);
|
v = vext_u16(v, v, 3);
|
||||||
#endif
|
#endif
|
||||||
return toArgb32(v);
|
return toArgb32(v);
|
||||||
|
#elif defined __loongarch_sx
|
||||||
|
__m128i v = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&rgba64), 0);
|
||||||
|
const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 4, 5, 6, 7};
|
||||||
|
v = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), v);
|
||||||
|
return toArgb32(v);
|
||||||
#else
|
#else
|
||||||
return rgba64.toArgb32();
|
return rgba64.toArgb32();
|
||||||
#endif
|
#endif
|
||||||
@ -251,6 +333,9 @@ static inline uint toRgba8888(QRgba64 rgba64)
|
|||||||
#elif defined __ARM_NEON__
|
#elif defined __ARM_NEON__
|
||||||
uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
|
uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
|
||||||
return toArgb32(v);
|
return toArgb32(v);
|
||||||
|
#elif defined __loongarch_sx
|
||||||
|
__m128i v = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&rgba64), 0);
|
||||||
|
return toArgb32(v);
|
||||||
#else
|
#else
|
||||||
return ARGB2RGBA(toArgb32(rgba64));
|
return ARGB2RGBA(toArgb32(rgba64));
|
||||||
#endif
|
#endif
|
||||||
@ -289,6 +374,23 @@ static inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha)
|
|||||||
vd32 = vsraq_n_u32(vd32, vd32, 16);
|
vd32 = vsraq_n_u32(vd32, vd32, 16);
|
||||||
vd = vrshrn_n_u32(vd32, 16);
|
vd = vrshrn_n_u32(vd32, 16);
|
||||||
vst1_u64(reinterpret_cast<uint64_t *>(&blend), vreinterpret_u64_u16(vd));
|
vst1_u64(reinterpret_cast<uint64_t *>(&blend), vreinterpret_u64_u16(vd));
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
__m128i vd = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&d), 0);
|
||||||
|
__m128i vs = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&s), 0);
|
||||||
|
__m128i va = __lsx_vinsgr2vr_w(__lsx_vldi(0), rgbAlpha, 0);
|
||||||
|
va = __lsx_vilvl_b(va, va);
|
||||||
|
const __m128i shuffleMask = (__m128i)(v8i16){2, 1, 0, 3, 4, 5, 6, 7};
|
||||||
|
va = __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), va);
|
||||||
|
__m128i vb = __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), va);
|
||||||
|
|
||||||
|
vs = __lsx_vilvl_h(__lsx_vmuh_hu(vs, va), __lsx_vmul_h(vs, va));
|
||||||
|
vd = __lsx_vilvl_h(__lsx_vmuh_hu(vd, vb), __lsx_vmul_h(vd, vb));
|
||||||
|
vd = __lsx_vadd_w(vd, vs);
|
||||||
|
vd = __lsx_vadd_w(vd, __lsx_vsrli_w(vd, 16));
|
||||||
|
vd = __lsx_vadd_w(vd, __lsx_vreplgr2vr_w(0x8000));
|
||||||
|
vd = __lsx_vsrai_w(vd, 16);
|
||||||
|
vd = __lsx_vpickev_h(__lsx_vsat_w(vd, 15), __lsx_vsat_w(vd, 15));
|
||||||
|
__lsx_vstelm_d(vd, reinterpret_cast<__m128i *>(&blend), 0, 0);
|
||||||
#else
|
#else
|
||||||
const int mr = qRed(rgbAlpha);
|
const int mr = qRed(rgbAlpha);
|
||||||
const int mg = qGreen(rgbAlpha);
|
const int mg = qGreen(rgbAlpha);
|
||||||
@ -318,6 +420,13 @@ static inline void blend_pixel(QRgba64 &dst, QRgba64 src)
|
|||||||
const uint16x4_t via = veor_u16(vdup_n_u16(0xffff), vdup_lane_u16(vs, 3));
|
const uint16x4_t via = veor_u16(vdup_n_u16(0xffff), vdup_lane_u16(vs, 3));
|
||||||
const uint16x4_t vr = vadd_u16(vs, multiplyAlpha65535(vd, via));
|
const uint16x4_t vr = vadd_u16(vs, multiplyAlpha65535(vd, via));
|
||||||
vst1_u64(reinterpret_cast<uint64_t *>(&dst), vreinterpret_u64_u16(vr));
|
vst1_u64(reinterpret_cast<uint64_t *>(&dst), vreinterpret_u64_u16(vr));
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
const __m128i vd = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&dst), 0);
|
||||||
|
const __m128i vs = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&src), 0);
|
||||||
|
const __m128i shuffleMask = (__m128i)(v8i16){3, 3, 3, 3, 4, 5, 6, 7};
|
||||||
|
const __m128i via = __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), vs));
|
||||||
|
const __m128i vr = __lsx_vadd_h(vs, multiplyAlpha65535(vd, via));
|
||||||
|
__lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&dst), 0, 0);
|
||||||
#else
|
#else
|
||||||
dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());
|
dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());
|
||||||
#endif
|
#endif
|
||||||
@ -343,6 +452,14 @@ static inline void blend_pixel(QRgba64 &dst, QRgba64 src, const int const_alpha)
|
|||||||
const uint16x4_t via = veor_u16(vdup_n_u16(0xffff), vdup_lane_u16(vs, 3));
|
const uint16x4_t via = veor_u16(vdup_n_u16(0xffff), vdup_lane_u16(vs, 3));
|
||||||
const uint16x4_t vr = vadd_u16(vs, multiplyAlpha65535(vd, via));
|
const uint16x4_t vr = vadd_u16(vs, multiplyAlpha65535(vd, via));
|
||||||
vst1_u64(reinterpret_cast<uint64_t *>(&dst), vreinterpret_u64_u16(vr));
|
vst1_u64(reinterpret_cast<uint64_t *>(&dst), vreinterpret_u64_u16(vr));
|
||||||
|
#elif defined(__loongarch_sx)
|
||||||
|
const __m128i vd = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&dst), 0);
|
||||||
|
__m128i vs = __lsx_vldrepl_d(reinterpret_cast<const __m128i *>(&src), 0);
|
||||||
|
vs = multiplyAlpha255(vs, const_alpha);
|
||||||
|
const __m128i shuffleMask = (__m128i)(v8i16){3, 3, 3, 3, 4, 5, 6, 7};
|
||||||
|
const __m128i via = __lsx_vxor_v(__lsx_vreplgr2vr_h(-1), __lsx_vshuf_h(shuffleMask, __lsx_vldi(0), vs));
|
||||||
|
const __m128i vr = __lsx_vadd_h(vs, multiplyAlpha65535(vd, via));
|
||||||
|
__lsx_vstelm_d(vr, reinterpret_cast<__m128i *>(&dst), 0, 0);
|
||||||
#else
|
#else
|
||||||
src = multiplyAlpha255(src, const_alpha);
|
src = multiplyAlpha255(src, const_alpha);
|
||||||
dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());
|
dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());
|
||||||
|
Loading…
x
Reference in New Issue
Block a user