From 9612547343098174e2317ded3f52cdab160aa0b6 Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Thu, 20 Jun 2024 12:50:30 +0200 Subject: [PATCH] Add NEON optimized versions where missing A few methods only had SSE2 versions, this adds NEON version the last optimized QRgba64 methods. Pick-to: 6.8 Change-Id: I61fc916183a08a7d3c41572af10ca92d73ef831a Reviewed-by: Eirik Aavitsland --- src/gui/painting/qrgba64_p.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/src/gui/painting/qrgba64_p.h b/src/gui/painting/qrgba64_p.h index ae8b6fd8cbc..6fd7d7ff2c1 100644 --- a/src/gui/painting/qrgba64_p.h +++ b/src/gui/painting/qrgba64_p.h @@ -186,7 +186,14 @@ static inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b) QRgba64 r; _mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr); return r; +#elif defined(__ARM_NEON__) + const uint16x4_t va = vreinterpret_u16_u64(vld1_u64(reinterpret_cast(&a))); + const uint16x4_t vb = vreinterpret_u16_u64(vld1_u64(reinterpret_cast(&b))); + QRgba64 r; + vst1_u64(reinterpret_cast(&r), vreinterpret_u64_u16(vqadd_u16(va, vb))); + return r; #else + return QRgba64::fromRgba64(qMin(a.red() + b.red(), 65535), qMin(a.green() + b.green(), 65535), qMin(a.blue() + b.blue(), 65535), @@ -274,8 +281,7 @@ static inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha) uint16x4_t vs = vreinterpret_u16_u64(vmov_n_u64(s)); uint8x8_t va8 = vreinterpret_u8_u32(vmov_n_u32(ARGB2RGBA(rgbAlpha))); uint16x4_t va = vreinterpret_u16_u8(vzip_u8(va8, va8).val[0]); - uint16x4_t vb = vdup_n_u16(0xffff); - vb = vsub_u16(vb, va); + uint16x4_t vb = veor_u16(vdup_n_u16(0xffff), va); uint32x4_t vs32 = vmull_u16(vs, va); uint32x4_t vd32 = vmull_u16(vd, vb); @@ -306,6 +312,12 @@ static inline void blend_pixel(QRgba64 &dst, QRgba64 src) const __m128i via = _mm_xor_si128(_mm_set1_epi16(-1), _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3))); const __m128i vr = _mm_add_epi16(vs, multiplyAlpha65535(vd, via)); _mm_storel_epi64(reinterpret_cast<__m128i *>(&dst), vr); +#elif defined(__ARM_NEON__) + const uint16x4_t vd = vreinterpret_u16_u64(vld1_u64(reinterpret_cast(&dst))); + const uint16x4_t vs = vreinterpret_u16_u64(vld1_u64(reinterpret_cast(&src))); + const uint16x4_t via = veor_u16(vdup_n_u16(0xffff), vdup_lane_u16(vs, 3)); + const uint16x4_t vr = vadd_u16(vs, multiplyAlpha65535(vd, via)); + vst1_u64(reinterpret_cast(&dst), vreinterpret_u64_u16(vr)); #else dst = src + multiplyAlpha65535(dst, 65535 - src.alpha()); #endif @@ -324,6 +336,13 @@ static inline void blend_pixel(QRgba64 &dst, QRgba64 src, const int const_alpha) const __m128i via = _mm_xor_si128(_mm_set1_epi16(-1), _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3))); const __m128i vr = _mm_add_epi16(vs, multiplyAlpha65535(vd, via)); _mm_storel_epi64(reinterpret_cast<__m128i *>(&dst), vr); +#elif defined(__ARM_NEON__) + const uint16x4_t vd = vreinterpret_u16_u64(vld1_u64(reinterpret_cast(&dst))); + uint16x4_t vs = vreinterpret_u16_u64(vld1_u64(reinterpret_cast(&src))); + vs = multiplyAlpha255(vs, const_alpha); + const uint16x4_t via = veor_u16(vdup_n_u16(0xffff), vdup_lane_u16(vs, 3)); + const uint16x4_t vr = vadd_u16(vs, multiplyAlpha65535(vd, via)); + vst1_u64(reinterpret_cast(&dst), vreinterpret_u64_u16(vr)); #else src = multiplyAlpha255(src, const_alpha); dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());