Clean-up MSVC ARM64 NEON support
Also fixing clang-cl support. Pick-to: 6.8.0 Change-Id: If2130091edfadc0cc4d4cecd95c2256522efc69a Reviewed-by: Maurice Kalinowski <maurice.kalinowski@qt.io> Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io> (cherry picked from commit 308bca94a72f83624e2e2c92449719e06940e77f) Reviewed-by: Qt Cherry-pick Bot <cherrypick_bot@qt-project.org>
This commit is contained in:
parent
0260c0d9f5
commit
4be72074a9
@ -283,6 +283,59 @@ inline uint8_t vaddv_u8(uint8x8_t v8)
|
||||
}
|
||||
#endif
|
||||
|
||||
// Missing NEON intrinsics, needed due different type definitions:
|
||||
inline uint16x8_t qvsetq_n_u16(uint16_t v1, uint16_t v2, uint16_t v3, uint16_t v4,
|
||||
uint16_t v5, uint16_t v6, uint16_t v7, uint16_t v8) {
|
||||
#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
|
||||
using u64 = uint64_t;
|
||||
const uint16x8_t vmask = {
|
||||
v1 | (v2 << 16) | (u64(v3) << 32) | (u64(v4) << 48),
|
||||
v5 | (v6 << 16) | (u64(v7) << 32) | (u64(v8) << 48)
|
||||
};
|
||||
#else
|
||||
const uint16x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 };
|
||||
#endif
|
||||
return vmask;
|
||||
}
|
||||
inline uint8x8_t qvset_n_u8(uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
|
||||
uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8) {
|
||||
#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
|
||||
using u64 = uint64_t;
|
||||
const uint8x8_t vmask = {
|
||||
v1 | (v2 << 8) | (v3 << 16) | (v4 << 24) |
|
||||
(u64(v5) << 32) | (u64(v6) << 40) | (u64(v7) << 48) | (u64(v8) << 56)
|
||||
};
|
||||
#else
|
||||
const uint8x8_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8 };
|
||||
#endif
|
||||
return vmask;
|
||||
}
|
||||
inline uint8x16_t qvsetq_n_u8(uint8_t v1, uint8_t v2, uint8_t v3, uint8_t v4,
|
||||
uint8_t v5, uint8_t v6, uint8_t v7, uint8_t v8,
|
||||
uint8_t v9, uint8_t v10, uint8_t v11, uint8_t v12,
|
||||
uint8_t v13, uint8_t v14, uint8_t v15, uint8_t v16) {
|
||||
#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
|
||||
using u64 = uint64_t;
|
||||
const uint8x16_t vmask = {
|
||||
v1 | (v2 << 8) | (v3 << 16) | (v4 << 24) |
|
||||
(u64(v5) << 32) | (u64(v6) << 40) | (u64(v7) << 48) | (u64(v8) << 56),
|
||||
v9 | (v10 << 8) | (v11 << 16) | (v12 << 24) |
|
||||
(u64(v13) << 32) | (u64(v14) << 40) | (u64(v15) << 48) | (u64(v16) << 56)
|
||||
};
|
||||
#else
|
||||
const uint8x16_t vmask = { v1, v2, v3, v4, v5, v6, v7, v8,
|
||||
v9, v10, v11, v12, v13, v14, v15, v16};
|
||||
#endif
|
||||
return vmask;
|
||||
}
|
||||
inline uint32x4_t qvsetq_n_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
|
||||
{
|
||||
#if defined(Q_CC_MSVC) && !defined(Q_CC_CLANG)
|
||||
return uint32x4_t{ (uint64_t(b) << 32) | a, (uint64_t(d) << 32) | c };
|
||||
#else
|
||||
return uint32x4_t{ a, b, c, d };
|
||||
#endif
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(Q_PROCESSOR_ARM) && defined(__ARM_FEATURE_CRC32)
|
||||
|
@ -749,11 +749,7 @@ const char16_t *QtPrivate::qustrchr(QStringView str, char16_t c) noexcept
|
||||
[=](qsizetype i) { return n + i; });
|
||||
# endif
|
||||
#elif defined(__ARM_NEON__)
|
||||
#ifdef _MSC_VER
|
||||
const uint16x8_t vmask = { 0x0008000400020001ULL, 0x0080004000200010ULL };
|
||||
#else
|
||||
const uint16x8_t vmask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
|
||||
#endif
|
||||
const uint16x8_t vmask = qvsetq_n_u16(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7);
|
||||
const uint16x8_t ch_vec = vdupq_n_u16(c);
|
||||
for (const char16_t *next = n + 8; next <= e; n = next, next += 8) {
|
||||
uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(n));
|
||||
@ -1294,11 +1290,7 @@ static int ucstrncmp(const char16_t *a, const char16_t *b, size_t l)
|
||||
# elif defined(__ARM_NEON__)
|
||||
if (l >= 8) {
|
||||
const char16_t *end = a + l;
|
||||
#ifdef _MSC_VER
|
||||
const uint16x8_t mask = { 0x0008000400020001ULL, 0x0080004000200010ULL };
|
||||
#else
|
||||
const uint16x8_t mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
|
||||
#endif
|
||||
const uint16x8_t mask = qvsetq_n_u16( 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 );
|
||||
while (end - a > 7) {
|
||||
uint16x8_t da = vld1q_u16(reinterpret_cast<const uint16_t *>(a));
|
||||
uint16x8_t db = vld1q_u16(reinterpret_cast<const uint16_t *>(b));
|
||||
|
@ -351,12 +351,7 @@ static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const
|
||||
static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
|
||||
{
|
||||
uint16x8_t maxAscii = vdupq_n_u16(0x7f);
|
||||
#ifdef _MSC_VER
|
||||
uint16_t mask1t[8] = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
|
||||
uint16x8_t mask1 = vld1q_u16(mask1t);
|
||||
#else
|
||||
uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
|
||||
#endif
|
||||
uint16x8_t mask1 = qvsetq_n_u16(1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 );
|
||||
uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
|
||||
|
||||
// do sixteen characters at a time
|
||||
@ -394,12 +389,7 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons
|
||||
{
|
||||
// do eight characters at a time
|
||||
uint8x8_t msb_mask = vdup_n_u8(0x80);
|
||||
#ifdef _MSC_VER
|
||||
uint8_t add_maskt[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
|
||||
uint8x8_t add_mask = vld1_u8(add_maskt);
|
||||
#else
|
||||
uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
|
||||
#endif
|
||||
uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 );
|
||||
for ( ; end - src >= 8; src += 8, dst += 8) {
|
||||
uint8x8_t c = vld1_u8(src);
|
||||
uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
|
||||
@ -435,12 +425,7 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,
|
||||
|
||||
// do eight characters at a time
|
||||
uint8x8_t msb_mask = vdup_n_u8(0x80);
|
||||
#ifdef _MSC_VER
|
||||
uint8_t add_maskt[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
|
||||
uint8x8_t add_mask = vld1_u8(add_maskt);
|
||||
#else
|
||||
uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
|
||||
#endif
|
||||
uint8x8_t add_mask = qvset_n_u8(1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7);
|
||||
for ( ; end - src >= 8; src += 8) {
|
||||
uint8x8_t c = vld1_u8(src);
|
||||
uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
|
||||
|
@ -662,21 +662,12 @@ static inline bool test_all_zero(uint32x4_t p)
|
||||
#endif
|
||||
}
|
||||
|
||||
static inline uint32x4_t vsetq_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
|
||||
{
|
||||
#ifdef _MSC_VER
|
||||
return uint32x4_t{ (uint64_t(b) << 32) | a, (uint64_t(d) << 32) | c };
|
||||
#else
|
||||
return uint32x4_t{ a, b, c, d };
|
||||
#endif
|
||||
}
|
||||
|
||||
template<typename T>
|
||||
static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
|
||||
{
|
||||
constexpr bool isARGB = isArgb<T>();
|
||||
const float iFF00 = 1.0f / (255 * 256);
|
||||
const uint32x4_t vRangeMax = vsetq_u32(
|
||||
const uint32x4_t vRangeMax = qvsetq_n_u32(
|
||||
isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
|
||||
: d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
|
||||
d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
|
||||
@ -749,7 +740,7 @@ void loadUnpremultiplied(QColorVector *buffer, const T *src, const qsizetype len
|
||||
{
|
||||
constexpr bool isARGB = isArgb<T>();
|
||||
const float iFF00 = 1.0f / (255 * 256);
|
||||
const uint32x4_t vRangeMax = vsetq_u32(
|
||||
const uint32x4_t vRangeMax = qvsetq_n_u32(
|
||||
isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
|
||||
: d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
|
||||
d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
|
||||
|
@ -1069,12 +1069,10 @@ const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Oper
|
||||
#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
|
||||
static inline uint32x4_t vrgba2argb(uint32x4_t srcVector)
|
||||
{
|
||||
#if defined(Q_PROCESSOR_ARM_64) && defined(_MSC_VER)
|
||||
const uint8x16_t rgbaMask = { 0x0704050603000102ULL, 0x0F0C0D0E0B08090AULL };
|
||||
#elif defined(Q_PROCESSOR_ARM_64)
|
||||
const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
|
||||
#if defined(Q_PROCESSOR_ARM_64)
|
||||
const uint8x16_t rgbaMask = qvsetq_n_u8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
|
||||
#else
|
||||
const uint8x8_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7 };
|
||||
const uint8x8_t rgbaMask = qvset_n_u8(2, 1, 0, 3, 6, 5, 4, 7);
|
||||
#endif
|
||||
#if defined(Q_PROCESSOR_ARM_64)
|
||||
srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask));
|
||||
@ -1091,11 +1089,7 @@ template<bool RGBA>
|
||||
static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
|
||||
{
|
||||
int i = 0;
|
||||
#ifdef _MSC_VER
|
||||
const uint8x8_t shuffleMask = { 0x0707070703030303ULL };
|
||||
#else
|
||||
const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
|
||||
#endif
|
||||
const uint8x8_t shuffleMask = qvset_n_u8(3, 3, 3, 3, 7, 7, 7, 7);
|
||||
const uint32x4_t blendMask = vdupq_n_u32(0xff000000);
|
||||
|
||||
for (; i < count - 3; i += 4) {
|
||||
@ -1147,11 +1141,7 @@ static inline void convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src
|
||||
if (count <= 0)
|
||||
return;
|
||||
|
||||
#ifdef _MSC_VER
|
||||
const uint8x8_t shuffleMask = { 0x0707070703030303ULL };
|
||||
#else
|
||||
const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
|
||||
#endif
|
||||
const uint8x8_t shuffleMask = qvset_n_u8(3, 3, 3, 3, 7, 7, 7, 7);
|
||||
const uint64x2_t blendMask = vdupq_n_u64(Q_UINT64_C(0xffff000000000000));
|
||||
|
||||
int i = 0;
|
||||
|
@ -1087,12 +1087,10 @@ static inline void qConvertARGB32PMToRGBA64PM_neon(QRgba64 *buffer, const uint *
|
||||
return;
|
||||
|
||||
const uint32x4_t amask = vdupq_n_u32(0xff000000);
|
||||
#if defined(Q_PROCESSOR_ARM_64) && defined(_MSC_VER)
|
||||
const uint8x16_t rgbaMask = { 0x0704050603000102ULL, 0x0F0C0D0E0B08090AULL };
|
||||
#elif defined(Q_PROCESSOR_ARM_64)
|
||||
const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
|
||||
#if defined(Q_PROCESSOR_ARM_64)
|
||||
const uint8x16_t rgbaMask = qvsetq_n_u8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
|
||||
#else
|
||||
const uint8x8_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7 };
|
||||
const uint8x8_t rgbaMask = qvset_n_u8(2, 1, 0, 3, 6, 5, 4, 7);
|
||||
#endif
|
||||
int i = 0;
|
||||
for (; i < count-3; i += 4) {
|
||||
|
@ -232,11 +232,7 @@ static inline uint toArgb32(QRgba64 rgba64)
|
||||
#elif defined __ARM_NEON__
|
||||
uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
|
||||
#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
|
||||
#ifdef _MSC_VER
|
||||
const uint8x8_t shuffleMask = { 0x0706010003020504ULL };
|
||||
#else
|
||||
const uint8x8_t shuffleMask = { 4, 5, 2, 3, 0, 1, 6, 7 };
|
||||
#endif
|
||||
const uint8x8_t shuffleMask = qvset_n_u8(4, 5, 2, 3, 0, 1, 6, 7);
|
||||
v = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(v), shuffleMask));
|
||||
#else
|
||||
v = vext_u16(v, v, 3);
|
||||
|
Loading…
x
Reference in New Issue
Block a user