diff --git a/config.tests/arch/arch.cpp b/config.tests/arch/arch.cpp index 44ec7214320..334e48f30d3 100644 --- a/config.tests/arch/arch.cpp +++ b/config.tests/arch/arch.cpp @@ -239,7 +239,7 @@ const char msg2[] = "==Qt=magic=Qt== Sub-architecture:" #endif // -- ARM -- -#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64) " neon" #endif #ifdef __IWMMXT__ diff --git a/src/corelib/global/qsimd.cpp b/src/corelib/global/qsimd.cpp index 2bd1aed573e..1d3214b273e 100644 --- a/src/corelib/global/qsimd.cpp +++ b/src/corelib/global/qsimd.cpp @@ -151,7 +151,7 @@ static inline quint64 detectProcessorFeatures() features |= CpuFeatureAES; return features; #endif -#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#if defined(__ARM_NEON__) features |= CpuFeatureNEON; #endif #if defined(__ARM_FEATURE_CRC32) diff --git a/src/corelib/global/qsimd.h b/src/corelib/global/qsimd.h index 4ef925ca337..8ee3e9b15d8 100644 --- a/src/corelib/global/qsimd.h +++ b/src/corelib/global/qsimd.h @@ -34,7 +34,7 @@ #define QT_COMPILER_USES(feature) (1/QT_COMPILER_USES_##feature == 1) -#if defined(Q_PROCESSOR_ARM) && defined(__ARM_NEON) || defined(__ARM_NEON__) +#if defined(Q_PROCESSOR_ARM) && defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64) # include # define QT_COMPILER_USES_neon 1 #else diff --git a/src/corelib/global/qsimd_p.h b/src/corelib/global/qsimd_p.h index 012eb6cf4f4..b9cd296c9e8 100644 --- a/src/corelib/global/qsimd_p.h +++ b/src/corelib/global/qsimd_p.h @@ -257,7 +257,7 @@ static_assert(ARCH_SKX_MACROS, "Undeclared identifiers indicate which features a // NEON intrinsics // note: as of GCC 4.9, does not support function targets for ARM -#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64) #if defined(Q_CC_CLANG) #define QT_FUNCTION_TARGET_STRING_NEON "neon" #else diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index 9a09675eb7e..c4c4c528ccd 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -749,7 +749,11 @@ const char16_t *QtPrivate::qustrchr(QStringView str, char16_t c) noexcept [=](qsizetype i) { return n + i; }); # endif #elif defined(__ARM_NEON__) +#ifdef _MSC_VER + const uint16x8_t vmask = { 0x0008000400020001ULL, 0x0080004000200010ULL }; +#else const uint16x8_t vmask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; +#endif const uint16x8_t ch_vec = vdupq_n_u16(c); for (const char16_t *next = n + 8; next <= e; n = next, next += 8) { uint16x8_t data = vld1q_u16(reinterpret_cast(n)); @@ -1290,7 +1294,11 @@ static int ucstrncmp(const char16_t *a, const char16_t *b, size_t l) # elif defined(__ARM_NEON__) if (l >= 8) { const char16_t *end = a + l; +#ifdef _MSC_VER + const uint16x8_t mask = { 0x0008000400020001ULL, 0x0080004000200010ULL }; +#else const uint16x8_t mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; +#endif while (end - a > 7) { uint16x8_t da = vld1q_u16(reinterpret_cast(a)); uint16x8_t db = vld1q_u16(reinterpret_cast(b)); diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index 9fc3318d716..57e0e30d9c8 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -351,7 +351,12 @@ static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end) { uint16x8_t maxAscii = vdupq_n_u16(0x7f); - uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 }; +#ifdef _MSC_VER + uint16_t mask1t[8] = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 }; + uint16x8_t mask1 = vld1q_u16(mask1t); +#else + uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 }; +#endif uint16x8_t mask2 = vshlq_n_u16(mask1, 1); // do sixteen characters at a time @@ -389,7 +394,12 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons { // do eight characters at a time uint8x8_t msb_mask = vdup_n_u8(0x80); +#ifdef _MSC_VER + uint8_t add_maskt[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; + uint8x8_t add_mask = vld1_u8(add_maskt); +#else uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; +#endif for ( ; end - src >= 8; src += 8, dst += 8) { uint8x8_t c = vld1_u8(src); uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask)); @@ -425,7 +435,12 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end, // do eight characters at a time uint8x8_t msb_mask = vdup_n_u8(0x80); +#ifdef _MSC_VER + uint8_t add_maskt[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; + uint8x8_t add_mask = vld1_u8(add_maskt); +#else uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; +#endif for ( ; end - src >= 8; src += 8) { uint8x8_t c = vld1_u8(src); uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask)); diff --git a/src/gui/painting/qcolortransform.cpp b/src/gui/painting/qcolortransform.cpp index 52fbebfa5db..14200ccf9d5 100644 --- a/src/gui/painting/qcolortransform.cpp +++ b/src/gui/painting/qcolortransform.cpp @@ -662,18 +662,27 @@ static inline bool test_all_zero(uint32x4_t p) #endif } +static inline uint32x4_t vsetq_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) +{ +#ifdef _MSC_VER + return uint32x4_t{ (uint64_t(b) << 32) | a, (uint64_t(d) << 32) | c }; +#else + return uint32x4_t{ a, b, c, d }; +#endif +} + template static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr) { constexpr bool isARGB = isArgb(); const float iFF00 = 1.0f / (255 * 256); - const uint32x4_t vRangeMax = { - isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear - : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear, - d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear, - isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear - : d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear, - QColorTrcLut::Resolution }; + const uint32x4_t vRangeMax = vsetq_u32( + isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear + : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear, + d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear, + isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear + : d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear, + QColorTrcLut::Resolution); for (qsizetype i = 0; i < len; ++i) { uint32x4_t v; loadP(src[i], v); @@ -740,13 +749,13 @@ void loadUnpremultiplied(QColorVector *buffer, const T *src, const qsizetype len { constexpr bool isARGB = isArgb(); const float iFF00 = 1.0f / (255 * 256); - const uint32x4_t vRangeMax = { - isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear - : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear, - d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear, - isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear - : d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear, - QColorTrcLut::Resolution }; + const uint32x4_t vRangeMax = vsetq_u32( + isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear + : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear, + d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear, + isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear + : d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear, + QColorTrcLut::Resolution); for (qsizetype i = 0; i < len; ++i) { uint32x4_t v; loadPU(src[i], v); diff --git a/src/gui/painting/qcolortrclut_p.h b/src/gui/painting/qcolortrclut_p.h index 220b504a83e..da5e9e60793 100644 --- a/src/gui/painting/qcolortrclut_p.h +++ b/src/gui/painting/qcolortrclut_p.h @@ -18,13 +18,14 @@ #include #include #include +#include #include #include #if defined(__SSE2__) #include -#elif defined(__ARM_NEON__) || defined(__ARM_NEON) +#elif defined(__ARM_NEON__) #include #endif @@ -75,7 +76,7 @@ public: QRgba64 rgba64; _mm_storel_epi64(reinterpret_cast<__m128i *>(&rgba64), v); return rgba64; -#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN +#elif defined(__ARM_NEON__) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN uint8x8_t v8 = vreinterpret_u8_u32(vmov_n_u32(rgb32)); uint16x4_t v16 = vget_low_u16(vmovl_u8(v8)); const uint16x4_t vidx = vshl_n_u16(v16, ShiftUp); @@ -145,7 +146,7 @@ public: v = _mm_srli_epi16(v, 8); v = _mm_packus_epi16(v, v); return _mm_cvtsi128_si32(v); -#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN +#elif defined(__ARM_NEON__) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN uint16x4_t v = vreinterpret_u16_u64(vmov_n_u64(rgb64)); v = vsub_u16(v, vshr_n_u16(v, 8)); const uint16x4_t vidx = vshr_n_u16(v, ShiftDown); @@ -236,7 +237,7 @@ private: QRgba64 rgba64; _mm_storel_epi64(reinterpret_cast<__m128i *>(&rgba64), v); return rgba64; -#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN +#elif defined(__ARM_NEON__) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN uint16x4_t v = vreinterpret_u16_u64(vmov_n_u64(rgb64)); v = vsub_u16(v, vshr_n_u16(v, 8)); const uint16x4_t vidx = vshr_n_u16(v, ShiftDown); diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp index 1fceb83710b..f8de0a85116 100644 --- a/src/gui/painting/qdrawhelper_neon.cpp +++ b/src/gui/painting/qdrawhelper_neon.cpp @@ -1008,8 +1008,18 @@ void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h, class QSimdNeon { public: - typedef int32x4_t Int32x4; - typedef float32x4_t Float32x4; + struct Int32x4 { + Int32x4() = default; + Int32x4(int32x4_t v) : v(v) {} + int32x4_t v; + operator int32x4_t() const { return v; } + }; + struct Float32x4 { + Float32x4() = default; + Float32x4(float32x4_t v) : v(v) {}; + float32x4_t v; + operator float32x4_t() const { return v; } + }; union Vect_buffer_i { Int32x4 v; int i[4]; }; union Vect_buffer_f { Float32x4 v; float f[4]; }; @@ -1059,7 +1069,9 @@ const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Oper #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN static inline uint32x4_t vrgba2argb(uint32x4_t srcVector) { -#if defined(Q_PROCESSOR_ARM_64) +#if defined(Q_PROCESSOR_ARM_64) && defined(_MSC_VER) + const uint8x16_t rgbaMask = { 0x0704050603000102ULL, 0x0F0C0D0E0B08090AULL }; +#elif defined(Q_PROCESSOR_ARM_64) const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; #else const uint8x8_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7 }; @@ -1079,7 +1091,11 @@ template static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count) { int i = 0; +#ifdef _MSC_VER + const uint8x8_t shuffleMask = { 0x0707070703030303ULL }; +#else const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7}; +#endif const uint32x4_t blendMask = vdupq_n_u32(0xff000000); for (; i < count - 3; i += 4) { @@ -1131,7 +1147,11 @@ static inline void convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src if (count <= 0) return; +#ifdef _MSC_VER + const uint8x8_t shuffleMask = { 0x0707070703030303ULL }; +#else const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7}; +#endif const uint64x2_t blendMask = vdupq_n_u64(Q_UINT64_C(0xffff000000000000)); int i = 0; diff --git a/src/gui/painting/qpixellayout.cpp b/src/gui/painting/qpixellayout.cpp index 4f2f0ae13a0..31646d2f23d 100644 --- a/src/gui/painting/qpixellayout.cpp +++ b/src/gui/painting/qpixellayout.cpp @@ -1087,7 +1087,9 @@ static inline void qConvertARGB32PMToRGBA64PM_neon(QRgba64 *buffer, const uint * return; const uint32x4_t amask = vdupq_n_u32(0xff000000); -#if defined(Q_PROCESSOR_ARM_64) +#if defined(Q_PROCESSOR_ARM_64) && defined(_MSC_VER) + const uint8x16_t rgbaMask = { 0x0704050603000102ULL, 0x0F0C0D0E0B08090AULL }; +#elif defined(Q_PROCESSOR_ARM_64) const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; #else const uint8x8_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7 }; diff --git a/src/gui/painting/qrgba64_p.h b/src/gui/painting/qrgba64_p.h index 6fd7d7ff2c1..6809f1d52cf 100644 --- a/src/gui/painting/qrgba64_p.h +++ b/src/gui/painting/qrgba64_p.h @@ -232,7 +232,11 @@ static inline uint toArgb32(QRgba64 rgba64) #elif defined __ARM_NEON__ uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast(&rgba64))); #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN +#ifdef _MSC_VER + const uint8x8_t shuffleMask = { 0x0706010003020504ULL }; +#else const uint8x8_t shuffleMask = { 4, 5, 2, 3, 0, 1, 6, 7 }; +#endif v = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(v), shuffleMask)); #else v = vext_u16(v, v, 3);