Fix ARM NEON detection on MSVC arm64

MSVC doesn't define __ARM_NEON, but neon is a necessary part of arm64, so will always be there. At the same time fix the NEON code for MSVC which has a different idea of what the intrinsics types are, even if all the intrinsic functions are the same. This has two consequences: 1. Since NEON lacks construction intrinsics except duplication, NEON mask constants needs to be built differently. 2. Since MSVC has all the NEON types aliases of the same underlying type, QSimdNeon cant do the same type based dispatch as before. Fixes: QTBUG-127646 Change-Id: I8038bb6bb4557e8ce29e3844f2742a97b4489818 Reviewed-by: Oliver Wolff <oliver.wolff@qt.io> (cherry picked from commit 572aa7caa04e85cbc07b18e1d0c720038facbf83) Reviewed-by: Qt Cherry-pick Bot <cherrypick_bot@qt-project.org>
2024-08-07 14:33:51 +02:00 · 2024-08-07 14:33:51 +02:00 · 9017e95858
commit 9017e95858
parent 121639c53e
11 changed files with 86 additions and 27 deletions
--- a/config.tests/arch/arch.cpp
+++ b/config.tests/arch/arch.cpp
@ -239,7 +239,7 @@ const char msg2[] = "==Qt=magic=Qt== Sub-architecture:"
 #endif

 // -- ARM --
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64)
 " neon"
 #endif
 #ifdef __IWMMXT__
--- a/src/corelib/global/qsimd.cpp
+++ b/src/corelib/global/qsimd.cpp
@ -151,7 +151,7 @@ static inline quint64 detectProcessorFeatures()
        features |= CpuFeatureAES;
    return features;
 #endif
-#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#if defined(__ARM_NEON__)
    features |= CpuFeatureNEON;
 #endif
 #if defined(__ARM_FEATURE_CRC32)
--- a/src/corelib/global/qsimd.h
+++ b/src/corelib/global/qsimd.h
@ -34,7 +34,7 @@

 #define QT_COMPILER_USES(feature) (1/QT_COMPILER_USES_##feature == 1)

-#if defined(Q_PROCESSOR_ARM) && defined(__ARM_NEON) || defined(__ARM_NEON__)
+#if defined(Q_PROCESSOR_ARM) && defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64)
 #  include <arm_neon.h>
 #  define QT_COMPILER_USES_neon 1
 #else
--- a/src/corelib/global/qsimd_p.h
+++ b/src/corelib/global/qsimd_p.h
@ -257,7 +257,7 @@ static_assert(ARCH_SKX_MACROS, "Undeclared identifiers indicate which features a

 // NEON intrinsics
 // note: as of GCC 4.9, does not support function targets for ARM
-#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64)
 #if defined(Q_CC_CLANG)
 #define QT_FUNCTION_TARGET_STRING_NEON      "neon"
 #else
--- a/src/corelib/text/qstring.cpp
+++ b/src/corelib/text/qstring.cpp
@ -749,7 +749,11 @@ const char16_t *QtPrivate::qustrchr(QStringView str, char16_t c) noexcept
                                   [=](qsizetype i) { return n + i; });
 #  endif
 #elif defined(__ARM_NEON__)
+#ifdef _MSC_VER
+    const uint16x8_t vmask = { 0x0008000400020001ULL, 0x0080004000200010ULL };
+#else
    const uint16x8_t vmask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+#endif
    const uint16x8_t ch_vec = vdupq_n_u16(c);
    for (const char16_t *next = n + 8; next <= e; n = next, next += 8) {
        uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(n));
@ -1290,7 +1294,11 @@ static int ucstrncmp(const char16_t *a, const char16_t *b, size_t l)
 #  elif defined(__ARM_NEON__)
    if (l >= 8) {
        const char16_t *end = a + l;
+#ifdef _MSC_VER
+        const uint16x8_t mask = { 0x0008000400020001ULL, 0x0080004000200010ULL };
+#else
        const uint16x8_t mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+#endif
        while (end - a > 7) {
            uint16x8_t da = vld1q_u16(reinterpret_cast<const uint16_t *>(a));
            uint16x8_t db = vld1q_u16(reinterpret_cast<const uint16_t *>(b));
--- a/src/corelib/text/qstringconverter.cpp
+++ b/src/corelib/text/qstringconverter.cpp
@ -351,7 +351,12 @@ static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const
 static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
 {
    uint16x8_t maxAscii = vdupq_n_u16(0x7f);
-    uint16x8_t mask1 = { 1,      1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
+#ifdef _MSC_VER
+    uint16_t mask1t[8] = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
+    uint16x8_t mask1 = vld1q_u16(mask1t);
+#else
+    uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
+#endif
    uint16x8_t mask2 = vshlq_n_u16(mask1, 1);

    // do sixteen characters at a time
@ -389,7 +394,12 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons
 {
    // do eight characters at a time
    uint8x8_t msb_mask = vdup_n_u8(0x80);
+#ifdef _MSC_VER
+    uint8_t add_maskt[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+    uint8x8_t add_mask = vld1_u8(add_maskt);
+#else
    uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+#endif
    for ( ; end - src >= 8; src += 8, dst += 8) {
        uint8x8_t c = vld1_u8(src);
        uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
@ -425,7 +435,12 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,

    // do eight characters at a time
    uint8x8_t msb_mask = vdup_n_u8(0x80);
+#ifdef _MSC_VER
+    uint8_t add_maskt[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+    uint8x8_t add_mask = vld1_u8(add_maskt);
+#else
    uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
+#endif
    for ( ; end - src >= 8; src += 8) {
        uint8x8_t c = vld1_u8(src);
        uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
--- a/src/gui/painting/qcolortransform.cpp
+++ b/src/gui/painting/qcolortransform.cpp
@ -662,18 +662,27 @@ static inline bool test_all_zero(uint32x4_t p)
 #endif
 }

+static inline uint32x4_t vsetq_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
+{
+#ifdef _MSC_VER
+    return uint32x4_t{ (uint64_t(b) << 32) | a, (uint64_t(d) << 32) | c };
+#else
+    return uint32x4_t{ a, b, c, d };
+#endif
+}
+
 template<typename T>
 static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
 {
    constexpr bool isARGB = isArgb<T>();
    const float iFF00 = 1.0f / (255 * 256);
-    const uint32x4_t vRangeMax = {
-        isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
-              : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
-        d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
-        isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear
-              : d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear,
-        QColorTrcLut::Resolution };
+    const uint32x4_t vRangeMax = vsetq_u32(
+            isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
+                   : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
+            d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
+            isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear
+                   : d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear,
+            QColorTrcLut::Resolution);
    for (qsizetype i = 0; i < len; ++i) {
        uint32x4_t v;
        loadP<T>(src[i], v);
@ -740,13 +749,13 @@ void loadUnpremultiplied(QColorVector *buffer, const T *src, const qsizetype len
 {
    constexpr bool isARGB = isArgb<T>();
    const float iFF00 = 1.0f / (255 * 256);
-    const uint32x4_t vRangeMax = {
-        isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
-               : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
-        d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
-        isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear
-               : d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear,
-        QColorTrcLut::Resolution };
+    const uint32x4_t vRangeMax = vsetq_u32(
+            isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
+                   : d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
+            d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
+            isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear
+                   : d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear,
+            QColorTrcLut::Resolution);
    for (qsizetype i = 0; i < len; ++i) {
        uint32x4_t v;
        loadPU<T>(src[i], v);
--- a/src/gui/painting/qcolortrclut_p.h
+++ b/src/gui/painting/qcolortrclut_p.h
@ -18,13 +18,14 @@
 #include <QtGui/private/qtguiglobal_p.h>
 #include <QtGui/qrgb.h>
 #include <QtGui/qrgba64.h>
+#include <QtCore/private/qsimd_p.h>

 #include <cmath>
 #include <memory>

 #if defined(__SSE2__)
 #include <emmintrin.h>
-#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
+#elif defined(__ARM_NEON__)
 #include <arm_neon.h>
 #endif

@ -75,7 +76,7 @@ public:
        QRgba64 rgba64;
        _mm_storel_epi64(reinterpret_cast<__m128i *>(&rgba64), v);
        return rgba64;
-#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+#elif defined(__ARM_NEON__) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
        uint8x8_t v8 = vreinterpret_u8_u32(vmov_n_u32(rgb32));
        uint16x4_t v16 = vget_low_u16(vmovl_u8(v8));
        const uint16x4_t vidx = vshl_n_u16(v16, ShiftUp);
@ -145,7 +146,7 @@ public:
        v = _mm_srli_epi16(v, 8);
        v = _mm_packus_epi16(v, v);
        return _mm_cvtsi128_si32(v);
-#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+#elif defined(__ARM_NEON__) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
        uint16x4_t v = vreinterpret_u16_u64(vmov_n_u64(rgb64));
        v = vsub_u16(v, vshr_n_u16(v, 8));
        const uint16x4_t vidx = vshr_n_u16(v, ShiftDown);
@ -236,7 +237,7 @@ private:
        QRgba64 rgba64;
        _mm_storel_epi64(reinterpret_cast<__m128i *>(&rgba64), v);
        return rgba64;
-#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+#elif defined(__ARM_NEON__) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
        uint16x4_t v = vreinterpret_u16_u64(vmov_n_u64(rgb64));
        v = vsub_u16(v, vshr_n_u16(v, 8));
        const uint16x4_t vidx = vshr_n_u16(v, ShiftDown);
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@ -1008,8 +1008,18 @@ void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h,
 class QSimdNeon
 {
 public:
-    typedef int32x4_t Int32x4;
-    typedef float32x4_t Float32x4;
+    struct Int32x4 {
+        Int32x4() = default;
+        Int32x4(int32x4_t v) : v(v) {}
+        int32x4_t v;
+        operator int32x4_t() const { return v; }
+    };
+    struct Float32x4 {
+        Float32x4() = default;
+        Float32x4(float32x4_t v) : v(v) {};
+        float32x4_t v;
+        operator float32x4_t() const { return v; }
+    };

    union Vect_buffer_i { Int32x4 v; int i[4]; };
    union Vect_buffer_f { Float32x4 v; float f[4]; };
@ -1059,7 +1069,9 @@ const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Oper
 #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
 static inline uint32x4_t vrgba2argb(uint32x4_t srcVector)
 {
-#if defined(Q_PROCESSOR_ARM_64)
+#if defined(Q_PROCESSOR_ARM_64) && defined(_MSC_VER)
+    const uint8x16_t rgbaMask  = { 0x0704050603000102ULL, 0x0F0C0D0E0B08090AULL };
+#elif defined(Q_PROCESSOR_ARM_64)
    const uint8x16_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
 #else
    const uint8x8_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7 };
@ -1079,7 +1091,11 @@ template<bool RGBA>
 static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
 {
    int i = 0;
+#ifdef _MSC_VER
+    const uint8x8_t shuffleMask = { 0x0707070703030303ULL };
+#else
    const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
+#endif
    const uint32x4_t blendMask = vdupq_n_u32(0xff000000);

    for (; i < count - 3; i += 4) {
@ -1131,7 +1147,11 @@ static inline void convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src
    if (count <= 0)
        return;

+#ifdef _MSC_VER
+    const uint8x8_t shuffleMask = { 0x0707070703030303ULL };
+#else
    const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
+#endif
    const uint64x2_t blendMask = vdupq_n_u64(Q_UINT64_C(0xffff000000000000));

    int i = 0;
--- a/src/gui/painting/qpixellayout.cpp
+++ b/src/gui/painting/qpixellayout.cpp
@ -1087,7 +1087,9 @@ static inline void qConvertARGB32PMToRGBA64PM_neon(QRgba64 *buffer, const uint *
        return;

    const uint32x4_t amask = vdupq_n_u32(0xff000000);
-#if defined(Q_PROCESSOR_ARM_64)
+#if defined(Q_PROCESSOR_ARM_64) && defined(_MSC_VER)
+    const uint8x16_t rgbaMask  = { 0x0704050603000102ULL, 0x0F0C0D0E0B08090AULL };
+#elif defined(Q_PROCESSOR_ARM_64)
    const uint8x16_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
 #else
    const uint8x8_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7 };
--- a/src/gui/painting/qrgba64_p.h
+++ b/src/gui/painting/qrgba64_p.h
@ -232,7 +232,11 @@ static inline uint toArgb32(QRgba64 rgba64)
 #elif defined __ARM_NEON__
    uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
 #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+#ifdef _MSC_VER
+    const uint8x8_t shuffleMask = { 0x0706010003020504ULL };
+#else
    const uint8x8_t shuffleMask = { 4, 5, 2, 3, 0, 1, 6, 7 };
+#endif
    v = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(v), shuffleMask));
 #else
    v = vext_u16(v, v, 3);