Fix ARM NEON detection on MSVC arm64

MSVC doesn't define __ARM_NEON, but neon is a necessary part of arm64,
so will always be there.

At the same time fix the NEON code for MSVC which has a different
idea of what the intrinsics types are, even if all the intrinsic
functions are the same. This has two consequences:
1. Since NEON lacks construction intrinsics except duplication, NEON mask constants needs to be built differently.
2. Since MSVC has all the NEON types aliases of the same underlying type, QSimdNeon cant do the same type based dispatch as before.

Fixes: QTBUG-127646
Change-Id: I8038bb6bb4557e8ce29e3844f2742a97b4489818
Reviewed-by: Oliver Wolff <oliver.wolff@qt.io>
(cherry picked from commit 572aa7caa04e85cbc07b18e1d0c720038facbf83)
Reviewed-by: Qt Cherry-pick Bot <cherrypick_bot@qt-project.org>
This commit is contained in:
Allan Sandfeld Jensen 2024-08-07 14:33:51 +02:00 committed by Qt Cherry-pick Bot
parent 121639c53e
commit 9017e95858
11 changed files with 86 additions and 27 deletions

View File

@ -239,7 +239,7 @@ const char msg2[] = "==Qt=magic=Qt== Sub-architecture:"
#endif
// -- ARM --
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64)
" neon"
#endif
#ifdef __IWMMXT__

View File

@ -151,7 +151,7 @@ static inline quint64 detectProcessorFeatures()
features |= CpuFeatureAES;
return features;
#endif
#if defined(__ARM_NEON__) || defined(__ARM_NEON)
#if defined(__ARM_NEON__)
features |= CpuFeatureNEON;
#endif
#if defined(__ARM_FEATURE_CRC32)

View File

@ -34,7 +34,7 @@
#define QT_COMPILER_USES(feature) (1/QT_COMPILER_USES_##feature == 1)
#if defined(Q_PROCESSOR_ARM) && defined(__ARM_NEON) || defined(__ARM_NEON__)
#if defined(Q_PROCESSOR_ARM) && defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64)
# include <arm_neon.h>
# define QT_COMPILER_USES_neon 1
#else

View File

@ -257,7 +257,7 @@ static_assert(ARCH_SKX_MACROS, "Undeclared identifiers indicate which features a
// NEON intrinsics
// note: as of GCC 4.9, does not support function targets for ARM
#if defined(__ARM_NEON) || defined(__ARM_NEON__)
#if defined(__ARM_NEON) || defined(__ARM_NEON__) || defined(_M_ARM64)
#if defined(Q_CC_CLANG)
#define QT_FUNCTION_TARGET_STRING_NEON "neon"
#else

View File

@ -749,7 +749,11 @@ const char16_t *QtPrivate::qustrchr(QStringView str, char16_t c) noexcept
[=](qsizetype i) { return n + i; });
# endif
#elif defined(__ARM_NEON__)
#ifdef _MSC_VER
const uint16x8_t vmask = { 0x0008000400020001ULL, 0x0080004000200010ULL };
#else
const uint16x8_t vmask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
#endif
const uint16x8_t ch_vec = vdupq_n_u16(c);
for (const char16_t *next = n + 8; next <= e; n = next, next += 8) {
uint16x8_t data = vld1q_u16(reinterpret_cast<const uint16_t *>(n));
@ -1290,7 +1294,11 @@ static int ucstrncmp(const char16_t *a, const char16_t *b, size_t l)
# elif defined(__ARM_NEON__)
if (l >= 8) {
const char16_t *end = a + l;
#ifdef _MSC_VER
const uint16x8_t mask = { 0x0008000400020001ULL, 0x0080004000200010ULL };
#else
const uint16x8_t mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
#endif
while (end - a > 7) {
uint16x8_t da = vld1q_u16(reinterpret_cast<const uint16_t *>(a));
uint16x8_t db = vld1q_u16(reinterpret_cast<const uint16_t *>(b));

View File

@ -351,7 +351,12 @@ static void simdCompareAscii(const qchar8_t *&src8, const qchar8_t *end8, const
static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
{
uint16x8_t maxAscii = vdupq_n_u16(0x7f);
uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
#ifdef _MSC_VER
uint16_t mask1t[8] = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
uint16x8_t mask1 = vld1q_u16(mask1t);
#else
uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 };
#endif
uint16x8_t mask2 = vshlq_n_u16(mask1, 1);
// do sixteen characters at a time
@ -389,7 +394,12 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons
{
// do eight characters at a time
uint8x8_t msb_mask = vdup_n_u8(0x80);
#ifdef _MSC_VER
uint8_t add_maskt[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
uint8x8_t add_mask = vld1_u8(add_maskt);
#else
uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
#endif
for ( ; end - src >= 8; src += 8, dst += 8) {
uint8x8_t c = vld1_u8(src);
uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));
@ -425,7 +435,12 @@ static inline const uchar *simdFindNonAscii(const uchar *src, const uchar *end,
// do eight characters at a time
uint8x8_t msb_mask = vdup_n_u8(0x80);
#ifdef _MSC_VER
uint8_t add_maskt[8] = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
uint8x8_t add_mask = vld1_u8(add_maskt);
#else
uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 };
#endif
for ( ; end - src >= 8; src += 8) {
uint8x8_t c = vld1_u8(src);
uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask));

View File

@ -662,18 +662,27 @@ static inline bool test_all_zero(uint32x4_t p)
#endif
}
static inline uint32x4_t vsetq_u32(uint32_t a, uint32_t b, uint32_t c, uint32_t d)
{
#ifdef _MSC_VER
return uint32x4_t{ (uint64_t(b) << 32) | a, (uint64_t(d) << 32) | c };
#else
return uint32x4_t{ a, b, c, d };
#endif
}
template<typename T>
static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
{
constexpr bool isARGB = isArgb<T>();
const float iFF00 = 1.0f / (255 * 256);
const uint32x4_t vRangeMax = {
isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
: d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear
: d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear,
QColorTrcLut::Resolution };
const uint32x4_t vRangeMax = vsetq_u32(
isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
: d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear
: d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear,
QColorTrcLut::Resolution);
for (qsizetype i = 0; i < len; ++i) {
uint32x4_t v;
loadP<T>(src[i], v);
@ -740,13 +749,13 @@ void loadUnpremultiplied(QColorVector *buffer, const T *src, const qsizetype len
{
constexpr bool isARGB = isArgb<T>();
const float iFF00 = 1.0f / (255 * 256);
const uint32x4_t vRangeMax = {
isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
: d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear
: d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear,
QColorTrcLut::Resolution };
const uint32x4_t vRangeMax = vsetq_u32(
isARGB ? d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear
: d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear,
d_ptr->colorSpaceIn->lut[1]->m_unclampedToLinear,
isARGB ? d_ptr->colorSpaceIn->lut[0]->m_unclampedToLinear
: d_ptr->colorSpaceIn->lut[2]->m_unclampedToLinear,
QColorTrcLut::Resolution);
for (qsizetype i = 0; i < len; ++i) {
uint32x4_t v;
loadPU<T>(src[i], v);

View File

@ -18,13 +18,14 @@
#include <QtGui/private/qtguiglobal_p.h>
#include <QtGui/qrgb.h>
#include <QtGui/qrgba64.h>
#include <QtCore/private/qsimd_p.h>
#include <cmath>
#include <memory>
#if defined(__SSE2__)
#include <emmintrin.h>
#elif defined(__ARM_NEON__) || defined(__ARM_NEON)
#elif defined(__ARM_NEON__)
#include <arm_neon.h>
#endif
@ -75,7 +76,7 @@ public:
QRgba64 rgba64;
_mm_storel_epi64(reinterpret_cast<__m128i *>(&rgba64), v);
return rgba64;
#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
#elif defined(__ARM_NEON__) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
uint8x8_t v8 = vreinterpret_u8_u32(vmov_n_u32(rgb32));
uint16x4_t v16 = vget_low_u16(vmovl_u8(v8));
const uint16x4_t vidx = vshl_n_u16(v16, ShiftUp);
@ -145,7 +146,7 @@ public:
v = _mm_srli_epi16(v, 8);
v = _mm_packus_epi16(v, v);
return _mm_cvtsi128_si32(v);
#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
#elif defined(__ARM_NEON__) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
uint16x4_t v = vreinterpret_u16_u64(vmov_n_u64(rgb64));
v = vsub_u16(v, vshr_n_u16(v, 8));
const uint16x4_t vidx = vshr_n_u16(v, ShiftDown);
@ -236,7 +237,7 @@ private:
QRgba64 rgba64;
_mm_storel_epi64(reinterpret_cast<__m128i *>(&rgba64), v);
return rgba64;
#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
#elif defined(__ARM_NEON__) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
uint16x4_t v = vreinterpret_u16_u64(vmov_n_u64(rgb64));
v = vsub_u16(v, vshr_n_u16(v, 8));
const uint16x4_t vidx = vshr_n_u16(v, ShiftDown);

View File

@ -1008,8 +1008,18 @@ void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h,
class QSimdNeon
{
public:
typedef int32x4_t Int32x4;
typedef float32x4_t Float32x4;
struct Int32x4 {
Int32x4() = default;
Int32x4(int32x4_t v) : v(v) {}
int32x4_t v;
operator int32x4_t() const { return v; }
};
struct Float32x4 {
Float32x4() = default;
Float32x4(float32x4_t v) : v(v) {};
float32x4_t v;
operator float32x4_t() const { return v; }
};
union Vect_buffer_i { Int32x4 v; int i[4]; };
union Vect_buffer_f { Float32x4 v; float f[4]; };
@ -1059,7 +1069,9 @@ const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Oper
#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
static inline uint32x4_t vrgba2argb(uint32x4_t srcVector)
{
#if defined(Q_PROCESSOR_ARM_64)
#if defined(Q_PROCESSOR_ARM_64) && defined(_MSC_VER)
const uint8x16_t rgbaMask = { 0x0704050603000102ULL, 0x0F0C0D0E0B08090AULL };
#elif defined(Q_PROCESSOR_ARM_64)
const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
#else
const uint8x8_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7 };
@ -1079,7 +1091,11 @@ template<bool RGBA>
static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
{
int i = 0;
#ifdef _MSC_VER
const uint8x8_t shuffleMask = { 0x0707070703030303ULL };
#else
const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
#endif
const uint32x4_t blendMask = vdupq_n_u32(0xff000000);
for (; i < count - 3; i += 4) {
@ -1131,7 +1147,11 @@ static inline void convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src
if (count <= 0)
return;
#ifdef _MSC_VER
const uint8x8_t shuffleMask = { 0x0707070703030303ULL };
#else
const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
#endif
const uint64x2_t blendMask = vdupq_n_u64(Q_UINT64_C(0xffff000000000000));
int i = 0;

View File

@ -1087,7 +1087,9 @@ static inline void qConvertARGB32PMToRGBA64PM_neon(QRgba64 *buffer, const uint *
return;
const uint32x4_t amask = vdupq_n_u32(0xff000000);
#if defined(Q_PROCESSOR_ARM_64)
#if defined(Q_PROCESSOR_ARM_64) && defined(_MSC_VER)
const uint8x16_t rgbaMask = { 0x0704050603000102ULL, 0x0F0C0D0E0B08090AULL };
#elif defined(Q_PROCESSOR_ARM_64)
const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
#else
const uint8x8_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7 };

View File

@ -232,7 +232,11 @@ static inline uint toArgb32(QRgba64 rgba64)
#elif defined __ARM_NEON__
uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
#ifdef _MSC_VER
const uint8x8_t shuffleMask = { 0x0706010003020504ULL };
#else
const uint8x8_t shuffleMask = { 4, 5, 2, 3, 0, 1, 6, 7 };
#endif
v = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(v), shuffleMask));
#else
v = vext_u16(v, v, 3);