MDEV-25870 followup : pmull support on Windows ARM64
casting vmull_p64 is possible on MSVC, although with much more verbose code. The reason are missing neon types (no compiler support for 128bit ints).
This commit is contained in:
parent
fe10645eb7
commit
8c6cbb3360
@ -68,12 +68,12 @@ IF(MSVC_INTEL)
|
|||||||
ENDIF()
|
ENDIF()
|
||||||
ELSEIF(MSVC_ARM64)
|
ELSEIF(MSVC_ARM64)
|
||||||
SET (MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_arm64.c)
|
SET (MYSYS_SOURCES ${MYSYS_SOURCES} crc32/crc32_arm64.c)
|
||||||
ADD_DEFINITIONS(-DHAVE_ARMV8_CRC -DHAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
|
ADD_DEFINITIONS(-DHAVE_ARMV8_CRC -DHAVE_ARMV8_CRC_CRYPTO_INTRINSICS -DHAVE_ARMV8_CRYPTO)
|
||||||
IF(CLANG_CL)
|
IF(CLANG_CL)
|
||||||
SET_SOURCE_FILES_PROPERTIES(
|
SET_SOURCE_FILES_PROPERTIES(
|
||||||
crc32/crc32_arm64.c
|
crc32/crc32_arm64.c
|
||||||
PROPERTIES
|
PROPERTIES
|
||||||
COMPILE_FLAGS "-march=armv8-a+crc"
|
COMPILE_FLAGS "-march=armv8-a+crc+crypto"
|
||||||
)
|
)
|
||||||
ENDIF()
|
ENDIF()
|
||||||
ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686")
|
ELSEIF(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|i386|i686")
|
||||||
|
@ -29,7 +29,9 @@ my_crc32_t crc32c_aarch64_available(void)
|
|||||||
{
|
{
|
||||||
if (crc32_aarch64_available() == 0)
|
if (crc32_aarch64_available() == 0)
|
||||||
return NULL;
|
return NULL;
|
||||||
/* TODO : pmull seems supported, but does not compile*/
|
|
||||||
|
if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE))
|
||||||
|
return crc32c_aarch64_pmull;
|
||||||
return crc32c_aarch64;
|
return crc32c_aarch64;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -181,11 +183,19 @@ asm(".arch_extension crypto");
|
|||||||
CRC32C3X8(buffer, ((ITR) * 7 + 6)) \
|
CRC32C3X8(buffer, ((ITR) * 7 + 6)) \
|
||||||
} while(0)
|
} while(0)
|
||||||
|
|
||||||
|
#if defined _MSC_VER && !defined __clang__
|
||||||
|
#define PREF4X64L1(buffer, offset, itr)\
|
||||||
|
__prefetch(buffer + (offset) + ((itr) + 0)*64);\
|
||||||
|
__prefetch(buffer + (offset) + ((itr) + 1)*64);\
|
||||||
|
__prefetch(buffer + (offset) + ((itr) + 2)*64);\
|
||||||
|
__prefetch(buffer + (offset) + ((itr) + 3)*64);
|
||||||
|
#else
|
||||||
#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \
|
#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \
|
||||||
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
|
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
|
||||||
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
|
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
|
||||||
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
|
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
|
||||||
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
|
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
|
||||||
|
#endif
|
||||||
|
|
||||||
#define PREF1KL1(buffer, PREF_OFFSET) \
|
#define PREF1KL1(buffer, PREF_OFFSET) \
|
||||||
PREF4X64L1(buffer,(PREF_OFFSET), 0) \
|
PREF4X64L1(buffer,(PREF_OFFSET), 0) \
|
||||||
@ -193,11 +203,20 @@ asm(".arch_extension crypto");
|
|||||||
PREF4X64L1(buffer,(PREF_OFFSET), 8) \
|
PREF4X64L1(buffer,(PREF_OFFSET), 8) \
|
||||||
PREF4X64L1(buffer,(PREF_OFFSET), 12)
|
PREF4X64L1(buffer,(PREF_OFFSET), 12)
|
||||||
|
|
||||||
|
#if defined _MSC_VER && !defined __clang__
|
||||||
|
#define MY_PLDL2KEEP 2 /* PLDL2KEEP is 2 in ARMv8 */
|
||||||
|
#define PREF4X64L2(buffer,offset,itr)\
|
||||||
|
__prefetch2(buffer + offset + ((itr) + 0) * 64, MY_PLDL2KEEP);\
|
||||||
|
__prefetch2(buffer + offset + ((itr) + 1) * 64, MY_PLDL2KEEP);\
|
||||||
|
__prefetch2(buffer + offset + ((itr) + 2) * 64, MY_PLDL2KEEP);\
|
||||||
|
__prefetch2(buffer + offset + ((itr) + 3) * 64, MY_PLDL2KEEP);
|
||||||
|
#else
|
||||||
#define PREF4X64L2(buffer, PREF_OFFSET, ITR) \
|
#define PREF4X64L2(buffer, PREF_OFFSET, ITR) \
|
||||||
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
|
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
|
||||||
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
|
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
|
||||||
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
|
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
|
||||||
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
|
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
|
||||||
|
#endif
|
||||||
|
|
||||||
#define PREF1KL2(buffer, PREF_OFFSET) \
|
#define PREF1KL2(buffer, PREF_OFFSET) \
|
||||||
PREF4X64L2(buffer,(PREF_OFFSET), 0) \
|
PREF4X64L2(buffer,(PREF_OFFSET), 0) \
|
||||||
@ -240,6 +259,16 @@ static unsigned crc32c_aarch64(unsigned crc, const void *buf, size_t len)
|
|||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef HAVE_ARMV8_CRYPTO
|
#ifdef HAVE_ARMV8_CRYPTO
|
||||||
|
|
||||||
|
static inline uint64_t poly_mul(uint64_t a, uint64_t b)
|
||||||
|
{
|
||||||
|
#if defined _MSC_VER && !defined __clang__
|
||||||
|
return vgetq_lane_u64(vreinterpretq_u64_p128(neon_pmull_64(vcreate_p64(a), vcreate_p64(b))),0);
|
||||||
|
#else
|
||||||
|
return (uint64_t) vmull_p64(a, b);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
static unsigned crc32c_aarch64_pmull(unsigned crc, const void *buf, size_t len)
|
static unsigned crc32c_aarch64_pmull(unsigned crc, const void *buf, size_t len)
|
||||||
{
|
{
|
||||||
int64_t length= (int64_t)len;
|
int64_t length= (int64_t)len;
|
||||||
@ -286,8 +315,8 @@ static unsigned crc32c_aarch64_pmull(unsigned crc, const void *buf, size_t len)
|
|||||||
* crc1 multiply by K2
|
* crc1 multiply by K2
|
||||||
* crc0 multiply by K1
|
* crc0 multiply by K1
|
||||||
*/
|
*/
|
||||||
t1= (uint64_t)vmull_p64(crc1, k2);
|
t1= poly_mul(crc1, k2);
|
||||||
t0= (uint64_t)vmull_p64(crc0, k1);
|
t0= poly_mul(crc0, k1);
|
||||||
crc= __crc32cd(crc2, *(const uint64_t *)buffer);
|
crc= __crc32cd(crc2, *(const uint64_t *)buffer);
|
||||||
crc1= __crc32cd(0, t1);
|
crc1= __crc32cd(0, t1);
|
||||||
crc^= crc1;
|
crc^= crc1;
|
||||||
|
Loading…
x
Reference in New Issue
Block a user