Armv8 CRC32 optimization (#772)
ARMv8 (AArch64) CPUs implement the CRC32 extension which is implemented by inline assembly , so they can also benefit from hardware acceleration in IO-intensive workloads. The patch optimizes crc32c calculate with the armv8 crypto instruction(Intrinsics) when available rather than original linear crc instructions. Change-Id: I05d36a64c726d910c47befad93390108f4e6567f Signed-off-by: Yuqi Gu <yuqi.gu@arm.com>
This commit is contained in:
parent
a74b01ea0e
commit
0928596a8b
@ -1,3 +1,32 @@
|
||||
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64")
|
||||
include(CheckCXXSourceCompiles)
|
||||
|
||||
CHECK_CXX_SOURCE_COMPILES("
|
||||
#define CRC32CX(crc, value) __asm__(\"crc32cx %w[c], %w[c], %x[v]\":[c]\"+r\"(crc):[v]\"r\"(value))
|
||||
asm(\".arch_extension crc\");
|
||||
unsigned int foo(unsigned int ret) {
|
||||
CRC32CX(ret, 0);
|
||||
return ret;
|
||||
}
|
||||
int main() { foo(0); }" HAVE_ARMV8_CRC)
|
||||
|
||||
CHECK_CXX_SOURCE_COMPILES("
|
||||
asm(\".arch_extension crypto\");
|
||||
unsigned int foo(unsigned int ret) {
|
||||
__asm__(\"pmull v2.1q, v2.1d, v1.1d\");
|
||||
return ret;
|
||||
}
|
||||
int main() { foo(0); }" HAVE_ARMV8_CRYPTO)
|
||||
|
||||
CHECK_C_COMPILER_FLAG(-march=armv8-a+crc+crypto HAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
|
||||
IF(HAVE_ARMV8_CRC_CRYPTO_INTRINSICS)
|
||||
SET(ARMV8_CRC_COMPILE_FLAGS "${ARMV8_CRC_COMPILE_FLAGS} -march=armv8-a+crc+crypto")
|
||||
ENDIF()
|
||||
|
||||
SET(CRC32_LIBRARY crc32_armv8_neon)
|
||||
ADD_SUBDIRECTORY(extra/crc32_armv8_neon)
|
||||
ENDIF()
|
||||
|
||||
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64")
|
||||
SET(HAVE_CRC32_VPMSUM 1)
|
||||
SET(CRC32_LIBRARY crc32-vpmsum)
|
||||
|
@ -107,6 +107,11 @@
|
||||
#cmakedefine HAVE_SYSTEMD 1
|
||||
#cmakedefine HAVE_CRC32_VPMSUM 1
|
||||
|
||||
/* Support ARMv8 crc + crypto */
|
||||
#cmakedefine HAVE_ARMV8_CRC 1
|
||||
#cmakedefine HAVE_ARMV8_CRYPTO 1
|
||||
#cmakedefine HAVE_ARMV8_CRC_CRYPTO_INTRINSICS 1
|
||||
|
||||
/* Does "struct timespec" have a "sec" and "nsec" field? */
|
||||
#cmakedefine HAVE_TIMESPEC_TS_SEC 1
|
||||
|
||||
|
8
extra/crc32_armv8_neon/CMakeLists.txt
Normal file
8
extra/crc32_armv8_neon/CMakeLists.txt
Normal file
@ -0,0 +1,8 @@
|
||||
INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include)
|
||||
INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/include)
|
||||
|
||||
ADD_CONVENIENCE_LIBRARY(${CRC32_LIBRARY} $<TARGET_OBJECTS:common_crc32c_armv8>)
|
||||
ADD_LIBRARY(common_crc32c_armv8 OBJECT crc32_armv8.c)
|
||||
|
||||
SET_TARGET_PROPERTIES(common_crc32c_armv8 PROPERTIES COMPILE_FLAGS "${ARMV8_CRC_COMPILE_FLAGS}")
|
||||
|
301
extra/crc32_armv8_neon/crc32_armv8.c
Normal file
301
extra/crc32_armv8_neon/crc32_armv8.c
Normal file
@ -0,0 +1,301 @@
|
||||
#include <my_global.h>
|
||||
#include <string.h>
|
||||
|
||||
|
||||
#if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC)
|
||||
|
||||
#include <sys/auxv.h>
|
||||
#include <asm/hwcap.h>
|
||||
|
||||
#ifndef HWCAP_CRC32
|
||||
#define HWCAP_CRC32 (1 << 7)
|
||||
#endif
|
||||
|
||||
unsigned int crc32c_aarch64_available(void)
|
||||
{
|
||||
unsigned long auxv = getauxval(AT_HWCAP);
|
||||
return (auxv & HWCAP_CRC32) != 0;
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
|
||||
|
||||
/* Request crc extension capabilities from the assembler */
|
||||
asm(".arch_extension crc");
|
||||
|
||||
#ifdef HAVE_ARMV8_CRYPTO
|
||||
/* crypto extension */
|
||||
asm(".arch_extension crypto");
|
||||
#endif
|
||||
|
||||
#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value))
|
||||
#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
|
||||
#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
|
||||
#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value))
|
||||
|
||||
#define CRC32C3X8(buffer, ITR) \
|
||||
__asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\
|
||||
__asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\
|
||||
__asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR))));
|
||||
|
||||
#define CRC32C3X8_ZERO \
|
||||
__asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0));
|
||||
|
||||
#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
|
||||
|
||||
/* Intrinsics header*/
|
||||
#include <arm_acle.h>
|
||||
#include <arm_neon.h>
|
||||
|
||||
#define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value))
|
||||
#define CRC32CW(crc, value) (crc) = __crc32cw((crc), (value))
|
||||
#define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value))
|
||||
#define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value))
|
||||
|
||||
#define CRC32C3X8(buffer, ITR) \
|
||||
crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\
|
||||
crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\
|
||||
crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR)));
|
||||
|
||||
#define CRC32C3X8_ZERO \
|
||||
crc0 = __crc32cd(crc0, (const uint64_t)0);
|
||||
|
||||
#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
|
||||
|
||||
#define CRC32C7X3X8(buffer, ITR) do {\
|
||||
CRC32C3X8(buffer, ((ITR) * 7 + 0)) \
|
||||
CRC32C3X8(buffer, ((ITR) * 7 + 1)) \
|
||||
CRC32C3X8(buffer, ((ITR) * 7 + 2)) \
|
||||
CRC32C3X8(buffer, ((ITR) * 7 + 3)) \
|
||||
CRC32C3X8(buffer, ((ITR) * 7 + 4)) \
|
||||
CRC32C3X8(buffer, ((ITR) * 7 + 5)) \
|
||||
CRC32C3X8(buffer, ((ITR) * 7 + 6)) \
|
||||
} while(0)
|
||||
|
||||
#define CRC32C7X3X8_ZERO do {\
|
||||
CRC32C3X8_ZERO \
|
||||
CRC32C3X8_ZERO \
|
||||
CRC32C3X8_ZERO \
|
||||
CRC32C3X8_ZERO \
|
||||
CRC32C3X8_ZERO \
|
||||
CRC32C3X8_ZERO \
|
||||
CRC32C3X8_ZERO \
|
||||
} while(0)
|
||||
|
||||
#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \
|
||||
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
|
||||
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
|
||||
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
|
||||
__asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
|
||||
|
||||
#define PREF1KL1(buffer, PREF_OFFSET) \
|
||||
PREF4X64L1(buffer,(PREF_OFFSET), 0) \
|
||||
PREF4X64L1(buffer,(PREF_OFFSET), 4) \
|
||||
PREF4X64L1(buffer,(PREF_OFFSET), 8) \
|
||||
PREF4X64L1(buffer,(PREF_OFFSET), 12)
|
||||
|
||||
#define PREF4X64L2(buffer, PREF_OFFSET, ITR) \
|
||||
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\
|
||||
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\
|
||||
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\
|
||||
__asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64));
|
||||
|
||||
#define PREF1KL2(buffer, PREF_OFFSET) \
|
||||
PREF4X64L2(buffer,(PREF_OFFSET), 0) \
|
||||
PREF4X64L2(buffer,(PREF_OFFSET), 4) \
|
||||
PREF4X64L2(buffer,(PREF_OFFSET), 8) \
|
||||
PREF4X64L2(buffer,(PREF_OFFSET), 12)
|
||||
|
||||
|
||||
uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len)
|
||||
{
|
||||
uint32_t crc0, crc1, crc2;
|
||||
int64_t length = (int64_t)len;
|
||||
|
||||
crc = 0xFFFFFFFFU;
|
||||
|
||||
if (buffer) {
|
||||
|
||||
/* Crypto extension Support
|
||||
* Process 1024 Bytes (per block)
|
||||
*/
|
||||
#ifdef HAVE_ARMV8_CRYPTO
|
||||
|
||||
/* Intrinsics Support */
|
||||
#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
|
||||
const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014;
|
||||
uint64_t t0, t1;
|
||||
|
||||
/* Process per block size of 1024 Bytes
|
||||
* A block size = 8 + 42*3*sizeof(uint64_t) + 8
|
||||
*/
|
||||
while ((length -= 1024) >= 0) {
|
||||
/* Prefetch 3*1024 data for avoiding L2 cache miss */
|
||||
PREF1KL2(buffer, 1024*3);
|
||||
/* Do first 8 bytes here for better pipelining */
|
||||
crc0 = __crc32cd(crc, *(const uint64_t *)buffer);
|
||||
crc1 = 0;
|
||||
crc2 = 0;
|
||||
buffer += sizeof(uint64_t);
|
||||
|
||||
/* Process block inline
|
||||
* Process crc0 last to avoid dependency with above
|
||||
*/
|
||||
CRC32C7X3X8(buffer, 0);
|
||||
CRC32C7X3X8(buffer, 1);
|
||||
CRC32C7X3X8(buffer, 2);
|
||||
CRC32C7X3X8(buffer, 3);
|
||||
CRC32C7X3X8(buffer, 4);
|
||||
CRC32C7X3X8(buffer, 5);
|
||||
|
||||
buffer += 42*3*sizeof(uint64_t);
|
||||
/* Prefetch data for following block to avoid L1 cache miss */
|
||||
PREF1KL1(buffer, 1024);
|
||||
|
||||
/* Last 8 bytes
|
||||
* Merge crc0 and crc1 into crc2
|
||||
* crc1 multiply by K2
|
||||
* crc0 multiply by K1
|
||||
*/
|
||||
t1 = (uint64_t)vmull_p64(crc1, k2);
|
||||
t0 = (uint64_t)vmull_p64(crc0, k1);
|
||||
crc = __crc32cd(crc2, *(const uint64_t *)buffer);
|
||||
crc1 = __crc32cd(0, t1);
|
||||
crc ^= crc1;
|
||||
crc0 = __crc32cd(0, t0);
|
||||
crc ^= crc0;
|
||||
|
||||
buffer += sizeof(uint64_t);
|
||||
}
|
||||
|
||||
#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
|
||||
|
||||
/*No intrinsics*/
|
||||
__asm__("mov x16, #0xf38a \n\t"
|
||||
"movk x16, #0xe417, lsl 16 \n\t"
|
||||
"mov v1.2d[0], x16 \n\t"
|
||||
"mov x16, #0x8014 \n\t"
|
||||
"movk x16, #0x8f15, lsl 16 \n\t"
|
||||
"mov v0.2d[0], x16 \n\t"
|
||||
:::"x16");
|
||||
|
||||
while ((length -= 1024) >= 0) {
|
||||
PREF1KL2(buffer, 1024*3);
|
||||
__asm__("crc32cx %w[c0], %w[c], %x[v]\n\t"
|
||||
:[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):);
|
||||
crc1 = 0;
|
||||
crc2 = 0;
|
||||
buffer += sizeof(uint64_t);
|
||||
|
||||
CRC32C7X3X8(buffer, 0);
|
||||
CRC32C7X3X8(buffer, 1);
|
||||
CRC32C7X3X8(buffer, 2);
|
||||
CRC32C7X3X8(buffer, 3);
|
||||
CRC32C7X3X8(buffer, 4);
|
||||
CRC32C7X3X8(buffer, 5);
|
||||
|
||||
buffer += 42*3*sizeof(uint64_t);
|
||||
PREF1KL1(buffer, 1024);
|
||||
__asm__("mov v2.2d[0], %x[c1] \n\t"
|
||||
"pmull v2.1q, v2.1d, v0.1d \n\t"
|
||||
"mov v3.2d[0], %x[c0] \n\t"
|
||||
"pmull v3.1q, v3.1d, v1.1d \n\t"
|
||||
"crc32cx %w[c], %w[c2], %x[v] \n\t"
|
||||
"mov %x[c1], v2.2d[0] \n\t"
|
||||
"crc32cx %w[c1], wzr, %x[c1] \n\t"
|
||||
"eor %w[c], %w[c], %w[c1] \n\t"
|
||||
"mov %x[c0], v3.2d[0] \n\t"
|
||||
"crc32cx %w[c0], wzr, %x[c0] \n\t"
|
||||
"eor %w[c], %w[c], %w[c0] \n\t"
|
||||
:[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc)
|
||||
:[v]"r"(*((const uint64_t *)buffer)));
|
||||
buffer += sizeof(uint64_t);
|
||||
}
|
||||
#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
|
||||
|
||||
/* Done if Input data size is aligned with 1024 */
|
||||
if(!(length += 1024))
|
||||
return (~crc);
|
||||
|
||||
#endif /* HAVE_ARMV8_CRYPTO */
|
||||
|
||||
while ((length -= sizeof(uint64_t)) >= 0) {
|
||||
CRC32CX(crc, *(uint64_t *)buffer);
|
||||
buffer += sizeof(uint64_t);
|
||||
}
|
||||
/* The following is more efficient than the straight loop */
|
||||
if (length & sizeof(uint32_t)) {
|
||||
CRC32CW(crc, *(uint32_t *)buffer);
|
||||
buffer += sizeof(uint32_t);
|
||||
}
|
||||
if (length & sizeof(uint16_t)) {
|
||||
CRC32CH(crc, *(uint16_t *)buffer);
|
||||
buffer += sizeof(uint16_t);
|
||||
}
|
||||
if (length & sizeof(uint8_t))
|
||||
CRC32CB(crc, *buffer);
|
||||
|
||||
} else {
|
||||
#ifdef HAVE_ARMV8_CRYPTO
|
||||
#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS
|
||||
const poly64_t k1 = 0xe417f38a;
|
||||
uint64_t t0;
|
||||
while ((length -= 1024) >= 0) {
|
||||
crc0 = __crc32cd(crc, 0);
|
||||
|
||||
CRC32C7X3X8_ZERO;
|
||||
CRC32C7X3X8_ZERO;
|
||||
CRC32C7X3X8_ZERO;
|
||||
CRC32C7X3X8_ZERO;
|
||||
CRC32C7X3X8_ZERO;
|
||||
CRC32C7X3X8_ZERO;
|
||||
|
||||
/* Merge crc0 into crc: crc0 multiply by K1 */
|
||||
t0 = (uint64_t)vmull_p64(crc0, k1);
|
||||
crc = __crc32cd(0, t0);
|
||||
}
|
||||
#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
|
||||
__asm__("mov x16, #0xf38a \n\t"
|
||||
"movk x16, #0xe417, lsl 16 \n\t"
|
||||
"mov v1.2d[0], x16 \n\t"
|
||||
:::"x16");
|
||||
|
||||
while ((length -= 1024) >= 0) {
|
||||
__asm__("crc32cx %w[c0], %w[c], xzr\n\t"
|
||||
:[c0]"=r"(crc0):[c]"r"(crc));
|
||||
|
||||
CRC32C7X3X8_ZERO;
|
||||
CRC32C7X3X8_ZERO;
|
||||
CRC32C7X3X8_ZERO;
|
||||
CRC32C7X3X8_ZERO;
|
||||
CRC32C7X3X8_ZERO;
|
||||
CRC32C7X3X8_ZERO;
|
||||
|
||||
__asm__("mov v3.2d[0], %x[c0] \n\t"
|
||||
"pmull v3.1q, v3.1d, v1.1d \n\t"
|
||||
"mov %x[c0], v3.2d[0] \n\t"
|
||||
"crc32cx %w[c], wzr, %x[c0] \n\t"
|
||||
:[c]"=r"(crc)
|
||||
:[c0]"r"(crc0));
|
||||
}
|
||||
#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */
|
||||
if(!(length += 1024))
|
||||
return (~crc);
|
||||
#endif /* HAVE_ARMV8_CRYPTO */
|
||||
while ((length -= sizeof(uint64_t)) >= 0)
|
||||
CRC32CX(crc, 0);
|
||||
|
||||
/* The following is more efficient than the straight loop */
|
||||
if (length & sizeof(uint32_t))
|
||||
CRC32CW(crc, 0);
|
||||
|
||||
if (length & sizeof(uint16_t))
|
||||
CRC32CH(crc, 0);
|
||||
|
||||
if (length & sizeof(uint8_t))
|
||||
CRC32CB(crc, 0);
|
||||
}
|
||||
|
||||
return (~crc);
|
||||
}
|
@ -131,6 +131,28 @@ ut_crc32_func_t ut_crc32 = ut_crc32_sw;
|
||||
const char* ut_crc32_implementation = "Using generic crc32 instructions";
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_ARMV8_CRC
|
||||
extern "C" {
|
||||
uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len);
|
||||
};
|
||||
static inline
|
||||
uint32_t
|
||||
ut_crc32_armv8(
|
||||
const byte* buf,
|
||||
ulint len)
|
||||
{
|
||||
return crc32c_aarch64(0, buf, len);
|
||||
}
|
||||
#endif
|
||||
|
||||
/* For runtime check */
|
||||
#if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC)
|
||||
extern "C" {
|
||||
unsigned int crc32c_aarch64_available(void);
|
||||
};
|
||||
#endif
|
||||
|
||||
|
||||
#if (defined(__GNUC__) && defined(__x86_64__)) || defined(_MSC_VER)
|
||||
/********************************************************************//**
|
||||
Fetches CPU info */
|
||||
@ -561,4 +583,14 @@ ut_crc32_init()
|
||||
ut_crc32_implementation = "Using SSE2 crc32 instructions";
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
#if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC)
|
||||
if (crc32c_aarch64_available()) {
|
||||
ut_crc32 = ut_crc32_armv8;
|
||||
ut_crc32_implementation = "Using Armv8 crc32 instructions";
|
||||
|
||||
}
|
||||
#endif
|
||||
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user