diff --git a/cmake/crc32.cmake b/cmake/crc32.cmake index ee8afdb0e92..e81f4d1c273 100644 --- a/cmake/crc32.cmake +++ b/cmake/crc32.cmake @@ -1,3 +1,32 @@ +IF(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|AARCH64") + include(CheckCXXSourceCompiles) + + CHECK_CXX_SOURCE_COMPILES(" + #define CRC32CX(crc, value) __asm__(\"crc32cx %w[c], %w[c], %x[v]\":[c]\"+r\"(crc):[v]\"r\"(value)) + asm(\".arch_extension crc\"); + unsigned int foo(unsigned int ret) { + CRC32CX(ret, 0); + return ret; + } + int main() { foo(0); }" HAVE_ARMV8_CRC) + + CHECK_CXX_SOURCE_COMPILES(" + asm(\".arch_extension crypto\"); + unsigned int foo(unsigned int ret) { + __asm__(\"pmull v2.1q, v2.1d, v1.1d\"); + return ret; + } + int main() { foo(0); }" HAVE_ARMV8_CRYPTO) + + CHECK_C_COMPILER_FLAG(-march=armv8-a+crc+crypto HAVE_ARMV8_CRC_CRYPTO_INTRINSICS) + IF(HAVE_ARMV8_CRC_CRYPTO_INTRINSICS) + SET(ARMV8_CRC_COMPILE_FLAGS "${ARMV8_CRC_COMPILE_FLAGS} -march=armv8-a+crc+crypto") + ENDIF() + + SET(CRC32_LIBRARY crc32_armv8_neon) + ADD_SUBDIRECTORY(extra/crc32_armv8_neon) +ENDIF() + IF(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc64") SET(HAVE_CRC32_VPMSUM 1) SET(CRC32_LIBRARY crc32-vpmsum) diff --git a/config.h.cmake b/config.h.cmake index b1092fa5df7..101dd767f86 100644 --- a/config.h.cmake +++ b/config.h.cmake @@ -107,6 +107,11 @@ #cmakedefine HAVE_SYSTEMD 1 #cmakedefine HAVE_CRC32_VPMSUM 1 +/* Support ARMv8 crc + crypto */ +#cmakedefine HAVE_ARMV8_CRC 1 +#cmakedefine HAVE_ARMV8_CRYPTO 1 +#cmakedefine HAVE_ARMV8_CRC_CRYPTO_INTRINSICS 1 + /* Does "struct timespec" have a "sec" and "nsec" field? */ #cmakedefine HAVE_TIMESPEC_TS_SEC 1 diff --git a/extra/crc32_armv8_neon/CMakeLists.txt b/extra/crc32_armv8_neon/CMakeLists.txt new file mode 100644 index 00000000000..ba1d34d7c2e --- /dev/null +++ b/extra/crc32_armv8_neon/CMakeLists.txt @@ -0,0 +1,8 @@ +INCLUDE_DIRECTORIES(${CMAKE_SOURCE_DIR}/include) +INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/include) + +ADD_CONVENIENCE_LIBRARY(${CRC32_LIBRARY} $) +ADD_LIBRARY(common_crc32c_armv8 OBJECT crc32_armv8.c) + +SET_TARGET_PROPERTIES(common_crc32c_armv8 PROPERTIES COMPILE_FLAGS "${ARMV8_CRC_COMPILE_FLAGS}") + diff --git a/extra/crc32_armv8_neon/crc32_armv8.c b/extra/crc32_armv8_neon/crc32_armv8.c new file mode 100644 index 00000000000..20f341552e2 --- /dev/null +++ b/extra/crc32_armv8_neon/crc32_armv8.c @@ -0,0 +1,301 @@ +#include +#include + + +#if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC) + +#include +#include + +#ifndef HWCAP_CRC32 +#define HWCAP_CRC32 (1 << 7) +#endif + +unsigned int crc32c_aarch64_available(void) +{ + unsigned long auxv = getauxval(AT_HWCAP); + return (auxv & HWCAP_CRC32) != 0; +} + +#endif + +#ifndef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS + +/* Request crc extension capabilities from the assembler */ +asm(".arch_extension crc"); + +#ifdef HAVE_ARMV8_CRYPTO +/* crypto extension */ +asm(".arch_extension crypto"); +#endif + +#define CRC32CX(crc, value) __asm__("crc32cx %w[c], %w[c], %x[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32CW(crc, value) __asm__("crc32cw %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32CH(crc, value) __asm__("crc32ch %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) +#define CRC32CB(crc, value) __asm__("crc32cb %w[c], %w[c], %w[v]":[c]"+r"(crc):[v]"r"(value)) + +#define CRC32C3X8(buffer, ITR) \ + __asm__("crc32cx %w[c1], %w[c1], %x[v]":[c1]"+r"(crc1):[v]"r"(*((const uint64_t *)buffer + 42*1 + (ITR))));\ + __asm__("crc32cx %w[c2], %w[c2], %x[v]":[c2]"+r"(crc2):[v]"r"(*((const uint64_t *)buffer + 42*2 + (ITR))));\ + __asm__("crc32cx %w[c0], %w[c0], %x[v]":[c0]"+r"(crc0):[v]"r"(*((const uint64_t *)buffer + 42*0 + (ITR)))); + +#define CRC32C3X8_ZERO \ + __asm__("crc32cx %w[c0], %w[c0], xzr":[c0]"+r"(crc0)); + +#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ + +/* Intrinsics header*/ +#include +#include + +#define CRC32CX(crc, value) (crc) = __crc32cd((crc), (value)) +#define CRC32CW(crc, value) (crc) = __crc32cw((crc), (value)) +#define CRC32CH(crc, value) (crc) = __crc32ch((crc), (value)) +#define CRC32CB(crc, value) (crc) = __crc32cb((crc), (value)) + +#define CRC32C3X8(buffer, ITR) \ + crc1 = __crc32cd(crc1, *((const uint64_t *)buffer + 42*1 + (ITR)));\ + crc2 = __crc32cd(crc2, *((const uint64_t *)buffer + 42*2 + (ITR)));\ + crc0 = __crc32cd(crc0, *((const uint64_t *)buffer + 42*0 + (ITR))); + +#define CRC32C3X8_ZERO \ + crc0 = __crc32cd(crc0, (const uint64_t)0); + +#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ + +#define CRC32C7X3X8(buffer, ITR) do {\ + CRC32C3X8(buffer, ((ITR) * 7 + 0)) \ + CRC32C3X8(buffer, ((ITR) * 7 + 1)) \ + CRC32C3X8(buffer, ((ITR) * 7 + 2)) \ + CRC32C3X8(buffer, ((ITR) * 7 + 3)) \ + CRC32C3X8(buffer, ((ITR) * 7 + 4)) \ + CRC32C3X8(buffer, ((ITR) * 7 + 5)) \ + CRC32C3X8(buffer, ((ITR) * 7 + 6)) \ + } while(0) + +#define CRC32C7X3X8_ZERO do {\ + CRC32C3X8_ZERO \ + CRC32C3X8_ZERO \ + CRC32C3X8_ZERO \ + CRC32C3X8_ZERO \ + CRC32C3X8_ZERO \ + CRC32C3X8_ZERO \ + CRC32C3X8_ZERO \ + } while(0) + +#define PREF4X64L1(buffer, PREF_OFFSET, ITR) \ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\ + __asm__("PRFM PLDL1KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64)); + +#define PREF1KL1(buffer, PREF_OFFSET) \ + PREF4X64L1(buffer,(PREF_OFFSET), 0) \ + PREF4X64L1(buffer,(PREF_OFFSET), 4) \ + PREF4X64L1(buffer,(PREF_OFFSET), 8) \ + PREF4X64L1(buffer,(PREF_OFFSET), 12) + +#define PREF4X64L2(buffer, PREF_OFFSET, ITR) \ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 0)*64));\ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 1)*64));\ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 2)*64));\ + __asm__("PRFM PLDL2KEEP, [%x[v],%[c]]"::[v]"r"(buffer), [c]"I"((PREF_OFFSET) + ((ITR) + 3)*64)); + +#define PREF1KL2(buffer, PREF_OFFSET) \ + PREF4X64L2(buffer,(PREF_OFFSET), 0) \ + PREF4X64L2(buffer,(PREF_OFFSET), 4) \ + PREF4X64L2(buffer,(PREF_OFFSET), 8) \ + PREF4X64L2(buffer,(PREF_OFFSET), 12) + + +uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len) +{ + uint32_t crc0, crc1, crc2; + int64_t length = (int64_t)len; + + crc = 0xFFFFFFFFU; + + if (buffer) { + +/* Crypto extension Support + * Process 1024 Bytes (per block) + */ +#ifdef HAVE_ARMV8_CRYPTO + +/* Intrinsics Support */ +#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS + const poly64_t k1 = 0xe417f38a, k2 = 0x8f158014; + uint64_t t0, t1; + + /* Process per block size of 1024 Bytes + * A block size = 8 + 42*3*sizeof(uint64_t) + 8 + */ + while ((length -= 1024) >= 0) { + /* Prefetch 3*1024 data for avoiding L2 cache miss */ + PREF1KL2(buffer, 1024*3); + /* Do first 8 bytes here for better pipelining */ + crc0 = __crc32cd(crc, *(const uint64_t *)buffer); + crc1 = 0; + crc2 = 0; + buffer += sizeof(uint64_t); + + /* Process block inline + * Process crc0 last to avoid dependency with above + */ + CRC32C7X3X8(buffer, 0); + CRC32C7X3X8(buffer, 1); + CRC32C7X3X8(buffer, 2); + CRC32C7X3X8(buffer, 3); + CRC32C7X3X8(buffer, 4); + CRC32C7X3X8(buffer, 5); + + buffer += 42*3*sizeof(uint64_t); + /* Prefetch data for following block to avoid L1 cache miss */ + PREF1KL1(buffer, 1024); + + /* Last 8 bytes + * Merge crc0 and crc1 into crc2 + * crc1 multiply by K2 + * crc0 multiply by K1 + */ + t1 = (uint64_t)vmull_p64(crc1, k2); + t0 = (uint64_t)vmull_p64(crc0, k1); + crc = __crc32cd(crc2, *(const uint64_t *)buffer); + crc1 = __crc32cd(0, t1); + crc ^= crc1; + crc0 = __crc32cd(0, t0); + crc ^= crc0; + + buffer += sizeof(uint64_t); + } + +#else /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ + + /*No intrinsics*/ + __asm__("mov x16, #0xf38a \n\t" + "movk x16, #0xe417, lsl 16 \n\t" + "mov v1.2d[0], x16 \n\t" + "mov x16, #0x8014 \n\t" + "movk x16, #0x8f15, lsl 16 \n\t" + "mov v0.2d[0], x16 \n\t" + :::"x16"); + + while ((length -= 1024) >= 0) { + PREF1KL2(buffer, 1024*3); + __asm__("crc32cx %w[c0], %w[c], %x[v]\n\t" + :[c0]"=r"(crc0):[c]"r"(crc), [v]"r"(*(const uint64_t *)buffer):); + crc1 = 0; + crc2 = 0; + buffer += sizeof(uint64_t); + + CRC32C7X3X8(buffer, 0); + CRC32C7X3X8(buffer, 1); + CRC32C7X3X8(buffer, 2); + CRC32C7X3X8(buffer, 3); + CRC32C7X3X8(buffer, 4); + CRC32C7X3X8(buffer, 5); + + buffer += 42*3*sizeof(uint64_t); + PREF1KL1(buffer, 1024); + __asm__("mov v2.2d[0], %x[c1] \n\t" + "pmull v2.1q, v2.1d, v0.1d \n\t" + "mov v3.2d[0], %x[c0] \n\t" + "pmull v3.1q, v3.1d, v1.1d \n\t" + "crc32cx %w[c], %w[c2], %x[v] \n\t" + "mov %x[c1], v2.2d[0] \n\t" + "crc32cx %w[c1], wzr, %x[c1] \n\t" + "eor %w[c], %w[c], %w[c1] \n\t" + "mov %x[c0], v3.2d[0] \n\t" + "crc32cx %w[c0], wzr, %x[c0] \n\t" + "eor %w[c], %w[c], %w[c0] \n\t" + :[c1]"+r"(crc1), [c0]"+r"(crc0), [c2]"+r"(crc2), [c]"+r"(crc) + :[v]"r"(*((const uint64_t *)buffer))); + buffer += sizeof(uint64_t); + } +#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ + + /* Done if Input data size is aligned with 1024 */ + if(!(length += 1024)) + return (~crc); + +#endif /* HAVE_ARMV8_CRYPTO */ + + while ((length -= sizeof(uint64_t)) >= 0) { + CRC32CX(crc, *(uint64_t *)buffer); + buffer += sizeof(uint64_t); + } + /* The following is more efficient than the straight loop */ + if (length & sizeof(uint32_t)) { + CRC32CW(crc, *(uint32_t *)buffer); + buffer += sizeof(uint32_t); + } + if (length & sizeof(uint16_t)) { + CRC32CH(crc, *(uint16_t *)buffer); + buffer += sizeof(uint16_t); + } + if (length & sizeof(uint8_t)) + CRC32CB(crc, *buffer); + + } else { +#ifdef HAVE_ARMV8_CRYPTO +#ifdef HAVE_ARMV8_CRC_CRYPTO_INTRINSICS + const poly64_t k1 = 0xe417f38a; + uint64_t t0; + while ((length -= 1024) >= 0) { + crc0 = __crc32cd(crc, 0); + + CRC32C7X3X8_ZERO; + CRC32C7X3X8_ZERO; + CRC32C7X3X8_ZERO; + CRC32C7X3X8_ZERO; + CRC32C7X3X8_ZERO; + CRC32C7X3X8_ZERO; + + /* Merge crc0 into crc: crc0 multiply by K1 */ + t0 = (uint64_t)vmull_p64(crc0, k1); + crc = __crc32cd(0, t0); + } +#else /* !HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ + __asm__("mov x16, #0xf38a \n\t" + "movk x16, #0xe417, lsl 16 \n\t" + "mov v1.2d[0], x16 \n\t" + :::"x16"); + + while ((length -= 1024) >= 0) { + __asm__("crc32cx %w[c0], %w[c], xzr\n\t" + :[c0]"=r"(crc0):[c]"r"(crc)); + + CRC32C7X3X8_ZERO; + CRC32C7X3X8_ZERO; + CRC32C7X3X8_ZERO; + CRC32C7X3X8_ZERO; + CRC32C7X3X8_ZERO; + CRC32C7X3X8_ZERO; + + __asm__("mov v3.2d[0], %x[c0] \n\t" + "pmull v3.1q, v3.1d, v1.1d \n\t" + "mov %x[c0], v3.2d[0] \n\t" + "crc32cx %w[c], wzr, %x[c0] \n\t" + :[c]"=r"(crc) + :[c0]"r"(crc0)); + } +#endif /* HAVE_ARMV8_CRC_CRYPTO_INTRINSICS */ + if(!(length += 1024)) + return (~crc); +#endif /* HAVE_ARMV8_CRYPTO */ + while ((length -= sizeof(uint64_t)) >= 0) + CRC32CX(crc, 0); + + /* The following is more efficient than the straight loop */ + if (length & sizeof(uint32_t)) + CRC32CW(crc, 0); + + if (length & sizeof(uint16_t)) + CRC32CH(crc, 0); + + if (length & sizeof(uint8_t)) + CRC32CB(crc, 0); + } + + return (~crc); +} diff --git a/storage/innobase/ut/ut0crc32.cc b/storage/innobase/ut/ut0crc32.cc index bbc519ad92b..0403131f274 100644 --- a/storage/innobase/ut/ut0crc32.cc +++ b/storage/innobase/ut/ut0crc32.cc @@ -131,6 +131,28 @@ ut_crc32_func_t ut_crc32 = ut_crc32_sw; const char* ut_crc32_implementation = "Using generic crc32 instructions"; #endif +#ifdef HAVE_ARMV8_CRC +extern "C" { +uint32_t crc32c_aarch64(uint32_t crc, const unsigned char *buffer, uint64_t len); +}; +static inline +uint32_t +ut_crc32_armv8( + const byte* buf, + ulint len) +{ + return crc32c_aarch64(0, buf, len); +} +#endif + +/* For runtime check */ +#if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC) +extern "C" { +unsigned int crc32c_aarch64_available(void); +}; +#endif + + #if (defined(__GNUC__) && defined(__x86_64__)) || defined(_MSC_VER) /********************************************************************//** Fetches CPU info */ @@ -561,4 +583,14 @@ ut_crc32_init() ut_crc32_implementation = "Using SSE2 crc32 instructions"; } #endif + + +#if defined(__GNUC__) && defined(__linux__) && defined(HAVE_ARMV8_CRC) + if (crc32c_aarch64_available()) { + ut_crc32 = ut_crc32_armv8; + ut_crc32_implementation = "Using Armv8 crc32 instructions"; + + } +#endif + }