From ad31a90319cf9d10bd9aadb7e6a18c8cbdc23d7e Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Thu, 3 Oct 2024 21:06:36 -0700 Subject: [PATCH] QUtf8: simplify the AVX2 code in simdDecodeAscii Instead of having this extra code to deal with BitSpacing because we were getting the bit mask from the UTF-16 content, we can get it from the UTF-8 side. This means the compilers must emit one extra instruction, but the number of uops to be executed is roughly the same. See the LLVM-MCA analysis: https://analysis.godbolt.org/z/TP48jM9Tq Change-Id: I73641d6b9443d8216eadfffd71515d1f33b3d833 Reviewed-by: Allan Sandfeld Jensen --- src/corelib/text/qstringconverter.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index 709fe75c24c..02021cb583e 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -136,23 +136,19 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons for ( ; end - src >= 16; src += 16, dst += 16) { __m128i data = _mm_loadu_si128((const __m128i*)src); + // check if everything is ASCII + // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII + uint n = _mm_movemask_epi8(data); + #ifdef __AVX2__ - const int BitSpacing = 2; // load and zero extend to an YMM register const __m256i extended = _mm256_cvtepu8_epi16(data); - - uint n = _mm256_movemask_epi8(extended); if (!n) { // store _mm256_storeu_si256((__m256i*)dst, extended); continue; } #else - const int BitSpacing = 1; - - // check if everything is ASCII - // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII - uint n = _mm_movemask_epi8(data); if (!n) { // unpack _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128())); @@ -164,14 +160,14 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons // copy the front part that is still ASCII while (!(n & 1)) { *dst++ = *src++; - n >>= BitSpacing; + n >>= 1; } // find the next probable ASCII character // we don't want to load 16 bytes again in this loop if we know there are non-ASCII // characters still coming n = qBitScanReverse(n); - nextAscii = src + (n / BitSpacing) + 1; + nextAscii = src + n + 1; return false; }