QUtf8: simplify the AVX2 code in simdDecodeAscii

Instead of having this extra code to deal with BitSpacing because we
were getting the bit mask from the UTF-16 content, we can get it from
the UTF-8 side.

This means the compilers must emit one extra instruction, but the number
of uops to be executed is roughly the same. See the LLVM-MCA analysis:
https://analysis.godbolt.org/z/TP48jM9Tq

Change-Id: I73641d6b9443d8216eadfffd71515d1f33b3d833
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
This commit is contained in:
Thiago Macieira 2024-10-03 21:06:36 -07:00
parent 2fcb905fef
commit ad31a90319

View File

@ -136,23 +136,19 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons
for ( ; end - src >= 16; src += 16, dst += 16) { for ( ; end - src >= 16; src += 16, dst += 16) {
__m128i data = _mm_loadu_si128((const __m128i*)src); __m128i data = _mm_loadu_si128((const __m128i*)src);
// check if everything is ASCII
// movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
uint n = _mm_movemask_epi8(data);
#ifdef __AVX2__ #ifdef __AVX2__
const int BitSpacing = 2;
// load and zero extend to an YMM register // load and zero extend to an YMM register
const __m256i extended = _mm256_cvtepu8_epi16(data); const __m256i extended = _mm256_cvtepu8_epi16(data);
uint n = _mm256_movemask_epi8(extended);
if (!n) { if (!n) {
// store // store
_mm256_storeu_si256((__m256i*)dst, extended); _mm256_storeu_si256((__m256i*)dst, extended);
continue; continue;
} }
#else #else
const int BitSpacing = 1;
// check if everything is ASCII
// movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
uint n = _mm_movemask_epi8(data);
if (!n) { if (!n) {
// unpack // unpack
_mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128())); _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
@ -164,14 +160,14 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons
// copy the front part that is still ASCII // copy the front part that is still ASCII
while (!(n & 1)) { while (!(n & 1)) {
*dst++ = *src++; *dst++ = *src++;
n >>= BitSpacing; n >>= 1;
} }
// find the next probable ASCII character // find the next probable ASCII character
// we don't want to load 16 bytes again in this loop if we know there are non-ASCII // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
// characters still coming // characters still coming
n = qBitScanReverse(n); n = qBitScanReverse(n);
nextAscii = src + (n / BitSpacing) + 1; nextAscii = src + n + 1;
return false; return false;
} }