QUtf8: modernize simd{Encode,Decode}Ascii()

This rewrites the looping so we only enter that block if we have 16 or
more characters to process. Moreover, we only loop if there are more
than 16 characters in the first place, otherwise we execute a non-
looping block to handle exactly 16 characters. Unfortunately, the
presence of that block after the loop causes both Clang and GCC to emit
slightly worse code for the loop (LLVM-MCA simulations show no change
for long loops on ADL-P but cost 1 extra cycle on SKL).

We also process in vector mode anything with 4 characters or more,
leaving the scalar code to deal with only up to 3. Those two are
implemented by a pair of overlapping loads and stores. Like the loop
above, we only perform an overlapping store if the previous block had no
non-ASCII character.

For simdEncodeAscii(), We use a slightly ugly calculation of size in
bytes, to avoid the 1 byte gap caused by char16_t being 2 bytes. Clang
generates the exact same code for AVX and AXV2 for this code. The
LLVM-MCA analysis[1] shows it runs at 2.0 cycles/iteration for 16
characters per iteration (8 characters/cycle).

In this simdDecodeAscii() implementation, according to the LLVM-MCA
analysis[2] the non-AVX code (compiled with AVX) runs at 2.3
cycles/iteration, processing 16 characters (~7.0 characters/cycle),
while the AVX2 implementation only needs 1.5 cycles/iteration (10.5
characters/cycle).

Drive-by C++ modernization too.

[1] https://analysis.godbolt.org/z/h5K4xzzW4
[2] https://analysis.godbolt.org/z/3Pfnso787

Change-Id: I19ccdb6485843dd9f206fffd4bd468264c66ace4
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
This commit is contained in:
Thiago Macieira 2024-10-05 08:32:18 -07:00
parent 5ca5b36b6c
commit 15e8014880

View File

@ -68,8 +68,10 @@ static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept
#if defined(__SSE2__)
static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end)
{
size_t sizeBytes = reinterpret_cast<const char *>(end) - reinterpret_cast<const char *>(src);
// do sixteen characters at a time
for ( ; end - src >= 16; src += 16, dst += 16) {
auto process16Chars = [](uchar *dst, const char16_t *src) {
# ifdef __AVX2__
__m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src));
__m128i data1 = _mm256_castsi256_si128(data);
@ -95,10 +97,15 @@ static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, cons
// n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
ushort n = ~_mm_movemask_epi8(nonAscii);
return n;
};
auto maybeFoundNonAscii = [&](auto n, qptrdiff offset = 0) {
if (n) {
// find the next probable ASCII character
// we don't want to load 32 bytes again in this loop if we know there are non-ASCII
// characters still coming
src += offset;
dst += offset;
nextAscii = src + qBitScanReverse(n) + 1;
n = qCountTrailingZeroBits(n);
@ -106,11 +113,34 @@ static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, cons
src += n;
return false;
}
return src == end;
};
auto adjustToEnd = [&] {
dst += sizeBytes / sizeof(char16_t);
src = end;
};
constexpr size_t Step = 16;
if (sizeBytes >= Step * sizeof(char16_t)) {
qptrdiff offset = 0;
for ( ; (offset + Step) * sizeof(char16_t) < sizeBytes; offset += Step) {
ushort n = process16Chars(dst + offset, src + offset);
if (n)
return maybeFoundNonAscii(n, offset);
}
// do sixteen characters again, possibly overlapping with the loop above
adjustToEnd();
ushort n = process16Chars(dst - Step, src - Step);
return maybeFoundNonAscii(n, -int(Step));
}
if (end - src >= 8) {
# if !defined(__OPTIMIZE_SIZE__)
if (sizeBytes >= 8 * sizeof(char16_t)) {
// do eight characters at a time
__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
__m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(end - 8));
__m128i packed = _mm_packus_epi16(data, data);
__m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
@ -118,14 +148,37 @@ static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, cons
_mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed);
uchar n = ~_mm_movemask_epi8(nonAscii);
if (n) {
nextAscii = src + qBitScanReverse(n) + 1;
n = qCountTrailingZeroBits(n);
dst += n;
src += n;
return false;
}
if (n)
return maybeFoundNonAscii(n);
adjustToEnd();
packed = _mm_packus_epi16(data2, data2);
nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
_mm_storel_epi64(reinterpret_cast<__m128i *>(dst - 8), packed);
n = ~_mm_movemask_epi8(nonAscii);
return maybeFoundNonAscii(n, -8);
} else if (sizeBytes >= 4 * sizeof(char16_t)) {
// do four characters at a time
__m128i data1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
__m128i data2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(end - 4));
__m128i packed = _mm_packus_epi16(data1, data1);
__m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
// store even non-ASCII
qToUnaligned(_mm_cvtsi128_si32(packed), dst);
uchar n = uchar(_mm_movemask_epi8(nonAscii) ^ 0xf);
if (n)
return maybeFoundNonAscii(n);
adjustToEnd();
packed = _mm_packus_epi16(data2, data2);
nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
qToUnaligned(_mm_cvtsi128_si32(packed), dst - 4);
n = uchar(_mm_movemask_epi8(nonAscii) ^ 0xf);
return maybeFoundNonAscii(n, -4);
}
#endif
return src == end;
}
@ -133,7 +186,7 @@ static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, cons
static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
{
// do sixteen characters at a time
for ( ; end - src >= 16; src += 16, dst += 16) {
auto process16Chars = [](char16_t *dst, const uchar *src) {
__m128i data = _mm_loadu_si128((const __m128i*)src);
// check if everything is ASCII
@ -151,32 +204,77 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons
_mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
_mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
#endif
return ushort(n);
};
auto maybeFoundNonAscii = [&](uint n, qptrdiff offset = 0) {
// find the next probable ASCII character
// we don't want to load 16 bytes again in this loop if we know there are non-ASCII
// characters still coming
if (n) {
uint c = qCountTrailingZeroBits(n);
src += offset;
dst += offset;
n = qBitScanReverse(n);
nextAscii = src + n + 1;
src += c;
dst += c;
}
return src == end;
};
auto adjustToEnd = [&] {
dst += end - src;
src = end;
};
constexpr qsizetype Step = 16;
if (end - src >= Step) {
qptrdiff offset = 0;
for ( ; offset + Step < end - src; offset += Step) {
ushort n = process16Chars(dst + offset, src + offset);
if (n)
return maybeFoundNonAscii(n, offset);
}
// do one chunk again, possibly overlapping with the loop above
adjustToEnd();
return maybeFoundNonAscii(process16Chars(dst - Step, src - Step), -Step);
}
# if !defined(__OPTIMIZE_SIZE__)
if (end - src >= 8) {
__m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
__m128i data2 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(end - 8));
uint n = _mm_movemask_epi8(data) & 0xff;
// store everything, even mojibake
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
if (n) {
uint c = qCountTrailingZeroBits(n);
n = qBitScanReverse(n);
nextAscii = src + n + 1;
src += c;
dst += c;
}
if (n)
return maybeFoundNonAscii(n);
// do one chunk again, possibly overlapping the above
adjustToEnd();
n = _mm_movemask_epi8(data2) & 0xff;
data2 = _mm_unpacklo_epi8(data2, _mm_setzero_si128());
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst - 8), data2);
return maybeFoundNonAscii(n, -8);
}
if (end - src >= 4) {
__m128i data = _mm_cvtsi32_si128(qFromUnaligned<quint32>(src));
__m128i data2 = _mm_cvtsi32_si128(qFromUnaligned<quint32>(end - 4));
uchar n = uchar(_mm_movemask_epi8(data) & 0xf);
// store everything, even mojibake
data = _mm_unpacklo_epi8(data, _mm_setzero_si128());
_mm_storel_epi64(reinterpret_cast<__m128i *>(dst), data);
if (n)
return maybeFoundNonAscii(n);
// do one chunk again, possibly overlapping the above
adjustToEnd();
n = uchar(_mm_movemask_epi8(data2) & 0xf);
data2 = _mm_unpacklo_epi8(data2, _mm_setzero_si128());
_mm_storel_epi64(reinterpret_cast<__m128i *>(dst - 4), data2);
return maybeFoundNonAscii(n, -4);
}
#endif
return src == end;
}