QUtf8: add AVX512VL/AVX10.1-256 version of simd{Encode,Decode}Ascii()
We keep the AVX2 looping code and just add the code to perform short loads using masks. This means the SSE2 code for short content gets dead- code-eliminated. I also made a preference for this for exactly 32 characters. The best looping code I could come up with that used the VPMOVUSBW instruction [1] was much worse than the AVX2 code, for either function. Both functions may benefit from 512-bit support, but benchmarking on real hardware is required. [1] https://analysis.godbolt.org/z/scEa8bW1T Change-Id: Ie76ef558f52bb2cf1f60fffd192d947ecb011706 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
This commit is contained in:
parent
8a8e91a7c1
commit
cabadef383
@ -134,6 +134,25 @@ simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, c
|
||||
return ~_mm256_movemask_epi8(nonAscii);
|
||||
};
|
||||
|
||||
if constexpr (Cpu & CpuFeatureAVX512VL) {
|
||||
// with AVX512/AXV10, we always process everything
|
||||
if (sizeBytes <= Step * sizeof(char16_t)) {
|
||||
uint mask = _bzhi_u32(-1, uint(sizeBytes / 2));
|
||||
__m256i data1 = _mm256_maskz_loadu_epi16(mask, src);
|
||||
__m256i data2 = _mm256_maskz_loadu_epi16(mask >> 16, src + Step / 2);
|
||||
__m256i packed = _mm256_packus_epi16(data1, data2);
|
||||
__m256i permuted = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0));
|
||||
__mmask32 nonAscii = _mm256_mask_cmple_epi8_mask(mask, permuted, _mm256_setzero_si256());
|
||||
|
||||
// store, even if there are non-ASCII characters here
|
||||
_mm256_mask_storeu_epi8(dst, mask, permuted);
|
||||
if (nonAscii)
|
||||
return maybeFoundNonAscii(nonAscii);
|
||||
adjustToEnd();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (sizeBytes >= Step * sizeof(char16_t)) {
|
||||
// do 32 characters at a time
|
||||
qptrdiff offset = 0;
|
||||
@ -279,6 +298,25 @@ simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, cons
|
||||
return R{ n1, n2, any };
|
||||
};
|
||||
|
||||
if constexpr (Cpu & CpuFeatureAVX512VL) {
|
||||
// with AVX512/AXV10, we always process everything
|
||||
if (end - src <= Step) {
|
||||
__mmask32 mask = _bzhi_u32(-1, uint(end - src));
|
||||
__m256i data = _mm256_maskz_loadu_epi8(mask, src);
|
||||
__mmask32 nonAscii = _mm256_mask_cmple_epi8_mask(mask, data, _mm256_setzero_si256());
|
||||
|
||||
// store everything, even mojibake
|
||||
__m256i extended1 = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(data));
|
||||
__m256i extended2 = _mm256_cvtepu8_epi16(_mm256_extracti64x2_epi64(data, 1));
|
||||
_mm256_mask_storeu_epi16(dst, mask, extended1);
|
||||
_mm256_mask_storeu_epi16(dst + Step/2, mask >> 16, extended2);
|
||||
if (nonAscii)
|
||||
return maybeFoundNonAscii(nonAscii);
|
||||
adjustToEnd();
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
if (end - src >= Step) {
|
||||
// do 32 characters at a time
|
||||
qptrdiff offset = 0;
|
||||
|
Loading…
x
Reference in New Issue
Block a user