diff --git a/src/corelib/text/qstringconverter.cpp b/src/corelib/text/qstringconverter.cpp index 04eb762ce5a..8b9d18f40a1 100644 --- a/src/corelib/text/qstringconverter.cpp +++ b/src/corelib/text/qstringconverter.cpp @@ -66,20 +66,15 @@ static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) noexcept #endif #if defined(__SSE2__) -static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end) +template static Q_ALWAYS_INLINE bool +simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, const char16_t *&src, const char16_t *end) { size_t sizeBytes = reinterpret_cast(end) - reinterpret_cast(src); // do sixteen characters at a time auto process16Chars = [](uchar *dst, const char16_t *src) { -# ifdef __AVX2__ - __m256i data = _mm256_loadu_si256(reinterpret_cast(src)); - __m128i data1 = _mm256_castsi256_si128(data); - __m128i data2 = _mm256_extracti128_si256(data, 1); -# else __m128i data1 = _mm_loadu_si128((const __m128i*)src); __m128i data2 = _mm_loadu_si128(1+(const __m128i*)src); -# endif // check if everything is ASCII // the highest ASCII value is U+007F @@ -120,6 +115,40 @@ static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, cons src = end; }; + if constexpr (Cpu & CpuFeatureAVX2) { + // The 256-bit VPACKUSWB[1] instruction interleaves the two input + // operands, so we need an extra permutation to get them back in-order. + // VPERMW takes 2 cyles to run while VPERMQ takes only 1. + // [1] https://www.felixcloutier.com/x86/PACKUSWB.html + constexpr size_t Step = 32; + auto process32Chars = [](const char16_t *src, uchar *dst) { + __m256i data1 = _mm256_loadu_si256(reinterpret_cast(src)); + __m256i data2 = _mm256_loadu_si256(reinterpret_cast(src) + 1); + __m256i packed = _mm256_packus_epi16(data1, data2); // will be [A, B, A, B] + __m256i permuted = _mm256_permute4x64_epi64(packed, _MM_SHUFFLE(3, 1, 2, 0)); + __m256i nonAscii = _mm256_cmpgt_epi8(permuted, _mm256_setzero_si256()); + + // store, even if there are non-ASCII characters here + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), permuted); + + return ~_mm256_movemask_epi8(nonAscii); + }; + + if (sizeBytes >= Step * sizeof(char16_t)) { + // do 32 characters at a time + qptrdiff offset = 0; + for ( ; (offset + Step) * sizeof(char16_t) < sizeBytes; offset += Step) { + if (uint n = process32Chars(src + offset, dst + offset)) + return maybeFoundNonAscii(n, offset); + } + + // do 32 characters again, possibly overlapping with the loop above + adjustToEnd(); + uint n = process32Chars(src - Step, dst - Step); + return maybeFoundNonAscii(n, -int(Step)); + } + } + constexpr size_t Step = 16; if (sizeBytes >= Step * sizeof(char16_t)) { @@ -128,6 +157,8 @@ static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, cons ushort n = process16Chars(dst + offset, src + offset); if (n) return maybeFoundNonAscii(n, offset); + if (Cpu & CpuFeatureAVX2) + break; // we can only ever loop once because of the code above } // do sixteen characters again, possibly overlapping with the loop above @@ -183,7 +214,8 @@ static inline bool simdEncodeAscii(uchar *&dst, const char16_t *&nextAscii, cons return src == end; } -static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end) +template static Q_ALWAYS_INLINE bool +simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end) { // do sixteen characters at a time auto process16Chars = [](char16_t *dst, const uchar *src) { @@ -193,17 +225,9 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII uint n = _mm_movemask_epi8(data); -#ifdef __AVX2__ - // load and zero extend to an YMM register - const __m256i extended = _mm256_cvtepu8_epi16(data); - - // store everything, even mojibake - _mm256_storeu_si256((__m256i*)dst, extended); -#else // store everything, even mojibake _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128())); _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128())); -#endif return ushort(n); }; auto maybeFoundNonAscii = [&](uint n, qptrdiff offset = 0) { @@ -226,6 +250,51 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons src = end; }; + if constexpr (Cpu & CpuFeatureAVX2) { + constexpr qsizetype Step = 32; + auto process32Chars = [](char16_t *dst, const uchar *src) { + __m128i data1 = _mm_loadu_si128(reinterpret_cast(src)); + __m128i data2 = _mm_loadu_si128(reinterpret_cast(src) + 1); + + // the processor can execute this VPOR (dispatches 3/cycle) faster + // than waiting for the VPMOVMSKB (1/cycle) of both data to check + // their masks + __m128i ored = _mm_or_si128(data1, data2); + bool any = _mm_movemask_epi8(ored); + + // store everything, even mojibake + __m256i extended1 = _mm256_cvtepu8_epi16(data1); + __m256i extended2 = _mm256_cvtepu8_epi16(data2); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst), extended1); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(dst) + 1, extended2); + + uint n1 = _mm_movemask_epi8(data1); + uint n2 = _mm_movemask_epi8(data2); + struct R { + uint n1, n2; + bool any; + operator bool() const { return any; } + operator uint() const { return n1|(n2 << 16); } + }; + return R{ n1, n2, any }; + }; + + if (end - src >= Step) { + // do 32 characters at a time + qptrdiff offset = 0; + for ( ; offset + Step < end - src; offset += Step) { + auto r = process32Chars(dst + offset, src + offset); + if (r) + return maybeFoundNonAscii(r, offset); + } + + // do 32 characters again, possibly overlapping with the loop above + adjustToEnd(); + auto r = process32Chars(dst - Step, src - Step); + return maybeFoundNonAscii(r, -Step); + } + } + constexpr qsizetype Step = 16; if (end - src >= Step) { qptrdiff offset = 0; @@ -233,6 +302,8 @@ static inline bool simdDecodeAscii(char16_t *&dst, const uchar *&nextAscii, cons ushort n = process16Chars(dst + offset, src + offset); if (n) return maybeFoundNonAscii(n, offset); + if (Cpu & CpuFeatureAVX2) + break; // we can only ever loop once because of the code above } // do one chunk again, possibly overlapping with the loop above