Improve a few string operations with AVX2

AVX2 brings the new PMOVZXBW instruction that extends from one 128-bit
SSE register to an 256-bit AVX register. With that, the main decoding
code is just two instructions (the loop requires a couple more to
maintain the offset counter and do the end-of-loop check).

This buys us another 4% performance improvement in the fromLatin1 code,
calculated on top of the VEX-encoded SSE2 code (which is already a little
better than plain SSE2).

Change-Id: I675fa24de4fa97683b662f19d146047251f77359
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@digia.com>
This commit is contained in:
Thiago Macieira 2014-01-26 23:45:27 -08:00 committed by The Qt Project
parent 4dba08eebf
commit 309d3557ca
2 changed files with 55 additions and 17 deletions

View File

@ -103,27 +103,44 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const
for ( ; end - src >= 16; src += 16, dst += 16) { for ( ; end - src >= 16; src += 16, dst += 16) {
__m128i data = _mm_loadu_si128((__m128i*)src); __m128i data = _mm_loadu_si128((__m128i*)src);
#ifdef __AVX2__
const int BitSpacing = 2;
// load and zero extend to an YMM register
const __m256i extended = _mm256_cvtepu8_epi16(data);
uint n = _mm256_movemask_epi8(extended);
if (!n) {
// store
_mm256_storeu_si256((__m256i*)dst, extended);
continue;
}
#else
const int BitSpacing = 1;
// check if everything is ASCII // check if everything is ASCII
// movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
uint n = _mm_movemask_epi8(data); uint n = _mm_movemask_epi8(data);
if (n) { if (!n) {
// copy the front part that is still ASCII // unpack
while (!(n & 1)) { _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
*dst++ = *src++; _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
n >>= 1; continue;
} }
#endif
// find the next probable ASCII character // copy the front part that is still ASCII
// we don't want to load 16 bytes again in this loop if we know there are non-ASCII while (!(n & 1)) {
// characters still coming *dst++ = *src++;
n = _bit_scan_reverse(n); n >>= BitSpacing;
nextAscii = src + n + 1;
return false;
} }
// unpack // find the next probable ASCII character
_mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128())); // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
_mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128())); // characters still coming
n = _bit_scan_reverse(n);
nextAscii = src + (n / BitSpacing) + 1;
return false;
} }
return src == end; return src == end;
} }

View File

@ -223,8 +223,15 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size)
// we're going to read str[offset..offset+15] (16 bytes) // we're going to read str[offset..offset+15] (16 bytes)
for ( ; str + offset + 15 < e; offset += 16) { for ( ; str + offset + 15 < e; offset += 16) {
const __m128i nullMask = _mm_set1_epi32(0);
const __m128i chunk = _mm_loadu_si128((__m128i*)(str + offset)); // load const __m128i chunk = _mm_loadu_si128((__m128i*)(str + offset)); // load
#ifdef __AVX2__
// zero extend to an YMM register
const __m256i extended = _mm256_cvtepu8_epi16(chunk);
// store
_mm256_storeu_si256((__m256i*)(dst + offset), extended);
#else
const __m128i nullMask = _mm_set1_epi32(0);
// unpack the first 8 bytes, padding with zeros // unpack the first 8 bytes, padding with zeros
const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
@ -233,6 +240,7 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size)
// unpack the last 8 bytes, padding with zeros // unpack the last 8 bytes, padding with zeros
const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
_mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store _mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store
#endif
} }
size = size % 16; size = size % 16;
@ -540,8 +548,20 @@ static int ucstrncmp(const QChar *a, const uchar *c, int l)
// and c[offset..offset+15] (16 bytes) // and c[offset..offset+15] (16 bytes)
for ( ; uc + offset + 15 < e; offset += 16) { for ( ; uc + offset + 15 < e; offset += 16) {
// similar to fromLatin1_helper: // similar to fromLatin1_helper:
// load Latin 1 data and expand to UTF-16 // load 16 bytes of Latin 1 data
__m128i chunk = _mm_loadu_si128((__m128i*)(c + offset)); __m128i chunk = _mm_loadu_si128((__m128i*)(c + offset));
# ifdef __AVX2__
// expand Latin 1 data via zero extension
__m256i ldata = _mm256_cvtepu8_epi16(chunk);
// load UTF-16 data and compare
__m256i ucdata = _mm256_loadu_si256((__m256i*)(uc + offset));
__m256i result = _mm256_cmpeq_epi16(ldata, ucdata);
uint mask = ~_mm256_movemask_epi8(result);
# else
// expand via unpacking
__m128i firstHalf = _mm_unpacklo_epi8(chunk, nullmask); __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullmask);
__m128i secondHalf = _mm_unpackhi_epi8(chunk, nullmask); __m128i secondHalf = _mm_unpackhi_epi8(chunk, nullmask);
@ -552,6 +572,7 @@ static int ucstrncmp(const QChar *a, const uchar *c, int l)
__m128i result2 = _mm_cmpeq_epi16(secondHalf, ucdata2); __m128i result2 = _mm_cmpeq_epi16(secondHalf, ucdata2);
uint mask = ~(_mm_movemask_epi8(result1) | _mm_movemask_epi8(result2) << 16); uint mask = ~(_mm_movemask_epi8(result1) | _mm_movemask_epi8(result2) << 16);
# endif
if (mask) { if (mask) {
// found a different character // found a different character
uint idx = uint(_bit_scan_forward(mask)); uint idx = uint(_bit_scan_forward(mask));