Optimize further the loading of 8 Latin 1 characters
This is important when AVX is enabled, which makes the VMOVQ load and the VPMOVZXBW instruction be combined into a single VPMOVZXBW with direct memory access. This is guaranteed to only read 8 bytes, so it's safe even close to the end of a page. Clang and ICC do combine the instructions like we want and I have filed a request for GCC to do so too[1]. AVX was first introduced in 2011, so plenty of computers today would benefit from this. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87317 Change-Id: I8f261579aad648fdb4f0fffd1553e08e90df3171 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
This commit is contained in:
parent
2e715c31ed
commit
d36a4fc197
@ -415,6 +415,21 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval)
|
|||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static Q_ALWAYS_INLINE __m128i mm_load8_zero_extend(const void *ptr)
|
||||||
|
{
|
||||||
|
const __m128i *dataptr = static_cast<const __m128i *>(ptr);
|
||||||
|
#if defined(__SSE4_1__)
|
||||||
|
// use a MOVQ followed by PMOVZXBW
|
||||||
|
// if AVX2 is present, these should combine into a single VPMOVZXBW instruction
|
||||||
|
__m128i data = _mm_loadl_epi64(dataptr);
|
||||||
|
return _mm_cvtepu8_epi16(data);
|
||||||
|
# else
|
||||||
|
// use MOVQ followed by PUNPCKLBW
|
||||||
|
__m128i data = _mm_loadl_epi64(dataptr);
|
||||||
|
return _mm_unpacklo_epi8(data, _mm_setzero_si128());
|
||||||
|
# endif
|
||||||
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Note: ptr on output may be off by one and point to a preceding US-ASCII
|
// Note: ptr on output may be off by one and point to a preceding US-ASCII
|
||||||
@ -585,8 +600,7 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size) Q_DECL_NOTHROW
|
|||||||
|
|
||||||
// we're going to read str[offset..offset+7] (8 bytes)
|
// we're going to read str[offset..offset+7] (8 bytes)
|
||||||
if (str + offset + 7 < e) {
|
if (str + offset + 7 < e) {
|
||||||
const __m128i chunk = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(str + offset));
|
const __m128i unpacked = mm_load8_zero_extend(str + offset);
|
||||||
const __m128i unpacked = _mm_unpacklo_epi8(chunk, _mm_setzero_si128());
|
|
||||||
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), unpacked);
|
_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), unpacked);
|
||||||
offset += 8;
|
offset += 8;
|
||||||
}
|
}
|
||||||
@ -1044,8 +1058,7 @@ static int ucstrncmp(const QChar *a, const uchar *c, size_t l)
|
|||||||
// we'll read uc[offset..offset+7] (16 bytes) and c[offset..offset+7] (8 bytes)
|
// we'll read uc[offset..offset+7] (16 bytes) and c[offset..offset+7] (8 bytes)
|
||||||
if (uc + offset + 7 < e) {
|
if (uc + offset + 7 < e) {
|
||||||
// same, but we're using an 8-byte load
|
// same, but we're using an 8-byte load
|
||||||
__m128i chunk = _mm_loadl_epi64((const __m128i*)(c + offset));
|
__m128i secondHalf = mm_load8_zero_extend(c + offset);
|
||||||
__m128i secondHalf = _mm_unpacklo_epi8(chunk, nullmask);
|
|
||||||
|
|
||||||
__m128i ucdata = _mm_loadu_si128((const __m128i*)(uc + offset));
|
__m128i ucdata = _mm_loadu_si128((const __m128i*)(uc + offset));
|
||||||
__m128i result = _mm_cmpeq_epi16(secondHalf, ucdata);
|
__m128i result = _mm_cmpeq_epi16(secondHalf, ucdata);
|
||||||
|
Loading…
x
Reference in New Issue
Block a user