ucstrncmp: refactor with 32- and 8-byte loads
First of all, this removes the UB that used to try and calculate the distance between the two strings. That's a valid technique in assembly, but dangerous in C++ and totally unnecessary. The compiler is perfectly able to generate loops with a single induction variable all on its own. Second, this commit makes the main loop use 32-byte comparisons (16 characters at a time), which is a reasonable size for strings. We use AVX2 if that's available, or an unrolled pair of 16-byte loads otherwise. After the existing 16-byte comparison, this commit inserts an 8-byte (4-character) comparison and then reduces the final, unrolled comparison to just 3 characters. Change-Id: Ib48364abee9f464c96c6fffd152e474b39e1f293 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
This commit is contained in:
parent
85278a6970
commit
6e2ad0c79c
@ -646,30 +646,70 @@ static int ucstrncmp(const QChar *a, const QChar *b, size_t l)
|
|||||||
}
|
}
|
||||||
#endif // __mips_dsp
|
#endif // __mips_dsp
|
||||||
#ifdef __SSE2__
|
#ifdef __SSE2__
|
||||||
const char *ptr = reinterpret_cast<const char*>(a);
|
const QChar *end = a + l;
|
||||||
qptrdiff distance = reinterpret_cast<const char*>(b) - ptr;
|
qptrdiff offset = 0;
|
||||||
a += l & ~7;
|
|
||||||
b += l & ~7;
|
|
||||||
l &= 7;
|
|
||||||
|
|
||||||
// we're going to read ptr[0..15] (16 bytes)
|
// we're going to read a[0..15] and b[0..15] (32 bytes)
|
||||||
for ( ; ptr + 15 < reinterpret_cast<const char *>(a); ptr += 16) {
|
for ( ; a + offset + 16 <= end; offset += 16) {
|
||||||
__m128i a_data = _mm_loadu_si128((const __m128i*)ptr);
|
#ifdef __AVX2__
|
||||||
__m128i b_data = _mm_loadu_si128((const __m128i*)(ptr + distance));
|
__m256i a_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(a + offset));
|
||||||
|
__m256i b_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(b + offset));
|
||||||
|
__m256i result = _mm256_cmpeq_epi16(a_data, b_data);
|
||||||
|
uint mask = _mm256_movemask_epi8(result);
|
||||||
|
#else
|
||||||
|
__m128i a_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset));
|
||||||
|
__m128i a_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset + 8));
|
||||||
|
__m128i b_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset));
|
||||||
|
__m128i b_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset + 8));
|
||||||
|
__m128i result1 = _mm_cmpeq_epi16(a_data1, b_data1);
|
||||||
|
__m128i result2 = _mm_cmpeq_epi16(a_data2, b_data2);
|
||||||
|
uint mask = _mm_movemask_epi8(result1) | (_mm_movemask_epi8(result2) << 16);
|
||||||
|
#endif
|
||||||
|
mask = ~mask;
|
||||||
|
if (mask) {
|
||||||
|
// found a different character
|
||||||
|
uint idx = qCountTrailingZeroBits(mask);
|
||||||
|
return a[offset + idx / 2].unicode() - b[offset + idx / 2].unicode();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// we're going to read a[0..7] and b[0..7] (16 bytes)
|
||||||
|
if (a + offset + 8 <= end) {
|
||||||
|
__m128i a_data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset));
|
||||||
|
__m128i b_data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset));
|
||||||
__m128i result = _mm_cmpeq_epi16(a_data, b_data);
|
__m128i result = _mm_cmpeq_epi16(a_data, b_data);
|
||||||
uint mask = ~_mm_movemask_epi8(result);
|
uint mask = ~_mm_movemask_epi8(result);
|
||||||
if (ushort(mask)) {
|
if (ushort(mask)) {
|
||||||
// found a different byte
|
// found a different character
|
||||||
uint idx = qCountTrailingZeroBits(mask);
|
uint idx = qCountTrailingZeroBits(mask);
|
||||||
return reinterpret_cast<const QChar *>(ptr + idx)->unicode()
|
return a[offset + idx / 2].unicode() - b[offset + idx / 2].unicode();
|
||||||
- reinterpret_cast<const QChar *>(ptr + distance + idx)->unicode();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
offset += 8;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// we're going to read a[0..3] and b[0..3] (8 bytes)
|
||||||
|
if (a + offset + 4 <= end) {
|
||||||
|
__m128i a_data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(a + offset));
|
||||||
|
__m128i b_data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(b + offset));
|
||||||
|
__m128i result = _mm_cmpeq_epi16(a_data, b_data);
|
||||||
|
uint mask = ~_mm_movemask_epi8(result);
|
||||||
|
if (uchar(mask)) {
|
||||||
|
// found a different character
|
||||||
|
uint idx = qCountTrailingZeroBits(mask);
|
||||||
|
return a[offset + idx / 2].unicode() - b[offset + idx / 2].unicode();
|
||||||
|
}
|
||||||
|
|
||||||
|
offset += 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
// reset l
|
||||||
|
l &= 3;
|
||||||
|
|
||||||
const auto lambda = [=](size_t i) -> int {
|
const auto lambda = [=](size_t i) -> int {
|
||||||
return reinterpret_cast<const QChar *>(ptr)[i].unicode()
|
return a[offset + i].unicode() - b[offset + i].unicode();
|
||||||
- reinterpret_cast<const QChar *>(ptr + distance)[i].unicode();
|
|
||||||
};
|
};
|
||||||
return UnrollTailLoop<7>::exec(l, 0, lambda, lambda);
|
return UnrollTailLoop<3>::exec(l, 0, lambda, lambda);
|
||||||
#endif
|
#endif
|
||||||
#if defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64
|
#if defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64
|
||||||
if (l >= 8) {
|
if (l >= 8) {
|
||||||
|
Loading…
x
Reference in New Issue
Block a user