Improve the code generation for the Latin1 codec

This change does not modify the actual algorithm implemented. It only
updates the source code so that the code generation is more optimal:
 - change only one variable per loop (the "offset" variable)
 - unroll the tail expansion of the last 15 characters

The Neon code for the toLatin1 codec most likely benefits from the
unrolling of the tail too, but I can't verify that I haven't broken
anything.

Change-Id: I8a92fd3c1aa700e6f8b0c8ebdb1978ade394757f
Reviewed-by: Olivier Goffart <ogoffart@woboq.com>
This commit is contained in:
Thiago Macieira 2014-01-16 15:25:50 -08:00 committed by The Qt Project
parent ab3637dd67
commit f7308e007e

View File

@ -190,6 +190,16 @@ template <uint MaxCount> struct UnrollTailLoop
return UnrollTailLoop<MaxCount - 1>::exec(count - 1, returnIfExited, loopCheck, returnIfFailed, i + 1); return UnrollTailLoop<MaxCount - 1>::exec(count - 1, returnIfExited, loopCheck, returnIfFailed, i + 1);
} }
template <typename Functor>
static inline void exec(int count, Functor code)
{
/* equivalent to:
* for (int i = 0; i < count; ++i)
* code(i);
*/
exec(count, 0, [=](int i) -> bool { code(i); return false; }, [](int) { return 0; });
}
}; };
template <> template <typename RetType, typename Functor1, typename Functor2> template <> template <typename RetType, typename Functor1, typename Functor2>
inline RetType UnrollTailLoop<0>::exec(int, RetType returnIfExited, Functor1, Functor2, int) inline RetType UnrollTailLoop<0>::exec(int, RetType returnIfExited, Functor1, Functor2, int)
@ -207,25 +217,29 @@ static void qt_from_latin1(ushort *dst, const char *str, size_t size)
* The same method gives no improvement with NEON. * The same method gives no improvement with NEON.
*/ */
#if defined(__SSE2__) #if defined(__SSE2__)
if (size >= 16) { const char *e = str + size;
int chunkCount = size >> 4; // divided by 16 qptrdiff offset = 0;
// we're going to read str[offset..offset+15] (16 bytes)
for ( ; str + offset + 15 < e; offset += 16) {
const __m128i nullMask = _mm_set1_epi32(0); const __m128i nullMask = _mm_set1_epi32(0);
for (int i = 0; i < chunkCount; ++i) { const __m128i chunk = _mm_loadu_si128((__m128i*)(str + offset)); // load
const __m128i chunk = _mm_loadu_si128((__m128i*)str); // load
str += 16;
// unpack the first 8 bytes, padding with zeros // unpack the first 8 bytes, padding with zeros
const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
_mm_storeu_si128((__m128i*)dst, firstHalf); // store _mm_storeu_si128((__m128i*)(dst + offset), firstHalf); // store
dst += 8;
// unpack the last 8 bytes, padding with zeros // unpack the last 8 bytes, padding with zeros
const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
_mm_storeu_si128((__m128i*)dst, secondHalf); // store _mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store
dst += 8;
} }
size = size % 16; size = size % 16;
} dst += offset;
str += offset;
# ifdef Q_COMPILER_LAMBDA
return UnrollTailLoop<15>::exec(size, [=](int i) { dst[i] = (uchar)str[i]; });
# endif
#endif #endif
#if defined(__mips_dsp) #if defined(__mips_dsp)
if (size > 20) if (size > 20)
@ -295,28 +309,30 @@ static inline __m128i mergeQuestionMarks(__m128i chunk)
static void qt_to_latin1(uchar *dst, const ushort *src, int length) static void qt_to_latin1(uchar *dst, const ushort *src, int length)
{ {
if (length) {
#if defined(__SSE2__) #if defined(__SSE2__)
if (length >= 16) { uchar *e = dst + length;
const int chunkCount = length >> 4; // divided by 16 qptrdiff offset = 0;
for (int i = 0; i < chunkCount; ++i) { // we're going to write to dst[offset..offset+15] (16 bytes)
__m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load for ( ; dst + offset + 15 < e; offset += 16) {
__m128i chunk1 = _mm_loadu_si128((__m128i*)(src + offset)); // load
chunk1 = mergeQuestionMarks(chunk1); chunk1 = mergeQuestionMarks(chunk1);
src += 8;
__m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load __m128i chunk2 = _mm_loadu_si128((__m128i*)(src + offset + 8)); // load
chunk2 = mergeQuestionMarks(chunk2); chunk2 = mergeQuestionMarks(chunk2);
src += 8;
// pack the two vector to 16 x 8bits elements // pack the two vector to 16 x 8bits elements
const __m128i result = _mm_packus_epi16(chunk1, chunk2); const __m128i result = _mm_packus_epi16(chunk1, chunk2);
_mm_storeu_si128((__m128i*)(dst + offset), result); // store
}
_mm_storeu_si128((__m128i*)dst, result); // store
dst += 16;
}
length = length % 16; length = length % 16;
} dst += offset;
src += offset;
# ifdef Q_COMPILER_LAMBDA
return UnrollTailLoop<15>::exec(length, [=](int i) { dst[i] = (src[i]>0xff) ? '?' : (uchar) src[i]; });
# endif
#elif defined(__ARM_NEON__) #elif defined(__ARM_NEON__)
// Refer to the documentation of the SSE2 implementation // Refer to the documentation of the SSE2 implementation
// this use eactly the same method as for SSE except: // this use eactly the same method as for SSE except:
@ -349,7 +365,6 @@ static void qt_to_latin1(uchar *dst, const ushort *src, int length)
++src; ++src;
} }
#endif #endif
}
} }
// Unicode case-insensitive comparison // Unicode case-insensitive comparison