Switch epilogues of AVX2 conversions to single step

Not only is it fewer instructions but all the logic except for load and
store can be identical to the main loop.

Change-Id: I2caac0c7504d94e404bd8cfe5080aff07ba2d465
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
Allan Sandfeld Jensen 2019-01-31 11:58:26 +01:00 committed by Liang Qi
parent a6f25dedd8
commit 0d3b913da7

View File

@ -995,16 +995,11 @@ void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2(uint *
} }
} }
static inline __m128i maskFromCount(qsizetype count) static inline __m256i epilogueMaskFromCount(qsizetype count)
{ {
Q_ASSERT(count > 0); Q_ASSERT(count > 0);
static const qint64 data[] = { -1, -1, 0, 0 }; static const __m256i offsetMask = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
auto ptr = reinterpret_cast<const quint8 *>(data) + sizeof(__m128i); return _mm256_add_epi32(offsetMask, _mm256_set1_epi32(-count));
if (count > int(sizeof(__m128i)))
return _mm_set1_epi8(-1);
return _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr - count));
} }
template<bool RGBA> template<bool RGBA>
@ -1050,40 +1045,39 @@ static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype
} }
} }
for ( ; i < count; i += 4) { if (i < count) {
__m128i maskedAlphaMask = _mm256_castsi256_si128(alphaMask); const __m256i epilogueMask = epilogueMaskFromCount(count - i);
__m128i mask = maskFromCount((count - i) * sizeof(*src)); __m256i srcVector = _mm256_maskload_epi32(reinterpret_cast<const int *>(src + i), epilogueMask);
maskedAlphaMask = _mm_and_si128(mask, maskedAlphaMask); const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask);
__m128i srcVector = _mm_maskload_epi32(reinterpret_cast<const int *>(src + i), mask);
if (!_mm_testz_si128(srcVector, maskedAlphaMask)) { if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) {
// keep the two _mm_test[zc]_siXXX next to each other // keep the two _mm_test[zc]_siXXX next to each other
bool cf = _mm_testc_si128(srcVector, maskedAlphaMask); bool cf = _mm256_testc_si256(srcVector, epilogueAlphaMask);
if (RGBA) if (RGBA)
srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
if (!cf) { if (!cf) {
__m128i src1 = _mm_unpacklo_epi8(srcVector, _mm256_castsi256_si128(zero)); __m256i src1 = _mm256_unpacklo_epi8(srcVector, zero);
__m128i src2 = _mm_unpackhi_epi8(srcVector, _mm256_castsi256_si128(zero)); __m256i src2 = _mm256_unpackhi_epi8(srcVector, zero);
__m128i alpha1 = _mm_shuffle_epi8(src1, _mm256_castsi256_si128(shuffleMask)); __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
__m128i alpha2 = _mm_shuffle_epi8(src2, _mm256_castsi256_si128(shuffleMask)); __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask);
src1 = _mm_mullo_epi16(src1, alpha1); src1 = _mm256_mullo_epi16(src1, alpha1);
src2 = _mm_mullo_epi16(src2, alpha2); src2 = _mm256_mullo_epi16(src2, alpha2);
src1 = _mm_add_epi16(src1, _mm_srli_epi16(src1, 8)); src1 = _mm256_add_epi16(src1, _mm256_srli_epi16(src1, 8));
src2 = _mm_add_epi16(src2, _mm_srli_epi16(src2, 8)); src2 = _mm256_add_epi16(src2, _mm256_srli_epi16(src2, 8));
src1 = _mm_add_epi16(src1, _mm256_castsi256_si128(half)); src1 = _mm256_add_epi16(src1, half);
src2 = _mm_add_epi16(src2, _mm256_castsi256_si128(half)); src2 = _mm256_add_epi16(src2, half);
src1 = _mm_srli_epi16(src1, 8); src1 = _mm256_srli_epi16(src1, 8);
src2 = _mm_srli_epi16(src2, 8); src2 = _mm256_srli_epi16(src2, 8);
src1 = _mm_blend_epi16(src1, alpha1, 0x88); src1 = _mm256_blend_epi16(src1, alpha1, 0x88);
src2 = _mm_blend_epi16(src2, alpha2, 0x88); src2 = _mm256_blend_epi16(src2, alpha2, 0x88);
srcVector = _mm_packus_epi16(src1, src2); srcVector = _mm256_packus_epi16(src1, src2);
_mm_maskstore_epi32(reinterpret_cast<int *>(buffer + i), mask, srcVector); _mm256_maskstore_epi32(reinterpret_cast<int *>(buffer + i), epilogueMask, srcVector);
} else { } else {
if (buffer != src || RGBA) if (buffer != src || RGBA)
_mm_maskstore_epi32(reinterpret_cast<int *>(buffer + i), mask, srcVector); _mm256_maskstore_epi32(reinterpret_cast<int *>(buffer + i), epilogueMask, srcVector);
} }
} else { } else {
_mm_maskstore_epi32(reinterpret_cast<int *>(buffer + i), mask, _mm256_castsi256_si128(zero)); _mm256_maskstore_epi32(reinterpret_cast<int *>(buffer + i), epilogueMask, zero);
} }
} }
} }
@ -1116,13 +1110,13 @@ template<bool RGBA>
static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizetype count) static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizetype count)
{ {
qsizetype i = 0; qsizetype i = 0;
const __m256i alphaMask = _mm256_broadcastsi128_si256(_mm_set1_epi32(0xff000000)); const __m256i alphaMask = _mm256_set1_epi32(0xff000000);
const __m256i rgbaMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15)); const __m256i rgbaMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15));
const __m256i shuffleMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15)); const __m256i shuffleMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15));
const __m256i zero = _mm256_setzero_si256(); const __m256i zero = _mm256_setzero_si256();
for (; i < count - 7; i += 8) { for (; i < count - 7; i += 8) {
__m256i src1, src2; __m256i dst1, dst2;
__m256i srcVector = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + i)); __m256i srcVector = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + i));
if (!_mm256_testz_si256(srcVector, alphaMask)) { if (!_mm256_testz_si256(srcVector, alphaMask)) {
// keep the two _mm_test[zc]_siXXX next to each other // keep the two _mm_test[zc]_siXXX next to each other
@ -1138,64 +1132,70 @@ static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizety
// after unpacklo/hi [ P1, P2; P3, P4 ] [ P5, P6; P7, P8 ] // after unpacklo/hi [ P1, P2; P3, P4 ] [ P5, P6; P7, P8 ]
srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(3, 1, 2, 0)); srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(3, 1, 2, 0));
const __m256i src1 = _mm256_unpacklo_epi8(srcVector, srcVector);
const __m256i src2 = _mm256_unpackhi_epi8(srcVector, srcVector);
if (!cf) { if (!cf) {
src1 = _mm256_unpacklo_epi8(srcVector, zero); dst1 = _mm256_unpacklo_epi8(srcVector, zero);
src2 = _mm256_unpackhi_epi8(srcVector, zero); dst2 = _mm256_unpackhi_epi8(srcVector, zero);
__m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask); const __m256i alpha1 = _mm256_shuffle_epi8(dst1, shuffleMask);
__m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask); const __m256i alpha2 = _mm256_shuffle_epi8(dst2, shuffleMask);
src1 = _mm256_mullo_epi16(src1, alpha1); dst1 = _mm256_mullo_epi16(dst1, alpha1);
src2 = _mm256_mullo_epi16(src2, alpha2); dst2 = _mm256_mullo_epi16(dst2, alpha2);
alpha1 = _mm256_unpacklo_epi8(srcVector, srcVector); dst1 = _mm256_add_epi16(dst1, _mm256_srli_epi16(dst1, 7));
alpha2 = _mm256_unpackhi_epi8(srcVector, srcVector); dst2 = _mm256_add_epi16(dst2, _mm256_srli_epi16(dst2, 7));
src1 = _mm256_add_epi16(src1, _mm256_srli_epi16(src1, 7)); dst1 = _mm256_blend_epi16(dst1, src1, 0x88);
src2 = _mm256_add_epi16(src2, _mm256_srli_epi16(src2, 7)); dst2 = _mm256_blend_epi16(dst2, src2, 0x88);
src1 = _mm256_blend_epi16(src1, alpha1, 0x88);
src2 = _mm256_blend_epi16(src2, alpha2, 0x88);
} else { } else {
src1 = _mm256_unpacklo_epi8(srcVector, srcVector); dst1 = src1;
src2 = _mm256_unpackhi_epi8(srcVector, srcVector); dst2 = src2;
} }
} else { } else {
src1 = src2 = zero; dst1 = dst2 = zero;
} }
_mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), src1); _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), dst1);
_mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i) + 1, src2); _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i) + 1, dst2);
} }
for ( ; i < count; i += 4) { if (i < count) {
__m128i maskedAlphaMask = _mm256_castsi256_si128(alphaMask); __m256i epilogueMask = epilogueMaskFromCount(count - i);
__m128i mask = maskFromCount((count - i) * sizeof(*src)); const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask);
maskedAlphaMask = _mm_and_si128(mask, maskedAlphaMask); __m256i dst1, dst2;
__m128i srcVector = _mm_maskload_epi32(reinterpret_cast<const int *>(src + i), mask); __m256i srcVector = _mm256_maskload_epi32(reinterpret_cast<const int *>(src + i), epilogueMask);
__m256i src;
if (!_mm_testz_si128(srcVector, maskedAlphaMask)) { if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) {
// keep the two _mm_test[zc]_siXXX next to each other // keep the two _mm_test[zc]_siXXX next to each other
bool cf = _mm_testc_si128(srcVector, maskedAlphaMask); bool cf = _mm256_testc_si256(srcVector, epilogueAlphaMask);
if (!RGBA) if (!RGBA)
srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(3, 1, 2, 0));
const __m256i src1 = _mm256_unpacklo_epi8(srcVector, srcVector);
const __m256i src2 = _mm256_unpackhi_epi8(srcVector, srcVector);
if (!cf) { if (!cf) {
src = _mm256_cvtepu8_epi16(srcVector); dst1 = _mm256_unpacklo_epi8(srcVector, zero);
__m256i alpha = _mm256_shuffle_epi8(src, shuffleMask); dst2 = _mm256_unpackhi_epi8(srcVector, zero);
src = _mm256_mullo_epi16(src, alpha); const __m256i alpha1 = _mm256_shuffle_epi8(dst1, shuffleMask);
const __m256i alpha2 = _mm256_shuffle_epi8(dst2, shuffleMask);
__m128i alpha1 = _mm_unpacklo_epi8(srcVector, srcVector); dst1 = _mm256_mullo_epi16(dst1, alpha1);
__m128i alpha2 = _mm_unpackhi_epi8(srcVector, srcVector); dst2 = _mm256_mullo_epi16(dst2, alpha2);
alpha = _mm256_inserti128_si256(_mm256_castsi128_si256(alpha1), alpha2, 1); dst1 = _mm256_add_epi16(dst1, _mm256_srli_epi16(dst1, 7));
src = _mm256_add_epi16(src, _mm256_srli_epi16(src, 7)); dst2 = _mm256_add_epi16(dst2, _mm256_srli_epi16(dst2, 7));
src = _mm256_blend_epi16(src, alpha, 0x88); dst1 = _mm256_blend_epi16(dst1, src1, 0x88);
dst2 = _mm256_blend_epi16(dst2, src2, 0x88);
} else { } else {
const __m128i src1 = _mm_unpacklo_epi8(srcVector, srcVector); dst1 = src1;
const __m128i src2 = _mm_unpackhi_epi8(srcVector, srcVector); dst2 = src2;
src = _mm256_castsi128_si256(src1);
src = _mm256_inserti128_si256(src, src2, 1);
} }
} else { } else {
src = zero; dst1 = dst2 = zero;
} }
__m256i xmask = _mm256_cvtepi32_epi64(mask); epilogueMask = _mm256_permute4x64_epi64(epilogueMask, _MM_SHUFFLE(3, 1, 2, 0));
_mm256_maskstore_epi64(reinterpret_cast<qint64 *>(buffer + i), xmask, src); _mm256_maskstore_epi64(reinterpret_cast<qint64 *>(buffer + i),
}; _mm256_unpacklo_epi32(epilogueMask, epilogueMask),
dst1);
_mm256_maskstore_epi64(reinterpret_cast<qint64 *>(buffer + i + 4),
_mm256_unpackhi_epi32(epilogueMask, epilogueMask),
dst2);
}
} }
const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, int count, const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, int count,