Optimize RGBA64->RGBA64PM for SSE2/AVX2
And remove the direct conversion so we can get both the SIMD optimization and threading applied. Change-Id: Id032ea91cc40c1cbf1c8a1da0386de35aa36cfb5 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
This commit is contained in:
parent
ba6b29a367
commit
936d499ed4
@ -1259,49 +1259,6 @@ static bool convert_RGBA64_to_RGBx64_inplace(QImageData *data, Qt::ImageConversi
|
||||
return true;
|
||||
}
|
||||
|
||||
static void convert_RGBA64_to_RGBA64PM(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags)
|
||||
{
|
||||
Q_ASSERT(src->format == QImage::Format_RGBA64);
|
||||
Q_ASSERT(dest->format == QImage::Format_RGBA64_Premultiplied);
|
||||
Q_ASSERT(src->width == dest->width);
|
||||
Q_ASSERT(src->height == dest->height);
|
||||
|
||||
const int src_pad = (src->bytes_per_line >> 3) - src->width;
|
||||
const int dest_pad = (dest->bytes_per_line >> 3) - dest->width;
|
||||
const QRgba64 *src_data = reinterpret_cast<const QRgba64 *>(src->data);
|
||||
QRgba64 *dest_data = reinterpret_cast<QRgba64 *>(dest->data);
|
||||
|
||||
for (int i = 0; i < src->height; ++i) {
|
||||
const QRgba64 *end = src_data + src->width;
|
||||
while (src_data < end) {
|
||||
*dest_data = src_data->premultiplied();
|
||||
++src_data;
|
||||
++dest_data;
|
||||
}
|
||||
src_data += src_pad;
|
||||
dest_data += dest_pad;
|
||||
}
|
||||
}
|
||||
|
||||
static bool convert_RGBA64_to_RGBA64PM_inplace(QImageData *data, Qt::ImageConversionFlags)
|
||||
{
|
||||
Q_ASSERT(data->format == QImage::Format_RGBA64);
|
||||
|
||||
const int pad = (data->bytes_per_line >> 3) - data->width;
|
||||
QRgba64 *rgb_data = reinterpret_cast<QRgba64 *>(data->data);
|
||||
|
||||
for (int i = 0; i < data->height; ++i) {
|
||||
const QRgba64 *end = rgb_data + data->width;
|
||||
while (rgb_data < end) {
|
||||
*rgb_data = rgb_data->premultiplied();
|
||||
++rgb_data;
|
||||
}
|
||||
rgb_data += pad;
|
||||
}
|
||||
data->format = QImage::Format_RGBA64_Premultiplied;
|
||||
return true;
|
||||
}
|
||||
|
||||
template<bool MaskAlpha>
|
||||
static void convert_RGBA64PM_to_RGBA64(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags)
|
||||
{
|
||||
@ -2368,7 +2325,6 @@ static void qInitImageConversions()
|
||||
qimage_converter_map[QImage::Format_RGBA64][QImage::Format_ARGB32] = convert_RGBA64_to_ARGB32<false>;
|
||||
qimage_converter_map[QImage::Format_RGBA64][QImage::Format_RGBA8888] = convert_RGBA64_to_ARGB32<true>;
|
||||
qimage_converter_map[QImage::Format_RGBA64][QImage::Format_RGBX64] = convert_RGBA64_to_RGBx64;
|
||||
qimage_converter_map[QImage::Format_RGBA64][QImage::Format_RGBA64_Premultiplied] = convert_RGBA64_to_RGBA64PM;
|
||||
|
||||
qimage_converter_map[QImage::Format_RGBA64_Premultiplied][QImage::Format_RGBX64] = convert_RGBA64PM_to_RGBA64<true>;
|
||||
qimage_converter_map[QImage::Format_RGBA64_Premultiplied][QImage::Format_RGBA64] = convert_RGBA64PM_to_RGBA64<false>;
|
||||
@ -2486,8 +2442,6 @@ static void qInitImageConversions()
|
||||
|
||||
qimage_inplace_converter_map[QImage::Format_RGBA64][QImage::Format_RGBX64] =
|
||||
convert_RGBA64_to_RGBx64_inplace;
|
||||
qimage_inplace_converter_map[QImage::Format_RGBA64][QImage::Format_RGBA64_Premultiplied] =
|
||||
convert_RGBA64_to_RGBA64PM_inplace;
|
||||
|
||||
qimage_inplace_converter_map[QImage::Format_RGBA64_Premultiplied][QImage::Format_RGBX64] =
|
||||
convert_RGBA64PM_to_RGBA64_inplace<true>;
|
||||
|
@ -5225,16 +5225,16 @@ static void qInitDrawhelperFunctions()
|
||||
qPixelLayouts[QImage::Format_RGBA8888].fetchToARGB32PM = fetchRGBA8888ToARGB32PM_avx2;
|
||||
qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_avx2;
|
||||
|
||||
#if QT_CONFIG(raster_64bit)
|
||||
extern const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_avx2(QRgba64 *, const uint *, int, const QList<QRgb> *, QDitherInfo *);
|
||||
extern const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_avx2(QRgba64 *, const uint *, int count, const QList<QRgb> *, QDitherInfo *);
|
||||
extern const QRgba64 *QT_FASTCALL convertARGB32ToRGBA64PM_avx2(QRgba64 *, const uint *, int, const QList<QRgb> *, QDitherInfo *);
|
||||
extern const QRgba64 *QT_FASTCALL convertRGBA8888ToRGBA64PM_avx2(QRgba64 *, const uint *, int count, const QList<QRgb> *, QDitherInfo *);
|
||||
extern const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_avx2(QRgba64 *, const uchar *, int, int, const QList<QRgb> *, QDitherInfo *);
|
||||
extern const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_avx2(QRgba64 *, const uchar *, int, int, const QList<QRgb> *, QDitherInfo *);
|
||||
extern const QRgba64 *QT_FASTCALL fetchRGBA64ToRGBA64PM_avx2(QRgba64 *buffer, const uchar *src, int index, int count, const QList<QRgb> *, QDitherInfo *);
|
||||
qPixelLayouts[QImage::Format_ARGB32].convertToRGBA64PM = convertARGB32ToRGBA64PM_avx2;
|
||||
qPixelLayouts[QImage::Format_RGBX8888].convertToRGBA64PM = convertRGBA8888ToRGBA64PM_avx2;
|
||||
qPixelLayouts[QImage::Format_ARGB32].fetchToRGBA64PM = fetchARGB32ToRGBA64PM_avx2;
|
||||
qPixelLayouts[QImage::Format_RGBX8888].fetchToRGBA64PM = fetchRGBA8888ToRGBA64PM_avx2;
|
||||
#endif
|
||||
qPixelLayouts[QImage::Format_RGBA64].fetchToRGBA64PM = fetchRGBA64ToRGBA64PM_avx2;
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -1229,6 +1229,38 @@ const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_avx2(QRgba64 *buffer, const u
|
||||
return buffer;
|
||||
}
|
||||
|
||||
const QRgba64 *QT_FASTCALL fetchRGBA64ToRGBA64PM_avx2(QRgba64 *buffer, const uchar *src, int index, int count,
|
||||
const QList<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
const QRgba64 *s = reinterpret_cast<const QRgba64 *>(src) + index;
|
||||
int i = 0;
|
||||
const __m256i vh = _mm256_set1_epi32(0x8000);
|
||||
for (; i < count - 3; i += 4) {
|
||||
__m256i vs256 = _mm256_loadu_si256((const __m256i *)(s + i));
|
||||
__m256i va256 = _mm256_shufflelo_epi16(vs256, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
va256 = _mm256_shufflehi_epi16(va256, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
const __m256i vmullo = _mm256_mullo_epi16(vs256, va256);
|
||||
const __m256i vmulhi = _mm256_mulhi_epu16(vs256, va256);
|
||||
__m256i vslo = _mm256_unpacklo_epi16(vmullo, vmulhi);
|
||||
__m256i vshi = _mm256_unpackhi_epi16(vmullo, vmulhi);
|
||||
vslo = _mm256_add_epi32(vslo, _mm256_srli_epi32(vslo, 16));
|
||||
vshi = _mm256_add_epi32(vshi, _mm256_srli_epi32(vshi, 16));
|
||||
vslo = _mm256_add_epi32(vslo, vh);
|
||||
vshi = _mm256_add_epi32(vshi, vh);
|
||||
vslo = _mm256_srli_epi32(vslo, 16);
|
||||
vshi = _mm256_srli_epi32(vshi, 16);
|
||||
vs256 = _mm256_packus_epi32(vslo, vshi);
|
||||
_mm256_storeu_si256((__m256i *)(buffer + i), vs256);
|
||||
}
|
||||
for (; i < count; ++i) {
|
||||
__m128i vs = _mm_loadl_epi64((const __m128i *)(s + i));
|
||||
__m128i va = _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vs = multiplyAlpha65535(vs, va);
|
||||
_mm_storel_epi64((__m128i *)(buffer + i), vs);
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
||||
QT_END_NAMESPACE
|
||||
|
||||
#endif
|
||||
|
@ -1045,8 +1045,17 @@ static const QRgba64 *QT_FASTCALL fetchRGBA64ToRGBA64PM(QRgba64 *buffer, const u
|
||||
const QList<QRgb> *, QDitherInfo *)
|
||||
{
|
||||
const QRgba64 *s = reinterpret_cast<const QRgba64 *>(src) + index;
|
||||
#ifdef __SSE2__
|
||||
for (int i = 0; i < count; ++i) {
|
||||
__m128i vs = _mm_loadl_epi64((const __m128i *)(s + i));
|
||||
__m128i va = _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3));
|
||||
vs = multiplyAlpha65535(vs, va);
|
||||
_mm_storel_epi64((__m128i *)(buffer + i), vs);
|
||||
}
|
||||
#else
|
||||
for (int i = 0; i < count; ++i)
|
||||
buffer[i] = QRgba64::fromRgba64(s[i]).premultiplied();
|
||||
#endif
|
||||
return buffer;
|
||||
}
|
||||
|
||||
|
@ -244,6 +244,7 @@ void tst_QImageConversion::convertGeneric_data()
|
||||
QImage a2rgb30 = argb32.convertToFormat(QImage::Format_A2RGB30_Premultiplied);
|
||||
QImage rgb666 = rgb32.convertToFormat(QImage::Format_RGB666);
|
||||
QImage argb4444 = argb32.convertToFormat(QImage::Format_ARGB4444_Premultiplied);
|
||||
QImage rgba64 = argb32.convertToFormat(QImage::Format_RGBA64);
|
||||
QImage rgba64pm = argb32.convertToFormat(QImage::Format_RGBA64_Premultiplied);
|
||||
QImage rgb888 = rgb32.convertToFormat(QImage::Format_RGB888);
|
||||
QImage bgr888 = rgb32.convertToFormat(QImage::Format_BGR888);
|
||||
@ -298,6 +299,10 @@ void tst_QImageConversion::convertGeneric_data()
|
||||
QTest::newRow("argb4444pm -> rgb30") << argb4444 << QImage::Format_RGB30;
|
||||
QTest::newRow("argb4444pm -> a2bgr30") << argb4444 << QImage::Format_A2BGR30_Premultiplied;
|
||||
|
||||
QTest::newRow("rgba64 -> argb32") << rgba64 << QImage::Format_ARGB32;
|
||||
QTest::newRow("rgba64 -> argb32pm") << rgba64 << QImage::Format_ARGB32_Premultiplied;
|
||||
QTest::newRow("rgba64 -> rgba64pm") << rgba64 << QImage::Format_RGBA64_Premultiplied;
|
||||
|
||||
QTest::newRow("rgba64pm -> argb32") << rgba64pm << QImage::Format_ARGB32;
|
||||
QTest::newRow("rgba64pm -> rgbx8888") << rgba64pm << QImage::Format_RGBX8888;
|
||||
QTest::newRow("rgba64pm -> rgba8888pm") << rgba64pm << QImage::Format_RGBA8888_Premultiplied;
|
||||
|
Loading…
x
Reference in New Issue
Block a user