Cleanup convert_ARGB_to_ARGB_PM_inplace_sse2

Changes it to follow standard SIMD patterns so it can use
ALIGNMENT_PROLOGUE_16BYTES and SIMD_EPILOGUE helpers.

Should also improve performance by using aligned memory access.

Change-Id: I14a48b82e3f3de83bd7572aa82bed07f28ad944c
Reviewed-by: Erik Verbruggen <erik.verbruggen@qt.io>
This commit is contained in:
Allan Sandfeld Jensen 2016-11-16 16:25:11 +01:00
parent 0382bb2ab8
commit 506aa694a9

View File

@ -51,51 +51,66 @@ bool convert_ARGB_to_ARGB_PM_inplace_sse2(QImageData *data, Qt::ImageConversionF
{
Q_ASSERT(data->format == QImage::Format_ARGB32 || data->format == QImage::Format_RGBA8888);
// extra pixels on each line
const int spare = data->width & 3;
// width in pixels of the pad at the end of each line
const int pad = (data->bytes_per_line >> 2) - data->width;
const int iter = data->width >> 2;
int height = data->height;
const int width = data->width;
const int height = data->height;
const int bpl = data->bytes_per_line;
const __m128i alphaMask = _mm_set1_epi32(0xff000000);
const __m128i nullVector = _mm_setzero_si128();
const __m128i half = _mm_set1_epi16(0x80);
const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
__m128i *d = reinterpret_cast<__m128i*>(data->data);
while (height--) {
const __m128i *end = d + iter;
for (; d != end; ++d) {
const __m128i srcVector = _mm_loadu_si128(d);
uchar *d = data->data;
for (int y = 0; y < height; ++y) {
int i = 0;
quint32 *d32 = reinterpret_cast<quint32 *>(d);
ALIGNMENT_PROLOGUE_16BYTES(d, i, width) {
const quint32 p = d32[i];
if (p <= 0x00ffffff)
d32[i] = 0;
else if (p < 0xff000000)
d32[i] = qPremultiply(p);
}
__m128i *d128 = reinterpret_cast<__m128i *>(d32 + i);
for (; i < (width - 3); i += 4) {
const __m128i srcVector = _mm_load_si128(d128);
#ifdef __SSE4_1__
if (_mm_testc_si128(srcVector, alphaMask)) {
// opaque, data is unchanged
} else if (_mm_testz_si128(srcVector, alphaMask)) {
// fully transparent
_mm_store_si128(d128, nullVector);
} else {
const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
#else
const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
// opaque, data is unchanged
} else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) == 0xffff) {
// fully transparent
_mm_storeu_si128(d, nullVector);
_mm_store_si128(d128, nullVector);
} else {
#endif
__m128i alphaChannel = _mm_srli_epi32(srcVector, 24);
alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16));
__m128i result;
BYTE_MUL_SSE2(result, srcVector, alphaChannel, colorMask, half);
result = _mm_or_si128(_mm_andnot_si128(alphaMask, result), srcVectorAlpha);
_mm_storeu_si128(d, result);
_mm_store_si128(d128, result);
}
d128++;
}
QRgb *p = reinterpret_cast<QRgb*>(d);
QRgb *pe = p+spare;
for (; p != pe; ++p) {
if (*p < 0x00ffffff)
*p = 0;
else if (*p < 0xff000000)
*p = qPremultiply(*p);
SIMD_EPILOGUE(i, width, 3) {
const quint32 p = d32[i];
if (p <= 0x00ffffff)
d32[i] = 0;
else if (p < 0xff000000)
d32[i] = qPremultiply(p);
}
d = reinterpret_cast<__m128i*>(p+pad);
d += bpl;
}
if (data->format == QImage::Format_ARGB32)