Add NEON optimized ARGB32 unpremultiply routines

Mirroring similar routines recently added for SSE4.1

Change-Id: Ibb9d10cc34655ce1dc0e97fdff4e4f6a81d47d05
Reviewed-by: Erik Verbruggen <erik.verbruggen@qt.io>
Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io>
This commit is contained in:
Allan Sandfeld Jensen 2018-09-06 11:20:36 +02:00
parent 25830cf912
commit b7c5c2e65b
3 changed files with 158 additions and 27 deletions

View File

@ -119,6 +119,7 @@ void qGamma_correct_back_to_linear_cs(QImage *image)
*****************************************************************************/
// The drawhelper conversions from/to RGB32 are passthroughs which is not always correct for general image conversion
#if !defined(__ARM_NEON__)
static void QT_FASTCALL storeRGB32FromARGB32PM(uchar *dest, const uint *src, int index, int count,
const QVector<QRgb> *, QDitherInfo *)
{
@ -126,6 +127,7 @@ static void QT_FASTCALL storeRGB32FromARGB32PM(uchar *dest, const uint *src, int
for (int i = 0; i < count; ++i)
d[i] = 0xff000000 | qUnpremultiply(src[i]);
}
#endif
static void QT_FASTCALL storeRGB32FromARGB32(uchar *dest, const uint *src, int index, int count,
const QVector<QRgb> *, QDitherInfo *)
@ -147,6 +149,9 @@ static const uint *QT_FASTCALL fetchRGB32ToARGB32PM(uint *buffer, const uchar *s
#ifdef QT_COMPILER_SUPPORTS_SSE4_1
extern void QT_FASTCALL storeRGB32FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
const QVector<QRgb> *, QDitherInfo *);
#elif defined(__ARM_NEON__)
extern void QT_FASTCALL storeRGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
const QVector<QRgb> *, QDitherInfo *);
#endif
void convert_generic(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags flags)
@ -175,8 +180,12 @@ void convert_generic(QImageData *dest, const QImageData *src, Qt::ImageConversio
if (qCpuHasFeature(SSE4_1))
store = storeRGB32FromARGB32PM_sse4;
else
#endif
store = storeRGB32FromARGB32PM;
#elif defined(__ARM_NEON__)
store = storeRGB32FromARGB32PM_neon;
#else
store = storeRGB32FromARGB32PM;
#endif
}
}
if (srcLayout->hasAlphaChannel && !srcLayout->premultiplied &&
@ -261,8 +270,12 @@ bool convert_generic_inplace(QImageData *data, QImage::Format dst_format, Qt::Im
if (qCpuHasFeature(SSE4_1))
store = storeRGB32FromARGB32PM_sse4;
else
#endif
store = storeRGB32FromARGB32PM;
#elif defined(__ARM_NEON__)
store = storeRGB32FromARGB32PM_neon;
#else
store = storeRGB32FromARGB32PM;
#endif
}
}
if (srcLayout->hasAlphaChannel && !srcLayout->premultiplied &&

View File

@ -6505,10 +6505,19 @@ static void qInitDrawhelperFunctions()
const QVector<QRgb> *, QDitherInfo *);
extern const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_neon(uint *buffer, const uchar *src, int index, int count,
const QVector<QRgb> *, QDitherInfo *);
extern void QT_FASTCALL storeARGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
const QVector<QRgb> *, QDitherInfo *);
extern void QT_FASTCALL storeRGBA8888FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
const QVector<QRgb> *, QDitherInfo *);
extern void QT_FASTCALL storeRGBXFromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
const QVector<QRgb> *, QDitherInfo *);
qPixelLayouts[QImage::Format_ARGB32].fetchToARGB32PM = fetchARGB32ToARGB32PM_neon;
qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_neon;
qPixelLayouts[QImage::Format_ARGB32].storeFromARGB32PM = storeARGB32FromARGB32PM_neon;
qPixelLayouts[QImage::Format_RGBA8888].fetchToARGB32PM = fetchRGBA8888ToARGB32PM_neon;
qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_neon;
qPixelLayouts[QImage::Format_RGBA8888].storeFromARGB32PM = storeRGBA8888FromARGB32PM_neon;
qPixelLayouts[QImage::Format_RGBX8888].storeFromARGB32PM = storeRGBXFromARGB32PM_neon;
#endif
#if defined(ENABLE_PIXMAN_DRAWHELPERS)

View File

@ -1081,15 +1081,28 @@ const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Oper
}
#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
template<bool RGBA>
static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
static inline uint32x4_t vrgba2argb(uint32x4_t srcVector)
{
int i = 0;
#if defined(Q_PROCESSOR_ARM_64)
const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
#else
const uint8x8_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7 };
#endif
#if defined(Q_PROCESSOR_ARM_64)
srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask));
#else
// no vqtbl1q_u8, so use two vtbl1_u8
const uint8x8_t low = vtbl1_u8(vreinterpret_u8_u32(vget_low_u32(srcVector)), rgbaMask);
const uint8x8_t high = vtbl1_u8(vreinterpret_u8_u32(vget_high_u32(srcVector)), rgbaMask);
srcVector = vcombine_u32(vreinterpret_u32_u8(low), vreinterpret_u32_u8(high));
#endif
return srcVector;
}
template<bool RGBA>
static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
{
int i = 0;
const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
const uint32x4_t blendMask = vdupq_n_u32(0xff000000);
@ -1105,16 +1118,8 @@ static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int
#endif
if (alphaSum) {
if (alphaSum != 255 * 4) {
if (RGBA) {
#if defined(Q_PROCESSOR_ARM_64)
srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask));
#else
// no vqtbl1q_u8
const uint8x8_t low = vtbl1_u8(vreinterpret_u8_u32(vget_low_u32(srcVector)), rgbaMask);
const uint8x8_t high = vtbl1_u8(vreinterpret_u8_u32(vget_high_u32(srcVector)), rgbaMask);
srcVector = vcombine_u32(vreinterpret_u32_u8(low), vreinterpret_u32_u8(high));
#endif
}
if (RGBA)
srcVector = vrgba2argb(srcVector);
const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(srcVector));
const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(srcVector));
const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask);
@ -1128,19 +1133,10 @@ static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int
const uint32x4_t d = vbslq_u32(blendMask, srcVector, vreinterpretq_u32_u8(vcombine_u8(d1, d2)));
vst1q_u32(buffer + i, d);
} else {
if (RGBA) {
#if defined(Q_PROCESSOR_ARM_64)
srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask));
#else
// no vqtbl1q_u8
const uint8x8_t low = vtbl1_u8(vreinterpret_u8_u32(vget_low_u32(srcVector)), rgbaMask);
const uint8x8_t high = vtbl1_u8(vreinterpret_u8_u32(vget_high_u32(srcVector)), rgbaMask);
srcVector = vcombine_u32(vreinterpret_u32_u8(low), vreinterpret_u32_u8(high));
#endif
if (RGBA)
vst1q_u32(buffer + i, vrgba2argb(srcVector));
else if (buffer != src)
vst1q_u32(buffer + i, srcVector);
} else if (buffer != src) {
vst1q_u32(buffer + i, srcVector);
}
}
} else {
vst1q_u32(buffer + i, vdupq_n_u32(0));
@ -1153,6 +1149,91 @@ static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int
}
}
static inline float32x4_t reciprocal_mul_ps(float32x4_t a, float mul)
{
float32x4_t ia = vrecpeq_f32(a); // estimate 1/a
ia = vmulq_f32(vrecpsq_f32(a, ia), vmulq_n_f32(ia, mul)); // estimate improvement step * mul
return ia;
}
template<bool RGBA, bool RGBx>
static inline void convertARGBFromARGB32PM_neon(uint *buffer, const uint *src, int count)
{
int i = 0;
const uint32x4_t alphaMask = vdupq_n_u32(0xff000000);
for (; i < count - 3; i += 4) {
uint32x4_t srcVector = vld1q_u32(src + i);
uint32x4_t alphaVector = vshrq_n_u32(srcVector, 24);
#if defined(Q_PROCESSOR_ARM_64)
uint32_t alphaSum = vaddvq_u32(alphaVector);
#else
// no vaddvq_u32
uint32x2_t tmp = vpadd_u32(vget_low_u32(alphaVector), vget_high_u32(alphaVector));
uint32_t alphaSum = vget_lane_u32(vpadd_u32(tmp, tmp), 0);
#endif
if (alphaSum) {
if (alphaSum != 255 * 4) {
if (RGBA)
srcVector = vrgba2argb(srcVector);
const float32x4_t a = vcvtq_f32_u32(alphaVector);
const float32x4_t ia = reciprocal_mul_ps(a, 255.0f);
// Convert 4x(4xU8) to 4x(4xF32)
uint16x8_t tmp1 = vmovl_u8(vget_low_u8(vreinterpretq_u8_u32(srcVector)));
uint16x8_t tmp3 = vmovl_u8(vget_high_u8(vreinterpretq_u8_u32(srcVector)));
float32x4_t src1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1)));
float32x4_t src2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1)));
float32x4_t src3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp3)));
float32x4_t src4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp3)));
src1 = vmulq_lane_f32(src1, vget_low_f32(ia), 0);
src2 = vmulq_lane_f32(src2, vget_low_f32(ia), 1);
src3 = vmulq_lane_f32(src3, vget_high_f32(ia), 0);
src4 = vmulq_lane_f32(src4, vget_high_f32(ia), 1);
// Convert 4x(4xF32) back to 4x(4xU8) (over a 8.1 fixed point format to get rounding)
tmp1 = vcombine_u16(vrshrn_n_u32(vcvtq_n_u32_f32(src1, 1), 1),
vrshrn_n_u32(vcvtq_n_u32_f32(src2, 1), 1));
tmp3 = vcombine_u16(vrshrn_n_u32(vcvtq_n_u32_f32(src3, 1), 1),
vrshrn_n_u32(vcvtq_n_u32_f32(src4, 1), 1));
uint32x4_t dstVector = vreinterpretq_u32_u8(vcombine_u8(vmovn_u16(tmp1), vmovn_u16(tmp3)));
// Overwrite any undefined results from alpha==0 with zeros:
#if defined(Q_PROCESSOR_ARM_64)
uint32x4_t srcVectorAlphaMask = vceqzq_u32(alphaVector);
#else
uint32x4_t srcVectorAlphaMask = vceqq_u32(alphaVector, vdupq_n_u32(0));
#endif
dstVector = vbicq_u32(dstVector, srcVectorAlphaMask);
// Restore or mask alpha values:
if (RGBx)
dstVector = vorrq_u32(alphaMask, dstVector);
else
dstVector = vbslq_u32(alphaMask, srcVector, dstVector);
vst1q_u32(&buffer[i], dstVector);
} else {
// 4xAlpha==255, no change except if we are doing RGBA->ARGB:
if (RGBA)
vst1q_u32(&buffer[i], vrgba2argb(srcVector));
else if (buffer != src)
vst1q_u32(&buffer[i], srcVector);
}
} else {
// 4xAlpha==0, always zero, except if output is RGBx:
if (RGBx)
vst1q_u32(&buffer[i], alphaMask);
else
vst1q_u32(&buffer[i], vdupq_n_u32(0));
}
}
SIMD_EPILOGUE(i, count, 3) {
uint v = qUnpremultiply(src[i]);
if (RGBx)
v = 0xff000000 | v;
if (RGBA)
v = ARGB2RGBA(v);
buffer[i] = v;
}
}
void QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, int count, const QVector<QRgb> *)
{
convertARGBToARGB32PM_neon<false>(buffer, buffer, count);
@ -1177,6 +1258,34 @@ const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_neon(uint *buffer, const uchar *
return buffer;
}
void QT_FASTCALL storeRGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
const QVector<QRgb> *, QDitherInfo *)
{
uint *d = reinterpret_cast<uint *>(dest) + index;
convertARGBFromARGB32PM_neon<false,true>(d, src, count);
}
void QT_FASTCALL storeARGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
const QVector<QRgb> *, QDitherInfo *)
{
uint *d = reinterpret_cast<uint *>(dest) + index;
convertARGBFromARGB32PM_neon<false,false>(d, src, count);
}
void QT_FASTCALL storeRGBA8888FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
const QVector<QRgb> *, QDitherInfo *)
{
uint *d = reinterpret_cast<uint *>(dest) + index;
convertARGBFromARGB32PM_neon<true,false>(d, src, count);
}
void QT_FASTCALL storeRGBXFromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
const QVector<QRgb> *, QDitherInfo *)
{
uint *d = reinterpret_cast<uint *>(dest) + index;
convertARGBFromARGB32PM_neon<true,true>(d, src, count);
}
#endif // Q_BYTE_ORDER == Q_LITTLE_ENDIAN
QT_END_NAMESPACE