From 1c8f9eb79da837db8e37cf6348de459088c3a20e Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Thu, 2 Aug 2018 18:05:51 +0200 Subject: Add missing optimization for loading RGB32 to RGBA64 using NEON The rest of the RGB64 routines were optimized, but the loading of RGB32 was not as it was originally not used much, but with ARGB32 using the RGB64 backend, it is essential for decent performance. Task-number: QTBUG-69724 Change-Id: I1c02411ed29d3d993427afde44dfa83689d117e0 Reviewed-by: Lars Knoll --- src/gui/painting/qdrawhelper.cpp | 61 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) (limited to 'src/gui/painting') diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index 34847daf55..9bb1498ff0 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -618,6 +618,53 @@ static inline void qConvertARGB32PMToARGB64PM_sse2(QRgba64 *buffer, const uint * *buffer++ = QRgba64::fromArgb32(s); } } +#elif defined(__ARM_NEON__) +template +static inline void qConvertARGB32PMToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count) +{ + if (count <= 0) + return; + + const uint32x4_t amask = vdupq_n_u32(0xff000000); +#if defined(Q_PROCESSOR_ARM_64) + const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; +#else + const uint8x8_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7 }; +#endif + int i = 0; + for (; i < count-3; i += 4) { + uint32x4_t vs32 = vld1q_u32(src); + src += 4; + if (maskAlpha) + vs32 = vorrq_u32(vs32, amask); + uint8x16_t vs8 = vreinterpretq_u8_u32(vs32); + if (!RGBA) { +#if defined(Q_PROCESSOR_ARM_64) + vs8 = vqtbl1q_u8(vs8, rgbaMask); +#else + // no vqtbl1q_u8 + const uint8x8_t vlo = vtbl1_u8(vget_low_u8(vs8), rgbaMask); + const uint8x8_t vhi = vtbl1_u8(vget_high_u8(vs8), rgbaMask); + vs8 = vcombine_u8(vlo, vhi); +#endif + } + uint8x16x2_t v = vzipq_u8(vs8, vs8); + + vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u8(v.val[0])); + buffer += 2; + vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u8(v.val[1])); + buffer += 2; + } + + SIMD_EPILOGUE(i, count, 3) { + uint s = *src++; + if (maskAlpha) + s = s | 0xff000000; + if (RGBA) + s = RGBA2ARGB(s); + *buffer++ = QRgba64::fromArgb32(s); + } +} #endif static const QRgba64 *QT_FASTCALL convertRGB32ToRGB64(QRgba64 *buffer, const uint *src, int count, @@ -625,6 +672,8 @@ static const QRgba64 *QT_FASTCALL convertRGB32ToRGB64(QRgba64 *buffer, const uin { #ifdef __SSE2__ qConvertARGB32PMToARGB64PM_sse2(buffer, src, count); +#elif defined(__ARM_NEON__) + qConvertARGB32PMToRGBA64PM_neon(buffer, src, count); #else for (int i = 0; i < count; ++i) buffer[i] = QRgba64::fromArgb32(0xff000000 | src[i]); @@ -639,6 +688,10 @@ static const QRgba64 *QT_FASTCALL convertARGB32ToARGB64PM(QRgba64 *buffer, const qConvertARGB32PMToARGB64PM_sse2(buffer, src, count); for (int i = 0; i < count; ++i) buffer[i] = buffer[i].premultiplied(); +#elif defined(__ARM_NEON__) + qConvertARGB32PMToRGBA64PM_neon(buffer, src, count); + for (int i = 0; i < count; ++i) + buffer[i] = buffer[i].premultiplied(); #else for (int i = 0; i < count; ++i) buffer[i] = QRgba64::fromArgb32(src[i]).premultiplied(); @@ -651,6 +704,8 @@ static const QRgba64 *QT_FASTCALL convertARGB32PMToARGB64PM(QRgba64 *buffer, con { #ifdef __SSE2__ qConvertARGB32PMToARGB64PM_sse2(buffer, src, count); +#elif defined(__ARM_NEON__) + qConvertARGB32PMToRGBA64PM_neon(buffer, src, count); #else for (int i = 0; i < count; ++i) buffer[i] = QRgba64::fromArgb32(src[i]); @@ -665,6 +720,10 @@ static const QRgba64 *QT_FASTCALL convertRGBA8888ToARGB64PM(QRgba64 *buffer, con qConvertARGB32PMToARGB64PM_sse2(buffer, src, count); for (int i = 0; i < count; ++i) buffer[i] = buffer[i].premultiplied(); +#elif defined(__ARM_NEON__) + qConvertARGB32PMToRGBA64PM_neon(buffer, src, count); + for (int i = 0; i < count; ++i) + buffer[i] = buffer[i].premultiplied(); #else for (int i = 0; i < count; ++i) buffer[i] = QRgba64::fromArgb32(RGBA2ARGB(src[i])).premultiplied(); @@ -677,6 +736,8 @@ static const QRgba64 *QT_FASTCALL convertRGBA8888PMToARGB64PM(QRgba64 *buffer, c { #ifdef __SSE2__ qConvertARGB32PMToARGB64PM_sse2(buffer, src, count); +#elif defined(__ARM_NEON__) + qConvertARGB32PMToRGBA64PM_neon(buffer, src, count); #else for (int i = 0; i < count; ++i) buffer[i] = QRgba64::fromArgb32(RGBA2ARGB(src[i])); -- cgit v1.2.3