Manually vectorize ARGB32toARGB32PM for SSE4.1 and NEON

Manually vectorizing is significantly faster because we can optimize for common cases like long stretches of opaque or transparent pixels. This is both smaller and faster than the auto-vectorized version, it is also much faster than the autovectorized version for AVX2 which then can be removed. Change-Id: I0fa80ce273a8387cc6cd084879822ad9bade385c Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
author: Allan Sandfeld Jensen <allan.jensen@qt.io> 2016-12-20 16:17:58 +0100
committer: Allan Sandfeld Jensen <allan.jensen@qt.io> 2017-01-31 00:14:11 +0000
commit: 85468f7bccb276c2be5801481a6ce10f07581cdb (patch)
tree: f96ef309303ed0caf91b0c37cabee4d295cb19d3 /src/gui/painting/qdrawhelper_neon.cpp
parent: ad4f7b59ead6c4eb17e787bce25a7211b866063f (diff)
1 files changed, 61 insertions, 0 deletions
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp
index cdb374f823..643c570f65 100644
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@@ -1069,6 +1069,67 @@ const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Oper
     return buffer;
 }
 
+#if defined(Q_PROCESSOR_ARM_64) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+template<bool RGBA>
+static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
+{
+    int i = 0;
+    const uint8x16_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
+    const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
+    const uint32x4_t blendMask = vdupq_n_u32(0xff000000);
+
+    for (; i < count - 3; i += 4) {
+        uint32x4_t srcVector = vld1q_u32(src + i);
+        uint32x4_t alphaVector = vshrq_n_u32(srcVector, 24);
+        uint32_t alphaSum = vaddvq_u32(alphaVector);
+        if (alphaSum) {
+            if (alphaSum != 255 * 4) {
+                if (RGBA)
+                    srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask));
+                const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(srcVector));
+                const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(srcVector));
+                const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask);
+                const uint8x8_t alpha2 = vtbl1_u8(s2, shuffleMask);
+                uint16x8_t src1 = vmull_u8(s1, alpha1);
+                uint16x8_t src2 = vmull_u8(s2, alpha2);
+                src1 = vsraq_n_u16(src1, src1, 8);
+                src2 = vsraq_n_u16(src2, src2, 8);
+                const uint8x8_t d1 = vrshrn_n_u16(src1, 8);
+                const uint8x8_t d2 = vrshrn_n_u16(src2, 8);
+                const uint32x4_t d = vbslq_u32(blendMask, srcVector, vreinterpretq_u32_u8(vcombine_u8(d1, d2)));
+                vst1q_u32(buffer + i, d);
+            } else {
+                if (RGBA)
+                    vst1q_u32(buffer + i, vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask)));
+                else if (buffer != src)
+                    vst1q_u32(buffer + i, srcVector);
+            }
+        } else {
+            vst1q_u32(buffer + i, vdupq_n_u32(0));
+        }
+    }
+
+    SIMD_EPILOGUE(i, count, 3) {
+        uint v = qPremultiply(src[i]);
+        buffer[i] = RGBA ? RGBA2ARGB(v) : v;
+    }
+}
+
+const uint *QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, const uint *src, int count,
+                                                     const QVector<QRgb> *, QDitherInfo *)
+{
+    convertARGBToARGB32PM_neon<false>(buffer, src, count);
+    return buffer;
+}
+
+const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_neon(uint *buffer, const uint *src, int count,
+                                                       const QVector<QRgb> *, QDitherInfo *)
+{
+    convertARGBToARGB32PM_neon<true>(buffer, src, count);
+    return buffer;
+}
+#endif
+
 QT_END_NAMESPACE
 
 #endif // __ARM_NEON__
author	Allan Sandfeld Jensen <allan.jensen@qt.io>	2016-12-20 16:17:58 +0100
committer	Allan Sandfeld Jensen <allan.jensen@qt.io>	2017-01-31 00:14:11 +0000
commit	85468f7bccb276c2be5801481a6ce10f07581cdb (patch)
tree	f96ef309303ed0caf91b0c37cabee4d295cb19d3 /src/gui/painting/qdrawhelper_neon.cpp
parent	ad4f7b59ead6c4eb17e787bce25a7211b866063f (diff)