Optimize ARGB32->RGBA64PM better

This conversion is critical for ARGB32 painting, and no compiler optimized the premultiplication efficiently. Change-Id: Iee137c2f7020246478d09e880a7a1bf2ed3c6fd4 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
author: Allan Sandfeld Jensen <allan.jensen@qt.io> 2018-10-12 16:52:54 +0200
committer: Allan Sandfeld Jensen <allan.jensen@qt.io> 2019-01-08 08:24:40 +0000
commit: 4e6a42cdd050c8714686fa53b3246c4fc66d18fd (patch)
tree: c6de5e4c67cee1d9db6edbfefde8acb042a0c6a4 /src/gui/painting/qdrawhelper_neon.cpp
parent: 1f2c23a7ca1699b345578aeb52fbd97b612e079a (diff)
1 files changed, 94 insertions, 0 deletions
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp
index e33af3b784..3fcbcfd053 100644
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@@ -1149,6 +1149,72 @@ static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int
     }
 }
 
+template<bool RGBA>
+static inline void convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count)
+{
+    if (count <= 0)
+        return;
+
+    const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
+    const uint64x2_t blendMask = vdupq_n_u64(Q_UINT64_C(0xffff000000000000));
+
+    int i = 0;
+    for (; i < count-3; i += 4) {
+        uint32x4_t vs32 = vld1q_u32(src + i);
+        uint32x4_t alphaVector = vshrq_n_u32(vs32, 24);
+#if defined(Q_PROCESSOR_ARM_64)
+        uint32_t alphaSum = vaddvq_u32(alphaVector);
+#else
+        // no vaddvq_u32
+        uint32x2_t tmp = vpadd_u32(vget_low_u32(alphaVector), vget_high_u32(alphaVector));
+        uint32_t alphaSum = vget_lane_u32(vpadd_u32(tmp, tmp), 0);
+#endif
+        if (alphaSum) {
+            if (!RGBA)
+                vs32 = vrgba2argb(vs32);
+            const uint8x16_t vs8 = vreinterpretq_u8_u32(vs32);
+            const uint8x16x2_t v = vzipq_u8(vs8, vs8);
+            if (alphaSum != 255 * 4) {
+                const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(vs32));
+                const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(vs32));
+                const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask);
+                const uint8x8_t alpha2 = vtbl1_u8(s2, shuffleMask);
+                uint16x8_t src1 = vmull_u8(s1, alpha1);
+                uint16x8_t src2 = vmull_u8(s2, alpha2);
+                // convert from 0->(255x255) to 0->(255x257)
+                src1 = vsraq_n_u16(src1, src1, 7);
+                src2 = vsraq_n_u16(src2, src2, 7);
+
+                // now restore alpha from the trivial conversion
+                const uint64x2_t d1 = vbslq_u64(blendMask, vreinterpretq_u64_u8(v.val[0]), vreinterpretq_u64_u16(src1));
+                const uint64x2_t d2 = vbslq_u64(blendMask, vreinterpretq_u64_u8(v.val[1]), vreinterpretq_u64_u16(src2));
+
+                vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u64(d1));
+                buffer += 2;
+                vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u64(d2));
+                buffer += 2;
+            } else {
+                vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u8(v.val[0]));
+                buffer += 2;
+                vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u8(v.val[1]));
+                buffer += 2;
+            }
+        } else {
+            vst1q_u16((uint16_t *)buffer, vdupq_n_u16(0));
+            buffer += 2;
+            vst1q_u16((uint16_t *)buffer, vdupq_n_u16(0));
+            buffer += 2;
+        }
+    }
+
+    SIMD_EPILOGUE(i, count, 3) {
+        uint s = src[i];
+        if (RGBA)
+            s = RGBA2ARGB(s);
+        *buffer++ = QRgba64::fromArgb32(s).premultiplied();
+    }
+}
+
 static inline float32x4_t reciprocal_mul_ps(float32x4_t a, float mul)
 {
     float32x4_t ia = vrecpeq_f32(a); // estimate 1/a
@@ -1258,6 +1324,34 @@ const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_neon(uint *buffer, const uchar *
     return buffer;
 }
 
+const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count,
+                                                         const QVector<QRgb> *, QDitherInfo *)
+{
+    convertARGB32ToRGBA64PM_neon<false>(buffer, src, count);
+    return buffer;
+}
+
+const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count,
+                                                           const QVector<QRgb> *, QDitherInfo *)
+{
+    convertARGB32ToRGBA64PM_neon<true>(buffer, src, count);
+    return buffer;
+}
+
+const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uchar *src, int index, int count,
+                                                      const QVector<QRgb> *, QDitherInfo *)
+{
+    convertARGB32ToRGBA64PM_neon<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
+    return buffer;
+}
+
+const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_neon(QRgba64 *buffer, const uchar *src, int index, int count,
+                                                        const QVector<QRgb> *, QDitherInfo *)
+{
+    convertARGB32ToRGBA64PM_neon<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
+    return buffer;
+}
+
 void QT_FASTCALL storeRGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
                                              const QVector<QRgb> *, QDitherInfo *)
 {
author	Allan Sandfeld Jensen <allan.jensen@qt.io>	2018-10-12 16:52:54 +0200
committer	Allan Sandfeld Jensen <allan.jensen@qt.io>	2019-01-08 08:24:40 +0000
commit	4e6a42cdd050c8714686fa53b3246c4fc66d18fd (patch)
tree	c6de5e4c67cee1d9db6edbfefde8acb042a0c6a4 /src/gui/painting/qdrawhelper_neon.cpp
parent	1f2c23a7ca1699b345578aeb52fbd97b612e079a (diff)