1 files changed, 150 insertions, 31 deletions
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp
index e126f4b670..629dfe2358 100644
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@@ -1081,15 +1081,28 @@ const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Oper
 }
 
 #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
-template<bool RGBA>
-static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
+static inline uint32x4_t vrgba2argb(uint32x4_t srcVector)
 {
-    int i = 0;
 #if defined(Q_PROCESSOR_ARM_64)
     const uint8x16_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
 #else
     const uint8x8_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7 };
 #endif
+#if defined(Q_PROCESSOR_ARM_64)
+    srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask));
+#else
+    // no vqtbl1q_u8, so use two vtbl1_u8
+    const uint8x8_t low = vtbl1_u8(vreinterpret_u8_u32(vget_low_u32(srcVector)), rgbaMask);
+    const uint8x8_t high = vtbl1_u8(vreinterpret_u8_u32(vget_high_u32(srcVector)), rgbaMask);
+    srcVector = vcombine_u32(vreinterpret_u32_u8(low), vreinterpret_u32_u8(high));
+#endif
+    return srcVector;
+}
+
+template<bool RGBA>
+static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
+{
+    int i = 0;
     const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
     const uint32x4_t blendMask = vdupq_n_u32(0xff000000);
 
@@ -1105,16 +1118,8 @@ static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int
 #endif
         if (alphaSum) {
             if (alphaSum != 255 * 4) {
-                if (RGBA) {
-#if defined(Q_PROCESSOR_ARM_64)
-                    srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask));
-#else
-                    // no vqtbl1q_u8
-                    const uint8x8_t low = vtbl1_u8(vreinterpret_u8_u32(vget_low_u32(srcVector)), rgbaMask);
-                    const uint8x8_t high = vtbl1_u8(vreinterpret_u8_u32(vget_high_u32(srcVector)), rgbaMask);
-                    srcVector = vcombine_u32(vreinterpret_u32_u8(low), vreinterpret_u32_u8(high));
-#endif
-                }
+                if (RGBA)
+                    srcVector = vrgba2argb(srcVector);
                 const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(srcVector));
                 const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(srcVector));
                 const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask);
@@ -1128,19 +1133,10 @@ static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int
                 const uint32x4_t d = vbslq_u32(blendMask, srcVector, vreinterpretq_u32_u8(vcombine_u8(d1, d2)));
                 vst1q_u32(buffer + i, d);
             } else {
-                if (RGBA) {
-#if defined(Q_PROCESSOR_ARM_64)
-                    srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask));
-#else
-                    // no vqtbl1q_u8
-                    const uint8x8_t low = vtbl1_u8(vreinterpret_u8_u32(vget_low_u32(srcVector)), rgbaMask);
-                    const uint8x8_t high = vtbl1_u8(vreinterpret_u8_u32(vget_high_u32(srcVector)), rgbaMask);
-                    srcVector = vcombine_u32(vreinterpret_u32_u8(low), vreinterpret_u32_u8(high));
-#endif
-                    vst1q_u32(buffer + i, srcVector);
-                } else if (buffer != src) {
+                if (RGBA)
+                    vst1q_u32(buffer + i, vrgba2argb(srcVector));
+                else if (buffer != src)
                     vst1q_u32(buffer + i, srcVector);
-                }
             }
         } else {
             vst1q_u32(buffer + i, vdupq_n_u32(0));
@@ -1153,20 +1149,143 @@ static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int
     }
 }
 
-const uint *QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, const uint *src, int count,
-                                                     const QVector<QRgb> *, QDitherInfo *)
+static inline float32x4_t reciprocal_mul_ps(float32x4_t a, float mul)
+{
+    float32x4_t ia = vrecpeq_f32(a); // estimate 1/a
+    ia = vmulq_f32(vrecpsq_f32(a, ia), vmulq_n_f32(ia, mul)); // estimate improvement step * mul
+    return ia;
+}
+
+template<bool RGBA, bool RGBx>
+static inline void convertARGBFromARGB32PM_neon(uint *buffer, const uint *src, int count)
+{
+    int i = 0;
+    const uint32x4_t alphaMask = vdupq_n_u32(0xff000000);
+
+    for (; i < count - 3; i += 4) {
+        uint32x4_t srcVector = vld1q_u32(src + i);
+        uint32x4_t alphaVector = vshrq_n_u32(srcVector, 24);
+#if defined(Q_PROCESSOR_ARM_64)
+        uint32_t alphaSum = vaddvq_u32(alphaVector);
+#else
+        // no vaddvq_u32
+        uint32x2_t tmp = vpadd_u32(vget_low_u32(alphaVector), vget_high_u32(alphaVector));
+        uint32_t alphaSum = vget_lane_u32(vpadd_u32(tmp, tmp), 0);
+#endif
+        if (alphaSum) {
+            if (alphaSum != 255 * 4) {
+                if (RGBA)
+                    srcVector = vrgba2argb(srcVector);
+                const float32x4_t a = vcvtq_f32_u32(alphaVector);
+                const float32x4_t ia = reciprocal_mul_ps(a, 255.0f);
+                // Convert 4x(4xU8) to 4x(4xF32)
+                uint16x8_t tmp1 = vmovl_u8(vget_low_u8(vreinterpretq_u8_u32(srcVector)));
+                uint16x8_t tmp3 = vmovl_u8(vget_high_u8(vreinterpretq_u8_u32(srcVector)));
+                float32x4_t src1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1)));
+                float32x4_t src2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1)));
+                float32x4_t src3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp3)));
+                float32x4_t src4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp3)));
+                src1 = vmulq_lane_f32(src1, vget_low_f32(ia), 0);
+                src2 = vmulq_lane_f32(src2, vget_low_f32(ia), 1);
+                src3 = vmulq_lane_f32(src3, vget_high_f32(ia), 0);
+                src4 = vmulq_lane_f32(src4, vget_high_f32(ia), 1);
+                // Convert 4x(4xF32) back to 4x(4xU8) (over a 8.1 fixed point format to get rounding)
+                tmp1 = vcombine_u16(vrshrn_n_u32(vcvtq_n_u32_f32(src1, 1), 1),
+                                    vrshrn_n_u32(vcvtq_n_u32_f32(src2, 1), 1));
+                tmp3 = vcombine_u16(vrshrn_n_u32(vcvtq_n_u32_f32(src3, 1), 1),
+                                    vrshrn_n_u32(vcvtq_n_u32_f32(src4, 1), 1));
+                uint32x4_t dstVector = vreinterpretq_u32_u8(vcombine_u8(vmovn_u16(tmp1), vmovn_u16(tmp3)));
+                // Overwrite any undefined results from alpha==0 with zeros:
+#if defined(Q_PROCESSOR_ARM_64)
+                uint32x4_t srcVectorAlphaMask = vceqzq_u32(alphaVector);
+#else
+                uint32x4_t srcVectorAlphaMask = vceqq_u32(alphaVector, vdupq_n_u32(0));
+#endif
+                dstVector = vbicq_u32(dstVector, srcVectorAlphaMask);
+                // Restore or mask alpha values:
+                if (RGBx)
+                    dstVector = vorrq_u32(alphaMask, dstVector);
+                else
+                    dstVector = vbslq_u32(alphaMask, srcVector, dstVector);
+                vst1q_u32(&buffer[i], dstVector);
+            } else {
+                // 4xAlpha==255, no change except if we are doing RGBA->ARGB:
+                if (RGBA)
+                    vst1q_u32(&buffer[i], vrgba2argb(srcVector));
+                else if (buffer != src)
+                    vst1q_u32(&buffer[i], srcVector);
+            }
+        } else {
+            // 4xAlpha==0, always zero, except if output is RGBx:
+            if (RGBx)
+                vst1q_u32(&buffer[i], alphaMask);
+            else
+                vst1q_u32(&buffer[i], vdupq_n_u32(0));
+        }
+    }
+
+    SIMD_EPILOGUE(i, count, 3) {
+        uint v = qUnpremultiply(src[i]);
+        if (RGBx)
+            v = 0xff000000 | v;
+        if (RGBA)
+            v = ARGB2RGBA(v);
+        buffer[i] = v;
+    }
+}
+
+void QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, int count, const QVector<QRgb> *)
+{
+    convertARGBToARGB32PM_neon<false>(buffer, buffer, count);
+}
+
+void QT_FASTCALL convertRGBA8888ToARGB32PM_neon(uint *buffer, int count, const QVector<QRgb> *)
+{
+    convertARGBToARGB32PM_neon<true>(buffer, buffer, count);
+}
+
+const uint *QT_FASTCALL fetchARGB32ToARGB32PM_neon(uint *buffer, const uchar *src, int index, int count,
+                                                   const QVector<QRgb> *, QDitherInfo *)
 {
-    convertARGBToARGB32PM_neon<false>(buffer, src, count);
+    convertARGBToARGB32PM_neon<false>(buffer, reinterpret_cast<const uint *>(src) + index, count);
     return buffer;
 }
 
-const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_neon(uint *buffer, const uint *src, int count,
-                                                       const QVector<QRgb> *, QDitherInfo *)
+const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_neon(uint *buffer, const uchar *src, int index, int count,
+                                                     const QVector<QRgb> *, QDitherInfo *)
 {
-    convertARGBToARGB32PM_neon<true>(buffer, src, count);
+    convertARGBToARGB32PM_neon<true>(buffer, reinterpret_cast<const uint *>(src) + index, count);
     return buffer;
 }
 
+void QT_FASTCALL storeRGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
+                                             const QVector<QRgb> *, QDitherInfo *)
+{
+    uint *d = reinterpret_cast<uint *>(dest) + index;
+    convertARGBFromARGB32PM_neon<false,true>(d, src, count);
+}
+
+void QT_FASTCALL storeARGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
+                                              const QVector<QRgb> *, QDitherInfo *)
+{
+    uint *d = reinterpret_cast<uint *>(dest) + index;
+    convertARGBFromARGB32PM_neon<false,false>(d, src, count);
+}
+
+void QT_FASTCALL storeRGBA8888FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
+                                                const QVector<QRgb> *, QDitherInfo *)
+{
+    uint *d = reinterpret_cast<uint *>(dest) + index;
+    convertARGBFromARGB32PM_neon<true,false>(d, src, count);
+}
+
+void QT_FASTCALL storeRGBXFromARGB32PM_neon(uchar *dest, const uint *src, int index, int count,
+                                            const QVector<QRgb> *, QDitherInfo *)
+{
+    uint *d = reinterpret_cast<uint *>(dest) + index;
+    convertARGBFromARGB32PM_neon<true,true>(d, src, count);
+}
+
 #endif // Q_BYTE_ORDER == Q_LITTLE_ENDIAN
 
 QT_END_NAMESPACE