Manually vectorize ARGB32toARGB32PM for SSE4.1 and NEON

Manually vectorizing is significantly faster because we can optimize for common cases like long stretches of opaque or transparent pixels. This is both smaller and faster than the auto-vectorized version, it is also much faster than the autovectorized version for AVX2 which then can be removed. Change-Id: I0fa80ce273a8387cc6cd084879822ad9bade385c Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
author: Allan Sandfeld Jensen <allan.jensen@qt.io> 2016-12-20 16:17:58 +0100
committer: Allan Sandfeld Jensen <allan.jensen@qt.io> 2017-01-31 00:14:11 +0000
commit: 85468f7bccb276c2be5801481a6ce10f07581cdb (patch)
tree: f96ef309303ed0caf91b0c37cabee4d295cb19d3 /src/gui/painting
parent: ad4f7b59ead6c4eb17e787bce25a7211b866063f (diff)
4 files changed, 125 insertions, 27 deletions
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 9b5f15470e..4ea3b37d5f 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -6196,20 +6196,18 @@ static void qInitDrawhelperFunctions()
 
 #if defined(QT_COMPILER_SUPPORTS_SSE4_1)
     if (qCpuHasFeature(SSE4_1)) {
-#if !defined(__SSE4_1__)
         extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_sse4(uint *buffer, const uint *src, int count,
                                                                     const QVector<QRgb> *, QDitherInfo *);
         extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_sse4(uint *buffer, const uint *src, int count,
                                                                       const QVector<QRgb> *, QDitherInfo *);
-        qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4;
-        qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4;
-#endif
         extern const uint *QT_FASTCALL convertARGB32FromARGB32PM_sse4(uint *buffer, const uint *src, int count,
                                                                       const QVector<QRgb> *, QDitherInfo *);
         extern const uint *QT_FASTCALL convertRGBA8888FromARGB32PM_sse4(uint *buffer, const uint *src, int count,
                                                                         const QVector<QRgb> *, QDitherInfo *);
         extern const uint *QT_FASTCALL convertRGBXFromARGB32PM_sse4(uint *buffer, const uint *src, int count,
                                                                     const QVector<QRgb> *, QDitherInfo *);
+        qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4;
+        qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4;
         qPixelLayouts[QImage::Format_ARGB32].convertFromARGB32PM = convertARGB32FromARGB32PM_sse4;
         qPixelLayouts[QImage::Format_RGBA8888].convertFromARGB32PM = convertRGBA8888FromARGB32PM_sse4;
         qPixelLayouts[QImage::Format_RGBX8888].convertFromARGB32PM = convertRGBXFromARGB32PM_sse4;
@@ -6220,14 +6218,6 @@ static void qInitDrawhelperFunctions()
 
 #if defined(QT_COMPILER_SUPPORTS_AVX2)
     if (qCpuHasFeature(AVX2)) {
-#if !defined(__AVX2__)
-        extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, const uint *src, int count,
-                                                                    const QVector<QRgb> *, QDitherInfo *);
-        extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, const uint *src, int count,
-                                                                      const QVector<QRgb> *, QDitherInfo *);
-        qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_avx2;
-        qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_avx2;
-#endif
         extern void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl,
                                                  const uchar *srcPixels, int sbpl,
                                                  int w, int h, int const_alpha);
@@ -6277,6 +6267,15 @@ static void qInitDrawhelperFunctions()
 
     sourceFetchUntransformed[QImage::Format_RGB888] = qt_fetchUntransformed_888_neon;
 
+#if defined(Q_PROCESSOR_ARM_64) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+    extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, const uint *src, int count,
+                                                                const QVector<QRgb> *, QDitherInfo *);
+    extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_neon(uint *buffer, const uint *src, int count,
+                                                                  const QVector<QRgb> *, QDitherInfo *);
+    qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_neon;
+    qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_neon;
+#endif
+
 #if defined(ENABLE_PIXMAN_DRAWHELPERS)
     // The RGB16 helpers are using Arm32 assemblythat has not been ported to AArch64
     qBlendFunctions[QImage::Format_RGB16][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_rgb16_neon;
diff --git a/src/gui/painting/qdrawhelper_avx2.cpp b/src/gui/painting/qdrawhelper_avx2.cpp
index 9c1335298e..5e17e8abec 100644
--- a/src/gui/painting/qdrawhelper_avx2.cpp
+++ b/src/gui/painting/qdrawhelper_avx2.cpp
@@ -44,19 +44,6 @@
 
 QT_BEGIN_NAMESPACE
 
-// Autovectorized premultiply functions:
-const uint *QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, const uint *src, int count,
-                                                     const QVector<QRgb> *, QDitherInfo *)
-{
-    return qt_convertARGB32ToARGB32PM(buffer, src, count);
-}
-
-const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, const uint *src, int count,
-                                                       const QVector<QRgb> *, QDitherInfo *)
-{
-    return qt_convertRGBA8888ToARGB32PM(buffer, src, count);
-}
-
 // Vectorized blend functions:
 
 // See BYTE_MUL_SSE2 for details.
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp
index cdb374f823..643c570f65 100644
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@@ -1069,6 +1069,67 @@ const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Oper
     return buffer;
 }
 
+#if defined(Q_PROCESSOR_ARM_64) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+template<bool RGBA>
+static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
+{
+    int i = 0;
+    const uint8x16_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
+    const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
+    const uint32x4_t blendMask = vdupq_n_u32(0xff000000);
+
+    for (; i < count - 3; i += 4) {
+        uint32x4_t srcVector = vld1q_u32(src + i);
+        uint32x4_t alphaVector = vshrq_n_u32(srcVector, 24);
+        uint32_t alphaSum = vaddvq_u32(alphaVector);
+        if (alphaSum) {
+            if (alphaSum != 255 * 4) {
+                if (RGBA)
+                    srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask));
+                const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(srcVector));
+                const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(srcVector));
+                const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask);
+                const uint8x8_t alpha2 = vtbl1_u8(s2, shuffleMask);
+                uint16x8_t src1 = vmull_u8(s1, alpha1);
+                uint16x8_t src2 = vmull_u8(s2, alpha2);
+                src1 = vsraq_n_u16(src1, src1, 8);
+                src2 = vsraq_n_u16(src2, src2, 8);
+                const uint8x8_t d1 = vrshrn_n_u16(src1, 8);
+                const uint8x8_t d2 = vrshrn_n_u16(src2, 8);
+                const uint32x4_t d = vbslq_u32(blendMask, srcVector, vreinterpretq_u32_u8(vcombine_u8(d1, d2)));
+                vst1q_u32(buffer + i, d);
+            } else {
+                if (RGBA)
+                    vst1q_u32(buffer + i, vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask)));
+                else if (buffer != src)
+                    vst1q_u32(buffer + i, srcVector);
+            }
+        } else {
+            vst1q_u32(buffer + i, vdupq_n_u32(0));
+        }
+    }
+
+    SIMD_EPILOGUE(i, count, 3) {
+        uint v = qPremultiply(src[i]);
+        buffer[i] = RGBA ? RGBA2ARGB(v) : v;
+    }
+}
+
+const uint *QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, const uint *src, int count,
+                                                     const QVector<QRgb> *, QDitherInfo *)
+{
+    convertARGBToARGB32PM_neon<false>(buffer, src, count);
+    return buffer;
+}
+
+const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_neon(uint *buffer, const uint *src, int count,
+                                                       const QVector<QRgb> *, QDitherInfo *)
+{
+    convertARGBToARGB32PM_neon<true>(buffer, src, count);
+    return buffer;
+}
+#endif
+
 QT_END_NAMESPACE
 
 #endif // __ARM_NEON__
diff --git a/src/gui/painting/qdrawhelper_sse4.cpp b/src/gui/painting/qdrawhelper_sse4.cpp
index 257bad9eca..14bfaabf09 100644
--- a/src/gui/painting/qdrawhelper_sse4.cpp
+++ b/src/gui/painting/qdrawhelper_sse4.cpp
@@ -44,16 +44,67 @@
 
 QT_BEGIN_NAMESPACE
 
+template<bool RGBA>
+static inline void convertARGBToARGB32PM_sse4(uint *buffer, const uint *src, int count)
+{
+    int i = 0;
+    const __m128i alphaMask = _mm_set1_epi32(0xff000000);
+    const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
+    const __m128i shuffleMask = _mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15);
+    const __m128i half = _mm_set1_epi16(0x0080);
+    const __m128i zero = _mm_setzero_si128();
+
+    for (; i < count - 3; i += 4) {
+        __m128i srcVector = _mm_loadu_si128((const __m128i *)&src[i]);
+        if (!_mm_testz_si128(srcVector, alphaMask)) {
+            if (!_mm_testc_si128(srcVector, alphaMask)) {
+                if (RGBA)
+                    srcVector = _mm_shuffle_epi8(srcVector, rgbaMask);
+                __m128i src1 = _mm_unpacklo_epi8(srcVector, zero);
+                __m128i src2 = _mm_unpackhi_epi8(srcVector, zero);
+                __m128i alpha1 = _mm_shuffle_epi8(src1, shuffleMask);
+                __m128i alpha2 = _mm_shuffle_epi8(src2, shuffleMask);
+                src1 = _mm_mullo_epi16(src1, alpha1);
+                src2 = _mm_mullo_epi16(src2, alpha2);
+                src1 = _mm_add_epi16(src1, _mm_srli_epi16(src1, 8));
+                src2 = _mm_add_epi16(src2, _mm_srli_epi16(src2, 8));
+                src1 = _mm_add_epi16(src1, half);
+                src2 = _mm_add_epi16(src2, half);
+                src1 = _mm_srli_epi16(src1, 8);
+                src2 = _mm_srli_epi16(src2, 8);
+                src1 = _mm_blend_epi16(src1, alpha1, 0x88);
+                src2 = _mm_blend_epi16(src2, alpha2, 0x88);
+                srcVector = _mm_packus_epi16(src1, src2);
+                _mm_storeu_si128((__m128i *)&buffer[i], srcVector);
+            } else {
+                if (RGBA)
+                    _mm_storeu_si128((__m128i *)&buffer[i], _mm_shuffle_epi8(srcVector, rgbaMask));
+                else if (buffer != src)
+                    _mm_storeu_si128((__m128i *)&buffer[i], srcVector);
+            }
+        } else {
+            _mm_storeu_si128((__m128i *)&buffer[i], _mm_setzero_si128());
+        }
+    }
+
+    SIMD_EPILOGUE(i, count, 3) {
+        uint v = qPremultiply(src[i]);
+        buffer[i] = RGBA ? RGBA2ARGB(v) : v;
+    }
+}
+
 const uint *QT_FASTCALL convertARGB32ToARGB32PM_sse4(uint *buffer, const uint *src, int count,
                                                      const QVector<QRgb> *, QDitherInfo *)
 {
-    return qt_convertARGB32ToARGB32PM(buffer, src, count);
+    convertARGBToARGB32PM_sse4<false>(buffer, src, count);
+    return buffer;
 }
 
 const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_sse4(uint *buffer, const uint *src, int count,
                                                        const QVector<QRgb> *, QDitherInfo *)
 {
-    return qt_convertRGBA8888ToARGB32PM(buffer, src, count);
+    convertARGBToARGB32PM_sse4<true>(buffer, src, count);
+    return buffer;
 }
 
 const uint *QT_FASTCALL convertARGB32FromARGB32PM_sse4(uint *buffer, const uint *src, int count,
author	Allan Sandfeld Jensen <allan.jensen@qt.io>	2016-12-20 16:17:58 +0100
committer	Allan Sandfeld Jensen <allan.jensen@qt.io>	2017-01-31 00:14:11 +0000
commit	85468f7bccb276c2be5801481a6ce10f07581cdb (patch)
tree	f96ef309303ed0caf91b0c37cabee4d295cb19d3 /src/gui/painting
parent	ad4f7b59ead6c4eb17e787bce25a7211b866063f (diff)