ARM: Vectorize common QRgba64 compositing functions.

Adds overloaded functions for some common operations used for compositing, and use those together with specialized loading/storing. QRgba64 is represented as a quint64, so most of the time it lives in the integer registers. By overloading functions to return SIMD data types, the temporary values are kept in SIMD registers (so no more unnecessary transfers between the registers, which can cost a lot on ARM). It also allows the compiler to do better TBAA and optimizations. Another benefit is that the loop bodies are smaller, giving them a higher chance to fit in the micro-op cache of modern CPUs. Change-Id: I5f13cd0677176b1162425fe59d42868d0d20f6e2 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
author: Erik Verbruggen <erik.verbruggen@theqtcompany.com> 2015-12-18 14:32:06 +0100
committer: Erik Verbruggen <erik.verbruggen@qt.io> 2016-05-24 11:32:08 +0000
commit: 06f920399bcb1e5984d9efc2ae23830d8f5e1e6f (patch)
tree: 52fd8482e3cf635430046ec21f75bc25dbad06a0 /src
parent: 36aaf851ff2814e9e5c024e21b866c403137ff26 (diff)
2 files changed, 153 insertions, 47 deletions
diff --git a/src/gui/painting/qcompositionfunctions.cpp b/src/gui/painting/qcompositionfunctions.cpp
index 9312ee9540..641550766a 100644
--- a/src/gui/painting/qcompositionfunctions.cpp
+++ b/src/gui/painting/qcompositionfunctions.cpp
@@ -87,6 +87,32 @@ QT_BEGIN_NAMESPACE
     }\
 }
 
+#if defined __SSE2__
+#  define LOAD(ptr) _mm_loadl_epi64((__m128i*)(ptr))
+#  define CONVERT(value) _mm_shufflelo_epi16(_mm_cvtsi32_si128(value), _MM_SHUFFLE(0, 0, 0, 0))
+#  define STORE(ptr, value) _mm_storel_epi64((__m128i*)(ptr), value)
+#  define ADD(p, q) _mm_add_epi32(p, q)
+#  define ALPHA(c) _mm_shufflelo_epi16(c, _MM_SHUFFLE(3, 3, 3, 3))
+#  define CONST(n) CONVERT(n)
+#  define INVALPHA(c) _mm_sub_epi32(CONST(65535), ALPHA(c))
+#elif defined __ARM_NEON__
+#  define LOAD(ptr) vreinterpret_u16_u64(vld1_u64((quint64*)(ptr)))
+#  define CONVERT(value) vreinterpret_u16_u64(vmov_n_u64(value))
+#  define STORE(ptr, value) vst1_u64((quint64*)(ptr), vreinterpret_u64_u16(value))
+#  define ADD(p, q) vadd_u16(p, q)
+#  define ALPHA(c) vdup_lane_u16(c, 3)
+#  define CONST(n) vdup_n_u16(n)
+#  define INVALPHA(c) vmvn_u16(ALPHA(c))
+#else
+#  define LOAD(ptr) *ptr
+#  define CONVERT(value) value
+#  define STORE(ptr, value) *ptr = value
+#  define ADD(p, q) (p + q)
+#  define ALPHA(c) (c).alpha()
+#  define CONST(n) n
+#  define INVALPHA(c) (65535 - ALPHA(c))
+#endif
+
 void QT_FASTCALL comp_func_solid_Clear(uint *dest, int length, uint, uint const_alpha)
 {
     comp_func_Clear_impl(dest, length, const_alpha);
@@ -99,7 +125,7 @@ void QT_FASTCALL comp_func_solid_Clear_rgb64(QRgba64 *dest, int length, QRgba64,
     else {
         int ialpha = 255 - const_alpha;
         for (int i = 0; i < length; ++i) {
-            dest[i] = multiplyAlpha255(dest[i], ialpha);
+            STORE(&dest[i], multiplyAlpha255(LOAD(&dest[i]), ialpha));
         }
     }
 }
@@ -116,7 +142,7 @@ void QT_FASTCALL comp_func_Clear_rgb64(QRgba64 *dest, const QRgba64 *, int lengt
     else {
         int ialpha = 255 - const_alpha;
         for (int i = 0; i < length; ++i) {
-            dest[i] = multiplyAlpha255(dest[i], ialpha);
+            STORE(&dest[i], multiplyAlpha255(LOAD(&dest[i]), ialpha));
         }
     }
 }
@@ -146,9 +172,9 @@ void QT_FASTCALL comp_func_solid_Source_rgb64(QRgba64 *dest, int length, QRgba64
         qt_memfill64((quint64*)dest, color, length);
     else {
         int ialpha = 255 - const_alpha;
-        color = multiplyAlpha255(color, const_alpha);
+        auto c = multiplyAlpha255(CONVERT(color), const_alpha);
         for (int i = 0; i < length; ++i) {
-            dest[i] = color + multiplyAlpha255(dest[i], ialpha);
+            STORE(&dest[i], ADD(c, multiplyAlpha255(LOAD(&dest[i]), ialpha)));
         }
     }
 }
@@ -174,7 +200,7 @@ void QT_FASTCALL comp_func_Source_rgb64(QRgba64 *Q_DECL_RESTRICT dest, const QRg
     else {
         int ialpha = 255 - const_alpha;
         for (int i = 0; i < length; ++i) {
-            dest[i] = interpolate255(src[i], const_alpha, dest[i], ialpha);
+            STORE(&dest[i], interpolate255(LOAD(&src[i]), const_alpha, LOAD(&dest[i]), ialpha));
         }
     }
 }
@@ -221,10 +247,12 @@ void QT_FASTCALL comp_func_solid_SourceOver_rgb64(QRgba64 *dest, int length, QRg
     if (const_alpha == 255 && color.isOpaque()) {
         qt_memfill64((quint64*)dest, color, length);
     } else {
+        auto c = CONVERT(color);
         if (const_alpha != 255)
-            color = multiplyAlpha255(color, const_alpha);
+            c = multiplyAlpha255(c, const_alpha);
+        auto cAlpha = INVALPHA(c);
         for (int i = 0; i < length; ++i) {
-            dest[i] = color + multiplyAlpha65535(dest[i], 65535 - color.alpha());
+            STORE(&dest[i], ADD(c, multiplyAlpha65535(LOAD(&dest[i]), cAlpha)));
         }
     }
 }
@@ -258,12 +286,12 @@ void QT_FASTCALL comp_func_SourceOver_rgb64(QRgba64 *Q_DECL_RESTRICT dest, const
             if (s.isOpaque())
                 dest[i] = s;
             else if (!s.isTransparent())
-                dest[i] = s + multiplyAlpha65535(dest[i], 65535 - s.alpha());
+                STORE(&dest[i], ADD(CONVERT(s), multiplyAlpha65535(LOAD(&dest[i]), 65535 - s.alpha())));
         }
     } else {
         for (int i = 0; i < length; ++i) {
-            QRgba64 s = multiplyAlpha255(src[i], const_alpha);
-            dest[i] = s + multiplyAlpha65535(dest[i], 65535 - s.alpha());
+            auto s = multiplyAlpha255(LOAD(&src[i]), const_alpha);
+            STORE(&dest[i], ADD(s, multiplyAlpha65535(LOAD(&dest[i]), INVALPHA(s))));
         }
     }
 }
@@ -287,11 +315,12 @@ void QT_FASTCALL comp_func_solid_DestinationOver(uint *dest, int length, uint co
 
 void QT_FASTCALL comp_func_solid_DestinationOver_rgb64(QRgba64 *dest, int length, QRgba64 color, uint const_alpha)
 {
+    auto c = CONVERT(color);
     if (const_alpha != 255)
-        color = multiplyAlpha255(color, const_alpha);
+        c = multiplyAlpha255(c, const_alpha);
     for (int i = 0; i < length; ++i) {
-        QRgba64 d = dest[i];
-        dest[i] = d + multiplyAlpha65535(color, 65535 - d.alpha());
+        auto d = LOAD(&dest[i]);
+        STORE(&dest[i], ADD(d, multiplyAlpha65535(c, INVALPHA(d))));
     }
 }
 
@@ -318,14 +347,14 @@ void QT_FASTCALL comp_func_DestinationOver_rgb64(QRgba64 *Q_DECL_RESTRICT dest,
 {
     if (const_alpha == 255) {
         for (int i = 0; i < length; ++i) {
-            QRgba64 d = dest[i];
-            dest[i] = d + multiplyAlpha65535(src[i], 65535 - d.alpha());
+            auto d = LOAD(&dest[i]);
+            STORE(&dest[i], ADD(d, multiplyAlpha65535(LOAD(&src[i]), INVALPHA(d))));
         }
     } else {
         for (int i = 0; i < length; ++i) {
-            QRgba64 d = dest[i];
-            QRgba64 s = multiplyAlpha255(src[i], const_alpha);
-            dest[i] = d + multiplyAlpha65535(s, 65535 - d.alpha());
+            auto d = LOAD(&dest[i]);
+            auto s = multiplyAlpha255(LOAD(&src[i]), const_alpha);
+            STORE(&dest[i], ADD(d, multiplyAlpha65535(s, INVALPHA(d))));
         }
     }
 }
@@ -393,15 +422,15 @@ void QT_FASTCALL comp_func_SourceIn_rgb64(QRgba64 *Q_DECL_RESTRICT dest, const Q
 {
     if (const_alpha == 255) {
         for (int i = 0; i < length; ++i) {
-            dest[i] = multiplyAlpha65535(src[i], dest[i].alpha());
+            STORE(&dest[i], multiplyAlpha65535(LOAD(&src[i]), dest[i].alpha()));
         }
     } else {
         uint ca = const_alpha * 257;
-        uint cia = 65535 - ca;
+        auto cia = CONST(65535 - ca);
         for (int i = 0; i < length; ++i) {
-            QRgba64 d = dest[i];
-            QRgba64 s = multiplyAlpha65535(src[i], ca);
-            dest[i] = interpolate65535(s, d.alpha(), d, cia);
+            auto d = LOAD(&dest[i]);
+            auto s = multiplyAlpha65535(LOAD(&src[i]), ca);
+            STORE(&dest[i], interpolate65535(s, ALPHA(d), d, cia));
         }
     }
 }
@@ -431,7 +460,7 @@ void QT_FASTCALL comp_func_solid_DestinationIn_rgb64(QRgba64 *dest, int length,
     if (const_alpha != 255)
         a = qt_div_65535(a * ca64k) + 65535 - ca64k;
     for (int i = 0; i < length; ++i) {
-        dest[i] = multiplyAlpha65535(dest[i], a);
+        STORE(&dest[i], multiplyAlpha65535(LOAD(&dest[i]), a));
     }
 }
 
@@ -885,14 +914,19 @@ void QT_FASTCALL comp_func_solid_Plus(uint *dest, int length, uint color, uint c
 
 void QT_FASTCALL comp_func_solid_Plus_rgb64(QRgba64 *dest, int length, QRgba64 color, uint const_alpha)
 {
+    auto b = CONVERT(color);
     if (const_alpha == 255) {
         for (int i = 0; i < length; ++i) {
-            dest[i] = addWithSaturation(dest[i], color);
+            auto a = LOAD(&dest[i]);
+            a = addWithSaturation(a, b);
+            STORE(&dest[i], a);
         }
     } else {
         for (int i = 0; i < length; ++i) {
-            QRgba64 d = addWithSaturation(dest[i], color);
-            dest[i] = interpolate255(d, const_alpha, dest[i], 255 - const_alpha);
+            auto a = LOAD(&dest[i]);
+            auto d = addWithSaturation(a, b);
+            a = interpolate255(d, const_alpha, a, 255 - const_alpha);
+            STORE(&dest[i], a);
         }
     }
 }
@@ -924,12 +958,18 @@ void QT_FASTCALL comp_func_Plus_rgb64(QRgba64 *Q_DECL_RESTRICT dest, const QRgba
 {
     if (const_alpha == 255) {
         for (int i = 0; i < length; ++i) {
-            dest[i] = addWithSaturation(dest[i], src[i]);
+            auto a = LOAD(&dest[i]);
+            auto b = LOAD(&src[i]);
+            a = addWithSaturation(a, b);
+            STORE(&dest[i], a);
         }
     } else {
         for (int i = 0; i < length; ++i) {
-            QRgba64 d = addWithSaturation(dest[i], src[i]);
-            dest[i] = interpolate255(d, const_alpha, dest[i], 255 - const_alpha);
+            auto a = LOAD(&dest[i]);
+            auto b = LOAD(&src[i]);
+            auto d = addWithSaturation(a, b);
+            a = interpolate255(d, const_alpha, a, 255 - const_alpha);
+            STORE(&dest[i], a);
         }
     }
 }
diff --git a/src/gui/painting/qrgba64_p.h b/src/gui/painting/qrgba64_p.h
index cf3dad5f90..b88a5d4c92 100644
--- a/src/gui/painting/qrgba64_p.h
+++ b/src/gui/painting/qrgba64_p.h
@@ -72,27 +72,49 @@ inline QRgba64 multiplyAlpha256(QRgba64 rgba64, uint alpha256)
 
 inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
 {
+    return QRgba64::fromRgba64(qt_div_65535(rgba64.red()   * alpha65535),
+                               qt_div_65535(rgba64.green() * alpha65535),
+                               qt_div_65535(rgba64.blue()  * alpha65535),
+                               qt_div_65535(rgba64.alpha() * alpha65535));
+}
+
 #ifdef __SSE2__
-    const __m128i va = _mm_shufflelo_epi16(_mm_cvtsi32_si128(alpha65535), _MM_SHUFFLE(0, 0, 0, 0));
-    __m128i vs = _mm_loadl_epi64((__m128i*)&rgba64);
+Q_ALWAYS_INLINE __m128i multiplyAlpha65535(__m128i rgba64, __m128i va)
+{
+    __m128i vs = rgba64;
     vs = _mm_unpacklo_epi16(_mm_mullo_epi16(vs, va), _mm_mulhi_epu16(vs, va));
     vs = _mm_add_epi32(vs, _mm_srli_epi32(vs, 16));
     vs = _mm_add_epi32(vs, _mm_set1_epi32(0x8000));
     vs = _mm_srai_epi32(vs, 16);
     vs = _mm_packs_epi32(vs, _mm_setzero_si128());
-    _mm_storel_epi64((__m128i*)&rgba64, vs);
-    return rgba64;
-#else
-    return QRgba64::fromRgba64(qt_div_65535(rgba64.red()   * alpha65535),
-                               qt_div_65535(rgba64.green() * alpha65535),
-                               qt_div_65535(rgba64.blue()  * alpha65535),
-                               qt_div_65535(rgba64.alpha() * alpha65535));
+    return vs;
+}
+Q_ALWAYS_INLINE __m128i multiplyAlpha65535(__m128i rgba64, uint alpha65535)
+{
+    const __m128i va = _mm_shufflelo_epi16(_mm_cvtsi32_si128(alpha65535), _MM_SHUFFLE(0, 0, 0, 0));
+    return multiplyAlpha65535(rgba64, va);
+}
 #endif
+
+#if defined(__ARM_NEON__)
+Q_ALWAYS_INLINE uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint16x4_t alpha65535)
+{
+    uint32x4_t vs32 = vmull_u16(rgba64, alpha65535); // vs = vs * alpha
+    vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16)
+    return vrshrn_n_u32(vs32, 16); // vs = (vs + 0x8000) >> 16
 }
+Q_ALWAYS_INLINE uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535)
+{
+    uint32x4_t vs32 = vmull_n_u16(rgba64, alpha65535); // vs = vs * alpha
+    vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16)
+    return vrshrn_n_u32(vs32, 16); // vs = (vs + 0x8000) >> 16
+}
+#endif
 
-inline QRgba64 multiplyAlpha255(QRgba64 rgba64, uint alpha255)
+template<typename T>
+inline T multiplyAlpha255(T rgba64, uint alpha255)
 {
-#ifdef __SSE2__
+#if defined(__SSE2__) || defined(__ARM_NEON__)
     return multiplyAlpha65535(rgba64, alpha255 * 257);
 #else
     return QRgba64::fromRgba64(qt_div_255(rgba64.red()   * alpha255),
@@ -112,25 +134,69 @@ inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
     return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2));
 }
 
+#if defined __SSE2__
+Q_ALWAYS_INLINE __m128i interpolate255(__m128i x, uint alpha1, __m128i y, uint alpha2)
+{
+    return _mm_add_epi32(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2));
+}
+#endif
+
+#if defined __ARM_NEON__
+Q_ALWAYS_INLINE uint16x4_t interpolate255(uint16x4_t x, uint alpha1, uint16x4_t y, uint alpha2)
+{
+    return vadd_u16(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2));
+}
+#endif
+
 inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
 {
     return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2));
 }
 
+#if defined __SSE2__
+Q_ALWAYS_INLINE __m128i interpolate65535(__m128i x, uint alpha1, __m128i y, uint alpha2)
+{
+    return _mm_add_epi32(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
+}
+// alpha2 below is const-ref because otherwise MSVC2013 complains that it can't 16-byte align the argument.
+Q_ALWAYS_INLINE __m128i interpolate65535(__m128i x, __m128i alpha1, __m128i y, const __m128i &alpha2)
+{
+    return _mm_add_epi32(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
+}
+#endif
+
+#if defined __ARM_NEON__
+Q_ALWAYS_INLINE uint16x4_t interpolate65535(uint16x4_t x, uint alpha1, uint16x4_t y, uint alpha2)
+{
+    return vadd_u16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
+}
+Q_ALWAYS_INLINE uint16x4_t interpolate65535(uint16x4_t x, uint16x4_t alpha1, uint16x4_t y, uint16x4_t alpha2)
+{
+    return vadd_u16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
+}
+#endif
+
 inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b)
 {
-#if defined(__SSE2__) && defined(Q_PROCESSOR_X86_64)
-    __m128i va = _mm_cvtsi64_si128((quint64)a);
-    __m128i vb = _mm_cvtsi64_si128((quint64)b);
-    va = _mm_adds_epu16(va, vb);
-    return QRgba64::fromRgba64(_mm_cvtsi128_si64(va));
-#else
     return QRgba64::fromRgba64(qMin(a.red() + b.red(), 65535),
                                qMin(a.green() + b.green(), 65535),
                                qMin(a.blue() + b.blue(), 65535),
                                qMin(a.alpha() + b.alpha(), 65535));
+}
+
+#if defined(__SSE2__)
+Q_ALWAYS_INLINE __m128i addWithSaturation(__m128i a, __m128i b)
+{
+    return _mm_adds_epu16(a, b);
+}
 #endif
+
+#if defined(__ARM_NEON__)
+Q_ALWAYS_INLINE uint16x4_t addWithSaturation(uint16x4_t a, uint16x4_t b)
+{
+    return vqmovn_u32(vaddl_u16(a, b));
 }
+#endif
 
 QT_END_NAMESPACE
author	Erik Verbruggen <erik.verbruggen@theqtcompany.com>	2015-12-18 14:32:06 +0100
committer	Erik Verbruggen <erik.verbruggen@qt.io>	2016-05-24 11:32:08 +0000
commit	06f920399bcb1e5984d9efc2ae23830d8f5e1e6f (patch)
tree	52fd8482e3cf635430046ec21f75bc25dbad06a0 /src
parent	36aaf851ff2814e9e5c024e21b866c403137ff26 (diff)