From e05443367f60e591556ae8854ecb634a7cf6ea33 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Samuel=20R=C3=B8dal?= <samuel.rodal@nokia.com>
Date: Wed, 13 Apr 2011 10:15:06 +0200
Subject: Improved gradient table generation performance for two-stop
 gradients.

Two stops is a fairly common case so we gain quite a bit by special
casing it. Improves performance by 10 % in parcycle benchmark, and
by 90 % in a synthetic benchmark.

Reviewed-by: Andreas Kling
(cherry picked from commit 5b74a70ac630073582be56f8a0539624a1080185)
---
 src/gui/painting/qdrawhelper_neon.cpp | 38 +++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

(limited to 'src/gui/painting/qdrawhelper_neon.cpp')
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp
index debca37486..7eb2f09d7a 100644
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@@ -955,6 +955,44 @@ void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h,
     }
 }
 
+class QSimdNeon
+{
+public:
+    typedef int32x4_t Int32x4;
+    typedef float32x4_t Float32x4;
+
+    union Vect_buffer_i { Int32x4 v; int i[4]; };
+    union Vect_buffer_f { Float32x4 v; float f[4]; };
+
+    static inline Float32x4 v_dup(float x) { return vdupq_n_f32(x); }
+    static inline Int32x4 v_dup(int x) { return vdupq_n_s32(x); }
+    static inline Int32x4 v_dup(uint x) { return vdupq_n_s32(x); }
+
+    static inline Float32x4 v_add(Float32x4 a, Float32x4 b) { return vaddq_f32(a, b); }
+    static inline Int32x4 v_add(Int32x4 a, Int32x4 b) { return vaddq_s32(a, b); }
+
+    static inline Float32x4 v_max(Float32x4 a, Float32x4 b) { return vmaxq_f32(a, b); }
+    static inline Float32x4 v_min(Float32x4 a, Float32x4 b) { return vminq_f32(a, b); }
+    static inline Int32x4 v_min_16(Int32x4 a, Int32x4 b) { return vminq_s32(a, b); }
+
+    static inline Int32x4 v_and(Int32x4 a, Int32x4 b) { return vandq_s32(a, b); }
+
+    static inline Float32x4 v_sub(Float32x4 a, Float32x4 b) { return vsubq_f32(a, b); }
+    static inline Int32x4 v_sub(Int32x4 a, Int32x4 b) { return vsubq_s32(a, b); }
+
+    static inline Float32x4 v_mul(Float32x4 a, Float32x4 b) { return vmulq_f32(a, b); }
+
+    static inline Float32x4 v_sqrt(Float32x4 x) { Float32x4 y = vrsqrteq_f32(x); y = vmulq_f32(y, vrsqrtsq_f32(x, vmulq_f32(y, y))); return vmulq_f32(x, y); }
+
+    static inline Int32x4 v_toInt(Float32x4 x) { return vcvtq_s32_f32(x); }
+};
+
+const uint * QT_FASTCALL qt_fetch_radial_gradient_neon(uint *buffer, const Operator *op, const QSpanData *data,
+                                                       int y, int x, int length)
+{
+    return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdNeon> >(buffer, op, data, y, x, length);
+}
+
 QT_END_NAMESPACE
 
 #endif // QT_HAVE_NEON
-- 
cgit v1.2.3