Optimize Newton-Raphson cuberoot with SSE2/SSE4.1

Do all colors in parallel using SIMD. Change-Id: I36cb47888d92c4244b5ea7a91c8d84ac3656c56a Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
author: Allan Sandfeld Jensen <allan.jensen@qt.io> 2024-03-14 11:07:44 +0100
committer: Allan Sandfeld Jensen <allan.jensen@qt.io> 2024-03-15 19:23:19 +0100
commit: f944651e3db01a73b10212926a7b1c7aad5eb83e (patch)
tree: 7f9c3414fb74e2f423ff76594fb3c8d90dfb0233 /src/gui/painting
parent: 27a3d3ac9001eb467829397f4ad02355a8d51b2a (diff)
1 files changed, 36 insertions, 0 deletions
diff --git a/src/gui/painting/qcolormatrix_p.h b/src/gui/painting/qcolormatrix_p.h
index c2ecd4617d..e1d65bb6fe 100644
--- a/src/gui/painting/qcolormatrix_p.h
+++ b/src/gui/painting/qcolormatrix_p.h
@@ -18,6 +18,7 @@
 #include <QtGui/qtguiglobal.h>
 #include <QtCore/qpoint.h>
 #include <QtCore/private/qglobal_p.h>
+#include <QtCore/private/qsimd_p.h>
 #include <cmath>
 
 QT_BEGIN_NAMESPACE
@@ -74,6 +75,40 @@ public:
         constexpr QColorVector ref = D50();
         constexpr float eps = 0.008856f;
         constexpr float kap = 903.3f;
+#if defined(__SSE2__)
+        const __m128 iref = _mm_setr_ps(1.f / ref.x, 1.f / ref.y, 1.f / ref.z, 0.f);
+        __m128 v = _mm_loadu_ps(&x);
+        v = _mm_mul_ps(v, iref);
+
+        const __m128 f3 = _mm_set1_ps(3.f);
+        __m128 est = _mm_add_ps(_mm_set1_ps(0.25f), _mm_mul_ps(v, _mm_set1_ps(0.75f))); // float est = 0.25f + (x * 0.75f);
+        __m128 estsq = _mm_mul_ps(est, est);
+        est = _mm_sub_ps(est, _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(estsq, est), v),
+                                         _mm_rcp_ps(_mm_mul_ps(estsq, f3)))); // est -= ((est * est * est) - x) / (3.f * (est * est));
+        estsq = _mm_mul_ps(est, est);
+        est = _mm_sub_ps(est, _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(estsq, est), v),
+                                         _mm_rcp_ps(_mm_mul_ps(estsq, f3)))); // est -= ((est * est * est) - x) / (3.f * (est * est));
+        estsq = _mm_mul_ps(est, est);
+        est = _mm_sub_ps(est, _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(estsq, est), v),
+                                         _mm_rcp_ps(_mm_mul_ps(estsq, f3)))); // est -= ((est * est * est) - x) / (3.f * (est * est));
+        estsq = _mm_mul_ps(est, est);
+        est = _mm_sub_ps(est, _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(estsq, est), v),
+                                         _mm_rcp_ps(_mm_mul_ps(estsq, f3)))); // est -= ((est * est * est) - x) / (3.f * (est * est));
+
+        __m128 kapmul = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(v, _mm_set1_ps(kap)), _mm_set1_ps(16.f)),
+                                   _mm_set1_ps(1.f / 116.f)); // f_ = (kap * f_ + 16.f) * (1.f / 116.f);
+        __m128 cmpgt = _mm_cmpgt_ps(v, _mm_set1_ps(eps)); // if (f_ > eps)
+#if defined(__SSE4_1__)
+        v = _mm_blendv_ps(kapmul, est, cmpgt); // if (..) f_ =..  else f_ =..
+#else
+        v = _mm_or_ps(_mm_and_ps(cmpgt, est), _mm_andnot_ps(cmpgt, kapmul));
+#endif
+        QColorVector out;
+        _mm_store_ps(&out.x, v);
+        const float L = 116.f * out.y - 16.f;
+        const float a = 500.f * (out.x - out.y);
+        const float b = 200.f * (out.y - out.z);
+#else
         float xr = x * (1.f / ref.x);
         float yr = y * (1.f / ref.y);
         float zr = z * (1.f / ref.z);
@@ -95,6 +130,7 @@ public:
         const float L = 116.f * fy - 16.f;
         const float a = 500.f * (fx - fy);
         const float b = 200.f * (fy - fz);
+#endif
         // We output Lab values that has been scaled to 0.0->1.0 values, see also labToXyz.
         return QColorVector(L * (1.f / 100.f), (a + 128.f) * (1.f / 255.f), (b + 128.f) * (1.f / 255.f));
     }
author	Allan Sandfeld Jensen <allan.jensen@qt.io>	2024-03-14 11:07:44 +0100
committer	Allan Sandfeld Jensen <allan.jensen@qt.io>	2024-03-15 19:23:19 +0100
commit	f944651e3db01a73b10212926a7b1c7aad5eb83e (patch)
tree	7f9c3414fb74e2f423ff76594fb3c8d90dfb0233 /src/gui/painting
parent	27a3d3ac9001eb467829397f4ad02355a8d51b2a (diff)