diff options
author | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2024-03-14 11:07:44 +0100 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2024-03-15 19:23:19 +0100 |
commit | f944651e3db01a73b10212926a7b1c7aad5eb83e (patch) | |
tree | 7f9c3414fb74e2f423ff76594fb3c8d90dfb0233 /src/gui/painting | |
parent | 27a3d3ac9001eb467829397f4ad02355a8d51b2a (diff) |
Optimize Newton-Raphson cuberoot with SSE2/SSE4.1
Do all colors in parallel using SIMD.
Change-Id: I36cb47888d92c4244b5ea7a91c8d84ac3656c56a
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src/gui/painting')
-rw-r--r-- | src/gui/painting/qcolormatrix_p.h | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/src/gui/painting/qcolormatrix_p.h b/src/gui/painting/qcolormatrix_p.h index c2ecd4617d..e1d65bb6fe 100644 --- a/src/gui/painting/qcolormatrix_p.h +++ b/src/gui/painting/qcolormatrix_p.h @@ -18,6 +18,7 @@ #include <QtGui/qtguiglobal.h> #include <QtCore/qpoint.h> #include <QtCore/private/qglobal_p.h> +#include <QtCore/private/qsimd_p.h> #include <cmath> QT_BEGIN_NAMESPACE @@ -74,6 +75,40 @@ public: constexpr QColorVector ref = D50(); constexpr float eps = 0.008856f; constexpr float kap = 903.3f; +#if defined(__SSE2__) + const __m128 iref = _mm_setr_ps(1.f / ref.x, 1.f / ref.y, 1.f / ref.z, 0.f); + __m128 v = _mm_loadu_ps(&x); + v = _mm_mul_ps(v, iref); + + const __m128 f3 = _mm_set1_ps(3.f); + __m128 est = _mm_add_ps(_mm_set1_ps(0.25f), _mm_mul_ps(v, _mm_set1_ps(0.75f))); // float est = 0.25f + (x * 0.75f); + __m128 estsq = _mm_mul_ps(est, est); + est = _mm_sub_ps(est, _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(estsq, est), v), + _mm_rcp_ps(_mm_mul_ps(estsq, f3)))); // est -= ((est * est * est) - x) / (3.f * (est * est)); + estsq = _mm_mul_ps(est, est); + est = _mm_sub_ps(est, _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(estsq, est), v), + _mm_rcp_ps(_mm_mul_ps(estsq, f3)))); // est -= ((est * est * est) - x) / (3.f * (est * est)); + estsq = _mm_mul_ps(est, est); + est = _mm_sub_ps(est, _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(estsq, est), v), + _mm_rcp_ps(_mm_mul_ps(estsq, f3)))); // est -= ((est * est * est) - x) / (3.f * (est * est)); + estsq = _mm_mul_ps(est, est); + est = _mm_sub_ps(est, _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(estsq, est), v), + _mm_rcp_ps(_mm_mul_ps(estsq, f3)))); // est -= ((est * est * est) - x) / (3.f * (est * est)); + + __m128 kapmul = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(v, _mm_set1_ps(kap)), _mm_set1_ps(16.f)), + _mm_set1_ps(1.f / 116.f)); // f_ = (kap * f_ + 16.f) * (1.f / 116.f); + __m128 cmpgt = _mm_cmpgt_ps(v, _mm_set1_ps(eps)); // if (f_ > eps) +#if defined(__SSE4_1__) + v = _mm_blendv_ps(kapmul, est, cmpgt); // if (..) f_ =.. else f_ =.. +#else + v = _mm_or_ps(_mm_and_ps(cmpgt, est), _mm_andnot_ps(cmpgt, kapmul)); +#endif + QColorVector out; + _mm_store_ps(&out.x, v); + const float L = 116.f * out.y - 16.f; + const float a = 500.f * (out.x - out.y); + const float b = 200.f * (out.y - out.z); +#else float xr = x * (1.f / ref.x); float yr = y * (1.f / ref.y); float zr = z * (1.f / ref.z); @@ -95,6 +130,7 @@ public: const float L = 116.f * fy - 16.f; const float a = 500.f * (fx - fy); const float b = 200.f * (fy - fz); +#endif // We output Lab values that has been scaled to 0.0->1.0 values, see also labToXyz. return QColorVector(L * (1.f / 100.f), (a + 128.f) * (1.f / 255.f), (b + 128.f) * (1.f / 255.f)); } |