From 4b6542c9ffa59eba6e82c0ecbb04dab361e3632f Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Fri, 23 Mar 2018 10:19:11 +0100 Subject: Remove last uses of interpolate_4_pixels_16 on SSE2 and NEON With SSE2 or NEON interpolate_4_pixels is faster than interpolate_4_pixels_16, and using it saves a branch of duplicated code. Similar changes had already been done other places it was used, those have been updated to follow a similar logic. Change-Id: I040d96480f7f925f659602f66f931d28b59312a5 Reviewed-by: Lars Knoll Reviewed-by: Eirik Aavitsland --- src/gui/painting/qdrawhelper.cpp | 96 ++++++++++++++++++---------------------- src/gui/painting/qdrawhelper_p.h | 9 ++++ 2 files changed, 52 insertions(+), 53 deletions(-) diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index f08038e34b..f3df62b855 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -2249,17 +2249,13 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper(uint * #endif while (b < boundedEnd) { int x = (fx >> 16); -#if defined(__SSE2__) || defined(__ARM_NEON__) - int distx8 = (fx & 0x0000ffff) >> 8; - *b = interpolate_4_pixels(s1 + x, s2 + x, distx8, disty8); -#else - uint tl = s1[x]; - uint tr = s1[x + 1]; - uint bl = s2[x]; - uint br = s2[x + 1]; - int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12; - *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4); -#endif + if (hasFastInterpolate4()) { + int distx8 = (fx & 0x0000ffff) >> 8; + *b = interpolate_4_pixels(s1 + x, s2 + x, distx8, disty8); + } else { + int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12; + *b = interpolate_4_pixels_16(s1[x], s1[x + 1], s2[x], s2[x + 1], distx4, disty4); + } fx += fdx; ++b; } @@ -2273,14 +2269,13 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper(uint * uint tr = s1[x2]; uint bl = s2[x1]; uint br = s2[x2]; -#if defined(__SSE2__) || defined(__ARM_NEON__) - // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16. - int distx8 = (fx & 0x0000ffff) >> 8; - *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8); -#else - int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12; - *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4); -#endif + if (hasFastInterpolate4()) { + int distx8 = (fx & 0x0000ffff) >> 8; + *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8); + } else { + int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12; + *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4); + } fx += fdx; ++b; } @@ -2345,15 +2340,15 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper(uint uint tr = s1[x2]; uint bl = s2[x1]; uint br = s2[x2]; -#if defined(__SSE2__) || defined(__ARM_NEON__) - int distx = (fx & 0x0000ffff) >> 8; - int disty = (fy & 0x0000ffff) >> 8; - *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); -#else - int distx = ((fx & 0x0000ffff) + 0x0800) >> 12; - int disty = ((fy & 0x0000ffff) + 0x0800) >> 12; - *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); -#endif + if (hasFastInterpolate4()) { + int distx = (fx & 0x0000ffff) >> 8; + int disty = (fy & 0x0000ffff) >> 8; + *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); + } else { + int distx = ((fx & 0x0000ffff) + 0x0800) >> 12; + int disty = ((fy & 0x0000ffff) + 0x0800) >> 12; + *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); + } fx += fdx; fy += fdy; ++b; @@ -2495,19 +2490,15 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper(uint const uint *s1 = (const uint *)image.scanLine(y); const uint *s2 = (const uint *)image.scanLine(y + 1); -#if defined(__SSE2__) || defined(__ARM_NEON__) - int distx = (fx & 0x0000ffff) >> 8; - int disty = (fy & 0x0000ffff) >> 8; - *b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty); -#else - uint tl = s1[x]; - uint tr = s1[x + 1]; - uint bl = s2[x]; - uint br = s2[x + 1]; - int distx = ((fx & 0x0000ffff) + 0x0800) >> 12; - int disty = ((fy & 0x0000ffff) + 0x0800) >> 12; - *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); -#endif + if (hasFastInterpolate4()) { + int distx = (fx & 0x0000ffff) >> 8; + int disty = (fy & 0x0000ffff) >> 8; + *b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty); + } else { + int distx = ((fx & 0x0000ffff) + 0x0800) >> 12; + int disty = ((fy & 0x0000ffff) + 0x0800) >> 12; + *b = interpolate_4_pixels_16(s1[x], s1[x + 1], s2[x], s2[x + 1], distx, disty); + } fx += fdx; fy += fdy; @@ -2532,16 +2523,15 @@ static void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper(uint uint bl = s2[x1]; uint br = s2[x2]; -#if defined(__SSE2__) || defined(__ARM_NEON__) - // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16. - int distx = (fx & 0x0000ffff) >> 8; - int disty = (fy & 0x0000ffff) >> 8; - *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); -#else - int distx = ((fx & 0x0000ffff) + 0x0800) >> 12; - int disty = ((fy & 0x0000ffff) + 0x0800) >> 12; - *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); -#endif + if (hasFastInterpolate4()) { + int distx = (fx & 0x0000ffff) >> 8; + int disty = (fy & 0x0000ffff) >> 8; + *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); + } else { + int distx = ((fx & 0x0000ffff) + 0x0800) >> 12; + int disty = ((fy & 0x0000ffff) + 0x0800) >> 12; + *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); + } fx += fdx; fy += fdy; @@ -2939,7 +2929,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0); layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0); - if (qAbs(data->m22) < qreal(1./8.)) { // scale up more than 8x (on Y) + if (hasFastInterpolate4() || qAbs(data->m22) < qreal(1./8.)) { // scale up more than 8x (on Y) int disty = (fy & 0x0000ffff) >> 8; for (int i = 0; i < len; ++i) { int distx = (fx & 0x0000ffff) >> 8; @@ -2974,7 +2964,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0); layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0); - if (qAbs(data->m11) < qreal(1./8.)|| qAbs(data->m22) < qreal(1./8.)) { + if (hasFastInterpolate4() || qAbs(data->m11) < qreal(1./8.) || qAbs(data->m22) < qreal(1./8.)) { // If we are zooming more than 8 times, we use 8bit precision for the position. for (int i = 0; i < len; ++i) { int distx = (fx & 0x0000ffff) >> 8; diff --git a/src/gui/painting/qdrawhelper_p.h b/src/gui/painting/qdrawhelper_p.h index ebf215a3eb..b94fd34b51 100644 --- a/src/gui/painting/qdrawhelper_p.h +++ b/src/gui/painting/qdrawhelper_p.h @@ -685,6 +685,9 @@ static inline uint interpolate_4_pixels(const uint t[], const uint b[], uint dis __m128i vb = _mm_loadl_epi64((const __m128i*)b); return interpolate_4_pixels_sse2(vt, vb, distx, disty); } + +static constexpr inline bool hasFastInterpolate4() { return true; } + #elif defined(__ARM_NEON__) static Q_ALWAYS_INLINE uint interpolate_4_pixels_neon(uint32x2_t vt32, uint32x2_t vb32, uint distx, uint disty) { @@ -717,6 +720,9 @@ static inline uint interpolate_4_pixels(const uint t[], const uint b[], uint dis uint32x2_t vb32 = vld1_u32(b); return interpolate_4_pixels_neon(vt32, vb32, distx, disty); } + +static constexpr inline bool hasFastInterpolate4() { return true; } + #else static inline uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty) { @@ -731,6 +737,9 @@ static inline uint interpolate_4_pixels(const uint t[], const uint b[], uint dis { return interpolate_4_pixels(t[0], t[1], b[0], b[1], distx, disty); } + +static constexpr inline bool hasFastInterpolate4() { return false; } + #endif #if Q_BYTE_ORDER == Q_BIG_ENDIAN -- cgit v1.2.3