Improve vectorized bound calculations

Adds a proper bound calculation to vectorized rotated sampling to avoid checking inside the loop, and fixes the existing bounds check to use the incrementor fdx instead the float value it was calculated from which may cause different rounding. Change-Id: I5226926a142573c026db5504414204b6ee8dd8a7 Reviewed-by: Olivier Goffart (Woboq GmbH) <ogoffart@woboq.com>
author: Allan Sandfeld Jensen <allan.jensen@theqtcompany.com> 2016-08-10 11:58:24 +0200
committer: Allan Sandfeld Jensen <allan.jensen@qt.io> 2016-08-12 22:27:17 +0000
commit: fcd3c11945c20c1b314fee57f7bbe457412844bc (patch)
tree: 1d71ae3e0d75b203a77f70623abf4d7233063ccb /src
parent: 4a1bafcc4ee5b7d968620808e155c1617aa6f273 (diff)
1 files changed, 19 insertions, 14 deletions
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 4812cee9bb..affbc43afe 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -2180,6 +2180,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
 
                 if (blendType != BlendTransformedBilinearTiled) {
 #define BILINEAR_DOWNSCALE_BOUNDS_PROLOG \
+                    const qint64 min_fx = qint64(image_x1) * fixed_scale; \
+                    const qint64 max_fx = qint64(image_x2) * fixed_scale; \
                     while (b < end) { \
                         int x1 = (fx >> 16); \
                         int x2; \
@@ -2195,11 +2197,11 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                         fx += fdx; \
                         ++b; \
                     } \
-                    uint *boundedEnd; \
+                    uint *boundedEnd = end; \
                     if (fdx > 0) \
-                        boundedEnd = qMin(end, buffer + uint((image_x2 - (fx >> 16)) / data->m11)); \
-                    else \
-                        boundedEnd = qMin(end, buffer + uint((image_x1 - (fx >> 16)) / data->m11)); \
+                        boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); \
+                    else if (fdx < 0) \
+                        boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); \
                     boundedEnd -= 3;
 
 #if defined(__SSE2__)
@@ -2333,6 +2335,10 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
 
                 if (blendType != BlendTransformedBilinearTiled) {
 #define BILINEAR_ROTATE_BOUNDS_PROLOG \
+                    const qint64 min_fx = qint64(image_x1) * fixed_scale; \
+                    const qint64 max_fx = qint64(image_x2) * fixed_scale; \
+                    const qint64 min_fy = qint64(image_y1) * fixed_scale; \
+                    const qint64 max_fy = qint64(image_y2) * fixed_scale; \
                     while (b < end) { \
                         int x1 = (fx >> 16); \
                         int x2; \
@@ -2355,7 +2361,15 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                         fy += fdy; \
                         ++b; \
                     } \
-                    uint *boundedEnd = end - 3; \
+                    uint *boundedEnd = end; \
+                    if (fdx > 0) \
+                        boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); \
+                    else if (fdx < 0) \
+                        boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); \
+                    if (fdy > 0) \
+                        boundedEnd = qMin(boundedEnd, b + (max_fy - fy) / fdy); \
+                    else if (fdy < 0) \
+                        boundedEnd = qMin(boundedEnd, b + (min_fy - fy) / fdy); \
                     boundedEnd -= 3;
 
 #if defined(__SSE2__)
@@ -2374,15 +2388,6 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                     const __m128i vbpl = _mm_shufflelo_epi16(_mm_cvtsi32_si128(bytesPerLine/4), _MM_SHUFFLE(0, 0, 0, 0));
 
                     while (b < boundedEnd) {
-                        if (fdx > 0 && (short)_mm_extract_epi16(v_fx, 7) >= image_x2)
-                            break;
-                        if (fdx < 0 && (short)_mm_extract_epi16(v_fx, 7) < image_x1)
-                            break;
-                        if (fdy > 0 && (short)_mm_extract_epi16(v_fy, 7) >= image_y2)
-                            break;
-                        if (fdy < 0 && (short)_mm_extract_epi16(v_fy, 7) < image_y1)
-                            break;
-
                         const __m128i vy = _mm_packs_epi32(_mm_srli_epi32(v_fy, 16), _mm_setzero_si128());
                         // 4x16bit * 4x16bit -> 4x32bit
                         __m128i offset = _mm_unpacklo_epi16(_mm_mullo_epi16(vy, vbpl), _mm_mulhi_epi16(vy, vbpl));
author	Allan Sandfeld Jensen <allan.jensen@theqtcompany.com>	2016-08-10 11:58:24 +0200
committer	Allan Sandfeld Jensen <allan.jensen@qt.io>	2016-08-12 22:27:17 +0000
commit	fcd3c11945c20c1b314fee57f7bbe457412844bc (patch)
tree	1d71ae3e0d75b203a77f70623abf4d7233063ccb /src
parent	4a1bafcc4ee5b7d968620808e155c1617aa6f273 (diff)