Vectorize bilinear rotating transforms

This patch reuses the code and pattern used to vectorize scaling, to also vectorize rotating transforms in fetchTransformedBilinearARGB32PM. This provides significant improvements in QtWebKit on benchmarks using rotating transforms. Change-Id: If250e0f5dae1ff0f954301f96cc8970a99ae3e9d Reviewed-by: Thiago Macieira <thiago.macieira@intel.com> Reviewed-by: Gunnar Sletta <gunnar.sletta@jollamobile.com>
author: Allan Sandfeld Jensen <allan.jensen@digia.com> 2014-03-07 14:30:13 +0100
committer: The Qt Project <gerrit-noreply@qt-project.org> 2014-05-05 14:13:50 +0200
commit: 89b70f3910dfec7108d6e95cc96f904fce01da29 (patch)
tree: e1821e7ec8db1caf00768c5e9d6354e7aedaad67 /src/gui/painting
parent: 1f278c6e1126a41e83aec83e2eeacb96ee08e74b (diff)
1 files changed, 157 insertions, 0 deletions
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 7a5c5dc660..b5ccafdf9a 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -1709,6 +1709,163 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                 }
             } else {
                 //we are zooming less than 8x, use 4bit precision
+
+                if (blendType != BlendTransformedBilinearTiled) {
+#define BILINEAR_ROTATE_BOUNDS_PROLOG \
+                    while (b < end) { \
+                        int x1 = (fx >> 16); \
+                        int x2; \
+                        int y1 = (fy >> 16); \
+                        int y2; \
+                        fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); \
+                        fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2); \
+                        if (x1 != x2 && y1 != y2) \
+                            break; \
+                        const uint *s1 = (const uint *)data->texture.scanLine(y1); \
+                        const uint *s2 = (const uint *)data->texture.scanLine(y2); \
+                        uint tl = s1[x1]; \
+                        uint tr = s1[x2]; \
+                        uint bl = s2[x1]; \
+                        uint br = s2[x2]; \
+                        int distx = (fx & 0x0000ffff) >> 12; \
+                        int disty = (fy & 0x0000ffff) >> 12; \
+                        *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); \
+                        fx += fdx; \
+                        fy += fdy; \
+                        ++b; \
+                    } \
+                    uint *boundedEnd = end - 3; \
+                    boundedEnd -= 3;
+
+#if defined(__SSE2__)
+                    BILINEAR_ROTATE_BOUNDS_PROLOG
+
+                    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+                    const __m128i v_256 = _mm_set1_epi16(256);
+                    __m128i v_fdx = _mm_set1_epi32(fdx*4);
+                    __m128i v_fdy = _mm_set1_epi32(fdy*4);
+
+                    const uchar *textureData = data->texture.imageData;
+                    const int bytesPerLine = data->texture.bytesPerLine;
+
+                    union Vect_buffer { __m128i vect; qint32 i[4]; };
+                    Vect_buffer v_fx, v_fy;
+
+                    for (int i = 0; i < 4; i++) {
+                        v_fx.i[i] = fx;
+                        v_fy.i[i] = fy;
+                        fx += fdx;
+                        fy += fdy;
+                    }
+
+                    while (b < boundedEnd) {
+                        if (fdx > 0 && (v_fx.i[3] >> 16) >= image_x2)
+                            break;
+                        if (fdx < 0 && (v_fx.i[3] >> 16) < image_x1)
+                            break;
+                        if (fdy > 0 && (v_fy.i[3] >> 16) >= image_y2)
+                            break;
+                        if (fdy < 0 && (v_fy.i[3] >> 16) < image_y1)
+                            break;
+
+                        Vect_buffer tl, tr, bl, br;
+                        Vect_buffer v_fx_shifted, v_fy_shifted;
+                        v_fx_shifted.vect = _mm_srli_epi32(v_fx.vect, 16);
+                        v_fy_shifted.vect = _mm_srli_epi32(v_fy.vect, 16);
+
+                        for (int i = 0; i < 4; i++) {
+                            const int x1 = v_fx_shifted.i[i];
+                            const int y1 = v_fy_shifted.i[i];
+                            const uchar *sl = textureData + bytesPerLine * y1;
+                            const uint *s1 = (const uint *)sl;
+                            const uint *s2 = (const uint *)(sl + bytesPerLine);
+                            tl.i[i] = s1[x1];
+                            tr.i[i] = s1[x1+1];
+                            bl.i[i] = s2[x1];
+                            br.i[i] = s2[x1+1];
+                        }
+                        __m128i v_distx = _mm_srli_epi16(v_fx.vect, 12);
+                        __m128i v_disty = _mm_srli_epi16(v_fy.vect, 12);
+                        v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+                        v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+                        v_disty = _mm_shufflehi_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
+                        v_disty = _mm_shufflelo_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
+
+                        interpolate_4_pixels_16_sse2(tl.vect, tr.vect, bl.vect, br.vect, v_distx, v_disty, colorMask, v_256, b);
+                        b+=4;
+                        v_fx.vect = _mm_add_epi32(v_fx.vect, v_fdx);
+                        v_fy.vect = _mm_add_epi32(v_fy.vect, v_fdy);
+                    }
+                    fx = v_fx.i[0];
+                    fy = v_fy.i[0];
+#elif defined(__ARM_NEON__)
+                    BILINEAR_ROTATE_BOUNDS_PROLOG
+
+                    const int16x8_t colorMask = vdupq_n_s16(0x00ff);
+                    const int16x8_t invColorMask = vmvnq_s16(colorMask);
+                    const int16x8_t v_256 = vdupq_n_s16(256);
+                    int32x4_t v_fdx = vdupq_n_s32(fdx*4);
+                    int32x4_t v_fdy = vdupq_n_s32(fdy*4);
+
+                    const uchar *textureData = data->texture.imageData;
+                    const int bytesPerLine = data->texture.bytesPerLine;
+
+                    union Vect_buffer { int32x4_t vect; quint32 i[4]; };
+                    Vect_buffer v_fx, v_fy;
+
+                    for (int i = 0; i < 4; i++) {
+                        v_fx.i[i] = fx;
+                        v_fy.i[i] = fy;
+                        fx += fdx;
+                        fy += fdy;
+                    }
+
+                    const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
+
+                    while (b < boundedEnd) {
+                        if (fdx > 0 && (v_fx.i[3] >> 16) >= image_x2)
+                            break;
+                        if (fdx < 0 && (v_fx.i[3] >> 16) < image_x1)
+                            break;
+                        if (fdy > 0 && (v_fy.i[3] >> 16) >= image_y2)
+                            break;
+                        if (fdy < 0 && (v_fy.i[3] >> 16) < image_y1)
+                            break;
+
+                        Vect_buffer tl, tr, bl, br;
+
+                        Vect_buffer v_fx_shifted, v_fy_shifted;
+                        v_fx_shifted.vect = vshrq_n_s32(v_fx.vect, 16);
+                        v_fy_shifted.vect = vshrq_n_s32(v_fy.vect, 16);
+
+                        for (int i = 0; i < 4; i++) {
+                            const int x1 = v_fx_shifted.i[i];
+                            const int y1 = v_fy_shifted.i[i];
+                            const uchar *sl = textureData + bytesPerLine * y1;
+                            const uint *s1 = (const uint *)sl;
+                            const uint *s2 = (const uint *)(sl + bytesPerLine);
+                            tl.i[i] = s1[x1];
+                            tr.i[i] = s1[x1+1];
+                            bl.i[i] = s2[x1];
+                            br.i[i] = s2[x1+1];
+                        }
+
+                        int32x4_t v_distx = vshrq_n_s32(vandq_s32(v_fx.vect, v_ffff_mask), 12);
+                        int32x4_t v_disty = vshrq_n_s32(vandq_s32(v_fy.vect, v_ffff_mask), 12);
+                        v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16));
+                        v_disty = vorrq_s32(v_disty, vshlq_n_s32(v_disty, 16));
+                        int16x8_t v_disty_ = vshlq_n_s16(v_disty, 4);
+
+                        interpolate_4_pixels_16_neon(vreinterpretq_s16_s32(tl.vect), vreinterpretq_s16_s32(tr.vect), vreinterpretq_s16_s32(bl.vect), vreinterpretq_s16_s32(br.vect), vreinterpretq_s16_s32(v_distx), v_disty, v_disty_, colorMask, invColorMask, v_256, b);
+                        b+=4;
+                        v_fx.vect = vaddq_s32(v_fx.vect, v_fdx);
+                        v_fy.vect = vaddq_s32(v_fy.vect, v_fdy);
+                    }
+                    fx = v_fx.i[0];
+                    fy = v_fy.i[0];
+#endif
+                }
+
                 while (b < end) {
                     int x1 = (fx >> 16);
                     int x2;
author	Allan Sandfeld Jensen <allan.jensen@digia.com>	2014-03-07 14:30:13 +0100
committer	The Qt Project <gerrit-noreply@qt-project.org>	2014-05-05 14:13:50 +0200
commit	89b70f3910dfec7108d6e95cc96f904fce01da29 (patch)
tree	e1821e7ec8db1caf00768c5e9d6354e7aedaad67 /src/gui/painting
parent	1f278c6e1126a41e83aec83e2eeacb96ee08e74b (diff)