1 files changed, 90 insertions, 94 deletions
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index ac11b0f324..c697aceaf3 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -2176,7 +2176,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                 fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
                 const uint *s1 = (const uint *)data->texture.scanLine(y1);
                 const uint *s2 = (const uint *)data->texture.scanLine(y2);
-                int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
+                const int disty8 = (fy & 0x0000ffff) >> 8;
+                const int disty4 = (disty8 + 0x08) >> 4;
 
                 if (blendType != BlendTransformedBilinearTiled) {
 #define BILINEAR_DOWNSCALE_BOUNDS_PROLOG \
@@ -2188,12 +2189,9 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                         fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); \
                         if (x1 != x2) \
                             break; \
-                        uint tl = s1[x1]; \
-                        uint tr = s1[x2]; \
-                        uint bl = s2[x1]; \
-                        uint br = s2[x2]; \
-                        int distx = ((fx & 0x0000ffff) + 0x0800) >> 12; \
-                        *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); \
+                        uint top = s1[x1]; \
+                        uint bot = s2[x1]; \
+                        *b = INTERPOLATE_PIXEL_256(top, 256 - disty8, bot, disty8); \
                         fx += fdx; \
                         ++b; \
                     } \
@@ -2209,7 +2207,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
 
                     const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
                     const __m128i v_256 = _mm_set1_epi16(256);
-                    const __m128i v_disty = _mm_set1_epi16(disty);
+                    const __m128i v_disty = _mm_set1_epi16(disty4);
                     const __m128i v_fdx = _mm_set1_epi32(fdx*4);
                     const __m128i v_fx_r = _mm_set1_epi32(0x8);
                     __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
@@ -2241,18 +2239,14 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                     const int16x8_t colorMask = vdupq_n_s16(0x00ff);
                     const int16x8_t invColorMask = vmvnq_s16(colorMask);
                     const int16x8_t v_256 = vdupq_n_s16(256);
-                    const int16x8_t v_disty = vdupq_n_s16(disty);
+                    const int16x8_t v_disty = vdupq_n_s16(disty4);
                     const int16x8_t v_disty_ = vshlq_n_s16(v_disty, 4);
                     int32x4_t v_fdx = vdupq_n_s32(fdx*4);
 
                     int32x4_t v_fx = vmovq_n_s32(fx);
-                    fx += fdx;
-                    v_fx = vsetq_lane_s32(fx, v_fx, 1);
-                    fx += fdx;
-                    v_fx = vsetq_lane_s32(fx, v_fx, 2);
-                    fx += fdx;
-                    v_fx = vsetq_lane_s32(fx, v_fx, 3);
-                    fx += fdx;
+                    v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1);
+                    v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2);
+                    v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3);
 
                     const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
                     const int32x4_t v_fx_r = vdupq_n_s32(0x0800);
@@ -2260,18 +2254,20 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                     while (b < boundedEnd) {
                         uint32x4x2_t v_top, v_bot;
 
-                        int32x4_t v_fx_shifted = vshrq_n_s32(v_fx, 16);
-
-                        int x1 = vgetq_lane_s32(v_fx_shifted, 0);
+                        int x1 = (fx >> 16);
+                        fx += fdx;
                         v_top = vld2q_lane_u32(s1 + x1, v_top, 0);
                         v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0);
-                        x1 = vgetq_lane_s32(v_fx_shifted, 1);
+                        x1 = (fx >> 16);
+                        fx += fdx;
                         v_top = vld2q_lane_u32(s1 + x1, v_top, 1);
                         v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1);
-                        x1 = vgetq_lane_s32(v_fx_shifted, 2);
+                        x1 = (fx >> 16);
+                        fx += fdx;
                         v_top = vld2q_lane_u32(s1 + x1, v_top, 2);
                         v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2);
-                        x1 = vgetq_lane_s32(v_fx_shifted, 3);
+                        x1 = (fx >> 16);
+                        fx += fdx;
                         v_top = vld2q_lane_u32(s1 + x1, v_top, 3);
                         v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3);
 
@@ -2286,7 +2282,6 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                         b+=4;
                         v_fx = vaddq_s32(v_fx, v_fdx);
                     }
-                    fx = vgetq_lane_s32(v_fx, 0);
 #endif
                 }
 
@@ -2298,8 +2293,14 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                     uint tr = s1[x2];
                     uint bl = s2[x1];
                     uint br = s2[x2];
-                    int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
-                    *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
+#if defined(__SSE2__) || defined(__ARM_NEON__)
+                    // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
+                    int distx8 = (fx & 0x0000ffff) >> 8;
+                    *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8);
+#else
+                    int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12;
+                    *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4);
+#endif
                     fx += fdx;
                     ++b;
                 }
@@ -2436,16 +2437,12 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
 
                     int32x4_t v_fx = vmovq_n_s32(fx);
                     int32x4_t v_fy = vmovq_n_s32(fy);
-                    fx += fdx; fy += fdy;
-                    v_fx = vsetq_lane_s32(fx, v_fx, 1);
-                    v_fy = vsetq_lane_s32(fy, v_fy, 1);
-                    fx += fdx; fy += fdy;
-                    v_fx = vsetq_lane_s32(fx, v_fx, 2);
-                    v_fy = vsetq_lane_s32(fy, v_fy, 2);
-                    fx += fdx; fy += fdy;
-                    v_fx = vsetq_lane_s32(fx, v_fx, 3);
-                    v_fy = vsetq_lane_s32(fy, v_fy, 3);
-                    fx += fdx; fy += fdy;
+                    v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1);
+                    v_fy = vsetq_lane_s32(fy + fdy, v_fy, 1);
+                    v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2);
+                    v_fy = vsetq_lane_s32(fy + fdy * 2, v_fy, 2);
+                    v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3);
+                    v_fy = vsetq_lane_s32(fy + fdy * 3, v_fy, 3);
 
                     const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
                     const int32x4_t v_round = vdupq_n_s32(0x0800);
@@ -2453,33 +2450,33 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                     while (b < boundedEnd) {
                         uint32x4x2_t v_top, v_bot;
 
-                        int32x4_t v_fx_shifted, v_fy_shifted;
-                        v_fx_shifted = vshrq_n_s32(v_fx, 16);
-                        v_fy_shifted = vshrq_n_s32(v_fy, 16);
-
-                        int x1 = vgetq_lane_s32(v_fx_shifted, 0);
-                        int y1 = vgetq_lane_s32(v_fy_shifted, 0);
+                        int x1 = (fx >> 16);
+                        int y1 = (fy >> 16);
+                        fx += fdx; fy += fdy;
                         const uchar *sl = textureData + bytesPerLine * y1;
                         const uint *s1 = reinterpret_cast<const uint *>(sl);
                         const uint *s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
                         v_top = vld2q_lane_u32(s1 + x1, v_top, 0);
                         v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0);
-                        x1 = vgetq_lane_s32(v_fx_shifted, 1);
-                        y1 = vgetq_lane_s32(v_fy_shifted, 1);
+                        x1 = (fx >> 16);
+                        y1 = (fy >> 16);
+                        fx += fdx; fy += fdy;
                         sl = textureData + bytesPerLine * y1;
                         s1 = reinterpret_cast<const uint *>(sl);
                         s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
                         v_top = vld2q_lane_u32(s1 + x1, v_top, 1);
                         v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1);
-                        x1 = vgetq_lane_s32(v_fx_shifted, 2);
-                        y1 = vgetq_lane_s32(v_fy_shifted, 2);
+                        x1 = (fx >> 16);
+                        y1 = (fy >> 16);
+                        fx += fdx; fy += fdy;
                         sl = textureData + bytesPerLine * y1;
                         s1 = reinterpret_cast<const uint *>(sl);
                         s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
                         v_top = vld2q_lane_u32(s1 + x1, v_top, 2);
                         v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2);
-                        x1 = vgetq_lane_s32(v_fx_shifted, 3);
-                        y1 = vgetq_lane_s32(v_fy_shifted, 3);
+                        x1 = (fx >> 16);
+                        y1 = (fy >> 16);
+                        fx += fdx; fy += fdy;
                         sl = textureData + bytesPerLine * y1;
                         s1 = reinterpret_cast<const uint *>(sl);
                         s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
@@ -2501,8 +2498,6 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                         v_fx = vaddq_s32(v_fx, v_fdx);
                         v_fy = vaddq_s32(v_fy, v_fdy);
                     }
-                    fx = vgetq_lane_s32(v_fx, 0);
-                    fy = vgetq_lane_s32(v_fy, 0);
 #endif
                 }
 
@@ -2980,10 +2975,8 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
                         fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
                         if (x1 != x2)
                             break;
-                        sbuf1[i * 2 + 0] = ((const uint*)s1)[x1];
-                        sbuf1[i * 2 + 1] = ((const uint*)s1)[x2];
-                        sbuf2[i * 2 + 0] = ((const uint*)s2)[x1];
-                        sbuf2[i * 2 + 1] = ((const uint*)s2)[x2];
+                        sbuf1[i * 2 + 0] = sbuf1[i * 2 + 1] = ((const uint*)s1)[x1];
+                        sbuf2[i * 2 + 0] = sbuf2[i * 2 + 1] = ((const uint*)s2)[x1];
                         fx += fdx;
                     }
                     int fastLen;
@@ -3102,6 +3095,16 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
                         fx += fdx;
                         fy += fdy;
                     }
+                    int fastLen = len;
+                    if (fdx > 0)
+                        fastLen = qMin(fastLen, int((qint64(image_x2) * fixed_scale - fx) / fdx));
+                    else if (fdx < 0)
+                        fastLen = qMin(fastLen, int((qint64(image_x1) * fixed_scale - fx) / fdx));
+                    if (fdy > 0)
+                        fastLen = qMin(fastLen, int((qint64(image_y2) * fixed_scale - fy) / fdy));
+                    else if (fdy < 0)
+                        fastLen = qMin(fastLen, int((qint64(image_y1) * fixed_scale - fy) / fdy));
+                    fastLen -= 3;
 
                     const __m128i v_fdx = _mm_set1_epi32(fdx*4);
                     const __m128i v_fdy = _mm_set1_epi32(fdy*4);
@@ -3111,15 +3114,7 @@ static const QRgba64 *QT_FASTCALL fetchTransformedBilinear64(QRgba64 *buffer, co
                     const uchar *s1 = data->texture.imageData;
                     const uchar *s2 = s1 + bytesPerLine;
                     const __m128i vbpl = _mm_shufflelo_epi16(_mm_cvtsi32_si128(bytesPerLine/4), _MM_SHUFFLE(0, 0, 0, 0));
-                    for (; i < len-3; i+=4) {
-                        if (fdx > 0 && (short)_mm_extract_epi16(v_fx, 7) >= image_x2)
-                            break;
-                        if (fdx < 0 && (short)_mm_extract_epi16(v_fx, 7) < image_x1)
-                            break;
-                        if (fdy > 0 && (short)_mm_extract_epi16(v_fy, 7) >= image_y2)
-                            break;
-                        if (fdy < 0 && (short)_mm_extract_epi16(v_fy, 7) < image_y1)
-                            break;
+                    for (; i < fastLen; i += 4) {
                         const __m128i vy = _mm_packs_epi32(_mm_srai_epi32(v_fy, 16), _mm_setzero_si128());
                         __m128i voffset = _mm_unpacklo_epi16(_mm_mullo_epi16(vy, vbpl), _mm_mulhi_epu16(vy, vbpl));
                         voffset = _mm_add_epi32(voffset, _mm_srli_epi32(v_fx, 16));
@@ -5555,17 +5550,17 @@ static const ProcessSpans processTextureSpans[NBlendTypes][QImage::NImageFormats
         blend_src_generic, // ARGB32
         blend_transformed_argb, // ARGB32_Premultiplied
         blend_transformed_rgb565,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
+        blend_src_generic, // ARGB8565_Premultiplied
+        blend_src_generic, // RGB666
+        blend_src_generic, // ARGB6666_Premultiplied
+        blend_src_generic, // RGB555
+        blend_src_generic, // ARGB8555_Premultiplied
+        blend_src_generic, // RGB888
+        blend_src_generic, // RGB444
+        blend_src_generic, // ARGB4444_Premultiplied
+        blend_src_generic, // RGBX8888
+        blend_src_generic, // RGBA8888
+        blend_src_generic, // RGBA8888_Premultiplied
         blend_src_generic_rgb64,
         blend_src_generic_rgb64,
         blend_src_generic_rgb64,
@@ -5583,16 +5578,17 @@ static const ProcessSpans processTextureSpans[NBlendTypes][QImage::NImageFormats
         blend_src_generic, // ARGB32
         blend_transformed_tiled_argb, // ARGB32_Premultiplied
         blend_transformed_tiled_rgb565,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
+        blend_src_generic, // ARGB8565_Premultiplied
+        blend_src_generic, // RGB666
+        blend_src_generic, // ARGB6666_Premultiplied
+        blend_src_generic, // RGB555
+        blend_src_generic, // ARGB8555_Premultiplied
+        blend_src_generic, // RGB888
+        blend_src_generic, // RGB444
+        blend_src_generic, // ARGB4444_Premultiplied
+        blend_src_generic, // RGBX8888
+        blend_src_generic, // RGBA8888
+        blend_src_generic, // RGBA8888_Premultiplied
         blend_src_generic_rgb64,
         blend_src_generic_rgb64,
         blend_src_generic_rgb64,
@@ -5610,17 +5606,17 @@ static const ProcessSpans processTextureSpans[NBlendTypes][QImage::NImageFormats
         blend_src_generic, // ARGB32
         blend_src_generic, // ARGB32_Premultiplied
         blend_transformed_bilinear_rgb565,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
-        blend_src_generic,
+        blend_src_generic, // ARGB8565_Premultiplied
+        blend_src_generic, // RGB666
+        blend_src_generic, // ARGB6666_Premultiplied
+        blend_src_generic, // RGB555
+        blend_src_generic, // ARGB8555_Premultiplied
+        blend_src_generic, // RGB888
+        blend_src_generic, // RGB444
+        blend_src_generic, // ARGB4444_Premultiplied
+        blend_src_generic, // RGBX8888
+        blend_src_generic, // RGBA8888
+        blend_src_generic, // RGBA8888_Premultiplied
         blend_src_generic_rgb64,
         blend_src_generic_rgb64,
         blend_src_generic_rgb64,