Merge remote-tracking branch 'origin/5.5' into dev

Change-Id: If9fd98525b6b4ca07e5e006fc98bf372a73b8a21
author: Liang Qi <liang.qi@theqtcompany.com> 2015-04-06 19:10:10 +0200
committer: Liang Qi <liang.qi@theqtcompany.com> 2015-04-06 19:10:25 +0200
commit: 20cac3d9c9c22153e9e316daff32b6050ff6be6b (patch)
tree: b563a89475df9afb4f40841ec371be9488d5b1ed /src/gui/painting
parent: 8ce85d74b692392a4ea0785360156f37418cff13 (diff)
parent: 9eb0b09abce28b11e4915fc9c3b3e996eb19cef2 (diff)
2 files changed, 67 insertions, 84 deletions
diff --git a/src/gui/painting/qcosmeticstroker.cpp b/src/gui/painting/qcosmeticstroker.cpp
index f82b098012..8fb5f4fd3f 100644
--- a/src/gui/painting/qcosmeticstroker.cpp
+++ b/src/gui/painting/qcosmeticstroker.cpp
@@ -602,8 +602,7 @@ void QCosmeticStroker::drawPath(const QVectorPath &path)
             if (!closed && drawCaps && points == end - 2)
                 caps |= CapEnd;
 
-            QCosmeticStroker::Point last = this->lastPixel;
-            bool unclipped = stroke(this, p.x(), p.y(), p2.x(), p2.y(), caps);
+            bool moveNextStart = stroke(this, p.x(), p.y(), p2.x(), p2.y(), caps);
 
             /* fix for gaps in polylines with fastpen and aliased in a sequence
                of points with small distances: if current point p2 has been dropped
@@ -613,14 +612,8 @@ void QCosmeticStroker::drawPath(const QVectorPath &path)
                still need to update p to avoid drawing the line after this one from
                a bad starting position.
             */
-            if (fastPenAliased && unclipped) {
-                if (last.x != lastPixel.x || last.y != lastPixel.y
-                    || points == begin + 2 || points == end - 2) {
-                    p = p2;
-                }
-            } else {
+            if (!fastPenAliased || moveNextStart || points == begin + 2 || points == end - 2)
                 p = p2;
-            }
             points += 2;
             caps = NoCaps;
         }
@@ -727,8 +720,9 @@ template<DrawPixel drawPixel, class Dasher>
 static bool drawLine(QCosmeticStroker *stroker, qreal rx1, qreal ry1, qreal rx2, qreal ry2, int caps)
 {
     if (stroker->clipLine(rx1, ry1, rx2, ry2))
-        return false;
+        return true;
 
+    bool didDraw = false;
     const int half = stroker->legacyRounding ? 31 : 0;
     int x1 = toF26Dot6(rx1) + half;
     int y1 = toF26Dot6(ry1) + half;
@@ -814,6 +808,7 @@ static bool drawLine(QCosmeticStroker *stroker, qreal rx1, qreal ry1, qreal rx2,
                 dasher.adjust();
                 x += xinc;
             } while (++y < ys);
+            didDraw = true;
         }
     } else {
         // horizontal
@@ -889,10 +884,11 @@ static bool drawLine(QCosmeticStroker *stroker, qreal rx1, qreal ry1, qreal rx2,
                 dasher.adjust();
                 y += yinc;
             } while (++x < xs);
+            didDraw = true;
         }
     }
     stroker->lastPixel = last;
-    return true;
+    return didDraw;
 }
 
 
@@ -900,7 +896,7 @@ template<DrawPixel drawPixel, class Dasher>
 static bool drawLineAA(QCosmeticStroker *stroker, qreal rx1, qreal ry1, qreal rx2, qreal ry2, int caps)
 {
     if (stroker->clipLine(rx1, ry1, rx2, ry2))
-        return false;
+        return true;
 
     int x1 = toF26Dot6(rx1);
     int y1 = toF26Dot6(ry1);
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 538389f15f..57bb111538 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -1539,40 +1539,29 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                     const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
                     const __m128i v_256 = _mm_set1_epi16(256);
                     const __m128i v_disty = _mm_set1_epi16(disty);
-                    __m128i v_fdx = _mm_set1_epi32(fdx*4);
-
-                    ptrdiff_t secondLine = reinterpret_cast<const uint *>(s2) - reinterpret_cast<const uint *>(s1);
-
-                    union Vect_buffer { __m128i vect; quint32 i[4]; };
-                    Vect_buffer v_fx;
-
-                    for (int i = 0; i < 4; i++) {
-                        v_fx.i[i] = fx;
-                        fx += fdx;
-                    }
+                    const __m128i v_fdx = _mm_set1_epi32(fdx*4);
+                    __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
 
                     while (b < boundedEnd) {
-
-                        Vect_buffer tl, tr, bl, br;
-
-                        for (int i = 0; i < 4; i++) {
-                            int x1 = v_fx.i[i] >> 16;
-                            const uint *addr_tl = reinterpret_cast<const uint *>(s1) + x1;
-                            const uint *addr_tr = addr_tl + 1;
-                            tl.i[i] = *addr_tl;
-                            tr.i[i] = *addr_tr;
-                            bl.i[i] = *(addr_tl+secondLine);
-                            br.i[i] = *(addr_tr+secondLine);
-                        }
-                        __m128i v_distx = _mm_srli_epi16(v_fx.vect, 12);
+                        __m128i offset = _mm_srli_epi32(v_fx, 16);
+                        const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+                        const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+                        const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+                        const int offset3 = _mm_cvtsi128_si32(offset);
+                        const __m128i tl = _mm_setr_epi32(s1[offset0], s1[offset1], s1[offset2], s1[offset3]);
+                        const __m128i tr = _mm_setr_epi32(s1[offset0 + 1], s1[offset1 + 1], s1[offset2 + 1], s1[offset3 + 1]);
+                        const __m128i bl = _mm_setr_epi32(s2[offset0], s2[offset1], s2[offset2], s2[offset3]);
+                        const __m128i br = _mm_setr_epi32(s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]);
+
+                        __m128i v_distx = _mm_srli_epi16(v_fx, 12);
                         v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
                         v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
 
-                        interpolate_4_pixels_16_sse2(tl.vect, tr.vect, bl.vect, br.vect, v_distx, v_disty, colorMask, v_256, b);
-                        b+=4;
-                        v_fx.vect = _mm_add_epi32(v_fx.vect, v_fdx);
+                        interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b);
+                        b += 4;
+                        v_fx = _mm_add_epi32(v_fx, v_fdx);
                     }
-                    fx = v_fx.i[0];
+                    fx = _mm_cvtsi128_si32(v_fx);
 #elif defined(__ARM_NEON__)
                     BILINEAR_DOWNSCALE_BOUNDS_PROLOG
 
@@ -1687,9 +1676,9 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                         uint tr = s1[x2]; \
                         uint bl = s2[x1]; \
                         uint br = s2[x2]; \
-                        int distx = (fx & 0x0000ffff) >> 12; \
-                        int disty = (fy & 0x0000ffff) >> 12; \
-                        *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); \
+                        int distx = (fx & 0x0000ffff) >> 8; \
+                        int disty = (fy & 0x0000ffff) >> 8; \
+                        *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); \
                         fx += fdx; \
                         fy += fdy; \
                         ++b; \
@@ -1702,62 +1691,54 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
 
                     const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
                     const __m128i v_256 = _mm_set1_epi16(256);
-                    __m128i v_fdx = _mm_set1_epi32(fdx*4);
-                    __m128i v_fdy = _mm_set1_epi32(fdy*4);
+                    const __m128i v_fdx = _mm_set1_epi32(fdx*4);
+                    const __m128i v_fdy = _mm_set1_epi32(fdy*4);
+                    __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
+                    __m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy);
 
                     const uchar *textureData = data->texture.imageData;
                     const int bytesPerLine = data->texture.bytesPerLine;
-
-                    union Vect_buffer { __m128i vect; qint32 i[4]; };
-                    Vect_buffer v_fx, v_fy;
-
-                    for (int i = 0; i < 4; i++) {
-                        v_fx.i[i] = fx;
-                        v_fy.i[i] = fy;
-                        fx += fdx;
-                        fy += fdy;
-                    }
+                    const __m128i vbpl = _mm_shufflelo_epi16(_mm_cvtsi32_si128(bytesPerLine/4), _MM_SHUFFLE(0, 0, 0, 0));
 
                     while (b < boundedEnd) {
-                        if (fdx > 0 && (v_fx.i[3] >> 16) >= image_x2)
+                        if (fdx > 0 && (short)_mm_extract_epi16(v_fx, 7) >= image_x2)
                             break;
-                        if (fdx < 0 && (v_fx.i[3] >> 16) < image_x1)
+                        if (fdx < 0 && (short)_mm_extract_epi16(v_fx, 7) < image_x1)
                             break;
-                        if (fdy > 0 && (v_fy.i[3] >> 16) >= image_y2)
+                        if (fdy > 0 && (short)_mm_extract_epi16(v_fy, 7) >= image_y2)
                             break;
-                        if (fdy < 0 && (v_fy.i[3] >> 16) < image_y1)
+                        if (fdy < 0 && (short)_mm_extract_epi16(v_fy, 7) < image_y1)
                             break;
 
-                        Vect_buffer tl, tr, bl, br;
-                        Vect_buffer v_fx_shifted, v_fy_shifted;
-                        v_fx_shifted.vect = _mm_srli_epi32(v_fx.vect, 16);
-                        v_fy_shifted.vect = _mm_srli_epi32(v_fy.vect, 16);
-
-                        for (int i = 0; i < 4; i++) {
-                            const int x1 = v_fx_shifted.i[i];
-                            const int y1 = v_fy_shifted.i[i];
-                            const uchar *sl = textureData + bytesPerLine * y1;
-                            const uint *s1 = (const uint *)sl;
-                            const uint *s2 = (const uint *)(sl + bytesPerLine);
-                            tl.i[i] = s1[x1];
-                            tr.i[i] = s1[x1+1];
-                            bl.i[i] = s2[x1];
-                            br.i[i] = s2[x1+1];
-                        }
-                        __m128i v_distx = _mm_srli_epi16(v_fx.vect, 12);
-                        __m128i v_disty = _mm_srli_epi16(v_fy.vect, 12);
+                        const __m128i vy = _mm_packs_epi32(_mm_srli_epi32(v_fy, 16), _mm_setzero_si128());
+                        // 4x16bit * 4x16bit -> 4x32bit
+                        __m128i offset = _mm_unpacklo_epi16(_mm_mullo_epi16(vy, vbpl), _mm_mulhi_epi16(vy, vbpl));
+                        offset = _mm_add_epi32(offset, _mm_srli_epi32(v_fx, 16));
+                        const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+                        const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+                        const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+                        const int offset3 = _mm_cvtsi128_si32(offset);
+                        const uint *topData = (const uint *)(textureData);
+                        const __m128i tl = _mm_setr_epi32(topData[offset0], topData[offset1], topData[offset2], topData[offset3]);
+                        const __m128i tr = _mm_setr_epi32(topData[offset0 + 1], topData[offset1 + 1], topData[offset2 + 1], topData[offset3 + 1]);
+                        const uint *bottomData = (const uint *)(textureData + bytesPerLine);
+                        const __m128i bl = _mm_setr_epi32(bottomData[offset0], bottomData[offset1], bottomData[offset2], bottomData[offset3]);
+                        const __m128i br = _mm_setr_epi32(bottomData[offset0 + 1], bottomData[offset1 + 1], bottomData[offset2 + 1], bottomData[offset3 + 1]);
+
+                        __m128i v_distx = _mm_srli_epi16(v_fx, 12);
+                        __m128i v_disty = _mm_srli_epi16(v_fy, 12);
                         v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
                         v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
                         v_disty = _mm_shufflehi_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
                         v_disty = _mm_shufflelo_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
 
-                        interpolate_4_pixels_16_sse2(tl.vect, tr.vect, bl.vect, br.vect, v_distx, v_disty, colorMask, v_256, b);
-                        b+=4;
-                        v_fx.vect = _mm_add_epi32(v_fx.vect, v_fdx);
-                        v_fy.vect = _mm_add_epi32(v_fy.vect, v_fdy);
+                        interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b);
+                        b += 4;
+                        v_fx = _mm_add_epi32(v_fx, v_fdx);
+                        v_fy = _mm_add_epi32(v_fy, v_fdy);
                     }
-                    fx = v_fx.i[0];
-                    fy = v_fy.i[0];
+                    fx = _mm_cvtsi128_si32(v_fx);
+                    fy = _mm_cvtsi128_si32(v_fy);
 #endif
                 }
 
@@ -1778,10 +1759,16 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
                     uint bl = s2[x1];
                     uint br = s2[x2];
 
+#if defined(__SSE2__)
+                    // The SSE2 optimized interpolate_4_pixels is faster than interpolate_4_pixels_16.
+                    int distx = (fx & 0x0000ffff) >> 8;
+                    int disty = (fy & 0x0000ffff) >> 8;
+                    *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+#else
                     int distx = (fx & 0x0000ffff) >> 12;
                     int disty = (fy & 0x0000ffff) >> 12;
-
                     *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
+#endif
 
                     fx += fdx;
                     fy += fdy;
author	Liang Qi <liang.qi@theqtcompany.com>	2015-04-06 19:10:10 +0200
committer	Liang Qi <liang.qi@theqtcompany.com>	2015-04-06 19:10:25 +0200
commit	20cac3d9c9c22153e9e316daff32b6050ff6be6b (patch)
tree	b563a89475df9afb4f40841ec371be9488d5b1ed /src/gui/painting
parent	8ce85d74b692392a4ea0785360156f37418cff13 (diff)
parent	9eb0b09abce28b11e4915fc9c3b3e996eb19cef2 (diff)