Improve drawing scaled image with raster using SSE2

That codepath is taken in qml when an Image specify with and hight and is not smooth, and the image contains alpha contents The changes in qdrawingprimitive_sse2_p.h just put some code from the BLEND_SOURCE_OVER_ARGB32_SSE2 macro into a sub macro to allow its reuse The code that is not SSE2 in qt_scale_image_argb32_on_argb32_sse2 comes from the qt_scale_image_argb32_on_argb32 in qblendfunctions.cpp Change-Id: I071a040af4514fb21777dead9f7c5baf16071d59 Reviewed-by: Samuel Rødal <samuel.rodal@nokia.com>
author: Olivier Goffart <ogoffart@kde.org> 2011-10-24 08:02:10 +0200
committer: Qt by Nokia <qt-info@nokia.com> 2011-10-24 13:56:59 +0200
commit: 4bff8ea4d48851fbea078bd93226888bdd05d8dc (patch)
tree: 27f8c6fab968a356fc31e3d76148fec038ac2e4c /src
parent: cea8e41dc838bcdc2ec63eefac6441ddc608e390 (diff)
3 files changed, 153 insertions, 22 deletions
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index c4c5846ed9..88c237b59b 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -7050,6 +7050,15 @@ void qInitDrawhelperAsm()
         qDrawHelper[QImage::Format_ARGB32].bitmapBlit = qt_bitmapblit32_sse2;
         qDrawHelper[QImage::Format_ARGB32_Premultiplied].bitmapBlit = qt_bitmapblit32_sse2;
         qDrawHelper[QImage::Format_RGB16].bitmapBlit = qt_bitmapblit16_sse2;
+
+        extern void qt_scale_image_argb32_on_argb32_sse2(uchar *destPixels, int dbpl,
+                                                         const uchar *srcPixels, int sbpl,
+                                                         const QRectF &targetRect,
+                                                         const QRectF &sourceRect,
+                                                         const QRect &clip,
+                                                         int const_alpha);
+        qScaleFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_argb32_sse2;
+        qScaleFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_argb32_sse2;
 #endif
 #ifdef QT_HAVE_SSE
     } else if (features & SSE) {
diff --git a/src/gui/painting/qdrawhelper_sse2.cpp b/src/gui/painting/qdrawhelper_sse2.cpp
index 340cd7100b..3bbdae00bd 100644
--- a/src/gui/painting/qdrawhelper_sse2.cpp
+++ b/src/gui/painting/qdrawhelper_sse2.cpp
@@ -538,6 +538,122 @@ const uint * QT_FASTCALL qt_fetch_radial_gradient_sse2(uint *buffer, const Opera
     return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdSse2> >(buffer, op, data, y, x, length);
 }
 
+void qt_scale_image_argb32_on_argb32_sse2(uchar *destPixels, int dbpl,
+                                          const uchar *srcPixels, int sbpl,
+                                          const QRectF &targetRect,
+                                          const QRectF &sourceRect,
+                                          const QRect &clip,
+                                          int const_alpha)
+{
+    if (const_alpha != 256) {
+        // from qblendfunctions.cpp
+        extern void qt_scale_image_argb32_on_argb32(uchar *destPixels, int dbpl,
+                                               const uchar *srcPixels, int sbpl,
+                                               const QRectF &targetRect,
+                                               const QRectF &sourceRect,
+                                               const QRect &clip,
+                                               int const_alpha);
+        return qt_scale_image_argb32_on_argb32(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, const_alpha);
+    }
+
+    qreal sx = targetRect.width() / (qreal) sourceRect.width();
+    qreal sy = targetRect.height() / (qreal) sourceRect.height();
+
+    int ix = 0x00010000 / sx;
+    int iy = 0x00010000 / sy;
+
+    int cx1 = clip.x();
+    int cx2 = clip.x() + clip.width();
+    int cy1 = clip.top();
+    int cy2 = clip.y() + clip.height();
+
+    int tx1 = qRound(targetRect.left());
+    int tx2 = qRound(targetRect.right());
+    int ty1 = qRound(targetRect.top());
+    int ty2 = qRound(targetRect.bottom());
+
+    if (tx2 < tx1)
+        qSwap(tx2, tx1);
+    if (ty2 < ty1)
+        qSwap(ty2, ty1);
+
+    if (tx1 < cx1)
+        tx1 = cx1;
+    if (tx2 >= cx2)
+        tx2 = cx2;
+
+    if (tx1 >= tx2)
+        return;
+
+    if (ty1 < cy1)
+        ty1 = cy1;
+    if (ty2 >= cy2)
+       ty2 = cy2;
+    if (ty1 >= ty2)
+        return;
+
+    int h = ty2 - ty1;
+    int w = tx2 - tx1;
+
+    quint32 basex;
+    quint32 srcy;
+
+    if (sx < 0) {
+        int dstx = qFloor((tx1 + qreal(0.5) - targetRect.right()) * ix) + 1;
+        basex = quint32(sourceRect.right() * 65536) + dstx;
+    } else {
+        int dstx = qCeil((tx1 + qreal(0.5) - targetRect.left()) * ix) - 1;
+        basex = quint32(sourceRect.left() * 65536) + dstx;
+    }
+    if (sy < 0) {
+        int dsty = qFloor((ty1 + qreal(0.5) - targetRect.bottom()) * iy) + 1;
+        srcy = quint32(sourceRect.bottom() * 65536) + dsty;
+    } else {
+        int dsty = qCeil((ty1 + qreal(0.5) - targetRect.top()) * iy) - 1;
+        srcy = quint32(sourceRect.top() * 65536) + dsty;
+    }
+
+    quint32 *dst = ((quint32 *) (destPixels + ty1 * dbpl)) + tx1;
+
+    const __m128i nullVector = _mm_set1_epi32(0);
+    const __m128i half = _mm_set1_epi16(0x80);
+    const __m128i one = _mm_set1_epi16(0xff);
+    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+    const __m128i alphaMask = _mm_set1_epi32(0xff000000);
+    const __m128i ixVector = _mm_set1_epi32(4*ix);
+
+
+    while (h--) {
+        const uint *src = (const quint32 *) (srcPixels + (srcy >> 16) * sbpl);
+        int srcx = basex;
+        int x = 0;
+
+        ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
+            uint s = src[(srcx + x*ix) >> 16];
+            dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
+        }
+
+        __m128i srcxVector = _mm_set_epi32(srcx, srcx + ix, srcx + ix + ix, srcx + ix + ix + ix);
+
+        for (; x<w - 3; x += 4) {
+            union Vect_buffer { __m128i vect; quint32 i[4]; };
+            Vect_buffer addr;
+            addr.vect = _mm_srli_epi32(srcxVector, 16);
+            srcxVector = _mm_add_epi32(srcxVector, ixVector);
+
+            const __m128i srcVector = _mm_set_epi32(src[addr.i[0]], src[addr.i[1]], src[addr.i[2]], src[addr.i[3]]);
+            BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask);
+        }
+
+        for (; x<w; x++) {
+            uint s = src[(srcx + x*ix) >> 16];
+            dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
+        }
+        dst = (quint32 *)(((uchar *) dst) + dbpl);
+        srcy += iy;
+    }
+}
+
 
 QT_END_NAMESPACE
 
diff --git a/src/gui/painting/qdrawingprimitive_sse2_p.h b/src/gui/painting/qdrawingprimitive_sse2_p.h
index 4c66d90bf1..dad8f6cb5d 100644
--- a/src/gui/painting/qdrawingprimitive_sse2_p.h
+++ b/src/gui/painting/qdrawingprimitive_sse2_p.h
@@ -128,6 +128,33 @@ QT_BEGIN_NAMESPACE
     result = _mm_or_si128(finalAG, finalRB); \
 }
 
+// same as BLEND_SOURCE_OVER_ARGB32_SSE2, but for one vector srcVector
+#define BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask) { \
+        const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
+        if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
+            /* all opaque */ \
+            _mm_store_si128((__m128i *)&dst[x], srcVector); \
+        } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
+            /* not fully transparent */ \
+            /* extract the alpha channel on 2 x 16 bits */ \
+            /* so we have room for the multiplication */ \
+            /* each 32 bits will be in the form 0x00AA00AA */ \
+            /* with A being the 1 - alpha */ \
+            __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); \
+            alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); \
+            alphaChannel = _mm_sub_epi16(one, alphaChannel); \
+ \
+            const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
+            __m128i destMultipliedByOneMinusAlpha; \
+            BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
+ \
+            /* result = s + d * (1-alpha) */\
+            const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
+            _mm_store_si128((__m128i *)&dst[x], result); \
+        } \
+    }
+
+
 // Basically blend src over dst with the const alpha defined as constAlphaVector.
 // nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
 //const __m128i nullVector = _mm_set1_epi32(0);
@@ -153,28 +180,7 @@ QT_BEGIN_NAMESPACE
 \
     for (; x < length-3; x += 4) { \
         const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); \
-        const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
-        if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
-            /* all opaque */ \
-            _mm_store_si128((__m128i *)&dst[x], srcVector); \
-        } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
-            /* not fully transparent */ \
-            /* extract the alpha channel on 2 x 16 bits */ \
-            /* so we have room for the multiplication */ \
-            /* each 32 bits will be in the form 0x00AA00AA */ \
-            /* with A being the 1 - alpha */ \
-            __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); \
-            alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); \
-            alphaChannel = _mm_sub_epi16(one, alphaChannel); \
- \
-            const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
-            __m128i destMultipliedByOneMinusAlpha; \
-            BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
- \
-            /* result = s + d * (1-alpha) */\
-            const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
-            _mm_store_si128((__m128i *)&dst[x], result); \
-        } \
+        BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask) \
     } \
     for (; x < length; ++x) { \
         uint s = src[x]; \
author	Olivier Goffart <ogoffart@kde.org>	2011-10-24 08:02:10 +0200
committer	Qt by Nokia <qt-info@nokia.com>	2011-10-24 13:56:59 +0200
commit	4bff8ea4d48851fbea078bd93226888bdd05d8dc (patch)
tree	27f8c6fab968a356fc31e3d76148fec038ac2e4c /src
parent	cea8e41dc838bcdc2ec63eefac6441ddc608e390 (diff)