From 506aa694a9e466f04c392d814b02c2130611dce6 Mon Sep 17 00:00:00 2001
From: Allan Sandfeld Jensen <allan.jensen@qt.io>
Date: Wed, 16 Nov 2016 16:25:11 +0100
Subject: Cleanup  convert_ARGB_to_ARGB_PM_inplace_sse2

Changes it to follow standard SIMD patterns so it can use
ALIGNMENT_PROLOGUE_16BYTES and SIMD_EPILOGUE helpers.

Should also improve performance by using aligned memory access.

Change-Id: I14a48b82e3f3de83bd7572aa82bed07f28ad944c
Reviewed-by: Erik Verbruggen <erik.verbruggen@qt.io>
---
 src/gui/image/qimage_sse2.cpp | 59 +++++++++++++++++++++++++++----------------
 1 file changed, 37 insertions(+), 22 deletions(-)

(limited to 'src')
diff --git a/src/gui/image/qimage_sse2.cpp b/src/gui/image/qimage_sse2.cpp
index 0fb92e9d43..8f7195e0b5 100644
--- a/src/gui/image/qimage_sse2.cpp
+++ b/src/gui/image/qimage_sse2.cpp
@@ -51,51 +51,66 @@ bool convert_ARGB_to_ARGB_PM_inplace_sse2(QImageData *data, Qt::ImageConversionF
 {
     Q_ASSERT(data->format == QImage::Format_ARGB32 || data->format == QImage::Format_RGBA8888);
 
-    // extra pixels on each line
-    const int spare = data->width & 3;
-    // width in pixels of the pad at the end of each line
-    const int pad = (data->bytes_per_line >> 2) - data->width;
-    const int iter = data->width >> 2;
-    int height = data->height;
+    const int width = data->width;
+    const int height = data->height;
+    const int bpl = data->bytes_per_line;
 
     const __m128i alphaMask = _mm_set1_epi32(0xff000000);
     const __m128i nullVector = _mm_setzero_si128();
     const __m128i half = _mm_set1_epi16(0x80);
     const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
 
-    __m128i *d = reinterpret_cast<__m128i*>(data->data);
-    while (height--) {
-        const __m128i *end = d + iter;
-
-        for (; d != end; ++d) {
-            const __m128i srcVector = _mm_loadu_si128(d);
+    uchar *d = data->data;
+    for (int y = 0; y < height; ++y) {
+        int i = 0;
+        quint32 *d32 = reinterpret_cast<quint32 *>(d);
+        ALIGNMENT_PROLOGUE_16BYTES(d, i, width) {
+            const quint32 p = d32[i];
+            if (p <= 0x00ffffff)
+                d32[i] = 0;
+            else if (p < 0xff000000)
+                d32[i] = qPremultiply(p);
+        }
+        __m128i *d128 = reinterpret_cast<__m128i *>(d32 + i);
+        for (; i < (width - 3); i += 4) {
+            const __m128i srcVector = _mm_load_si128(d128);
+#ifdef __SSE4_1__
+            if (_mm_testc_si128(srcVector, alphaMask)) {
+                // opaque, data is unchanged
+            } else if (_mm_testz_si128(srcVector, alphaMask)) {
+                // fully transparent
+                _mm_store_si128(d128, nullVector);
+            } else {
+                const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
+#else
             const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
             if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
                 // opaque, data is unchanged
             } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) == 0xffff) {
                 // fully transparent
-                _mm_storeu_si128(d, nullVector);
+                _mm_store_si128(d128, nullVector);
             } else {
+#endif
                 __m128i alphaChannel = _mm_srli_epi32(srcVector, 24);
                 alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16));
 
                 __m128i result;
                 BYTE_MUL_SSE2(result, srcVector, alphaChannel, colorMask, half);
                 result = _mm_or_si128(_mm_andnot_si128(alphaMask, result), srcVectorAlpha);
-                _mm_storeu_si128(d, result);
+                _mm_store_si128(d128, result);
             }
+            d128++;
         }
 
-        QRgb *p = reinterpret_cast<QRgb*>(d);
-        QRgb *pe = p+spare;
-        for (; p != pe; ++p) {
-            if (*p < 0x00ffffff)
-                *p = 0;
-            else if (*p < 0xff000000)
-                *p = qPremultiply(*p);
+        SIMD_EPILOGUE(i, width, 3) {
+            const quint32 p = d32[i];
+            if (p <= 0x00ffffff)
+                d32[i] = 0;
+            else if (p < 0xff000000)
+                d32[i] = qPremultiply(p);
         }
 
-        d = reinterpret_cast<__m128i*>(p+pad);
+        d += bpl;
     }
 
     if (data->format == QImage::Format_ARGB32)
-- 
cgit v1.2.3