Fix repremultiply from RGB64 to RGB30

Just like from RGB32 to RGB30 we must also repremultiply when converting from RGB64 because the alpha channel loses more precision than the other color channels. Since this is not approximated accurately in the simple blending functions and the functions are no longer needed now the main render engine supports higher accuracy, the simple blending routines for RGB30 have been removed. Change-Id: I2b7b8eb015e330a487848fc4370ad3a1e966be91 Reviewed-by: Gunnar Sletta <gunnar@sletta.org>
author: Allan Sandfeld Jensen <allan.jensen@theqtcompany.com> 2015-06-30 13:26:25 +0200
committer: Allan Sandfeld Jensen <allan.jensen@theqtcompany.com> 2015-08-11 09:33:06 +0000
commit: 55512950281260b85ea40c08239e459c76b696d2 (patch)
tree: 98faefc98c2d7e81b65517d36ccfecfed9a79240 /src/gui/painting/qdrawhelper.cpp
parent: 49ea02f435876d514898d587700a21e1a8972f09 (diff)
1 files changed, 48 insertions, 19 deletions
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 07e5a3d19b..0cf7e20605 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -1252,25 +1252,54 @@ static inline void qConvertARGB64PMToA2RGB30PM_sse2(uint *dest, const QRgba64 *b
     const __m128i cmask = _mm_set1_epi32(0x000003ff);
     int i = 0;
     __m128i vr, vg, vb, va;
-    for (; i < count-1; i += 2) {
-        __m128i vs = _mm_loadu_si128((const __m128i*)buffer);
-        buffer += 2;
-        vr = _mm_srli_epi64(vs, 6);
-        vg = _mm_srli_epi64(vs, 16 + 6 - 10);
-        vb = _mm_srli_epi64(vs, 32 + 6);
-        vr = _mm_and_si128(vr, cmask);
-        vg = _mm_and_si128(vg, gmask);
-        vb = _mm_and_si128(vb, cmask);
-        va = _mm_srli_epi64(vs, 48 + 14);
-        if (PixelOrder == PixelOrderRGB)
-            vr = _mm_slli_epi32(vr, 20);
-        else
-            vb = _mm_slli_epi32(vb, 20);
-        va = _mm_slli_epi32(va, 30);
-        __m128i vd = _mm_or_si128(_mm_or_si128(vr, vg), _mm_or_si128(vb, va));
-        vd = _mm_shuffle_epi32(vd, _MM_SHUFFLE(3, 1, 2, 0));
-        _mm_storel_epi64((__m128i*)dest, vd);
-        dest += 2;
+    if (i < count && (const uintptr_t)buffer & 0x8) {
+        *dest++ = qConvertRgb64ToRgb30<PixelOrder>(*buffer++);
+        ++i;
+    }
+
+    for (; i < count-15; i += 16) {
+        // Repremultiplying is really expensive and hard to do in SIMD without AVX2,
+        // so we try to avoid it by checking if it is needed 16 samples at a time.
+        __m128i vOr = _mm_set1_epi32(0);
+        __m128i vAnd = _mm_set1_epi32(0xffffffff);
+        for (int j = 0; j < 16; j += 2) {
+            __m128i vs = _mm_load_si128((const __m128i*)(buffer + j));
+            vOr = _mm_or_si128(vOr, vs);
+            vAnd = _mm_and_si128(vAnd, vs);
+        }
+        const quint16 orAlpha = ((uint)_mm_extract_epi16(vOr, 3)) | ((uint)_mm_extract_epi16(vOr, 7));
+        const quint16 andAlpha = ((uint)_mm_extract_epi16(vAnd, 3)) & ((uint)_mm_extract_epi16(vAnd, 7));
+
+        if (andAlpha == 0xffff) {
+            for (int j = 0; j < 16; j += 2) {
+                __m128i vs = _mm_load_si128((const __m128i*)buffer);
+                buffer += 2;
+                vr = _mm_srli_epi64(vs, 6);
+                vg = _mm_srli_epi64(vs, 16 + 6 - 10);
+                vb = _mm_srli_epi64(vs, 32 + 6);
+                vr = _mm_and_si128(vr, cmask);
+                vg = _mm_and_si128(vg, gmask);
+                vb = _mm_and_si128(vb, cmask);
+                va = _mm_srli_epi64(vs, 48 + 14);
+                if (PixelOrder == PixelOrderRGB)
+                    vr = _mm_slli_epi32(vr, 20);
+                else
+                    vb = _mm_slli_epi32(vb, 20);
+                va = _mm_slli_epi32(va, 30);
+                __m128i vd = _mm_or_si128(_mm_or_si128(vr, vg), _mm_or_si128(vb, va));
+                vd = _mm_shuffle_epi32(vd, _MM_SHUFFLE(3, 1, 2, 0));
+                _mm_storel_epi64((__m128i*)dest, vd);
+                dest += 2;
+            }
+        } else if (orAlpha == 0) {
+            for (int j = 0; j < 16; ++j) {
+                *dest++ = 0;
+                buffer++;
+            }
+        } else {
+            for (int j = 0; j < 16; ++j)
+                *dest++ = qConvertRgb64ToRgb30<PixelOrder>(*buffer++);
+        }
     }
 
     for (; i < count; ++i)
author	Allan Sandfeld Jensen <allan.jensen@theqtcompany.com>	2015-06-30 13:26:25 +0200
committer	Allan Sandfeld Jensen <allan.jensen@theqtcompany.com>	2015-08-11 09:33:06 +0000
commit	55512950281260b85ea40c08239e459c76b696d2 (patch)
tree	98faefc98c2d7e81b65517d36ccfecfed9a79240 /src/gui/painting/qdrawhelper.cpp
parent	49ea02f435876d514898d587700a21e1a8972f09 (diff)