SSSE3 optimized store of 24-bit formats

Using shuffle and align storing our quint24 format can be done much faster. This in particular improves conversions to RGB888. Change-Id: I179748706a33a43fd6f60f5c40287317418c8867 Reviewed-by: Gunnar Sletta <gunnar@sletta.org>
author: Allan Sandfeld Jensen <allan.jensen@theqtcompany.com> 2016-02-26 16:53:24 +0100
committer: Allan Sandfeld Jensen <allan.jensen@theqtcompany.com> 2016-02-29 19:39:27 +0000
commit: 1dd0c4bf1af1c90fde1449a81d41acbc62cf1934 (patch)
tree: ba289b9d840d327879a8399bb703e64440a96402 /src/gui/painting/qdrawhelper_ssse3.cpp
parent: 31a880f1f37412abf930bb7427c8538e9d09e765 (diff)
1 files changed, 57 insertions, 0 deletions
diff --git a/src/gui/painting/qdrawhelper_ssse3.cpp b/src/gui/painting/qdrawhelper_ssse3.cpp
index 09b25d30fd..e0d1bac6b1 100644
--- a/src/gui/painting/qdrawhelper_ssse3.cpp
+++ b/src/gui/painting/qdrawhelper_ssse3.cpp
@@ -176,6 +176,63 @@ void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl,
     }
 }
 
+static inline void store_uint24_ssse3(uchar *dst, const uint *src, int len)
+{
+    int i = 0;
+
+    quint24 *dst24 = reinterpret_cast<quint24*>(dst);
+    // Align dst on 16 bytes
+    for (; i < len && (reinterpret_cast<quintptr>(dst24) & 0xf); ++i)
+        *dst24++ = quint24(*src++);
+
+    // Shuffle masks for first and second half of every output, all outputs are aligned so the shuffled ends are not used.
+    const __m128i shuffleMask1 = _mm_setr_epi8(char(0x80), char(0x80), char(0x80), char(0x80), 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12);
+    const __m128i shuffleMask2 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, char(0x80), char(0x80), char(0x80), char(0x80));
+
+    const __m128i *inVectorPtr = (const __m128i *)src;
+    __m128i *dstVectorPtr = (__m128i *)dst24;
+
+    for (; i < (len - 15); i += 16) {
+        // Load four vectors, store three.
+        // Create each output vector by combining two shuffled input vectors.
+        __m128i srcVector1 = _mm_loadu_si128(inVectorPtr);
+        ++inVectorPtr;
+        __m128i srcVector2 = _mm_loadu_si128(inVectorPtr);
+        ++inVectorPtr;
+        __m128i outputVector1 = _mm_shuffle_epi8(srcVector1, shuffleMask1);
+        __m128i outputVector2 = _mm_shuffle_epi8(srcVector2, shuffleMask2);
+        __m128i outputVector = _mm_alignr_epi8(outputVector2, outputVector1, 4);
+        _mm_store_si128(dstVectorPtr, outputVector);
+        ++dstVectorPtr;
+
+        srcVector1 = _mm_loadu_si128(inVectorPtr);
+        ++inVectorPtr;
+        outputVector1 = _mm_shuffle_epi8(srcVector2, shuffleMask1);
+        outputVector2 = _mm_shuffle_epi8(srcVector1, shuffleMask2);
+        outputVector = _mm_alignr_epi8(outputVector2, outputVector1, 8);
+        _mm_store_si128(dstVectorPtr, outputVector);
+        ++dstVectorPtr;
+
+        srcVector2 = _mm_loadu_si128(inVectorPtr);
+        ++inVectorPtr;
+        outputVector1 = _mm_shuffle_epi8(srcVector1, shuffleMask1);
+        outputVector2 = _mm_shuffle_epi8(srcVector2, shuffleMask2);
+        outputVector = _mm_alignr_epi8(outputVector2, outputVector1, 12);
+        _mm_store_si128(dstVectorPtr, outputVector);
+        ++dstVectorPtr;
+    }
+    dst24 = reinterpret_cast<quint24*>(dstVectorPtr);
+    src = reinterpret_cast<const uint*>(inVectorPtr);
+
+    for (; i < len; ++i)
+        *dst24++ = quint24(*src++);
+}
+
+void QT_FASTCALL storePixelsBPP24_ssse3(uchar *dest, const uint *src, int index, int count)
+{
+    store_uint24_ssse3(dest + index * 3, src, count);
+}
+
 QT_END_NAMESPACE
 
 #endif // QT_COMPILER_SUPPORTS_SSSE3
author	Allan Sandfeld Jensen <allan.jensen@theqtcompany.com>	2016-02-26 16:53:24 +0100
committer	Allan Sandfeld Jensen <allan.jensen@theqtcompany.com>	2016-02-29 19:39:27 +0000
commit	1dd0c4bf1af1c90fde1449a81d41acbc62cf1934 (patch)
tree	ba289b9d840d327879a8399bb703e64440a96402 /src/gui/painting/qdrawhelper_ssse3.cpp
parent	31a880f1f37412abf930bb7427c8538e9d09e765 (diff)