diff options
author | Allan Sandfeld Jensen <allan.jensen@theqtcompany.com> | 2016-02-26 16:53:24 +0100 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@theqtcompany.com> | 2016-02-29 19:39:27 +0000 |
commit | 1dd0c4bf1af1c90fde1449a81d41acbc62cf1934 (patch) | |
tree | ba289b9d840d327879a8399bb703e64440a96402 /src/gui | |
parent | 31a880f1f37412abf930bb7427c8538e9d09e765 (diff) |
SSSE3 optimized store of 24-bit formats
Using shuffle and align storing our quint24 format can be done much
faster. This in particular improves conversions to RGB888.
Change-Id: I179748706a33a43fd6f60f5c40287317418c8867
Reviewed-by: Gunnar Sletta <gunnar@sletta.org>
Diffstat (limited to 'src/gui')
-rw-r--r-- | src/gui/painting/qdrawhelper.cpp | 4 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper_p.h | 2 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper_ssse3.cpp | 57 |
3 files changed, 61 insertions, 2 deletions
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index 1143123717..112dbd4738 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -961,7 +961,7 @@ const FetchPixelsFunc qFetchPixels[QPixelLayout::BPPCount] = { fetchPixels<QPixelLayout::BPP32> // BPP32 }; -const StorePixelsFunc qStorePixels[QPixelLayout::BPPCount] = { +StorePixelsFunc qStorePixels[QPixelLayout::BPPCount] = { 0, // BPPNone storePixels<QPixelLayout::BPP1MSB>, // BPP1MSB storePixels<QPixelLayout::BPP1LSB>, // BPP1LSB @@ -6375,10 +6375,12 @@ static void qInitDrawhelperFunctions() int w, int h, int const_alpha); + extern void QT_FASTCALL storePixelsBPP24_ssse3(uchar *dest, const uint *src, int index, int count); qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_ssse3; qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_ssse3; qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_ssse3; qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_ssse3; + qStorePixels[QPixelLayout::BPP24] = storePixelsBPP24_ssse3; } #endif // SSSE3 diff --git a/src/gui/painting/qdrawhelper_p.h b/src/gui/painting/qdrawhelper_p.h index af52ed0b43..21af6039f8 100644 --- a/src/gui/painting/qdrawhelper_p.h +++ b/src/gui/painting/qdrawhelper_p.h @@ -1213,7 +1213,7 @@ typedef void (QT_FASTCALL *StorePixelsFunc)(uchar *dest, const uint *src, int in extern QPixelLayout qPixelLayouts[QImage::NImageFormats]; extern const FetchPixelsFunc qFetchPixels[QPixelLayout::BPPCount]; -extern const StorePixelsFunc qStorePixels[QPixelLayout::BPPCount]; +extern StorePixelsFunc qStorePixels[QPixelLayout::BPPCount]; diff --git a/src/gui/painting/qdrawhelper_ssse3.cpp b/src/gui/painting/qdrawhelper_ssse3.cpp index 09b25d30fd..e0d1bac6b1 100644 --- a/src/gui/painting/qdrawhelper_ssse3.cpp +++ b/src/gui/painting/qdrawhelper_ssse3.cpp @@ -176,6 +176,63 @@ void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl, } } +static inline void store_uint24_ssse3(uchar *dst, const uint *src, int len) +{ + int i = 0; + + quint24 *dst24 = reinterpret_cast<quint24*>(dst); + // Align dst on 16 bytes + for (; i < len && (reinterpret_cast<quintptr>(dst24) & 0xf); ++i) + *dst24++ = quint24(*src++); + + // Shuffle masks for first and second half of every output, all outputs are aligned so the shuffled ends are not used. + const __m128i shuffleMask1 = _mm_setr_epi8(char(0x80), char(0x80), char(0x80), char(0x80), 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12); + const __m128i shuffleMask2 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12, char(0x80), char(0x80), char(0x80), char(0x80)); + + const __m128i *inVectorPtr = (const __m128i *)src; + __m128i *dstVectorPtr = (__m128i *)dst24; + + for (; i < (len - 15); i += 16) { + // Load four vectors, store three. + // Create each output vector by combining two shuffled input vectors. + __m128i srcVector1 = _mm_loadu_si128(inVectorPtr); + ++inVectorPtr; + __m128i srcVector2 = _mm_loadu_si128(inVectorPtr); + ++inVectorPtr; + __m128i outputVector1 = _mm_shuffle_epi8(srcVector1, shuffleMask1); + __m128i outputVector2 = _mm_shuffle_epi8(srcVector2, shuffleMask2); + __m128i outputVector = _mm_alignr_epi8(outputVector2, outputVector1, 4); + _mm_store_si128(dstVectorPtr, outputVector); + ++dstVectorPtr; + + srcVector1 = _mm_loadu_si128(inVectorPtr); + ++inVectorPtr; + outputVector1 = _mm_shuffle_epi8(srcVector2, shuffleMask1); + outputVector2 = _mm_shuffle_epi8(srcVector1, shuffleMask2); + outputVector = _mm_alignr_epi8(outputVector2, outputVector1, 8); + _mm_store_si128(dstVectorPtr, outputVector); + ++dstVectorPtr; + + srcVector2 = _mm_loadu_si128(inVectorPtr); + ++inVectorPtr; + outputVector1 = _mm_shuffle_epi8(srcVector1, shuffleMask1); + outputVector2 = _mm_shuffle_epi8(srcVector2, shuffleMask2); + outputVector = _mm_alignr_epi8(outputVector2, outputVector1, 12); + _mm_store_si128(dstVectorPtr, outputVector); + ++dstVectorPtr; + } + dst24 = reinterpret_cast<quint24*>(dstVectorPtr); + src = reinterpret_cast<const uint*>(inVectorPtr); + + for (; i < len; ++i) + *dst24++ = quint24(*src++); +} + +void QT_FASTCALL storePixelsBPP24_ssse3(uchar *dest, const uint *src, int index, int count) +{ + store_uint24_ssse3(dest + index * 3, src, count); +} + QT_END_NAMESPACE #endif // QT_COMPILER_SUPPORTS_SSSE3 |