From 38aafe1a17c5d3be81edf798dd4dbe745727f4fd Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Thu, 29 Jan 2015 12:25:06 +0100 Subject: Optimize unpremultiply on SSE4.1 Adds an SSE4.1 optimized version of qUnpremultiply and uses it in the most drawing conversions methods. This gives a speed-up of little over 2x. Change-Id: Ieb858a94ada1eb86d7af715ac1a100f1587f360d Reviewed-by: Gunnar Sletta --- src/gui/painting/qdrawhelper.cpp | 42 ++++++++++++++++++++++++++++- src/gui/painting/qdrawingprimitive_sse2_p.h | 22 +++++++++++++++ 2 files changed, 63 insertions(+), 1 deletion(-) (limited to 'src/gui/painting') diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index ac73d78afe..83372b198d 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -45,6 +45,7 @@ #include #include #include +#include #include #if defined(QT_COMPILER_SUPPORTS_MIPS_DSP) || defined(QT_COMPILER_SUPPORTS_MIPS_DSPR2) #include @@ -572,6 +573,18 @@ static const uint *QT_FASTCALL convertARGB32FromARGB32PM(uint *buffer, const uin return buffer; } +#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) +QT_FUNCTION_TARGET(SSE4_1) +static const uint *QT_FASTCALL convertARGB32FromARGB32PM_sse4(uint *buffer, const uint *src, int count, + const QPixelLayout *, const QRgb *) +{ + for (int i = 0; i < count; ++i) + buffer[i] = qUnpremultiply_sse4(src[i]); + return buffer; +} +#endif + + static const uint *QT_FASTCALL convertRGBA8888PMFromARGB32PM(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *) { @@ -588,6 +601,17 @@ static const uint *QT_FASTCALL convertRGBA8888FromARGB32PM(uint *buffer, const u return buffer; } +#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) +QT_FUNCTION_TARGET(SSE4_1) +static const uint *QT_FASTCALL convertRGBA8888FromARGB32PM_sse4(uint *buffer, const uint *src, int count, + const QPixelLayout *, const QRgb *) +{ + for (int i = 0; i < count; ++i) + buffer[i] = ARGB2RGBA(qUnpremultiply_sse4(src[i])); + return buffer; +} +#endif + static const uint *QT_FASTCALL convertRGBXFromRGB32(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *) { @@ -604,6 +628,17 @@ static const uint *QT_FASTCALL convertRGBXFromARGB32PM(uint *buffer, const uint return buffer; } +#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) +QT_FUNCTION_TARGET(SSE4_1) +static const uint *QT_FASTCALL convertRGBXFromARGB32PM_sse4(uint *buffer, const uint *src, int count, + const QPixelLayout *, const QRgb *) +{ + for (int i = 0; i < count; ++i) + buffer[i] = ARGB2RGBA(0xff000000 | qUnpremultiply_sse4(src[i])); + return buffer; +} +#endif + template static const uint *QT_FASTCALL convertA2RGB30PMToARGB32PM(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *) @@ -6879,10 +6914,15 @@ void qInitDrawhelperAsm() } #endif // SSSE3 -#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) && !defined(__SSE4_1__) +#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) if (qCpuHasFeature(SSE4_1)) { +#if !defined(__SSE4_1__) qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4; qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4; +#endif + qPixelLayouts[QImage::Format_ARGB32].convertFromARGB32PM = convertARGB32FromARGB32PM_sse4; + qPixelLayouts[QImage::Format_RGBA8888].convertFromARGB32PM = convertRGBA8888FromARGB32PM_sse4; + qPixelLayouts[QImage::Format_RGBX8888].convertFromARGB32PM = convertRGBXFromARGB32PM_sse4; } #endif diff --git a/src/gui/painting/qdrawingprimitive_sse2_p.h b/src/gui/painting/qdrawingprimitive_sse2_p.h index 2b4cdc286e..deac31b797 100644 --- a/src/gui/painting/qdrawingprimitive_sse2_p.h +++ b/src/gui/painting/qdrawingprimitive_sse2_p.h @@ -236,4 +236,26 @@ QT_END_NAMESPACE #endif // __SSE2__ +QT_BEGIN_NAMESPACE +#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) +QT_FUNCTION_TARGET(SSE4_1) +inline QRgb qUnpremultiply_sse4(QRgb p) +{ + const uint alpha = qAlpha(p); + const uint invAlpha = qt_inv_premul_factor[alpha]; + const __m128i via = _mm_set1_epi32(invAlpha); + const __m128i vr = _mm_set1_epi32(0x8000); + __m128i vl = _mm_unpacklo_epi8(_mm_cvtsi32_si128(p), _mm_setzero_si128()); + vl = _mm_unpacklo_epi16(vl, _mm_setzero_si128()); + vl = _mm_mullo_epi32(vl, via); + vl = _mm_add_epi32(vl, vr); + vl = _mm_srai_epi32(vl, 16); + vl = _mm_insert_epi32(vl, alpha, 3); + vl = _mm_packus_epi32(vl, _mm_setzero_si128()); + vl = _mm_packus_epi16(vl, _mm_setzero_si128()); + return _mm_cvtsi128_si32(vl); +} +#endif +QT_END_NAMESPACE + #endif // QDRAWINGPRIMITIVE_SSE2_P_H -- cgit v1.2.3