From 38aafe1a17c5d3be81edf798dd4dbe745727f4fd Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Thu, 29 Jan 2015 12:25:06 +0100 Subject: Optimize unpremultiply on SSE4.1 Adds an SSE4.1 optimized version of qUnpremultiply and uses it in the most drawing conversions methods. This gives a speed-up of little over 2x. Change-Id: Ieb858a94ada1eb86d7af715ac1a100f1587f360d Reviewed-by: Gunnar Sletta --- src/gui/painting/qdrawhelper.cpp | 42 ++++++++++++++++++++++++++- src/gui/painting/qdrawingprimitive_sse2_p.h | 22 ++++++++++++++ tests/auto/gui/painting/qcolor/qcolor.pro | 2 +- tests/auto/gui/painting/qcolor/tst_qcolor.cpp | 32 ++++++++++++++++++++ 4 files changed, 96 insertions(+), 2 deletions(-) diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index ac73d78afe..83372b198d 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -45,6 +45,7 @@ #include #include #include +#include #include #if defined(QT_COMPILER_SUPPORTS_MIPS_DSP) || defined(QT_COMPILER_SUPPORTS_MIPS_DSPR2) #include @@ -572,6 +573,18 @@ static const uint *QT_FASTCALL convertARGB32FromARGB32PM(uint *buffer, const uin return buffer; } +#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) +QT_FUNCTION_TARGET(SSE4_1) +static const uint *QT_FASTCALL convertARGB32FromARGB32PM_sse4(uint *buffer, const uint *src, int count, + const QPixelLayout *, const QRgb *) +{ + for (int i = 0; i < count; ++i) + buffer[i] = qUnpremultiply_sse4(src[i]); + return buffer; +} +#endif + + static const uint *QT_FASTCALL convertRGBA8888PMFromARGB32PM(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *) { @@ -588,6 +601,17 @@ static const uint *QT_FASTCALL convertRGBA8888FromARGB32PM(uint *buffer, const u return buffer; } +#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) +QT_FUNCTION_TARGET(SSE4_1) +static const uint *QT_FASTCALL convertRGBA8888FromARGB32PM_sse4(uint *buffer, const uint *src, int count, + const QPixelLayout *, const QRgb *) +{ + for (int i = 0; i < count; ++i) + buffer[i] = ARGB2RGBA(qUnpremultiply_sse4(src[i])); + return buffer; +} +#endif + static const uint *QT_FASTCALL convertRGBXFromRGB32(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *) { @@ -604,6 +628,17 @@ static const uint *QT_FASTCALL convertRGBXFromARGB32PM(uint *buffer, const uint return buffer; } +#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) +QT_FUNCTION_TARGET(SSE4_1) +static const uint *QT_FASTCALL convertRGBXFromARGB32PM_sse4(uint *buffer, const uint *src, int count, + const QPixelLayout *, const QRgb *) +{ + for (int i = 0; i < count; ++i) + buffer[i] = ARGB2RGBA(0xff000000 | qUnpremultiply_sse4(src[i])); + return buffer; +} +#endif + template static const uint *QT_FASTCALL convertA2RGB30PMToARGB32PM(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *) @@ -6879,10 +6914,15 @@ void qInitDrawhelperAsm() } #endif // SSSE3 -#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) && !defined(__SSE4_1__) +#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) if (qCpuHasFeature(SSE4_1)) { +#if !defined(__SSE4_1__) qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4; qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4; +#endif + qPixelLayouts[QImage::Format_ARGB32].convertFromARGB32PM = convertARGB32FromARGB32PM_sse4; + qPixelLayouts[QImage::Format_RGBA8888].convertFromARGB32PM = convertRGBA8888FromARGB32PM_sse4; + qPixelLayouts[QImage::Format_RGBX8888].convertFromARGB32PM = convertRGBXFromARGB32PM_sse4; } #endif diff --git a/src/gui/painting/qdrawingprimitive_sse2_p.h b/src/gui/painting/qdrawingprimitive_sse2_p.h index 2b4cdc286e..deac31b797 100644 --- a/src/gui/painting/qdrawingprimitive_sse2_p.h +++ b/src/gui/painting/qdrawingprimitive_sse2_p.h @@ -236,4 +236,26 @@ QT_END_NAMESPACE #endif // __SSE2__ +QT_BEGIN_NAMESPACE +#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) +QT_FUNCTION_TARGET(SSE4_1) +inline QRgb qUnpremultiply_sse4(QRgb p) +{ + const uint alpha = qAlpha(p); + const uint invAlpha = qt_inv_premul_factor[alpha]; + const __m128i via = _mm_set1_epi32(invAlpha); + const __m128i vr = _mm_set1_epi32(0x8000); + __m128i vl = _mm_unpacklo_epi8(_mm_cvtsi32_si128(p), _mm_setzero_si128()); + vl = _mm_unpacklo_epi16(vl, _mm_setzero_si128()); + vl = _mm_mullo_epi32(vl, via); + vl = _mm_add_epi32(vl, vr); + vl = _mm_srai_epi32(vl, 16); + vl = _mm_insert_epi32(vl, alpha, 3); + vl = _mm_packus_epi32(vl, _mm_setzero_si128()); + vl = _mm_packus_epi16(vl, _mm_setzero_si128()); + return _mm_cvtsi128_si32(vl); +} +#endif +QT_END_NAMESPACE + #endif // QDRAWINGPRIMITIVE_SSE2_P_H diff --git a/tests/auto/gui/painting/qcolor/qcolor.pro b/tests/auto/gui/painting/qcolor/qcolor.pro index f7439c243c..44d65bb50b 100644 --- a/tests/auto/gui/painting/qcolor/qcolor.pro +++ b/tests/auto/gui/painting/qcolor/qcolor.pro @@ -2,4 +2,4 @@ CONFIG += testcase CONFIG += parallel_test TARGET = tst_qcolor SOURCES += tst_qcolor.cpp -QT += testlib +QT += testlib gui-private core-private diff --git a/tests/auto/gui/painting/qcolor/tst_qcolor.cpp b/tests/auto/gui/painting/qcolor/tst_qcolor.cpp index 3b3334ba1f..95f1da1354 100644 --- a/tests/auto/gui/painting/qcolor/tst_qcolor.cpp +++ b/tests/auto/gui/painting/qcolor/tst_qcolor.cpp @@ -38,6 +38,7 @@ #include #include +#include class tst_QColor : public QObject { @@ -102,6 +103,9 @@ private slots: void achromaticHslHue(); + void premultiply(); + void unpremultiply_sse4(); + #ifdef Q_DEAD_CODE_FROM_QT4_X11 void setallowX11ColorNames(); #endif @@ -1432,5 +1436,33 @@ void tst_QColor::setallowX11ColorNames() } #endif +void tst_QColor::premultiply() +{ + // Tests that qPremultiply(qUnpremultiply(x)) returns x. + for (uint a = 0; a < 256; a++) { + for (uint c = 0; c <= a; c++) { + QRgb p = qRgba(c, a-c, c, a); + QCOMPARE(p, qPremultiply(qUnpremultiply(p))); + } + } +} + +void tst_QColor::unpremultiply_sse4() +{ + // Tests that qUnpremultiply_sse4 returns the same as qUnpremultiply. +#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) + if (qCpuHasFeature(SSE4_1)) { + for (uint a = 0; a < 256; a++) { + for (uint c = 0; c <= a; c++) { + QRgb p = qRgba(c, a-c, c, a); + QCOMPARE(qUnpremultiply(p), qUnpremultiply_sse4(p)); + } + } + return; + } +#endif + QSKIP("SSE4 not supported on this CPU."); +} + QTEST_MAIN(tst_QColor) #include "tst_qcolor.moc" -- cgit v1.2.3