From dfa434a9799618eba48a84cbad279262679aa108 Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Fri, 27 Apr 2018 10:37:27 +0200 Subject: Optimize unpremultiply using SSE rcp Change-Id: I255031d354b0fde7abe8366ea2c86a35f9f24afd Reviewed-by: Thiago Macieira --- src/gui/painting/qdrawhelper.cpp | 4 + src/gui/painting/qdrawhelper_sse4.cpp | 190 ++++++++++++++++++++++++++-- src/gui/painting/qdrawingprimitive_sse2_p.h | 69 ++++++---- src/gui/painting/qrgba64.h | 4 + src/gui/painting/qrgba64_p.h | 3 +- 5 files changed, 239 insertions(+), 31 deletions(-) (limited to 'src/gui/painting') diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index 3be1af026c..fb9b500941 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -6293,6 +6293,8 @@ static void qInitDrawhelperFunctions() const QVector *, QDitherInfo *); extern void QT_FASTCALL storeRGBXFromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count, const QVector *, QDitherInfo *); + extern void QT_FASTCALL destStore64ARGB32_sse4(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length); + extern void QT_FASTCALL destStore64RGBA8888_sse4(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length); qPixelLayouts[QImage::Format_ARGB32].fetchToARGB32PM = fetchARGB32ToARGB32PM_sse4; qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4; qPixelLayouts[QImage::Format_RGBA8888].fetchToARGB32PM = fetchRGBA8888ToARGB32PM_sse4; @@ -6302,6 +6304,8 @@ static void qInitDrawhelperFunctions() qPixelLayouts[QImage::Format_RGBX8888].storeFromARGB32PM = storeRGBXFromARGB32PM_sse4; qPixelLayouts[QImage::Format_A2BGR30_Premultiplied].storeFromARGB32PM = storeA2RGB30PMFromARGB32PM_sse4; qPixelLayouts[QImage::Format_A2RGB30_Premultiplied].storeFromARGB32PM = storeA2RGB30PMFromARGB32PM_sse4; + destStoreProc64[QImage::Format_ARGB32] = destStore64ARGB32_sse4; + destStoreProc64[QImage::Format_RGBA8888] = destStore64RGBA8888_sse4; } #endif diff --git a/src/gui/painting/qdrawhelper_sse4.cpp b/src/gui/painting/qdrawhelper_sse4.cpp index 4696493715..0b6f963168 100644 --- a/src/gui/painting/qdrawhelper_sse4.cpp +++ b/src/gui/painting/qdrawhelper_sse4.cpp @@ -39,6 +39,7 @@ #include #include +#include #if defined(QT_COMPILER_SUPPORTS_SSE4_1) @@ -93,6 +94,171 @@ static void convertARGBToARGB32PM_sse4(uint *buffer, const uint *src, int count) } } +static inline __m128 reciprocal_mul_ps(__m128 a, float mul) +{ + __m128 ia = _mm_rcp_ps(a); // Approximate 1/a + // Improve precision of ia using Newton-Raphson + ia = _mm_sub_ps(_mm_add_ps(ia, ia), _mm_mul_ps(ia, _mm_mul_ps(ia, a))); + ia = _mm_mul_ps(ia, _mm_set1_ps(mul)); + return ia; +} + +template +static inline void convertARGBFromARGB32PM_sse4(uint *buffer, const uint *src, int count) +{ + int i = 0; + const __m128i alphaMask = _mm_set1_epi32(0xff000000); + const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15); + const __m128i zero = _mm_setzero_si128(); + + for (; i < count - 3; i += 4) { + __m128i srcVector = _mm_loadu_si128((const __m128i *)&src[i]); + if (!_mm_testz_si128(srcVector, alphaMask)) { + if (!_mm_testc_si128(srcVector, alphaMask)) { + __m128i srcVectorAlpha = _mm_srli_epi32(srcVector, 24); + if (RGBA) + srcVector = _mm_shuffle_epi8(srcVector, rgbaMask); + const __m128 a = _mm_cvtepi32_ps(srcVectorAlpha); + const __m128 ia = reciprocal_mul_ps(a, 255.0f); + __m128i src1 = _mm_unpacklo_epi8(srcVector, zero); + __m128i src3 = _mm_unpackhi_epi8(srcVector, zero); + __m128i src2 = _mm_unpackhi_epi16(src1, zero); + __m128i src4 = _mm_unpackhi_epi16(src3, zero); + src1 = _mm_unpacklo_epi16(src1, zero); + src3 = _mm_unpacklo_epi16(src3, zero); + __m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(3, 3, 3, 3)); + src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src1), ia1)); + src2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src2), ia2)); + src3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src3), ia3)); + src4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src4), ia4)); + src1 = _mm_packus_epi32(src1, src2); + src3 = _mm_packus_epi32(src3, src4); + src1 = _mm_packus_epi16(src1, src3); + // Handle potential alpha == 0 values: + __m128i srcVectorAlphaMask = _mm_cmpeq_epi32(srcVectorAlpha, zero); + src1 = _mm_andnot_si128(srcVectorAlphaMask, src1); + // Fixup alpha values: + if (RGBx) + srcVector = _mm_or_si128(src1, alphaMask); + else + srcVector = _mm_blendv_epi8(src1, srcVector, alphaMask); + _mm_storeu_si128((__m128i *)&buffer[i], srcVector); + } else { + if (RGBA) + _mm_storeu_si128((__m128i *)&buffer[i], _mm_shuffle_epi8(srcVector, rgbaMask)); + else if (buffer != src) + _mm_storeu_si128((__m128i *)&buffer[i], srcVector); + } + } else { + if (RGBx) + _mm_storeu_si128((__m128i *)&buffer[i], alphaMask); + else + _mm_storeu_si128((__m128i *)&buffer[i], zero); + } + } + + SIMD_EPILOGUE(i, count, 3) { + uint v = qUnpremultiply_sse4(src[i]); + if (RGBx) + v = 0xff000000 | v; + if (RGBA) + v = ARGB2RGBA(v); + buffer[i] = v; + } +} + +template +static inline void convertARGBFromRGBA64PM_sse4(uint *buffer, const QRgba64 *src, int count) +{ + int i = 0; + const __m128i alphaMask = _mm_set1_epi64x(Q_UINT64_C(0xffff) << 48); + const __m128i alphaMask32 = _mm_set1_epi32(0xff000000); + const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15); + const __m128i zero = _mm_setzero_si128(); + + for (; i < count - 3; i += 4) { + __m128i srcVector1 = _mm_loadu_si128((const __m128i *)&src[i]); + __m128i srcVector2 = _mm_loadu_si128((const __m128i *)&src[i + 2]); + bool transparent1 = _mm_testz_si128(srcVector1, alphaMask); + bool opaque1 = _mm_testc_si128(srcVector1, alphaMask); + bool transparent2 = _mm_testz_si128(srcVector2, alphaMask); + bool opaque2 = _mm_testc_si128(srcVector2, alphaMask); + + if (!(transparent1 && transparent2)) { + if (!(opaque1 && opaque2)) { + __m128i srcVector1Alpha = _mm_srli_epi64(srcVector1, 48); + __m128i srcVector2Alpha = _mm_srli_epi64(srcVector2, 48); + __m128i srcVectorAlpha = _mm_packus_epi32(srcVector1Alpha, srcVector2Alpha); + const __m128 a = _mm_cvtepi32_ps(srcVectorAlpha); + // Convert srcVectorAlpha to final 8-bit alpha channel + srcVectorAlpha = _mm_add_epi32(srcVectorAlpha, _mm_set1_epi32(128)); + srcVectorAlpha = _mm_sub_epi32(srcVectorAlpha, _mm_srli_epi32(srcVectorAlpha, 8)); + srcVectorAlpha = _mm_srli_epi32(srcVectorAlpha, 8); + srcVectorAlpha = _mm_slli_epi32(srcVectorAlpha, 24); + const __m128 ia = reciprocal_mul_ps(a, 255.0f); + __m128i src1 = _mm_unpacklo_epi16(srcVector1, zero); + __m128i src2 = _mm_unpackhi_epi16(srcVector1, zero); + __m128i src3 = _mm_unpacklo_epi16(srcVector2, zero); + __m128i src4 = _mm_unpackhi_epi16(srcVector2, zero); + __m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0, 0, 0, 0)); + __m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(1, 1, 1, 1)); + __m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(2, 2, 2, 2)); + __m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(3, 3, 3, 3)); + src1 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src1), ia1)); + src2 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src2), ia2)); + src3 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src3), ia3)); + src4 = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(src4), ia4)); + src1 = _mm_packus_epi32(src1, src2); + src3 = _mm_packus_epi32(src3, src4); + // Handle potential alpha == 0 values: + __m128i srcVector1AlphaMask = _mm_cmpeq_epi64(srcVector1Alpha, zero); + __m128i srcVector2AlphaMask = _mm_cmpeq_epi64(srcVector2Alpha, zero); + src1 = _mm_andnot_si128(srcVector1AlphaMask, src1); + src3 = _mm_andnot_si128(srcVector2AlphaMask, src3); + src1 = _mm_packus_epi16(src1, src3); + // Fixup alpha values: + src1 = _mm_blendv_epi8(src1, srcVectorAlpha, alphaMask32); + // Fix RGB order + if (!RGBA) + src1 = _mm_shuffle_epi8(src1, rgbaMask); + _mm_storeu_si128((__m128i *)&buffer[i], src1); + } else { + __m128i src1 = _mm_unpacklo_epi16(srcVector1, zero); + __m128i src2 = _mm_unpackhi_epi16(srcVector1, zero); + __m128i src3 = _mm_unpacklo_epi16(srcVector2, zero); + __m128i src4 = _mm_unpackhi_epi16(srcVector2, zero); + src1 = _mm_add_epi32(src1, _mm_set1_epi32(128)); + src2 = _mm_add_epi32(src2, _mm_set1_epi32(128)); + src3 = _mm_add_epi32(src3, _mm_set1_epi32(128)); + src4 = _mm_add_epi32(src4, _mm_set1_epi32(128)); + src1 = _mm_sub_epi32(src1, _mm_srli_epi32(src1, 8)); + src2 = _mm_sub_epi32(src2, _mm_srli_epi32(src2, 8)); + src3 = _mm_sub_epi32(src3, _mm_srli_epi32(src3, 8)); + src4 = _mm_sub_epi32(src4, _mm_srli_epi32(src4, 8)); + src1 = _mm_srli_epi32(src1, 8); + src2 = _mm_srli_epi32(src2, 8); + src3 = _mm_srli_epi32(src3, 8); + src4 = _mm_srli_epi32(src4, 8); + src1 = _mm_packus_epi32(src1, src2); + src3 = _mm_packus_epi32(src3, src4); + src1 = _mm_packus_epi16(src1, src3); + if (!RGBA) + src1 = _mm_shuffle_epi8(src1, rgbaMask); + _mm_storeu_si128((__m128i *)&buffer[i], src1); + } + } else { + _mm_storeu_si128((__m128i *)&buffer[i], zero); + } + } + + SIMD_EPILOGUE(i, count, 3) { + buffer[i] = qConvertRgba64ToRgb32_sse4(src[i]); + } +} + void QT_FASTCALL convertARGB32ToARGB32PM_sse4(uint *buffer, int count, const QVector *) { convertARGBToARGB32PM_sse4(buffer, buffer, count); @@ -121,32 +287,28 @@ void QT_FASTCALL storeRGB32FromARGB32PM_sse4(uchar *dest, const uint *src, int i const QVector *, QDitherInfo *) { uint *d = reinterpret_cast(dest) + index; - for (int i = 0; i < count; ++i) - d[i] = 0xff000000 | qUnpremultiply_sse4(src[i]); + convertARGBFromARGB32PM_sse4(d, src, count); } void QT_FASTCALL storeARGB32FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count, const QVector *, QDitherInfo *) { uint *d = reinterpret_cast(dest) + index; - for (int i = 0; i < count; ++i) - d[i] = qUnpremultiply_sse4(src[i]); + convertARGBFromARGB32PM_sse4(d, src, count); } void QT_FASTCALL storeRGBA8888FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count, const QVector *, QDitherInfo *) { uint *d = reinterpret_cast(dest) + index; - for (int i = 0; i < count; ++i) - d[i] = ARGB2RGBA(qUnpremultiply_sse4(src[i])); + convertARGBFromARGB32PM_sse4(d, src, count); } void QT_FASTCALL storeRGBXFromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count, const QVector *, QDitherInfo *) { uint *d = reinterpret_cast(dest) + index; - for (int i = 0; i < count; ++i) - d[i] = ARGB2RGBA(0xff000000 | qUnpremultiply_sse4(src[i])); + convertARGBFromARGB32PM_sse4(d, src, count); } template @@ -158,6 +320,18 @@ void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4(uchar *dest, const uint *src, i d[i] = qConvertArgb32ToA2rgb30_sse4(src[i]); } +void QT_FASTCALL destStore64ARGB32_sse4(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length) +{ + uint *dest = (uint*)rasterBuffer->scanLine(y) + x; + convertARGBFromRGBA64PM_sse4(dest, buffer, length); +} + +void QT_FASTCALL destStore64RGBA8888_sse4(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length) +{ + uint *dest = (uint*)rasterBuffer->scanLine(y) + x; + convertARGBFromRGBA64PM_sse4(dest, buffer, length); +} + template void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count, const QVector *, QDitherInfo *); diff --git a/src/gui/painting/qdrawingprimitive_sse2_p.h b/src/gui/painting/qdrawingprimitive_sse2_p.h index 93e4b9f572..b237ea1611 100644 --- a/src/gui/painting/qdrawingprimitive_sse2_p.h +++ b/src/gui/painting/qdrawingprimitive_sse2_p.h @@ -43,6 +43,7 @@ #include #include #include "qdrawhelper_p.h" +#include "qrgba64_p.h" #ifdef __SSE2__ @@ -230,21 +231,31 @@ QT_END_NAMESPACE QT_BEGIN_NAMESPACE #if QT_COMPILER_SUPPORTS_HERE(SSE4_1) +QT_FUNCTION_TARGET(SSE2) +Q_ALWAYS_INLINE void reciprocal_mul_ss(__m128 &ia, const __m128 a, float mul) +{ + ia = _mm_rcp_ss(a); // Approximate 1/a + // Improve precision of ia using Newton-Raphson + ia = _mm_sub_ss(_mm_add_ss(ia, ia), _mm_mul_ss(ia, _mm_mul_ss(ia, a))); + ia = _mm_mul_ss(ia, _mm_set_ss(mul)); + ia = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0,0,0,0)); +} + QT_FUNCTION_TARGET(SSE4_1) inline QRgb qUnpremultiply_sse4(QRgb p) { const uint alpha = qAlpha(p); - if (alpha == 255 || alpha == 0) + if (alpha == 255) return p; - const uint invAlpha = qt_inv_premul_factor[alpha]; - const __m128i via = _mm_set1_epi32(invAlpha); - const __m128i vr = _mm_set1_epi32(0x8000); + if (alpha == 0) + return 0; + const __m128 va = _mm_set1_ps(alpha); + __m128 via; + reciprocal_mul_ss(via, va, 255.0f); // Approximate 1/a __m128i vl = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p)); - vl = _mm_mullo_epi32(vl, via); - vl = _mm_add_epi32(vl, vr); - vl = _mm_srai_epi32(vl, 16); - vl = _mm_insert_epi32(vl, alpha, 3); + vl = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(vl), via)); vl = _mm_packus_epi32(vl, vl); + vl = _mm_insert_epi16(vl, alpha, 3); vl = _mm_packus_epi16(vl, vl); return _mm_cvtsi128_si32(vl); } @@ -258,21 +269,14 @@ inline uint qConvertArgb32ToA2rgb30_sse4(QRgb p) return qConvertRgb32ToRgb30(p); if (alpha == 0) return 0; - Q_CONSTEXPR uint mult = 255 / (255 >> 6); - const uint invAlpha = qt_inv_premul_factor[alpha]; + Q_CONSTEXPR float mult = 1023.0f / (255 >> 6); const uint newalpha = (alpha >> 6); - const __m128i via = _mm_set1_epi32(invAlpha); - const __m128i vna = _mm_set1_epi32(mult * newalpha); - const __m128i vr1 = _mm_set1_epi32(0x1000); - const __m128i vr2 = _mm_set1_epi32(0x80); - __m128i vl = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p)); - vl = _mm_mullo_epi32(vl, via); - vl = _mm_add_epi32(vl, vr1); - vl = _mm_srli_epi32(vl, 14); - vl = _mm_mullo_epi32(vl, vna); - vl = _mm_add_epi32(vl, _mm_srli_epi32(vl, 8)); - vl = _mm_add_epi32(vl, vr2); - vl = _mm_srli_epi32(vl, 8); + const __m128 va = _mm_set1_ps(alpha); + __m128 via; + reciprocal_mul_ss(via, va, mult * newalpha); + __m128i vl = _mm_cvtsi32_si128(p); + vl = _mm_cvtepu8_epi32(vl); + vl = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(vl), via)); vl = _mm_packus_epi32(vl, vl); uint rgb30 = (newalpha << 30); rgb30 |= ((uint)_mm_extract_epi16(vl, 1)) << 10; @@ -285,6 +289,27 @@ inline uint qConvertArgb32ToA2rgb30_sse4(QRgb p) } return rgb30; } + +template +QT_FUNCTION_TARGET(SSE4_1) +inline uint qConvertRgba64ToRgb32_sse4(QRgba64 p) +{ + if (p.isTransparent()) + return 0; + __m128i vl = _mm_loadl_epi64(reinterpret_cast(&p)); + if (!p.isOpaque()) { + const __m128 va = _mm_set1_ps(p.alpha()); + __m128 via; + reciprocal_mul_ss(via, va, 65535.0f); + vl = _mm_unpacklo_epi16(vl, _mm_setzero_si128()); + vl = _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(vl) , via)); + vl = _mm_packus_epi32(vl, vl); + vl = _mm_insert_epi16(vl, p.alpha(), 3); + } + if (PixelOrder == PixelOrderBGR) + vl = _mm_shufflelo_epi16(vl, _MM_SHUFFLE(3, 0, 1, 2)); + return toArgb32(vl); +} #endif QT_END_NAMESPACE diff --git a/src/gui/painting/qrgba64.h b/src/gui/painting/qrgba64.h index 2c8f8fa8c4..0e5344cacb 100644 --- a/src/gui/painting/qrgba64.h +++ b/src/gui/painting/qrgba64.h @@ -127,6 +127,10 @@ public: Q_DECL_RELAXED_CONSTEXPR QRgba64 premultiplied() const { + if (isOpaque()) + return *this; + if (isTransparent()) + return QRgba64::fromRgba64(0); const quint32 a = alpha(); const quint16 r = div_65535(red() * a); const quint16 g = div_65535(green() * a); diff --git a/src/gui/painting/qrgba64_p.h b/src/gui/painting/qrgba64_p.h index 213ee58ef0..1ed0e82182 100644 --- a/src/gui/painting/qrgba64_p.h +++ b/src/gui/painting/qrgba64_p.h @@ -186,7 +186,8 @@ inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b) qMin(a.alpha() + b.alpha(), 65535)); } -#if defined __SSE2__ +#if QT_COMPILER_SUPPORTS_HERE(SSE2) +QT_FUNCTION_TARGET(SSE2) Q_ALWAYS_INLINE uint toArgb32(__m128i v) { v = _mm_unpacklo_epi16(v, _mm_setzero_si128()); -- cgit v1.2.3