From f370410097f8cb8d8fdf6174b799497fe7fe0adf Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Tue, 8 Jan 2019 12:27:31 -0800 Subject: Add AVX2 version of ARGB->ARGB32PM Similar to the previous commit. This also removes the SSE4 implementations from Qt builds that use AVX2 throughout. Change-Id: I251f00d706d646ed87b4fffd1577f96ed52a4cf4 Reviewed-by: Allan Sandfeld Jensen --- src/gui/painting/qdrawhelper.cpp | 17 ++++- src/gui/painting/qdrawhelper_avx2.cpp | 138 ++++++++++++++++++++++++++++++++++ src/gui/painting/qdrawhelper_sse4.cpp | 4 + 3 files changed, 157 insertions(+), 2 deletions(-) diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index f1c0852d82..2dd18f6dfc 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -6514,18 +6514,20 @@ static void qInitDrawhelperFunctions() const QVector *, QDitherInfo *); extern void QT_FASTCALL destStore64ARGB32_sse4(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length); extern void QT_FASTCALL destStore64RGBA8888_sse4(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length); +# ifndef __AVX2__ qPixelLayouts[QImage::Format_ARGB32].fetchToARGB32PM = fetchARGB32ToARGB32PM_sse4; qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4; qPixelLayouts[QImage::Format_RGBA8888].fetchToARGB32PM = fetchRGBA8888ToARGB32PM_sse4; qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4; qPixelLayouts[QImage::Format_ARGB32].fetchToRGBA64PM = fetchARGB32ToRGBA64PM_sse4; qPixelLayouts[QImage::Format_ARGB32].convertToRGBA64PM = convertARGB32ToRGBA64PM_sse4; - qPixelLayouts[QImage::Format_ARGB32].storeFromARGB32PM = storeARGB32FromARGB32PM_sse4; qPixelLayouts[QImage::Format_RGBA8888].fetchToRGBA64PM = fetchRGBA8888ToRGBA64PM_sse4; qPixelLayouts[QImage::Format_RGBA8888].convertToRGBA64PM = convertRGBA8888ToRGBA64PM_sse4; - qPixelLayouts[QImage::Format_RGBA8888].storeFromARGB32PM = storeRGBA8888FromARGB32PM_sse4; qPixelLayouts[QImage::Format_RGBX8888].fetchToRGBA64PM = fetchRGBA8888ToRGBA64PM_sse4; qPixelLayouts[QImage::Format_RGBX8888].convertToRGBA64PM = convertRGBA8888ToRGBA64PM_sse4; +# endif + qPixelLayouts[QImage::Format_ARGB32].storeFromARGB32PM = storeARGB32FromARGB32PM_sse4; + qPixelLayouts[QImage::Format_RGBA8888].storeFromARGB32PM = storeRGBA8888FromARGB32PM_sse4; qPixelLayouts[QImage::Format_RGBX8888].storeFromARGB32PM = storeRGBXFromARGB32PM_sse4; qPixelLayouts[QImage::Format_A2BGR30_Premultiplied].storeFromARGB32PM = storeA2RGB30PMFromARGB32PM_sse4; qPixelLayouts[QImage::Format_A2RGB30_Premultiplied].storeFromARGB32PM = storeA2RGB30PMFromARGB32PM_sse4; @@ -6580,6 +6582,17 @@ static void qInitDrawhelperFunctions() bilinearFastTransformHelperARGB32PM[0][DownscaleTransform] = fetchTransformedBilinearARGB32PM_downscale_helper_avx2; bilinearFastTransformHelperARGB32PM[0][FastRotateTransform] = fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2; + extern void QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, int count, const QVector *); + extern void QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, int count, const QVector *); + extern const uint *QT_FASTCALL fetchARGB32ToARGB32PM_avx2(uint *buffer, const uchar *src, int index, int count, + const QVector *, QDitherInfo *); + extern const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_avx2(uint *buffer, const uchar *src, int index, int count, + const QVector *, QDitherInfo *); + qPixelLayouts[QImage::Format_ARGB32].fetchToARGB32PM = fetchARGB32ToARGB32PM_avx2; + qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_avx2; + qPixelLayouts[QImage::Format_RGBA8888].fetchToARGB32PM = fetchRGBA8888ToARGB32PM_avx2; + qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_avx2; + extern const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_avx2(QRgba64 *, const uint *, int, const QVector *, QDitherInfo *); extern const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_avx2(QRgba64 *, const uint *, int count, const QVector *, QDitherInfo *); extern const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_avx2(QRgba64 *, const uchar *, int, int, const QVector *, QDitherInfo *); diff --git a/src/gui/painting/qdrawhelper_avx2.cpp b/src/gui/painting/qdrawhelper_avx2.cpp index 7829e476e5..d8732dc29f 100644 --- a/src/gui/painting/qdrawhelper_avx2.cpp +++ b/src/gui/painting/qdrawhelper_avx2.cpp @@ -995,6 +995,144 @@ void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2(uint * } } +template +static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype count) +{ + qsizetype i = 0; + const __m256i alphaMask = _mm256_set1_epi32(0xff000000); + const __m256i rgbaMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15)); + const __m256i shuffleMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15)); + const __m256i half = _mm256_set1_epi16(0x0080); + const __m256i zero = _mm256_setzero_si256(); + + for (; i < count - 7; i += 8) { + __m256i srcVector = _mm256_loadu_si256(reinterpret_cast(src + i)); + if (!_mm256_testz_si256(srcVector, alphaMask)) { + if (!_mm256_testc_si256(srcVector, alphaMask)) { + if (RGBA) + srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask); + __m256i src1 = _mm256_unpacklo_epi8(srcVector, zero); + __m256i src2 = _mm256_unpackhi_epi8(srcVector, zero); + __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask); + __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask); + src1 = _mm256_mullo_epi16(src1, alpha1); + src2 = _mm256_mullo_epi16(src2, alpha2); + src1 = _mm256_add_epi16(src1, _mm256_srli_epi16(src1, 8)); + src2 = _mm256_add_epi16(src2, _mm256_srli_epi16(src2, 8)); + src1 = _mm256_add_epi16(src1, half); + src2 = _mm256_add_epi16(src2, half); + src1 = _mm256_srli_epi16(src1, 8); + src2 = _mm256_srli_epi16(src2, 8); + src1 = _mm256_blend_epi16(src1, alpha1, 0x88); + src2 = _mm256_blend_epi16(src2, alpha2, 0x88); + srcVector = _mm256_packus_epi16(src1, src2); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), srcVector); + } else { + if (RGBA) + srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask); + if (buffer != src || RGBA) + _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), srcVector); + } + } else { + _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), zero); + } + } + + if (i < count - 3) { + __m128i srcVector = _mm_loadu_si128(reinterpret_cast(src + i)); + if (!_mm_testz_si128(srcVector, _mm256_castsi256_si128(alphaMask))) { + if (!_mm_testc_si128(srcVector, _mm256_castsi256_si128(alphaMask))) { + if (RGBA) + srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); + __m128i src1 = _mm_unpacklo_epi8(srcVector, _mm256_castsi256_si128(zero)); + __m128i src2 = _mm_unpackhi_epi8(srcVector, _mm256_castsi256_si128(zero)); + __m128i alpha1 = _mm_shuffle_epi8(src1, _mm256_castsi256_si128(shuffleMask)); + __m128i alpha2 = _mm_shuffle_epi8(src2, _mm256_castsi256_si128(shuffleMask)); + src1 = _mm_mullo_epi16(src1, alpha1); + src2 = _mm_mullo_epi16(src2, alpha2); + src1 = _mm_add_epi16(src1, _mm_srli_epi16(src1, 8)); + src2 = _mm_add_epi16(src2, _mm_srli_epi16(src2, 8)); + src1 = _mm_add_epi16(src1, _mm256_castsi256_si128(half)); + src2 = _mm_add_epi16(src2, _mm256_castsi256_si128(half)); + src1 = _mm_srli_epi16(src1, 8); + src2 = _mm_srli_epi16(src2, 8); + src1 = _mm_blend_epi16(src1, alpha1, 0x88); + src2 = _mm_blend_epi16(src2, alpha2, 0x88); + _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + i), srcVector); + } else { + if (RGBA) + srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); + if (buffer != src || RGBA) + _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + i), srcVector); + } + } else { + _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + i), _mm256_castsi256_si128(zero)); + } + i += 4; + } + + auto convert_half = [=](__m128i &srcVector) { + if (!_mm_testz_si128(srcVector, _mm256_castsi256_si128(alphaMask))) { + if (!_mm_testc_si128(srcVector, _mm256_castsi256_si128(alphaMask))) { + if (RGBA) + srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); + __m128i src1 = _mm_unpacklo_epi8(srcVector, _mm256_castsi256_si128(zero)); + __m128i alpha1 = _mm_shuffle_epi8(src1, _mm256_castsi256_si128(shuffleMask)); + src1 = _mm_mullo_epi16(src1, alpha1); + src1 = _mm_add_epi16(src1, _mm_srli_epi16(src1, 8)); + src1 = _mm_add_epi16(src1, _mm256_castsi256_si128(half)); + src1 = _mm_srli_epi16(src1, 8); + src1 = _mm_blend_epi16(src1, alpha1, 0x88); + srcVector = _mm_packus_epi16(src1, src1); + return true; + } else { + if (RGBA) + srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); + return buffer != src || RGBA; + } + } else { + srcVector = _mm256_castsi256_si128(zero); + return true; + } + }; + if (i < count - 1) { + __m128i srcVector = _mm_loadl_epi64(reinterpret_cast(src + i)); + if (convert_half(srcVector)) + _mm_storel_epi64(reinterpret_cast<__m128i *>(buffer + i), srcVector); + i += 2; + } + + if (i != count) { + __m128i srcVector = _mm_cvtsi32_si128(src[i]); + if (convert_half(srcVector)) + buffer[i] = _mm_cvtsi128_si32(srcVector); + } +} + +void QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, int count, const QVector *) +{ + convertARGBToARGB32PM_avx2(buffer, buffer, count); +} + +void QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, int count, const QVector *) +{ + convertARGBToARGB32PM_avx2(buffer, buffer, count); +} + +const uint *QT_FASTCALL fetchARGB32ToARGB32PM_avx2(uint *buffer, const uchar *src, int index, int count, + const QVector *, QDitherInfo *) +{ + convertARGBToARGB32PM_avx2(buffer, reinterpret_cast(src) + index, count); + return buffer; +} + +const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_avx2(uint *buffer, const uchar *src, int index, int count, + const QVector *, QDitherInfo *) +{ + convertARGBToARGB32PM_avx2(buffer, reinterpret_cast(src) + index, count); + return buffer; +} + template static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizetype count) { diff --git a/src/gui/painting/qdrawhelper_sse4.cpp b/src/gui/painting/qdrawhelper_sse4.cpp index 9e3e6682bb..e4b93e8511 100644 --- a/src/gui/painting/qdrawhelper_sse4.cpp +++ b/src/gui/painting/qdrawhelper_sse4.cpp @@ -45,6 +45,7 @@ QT_BEGIN_NAMESPACE +#ifndef __AVX2__ template static void convertARGBToARGB32PM_sse4(uint *buffer, const uint *src, int count) { @@ -142,6 +143,7 @@ static void convertARGBToRGBA64PM_sse4(QRgba64 *buffer, const uint *src, int cou buffer[i] = QRgba64::fromArgb32(s).premultiplied(); } } +#endif // __AVX2__ static inline __m128 Q_DECL_VECTORCALL reciprocal_mul_ps(__m128 a, float mul) { @@ -308,6 +310,7 @@ static inline void convertARGBFromRGBA64PM_sse4(uint *buffer, const QRgba64 *src } } +#ifndef __AVX2__ void QT_FASTCALL convertARGB32ToARGB32PM_sse4(uint *buffer, int count, const QVector *) { convertARGBToARGB32PM_sse4(buffer, buffer, count); @@ -359,6 +362,7 @@ const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_sse4(QRgba64 *buffer, const u convertARGBToRGBA64PM_sse4(buffer, reinterpret_cast(src) + index, count); return buffer; } +#endif // __AVX2__ void QT_FASTCALL storeRGB32FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count, const QVector *, QDitherInfo *) -- cgit v1.2.3