diff options
Diffstat (limited to 'src/gui/painting/qdrawhelper_avx2.cpp')
-rw-r--r-- | src/gui/painting/qdrawhelper_avx2.cpp | 308 |
1 files changed, 301 insertions, 7 deletions
diff --git a/src/gui/painting/qdrawhelper_avx2.cpp b/src/gui/painting/qdrawhelper_avx2.cpp index ec6643deed..2b3cc9b226 100644 --- a/src/gui/painting/qdrawhelper_avx2.cpp +++ b/src/gui/painting/qdrawhelper_avx2.cpp @@ -1,6 +1,7 @@ /**************************************************************************** ** -** Copyright (C) 2016 The Qt Company Ltd. +** Copyright (C) 2018 The Qt Company Ltd. +** Copyright (C) 2018 Intel Corporation. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of the QtGui module of the Qt Toolkit. @@ -38,6 +39,7 @@ ****************************************************************************/ #include "qdrawhelper_p.h" +#include "qdrawhelper_x86_p.h" #include "qdrawingprimitive_sse2_p.h" #include "qrgba64_p.h" @@ -53,7 +55,8 @@ enum { // Vectorized blend functions: // See BYTE_MUL_SSE2 for details. -inline static void BYTE_MUL_AVX2(__m256i &pixelVector, const __m256i &alphaChannel, const __m256i &colorMask, const __m256i &half) +inline static void Q_DECL_VECTORCALL +BYTE_MUL_AVX2(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half) { __m256i pixelVectorAG = _mm256_srli_epi16(pixelVector, 8); __m256i pixelVectorRB = _mm256_and_si256(pixelVector, colorMask); @@ -72,7 +75,8 @@ inline static void BYTE_MUL_AVX2(__m256i &pixelVector, const __m256i &alphaChann pixelVector = _mm256_or_si256(pixelVectorAG, pixelVectorRB); } -inline static void BYTE_MUL_RGB64_AVX2(__m256i &pixelVector, const __m256i &alphaChannel, const __m256i &colorMask, const __m256i &half) +inline static void Q_DECL_VECTORCALL +BYTE_MUL_RGB64_AVX2(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half) { __m256i pixelVectorAG = _mm256_srli_epi32(pixelVector, 16); __m256i pixelVectorRB = _mm256_and_si256(pixelVector, colorMask); @@ -92,7 +96,8 @@ inline static void BYTE_MUL_RGB64_AVX2(__m256i &pixelVector, const __m256i &alph } // See INTERPOLATE_PIXEL_255_SSE2 for details. -inline static void INTERPOLATE_PIXEL_255_AVX2(const __m256i &srcVector, __m256i &dstVector, const __m256i &alphaChannel, const __m256i &oneMinusAlphaChannel, const __m256i &colorMask, const __m256i &half) +inline static void Q_DECL_VECTORCALL +INTERPOLATE_PIXEL_255_AVX2(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel, __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half) { const __m256i srcVectorAG = _mm256_srli_epi16(srcVector, 8); const __m256i dstVectorAG = _mm256_srli_epi16(dstVector, 8); @@ -114,7 +119,8 @@ inline static void INTERPOLATE_PIXEL_255_AVX2(const __m256i &srcVector, __m256i dstVector = _mm256_or_si256(finalAG, finalRB); } -inline static void INTERPOLATE_PIXEL_RGB64_AVX2(const __m256i &srcVector, __m256i &dstVector, const __m256i &alphaChannel, const __m256i &oneMinusAlphaChannel, const __m256i &colorMask, const __m256i &half) +inline static void Q_DECL_VECTORCALL +INTERPOLATE_PIXEL_RGB64_AVX2(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel, __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half) { const __m256i srcVectorAG = _mm256_srli_epi32(srcVector, 16); const __m256i dstVectorAG = _mm256_srli_epi32(dstVector, 16); @@ -138,7 +144,7 @@ inline static void INTERPOLATE_PIXEL_RGB64_AVX2(const __m256i &srcVector, __m256 // See BLEND_SOURCE_OVER_ARGB32_SSE2 for details. -inline static void BLEND_SOURCE_OVER_ARGB32_AVX2(quint32 *dst, const quint32 *src, const int length) +inline static void Q_DECL_VECTORCALL BLEND_SOURCE_OVER_ARGB32_AVX2(quint32 *dst, const quint32 *src, const int length) { const __m256i half = _mm256_set1_epi16(0x80); const __m256i one = _mm256_set1_epi16(0xff); @@ -209,7 +215,8 @@ inline static void BLEND_SOURCE_OVER_ARGB32_AVX2(quint32 *dst, const quint32 *sr // See BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2 for details. -inline static void BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(quint32 *dst, const quint32 *src, const int length, const int const_alpha) +inline static void Q_DECL_VECTORCALL +BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(quint32 *dst, const quint32 *src, const int length, const int const_alpha) { int x = 0; @@ -316,6 +323,66 @@ void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl, } } +static Q_NEVER_INLINE +void Q_DECL_VECTORCALL qt_memfillXX_avx2(uchar *dest, __m256i value256, qsizetype bytes) +{ + __m128i value128 = _mm256_castsi256_si128(value256); + + // main body + __m256i *dst256 = reinterpret_cast<__m256i *>(dest); + uchar *end = dest + bytes; + while (reinterpret_cast<uchar *>(dst256 + 4) <= end) { + _mm256_storeu_si256(dst256 + 0, value256); + _mm256_storeu_si256(dst256 + 1, value256); + _mm256_storeu_si256(dst256 + 2, value256); + _mm256_storeu_si256(dst256 + 3, value256); + dst256 += 4; + } + + // first epilogue: fewer than 128 bytes / 32 entries + bytes = end - reinterpret_cast<uchar *>(dst256); + switch (bytes / sizeof(value256)) { + case 3: _mm256_storeu_si256(dst256++, value256); Q_FALLTHROUGH(); + case 2: _mm256_storeu_si256(dst256++, value256); Q_FALLTHROUGH(); + case 1: _mm256_storeu_si256(dst256++, value256); + } + + // second epilogue: fewer than 32 bytes + __m128i *dst128 = reinterpret_cast<__m128i *>(dst256); + if (bytes & sizeof(value128)) + _mm_storeu_si128(dst128++, value128); + + // third epilogue: fewer than 16 bytes + if (bytes & 8) + _mm_storel_epi64(reinterpret_cast<__m128i *>(end - 8), value128); +} + +void qt_memfill64_avx2(quint64 *dest, quint64 value, qsizetype count) +{ +#if defined(Q_CC_GNU) && !defined(Q_CC_CLANG) && !defined(Q_CC_INTEL) + // work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80820 + __m128i value64 = _mm_set_epi64x(0, value); // _mm_cvtsi64_si128(value); +# ifdef Q_PROCESSOR_X86_64 + asm ("" : "+x" (value64)); +# endif + __m256i value256 = _mm256_broadcastq_epi64(value64); +#else + __m256i value256 = _mm256_set1_epi64x(value); +#endif + + qt_memfillXX_avx2(reinterpret_cast<uchar *>(dest), value256, count * sizeof(quint64)); +} + +void qt_memfill32_avx2(quint32 *dest, quint32 value, qsizetype count) +{ + if (count % 2) { + // odd number of pixels, round to even + *dest++ = value; + --count; + } + qt_memfillXX_avx2(reinterpret_cast<uchar *>(dest), _mm256_set1_epi32(value), count * sizeof(quint32)); +} + void QT_FASTCALL comp_func_SourceOver_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha) { Q_ASSERT(const_alpha < 256); @@ -928,6 +995,233 @@ void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2(uint * } } +static inline __m256i epilogueMaskFromCount(qsizetype count) +{ + Q_ASSERT(count > 0); + static const __m256i offsetMask = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + return _mm256_add_epi32(offsetMask, _mm256_set1_epi32(-count)); +} + +template<bool RGBA> +static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype count) +{ + qsizetype i = 0; + const __m256i alphaMask = _mm256_set1_epi32(0xff000000); + const __m256i rgbaMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15)); + const __m256i shuffleMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15)); + const __m256i half = _mm256_set1_epi16(0x0080); + const __m256i zero = _mm256_setzero_si256(); + + for (; i < count - 7; i += 8) { + __m256i srcVector = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + i)); + if (!_mm256_testz_si256(srcVector, alphaMask)) { + // keep the two _mm_test[zc]_siXXX next to each other + bool cf = _mm256_testc_si256(srcVector, alphaMask); + if (RGBA) + srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask); + if (!cf) { + __m256i src1 = _mm256_unpacklo_epi8(srcVector, zero); + __m256i src2 = _mm256_unpackhi_epi8(srcVector, zero); + __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask); + __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask); + src1 = _mm256_mullo_epi16(src1, alpha1); + src2 = _mm256_mullo_epi16(src2, alpha2); + src1 = _mm256_add_epi16(src1, _mm256_srli_epi16(src1, 8)); + src2 = _mm256_add_epi16(src2, _mm256_srli_epi16(src2, 8)); + src1 = _mm256_add_epi16(src1, half); + src2 = _mm256_add_epi16(src2, half); + src1 = _mm256_srli_epi16(src1, 8); + src2 = _mm256_srli_epi16(src2, 8); + src1 = _mm256_blend_epi16(src1, alpha1, 0x88); + src2 = _mm256_blend_epi16(src2, alpha2, 0x88); + srcVector = _mm256_packus_epi16(src1, src2); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), srcVector); + } else { + if (buffer != src || RGBA) + _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), srcVector); + } + } else { + _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), zero); + } + } + + if (i < count) { + const __m256i epilogueMask = epilogueMaskFromCount(count - i); + __m256i srcVector = _mm256_maskload_epi32(reinterpret_cast<const int *>(src + i), epilogueMask); + const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask); + + if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) { + // keep the two _mm_test[zc]_siXXX next to each other + bool cf = _mm256_testc_si256(srcVector, epilogueAlphaMask); + if (RGBA) + srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask); + if (!cf) { + __m256i src1 = _mm256_unpacklo_epi8(srcVector, zero); + __m256i src2 = _mm256_unpackhi_epi8(srcVector, zero); + __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask); + __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask); + src1 = _mm256_mullo_epi16(src1, alpha1); + src2 = _mm256_mullo_epi16(src2, alpha2); + src1 = _mm256_add_epi16(src1, _mm256_srli_epi16(src1, 8)); + src2 = _mm256_add_epi16(src2, _mm256_srli_epi16(src2, 8)); + src1 = _mm256_add_epi16(src1, half); + src2 = _mm256_add_epi16(src2, half); + src1 = _mm256_srli_epi16(src1, 8); + src2 = _mm256_srli_epi16(src2, 8); + src1 = _mm256_blend_epi16(src1, alpha1, 0x88); + src2 = _mm256_blend_epi16(src2, alpha2, 0x88); + srcVector = _mm256_packus_epi16(src1, src2); + _mm256_maskstore_epi32(reinterpret_cast<int *>(buffer + i), epilogueMask, srcVector); + } else { + if (buffer != src || RGBA) + _mm256_maskstore_epi32(reinterpret_cast<int *>(buffer + i), epilogueMask, srcVector); + } + } else { + _mm256_maskstore_epi32(reinterpret_cast<int *>(buffer + i), epilogueMask, zero); + } + } +} + +void QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, int count, const QVector<QRgb> *) +{ + convertARGBToARGB32PM_avx2<false>(buffer, buffer, count); +} + +void QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, int count, const QVector<QRgb> *) +{ + convertARGBToARGB32PM_avx2<true>(buffer, buffer, count); +} + +const uint *QT_FASTCALL fetchARGB32ToARGB32PM_avx2(uint *buffer, const uchar *src, int index, int count, + const QVector<QRgb> *, QDitherInfo *) +{ + convertARGBToARGB32PM_avx2<false>(buffer, reinterpret_cast<const uint *>(src) + index, count); + return buffer; +} + +const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_avx2(uint *buffer, const uchar *src, int index, int count, + const QVector<QRgb> *, QDitherInfo *) +{ + convertARGBToARGB32PM_avx2<true>(buffer, reinterpret_cast<const uint *>(src) + index, count); + return buffer; +} + +template<bool RGBA> +static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizetype count) +{ + qsizetype i = 0; + const __m256i alphaMask = _mm256_set1_epi32(0xff000000); + const __m256i rgbaMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15)); + const __m256i shuffleMask = _mm256_broadcastsi128_si256(_mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15)); + const __m256i zero = _mm256_setzero_si256(); + + for (; i < count - 7; i += 8) { + __m256i dst1, dst2; + __m256i srcVector = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + i)); + if (!_mm256_testz_si256(srcVector, alphaMask)) { + // keep the two _mm_test[zc]_siXXX next to each other + bool cf = _mm256_testc_si256(srcVector, alphaMask); + if (!RGBA) + srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask); + + // The two unpack instructions unpack the low and upper halves of + // each 128-bit half of the 256-bit register. Here's the tracking + // of what's where: (p is 32-bit, P is 64-bit) + // as loaded: [ p1, p2, p3, p4; p5, p6, p7, p8 ] + // after permute4x64 [ p1, p2, p5, p6; p3, p4, p7, p8 ] + // after unpacklo/hi [ P1, P2; P3, P4 ] [ P5, P6; P7, P8 ] + srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(3, 1, 2, 0)); + + const __m256i src1 = _mm256_unpacklo_epi8(srcVector, srcVector); + const __m256i src2 = _mm256_unpackhi_epi8(srcVector, srcVector); + if (!cf) { + const __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask); + const __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask); + dst1 = _mm256_mulhi_epu16(src1, alpha1); + dst2 = _mm256_mulhi_epu16(src2, alpha2); + dst1 = _mm256_add_epi16(dst1, _mm256_srli_epi16(dst1, 15)); + dst2 = _mm256_add_epi16(dst2, _mm256_srli_epi16(dst2, 15)); + dst1 = _mm256_blend_epi16(dst1, src1, 0x88); + dst2 = _mm256_blend_epi16(dst2, src2, 0x88); + } else { + dst1 = src1; + dst2 = src2; + } + } else { + dst1 = dst2 = zero; + } + _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), dst1); + _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i) + 1, dst2); + } + + if (i < count) { + __m256i epilogueMask = epilogueMaskFromCount(count - i); + const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask); + __m256i dst1, dst2; + __m256i srcVector = _mm256_maskload_epi32(reinterpret_cast<const int *>(src + i), epilogueMask); + + if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) { + // keep the two _mm_test[zc]_siXXX next to each other + bool cf = _mm256_testc_si256(srcVector, epilogueAlphaMask); + if (!RGBA) + srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask); + srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(3, 1, 2, 0)); + const __m256i src1 = _mm256_unpacklo_epi8(srcVector, srcVector); + const __m256i src2 = _mm256_unpackhi_epi8(srcVector, srcVector); + if (!cf) { + const __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask); + const __m256i alpha2 = _mm256_shuffle_epi8(src2, shuffleMask); + dst1 = _mm256_mulhi_epu16(src1, alpha1); + dst2 = _mm256_mulhi_epu16(src2, alpha2); + dst1 = _mm256_add_epi16(dst1, _mm256_srli_epi16(dst1, 15)); + dst2 = _mm256_add_epi16(dst2, _mm256_srli_epi16(dst2, 15)); + dst1 = _mm256_blend_epi16(dst1, src1, 0x88); + dst2 = _mm256_blend_epi16(dst2, src2, 0x88); + } else { + dst1 = src1; + dst2 = src2; + } + } else { + dst1 = dst2 = zero; + } + epilogueMask = _mm256_permute4x64_epi64(epilogueMask, _MM_SHUFFLE(3, 1, 2, 0)); + _mm256_maskstore_epi64(reinterpret_cast<qint64 *>(buffer + i), + _mm256_unpacklo_epi32(epilogueMask, epilogueMask), + dst1); + _mm256_maskstore_epi64(reinterpret_cast<qint64 *>(buffer + i + 4), + _mm256_unpackhi_epi32(epilogueMask, epilogueMask), + dst2); + } +} + +const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, int count, + const QVector<QRgb> *, QDitherInfo *) +{ + convertARGBToRGBA64PM_avx2<false>(buffer, src, count); + return buffer; +} + +const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, int count, + const QVector<QRgb> *, QDitherInfo *) +{ + convertARGBToRGBA64PM_avx2<true>(buffer, src, count); + return buffer; +} + +const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_avx2(QRgba64 *buffer, const uchar *src, int index, int count, + const QVector<QRgb> *, QDitherInfo *) +{ + convertARGBToRGBA64PM_avx2<false>(buffer, reinterpret_cast<const uint *>(src) + index, count); + return buffer; +} + +const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_avx2(QRgba64 *buffer, const uchar *src, int index, int count, + const QVector<QRgb> *, QDitherInfo *) +{ + convertARGBToRGBA64PM_avx2<true>(buffer, reinterpret_cast<const uint *>(src) + index, count); + return buffer; +} + QT_END_NAMESPACE #endif |