diff options
author | Allan Sandfeld Jensen <allan.jensen@theqtcompany.com> | 2016-09-06 11:12:30 +0200 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2016-09-18 19:36:30 +0000 |
commit | 8b2f91e32860243f8d635a3e94e53cc6afe6def3 (patch) | |
tree | a4109504885a8c24e5dac960c9af55a8a69aba91 /src/gui | |
parent | 2d2d90781abdcdbd2a98201877563c83d926dd1a (diff) |
Add AVX2 versions of the fast blending functions
This patch adds AVX2 versions of the fast blending functions that we
already have SSE2 versions of.
Change-Id: Ifd1a22f7891b6208cb74929ad26095d12c5a1efb
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src/gui')
-rw-r--r-- | src/gui/painting/qdrawhelper.cpp | 42 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper_avx2.cpp | 305 |
2 files changed, 337 insertions, 10 deletions
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index c697aceaf3..ea0c5bbb88 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -6548,6 +6548,15 @@ static void qInitDrawhelperFunctions() qt_fetch_radial_gradient = qt_fetch_radial_gradient_sse2; + extern void QT_FASTCALL comp_func_SourceOver_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha); + extern void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha); + extern void QT_FASTCALL comp_func_Source_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha); + extern void QT_FASTCALL comp_func_Plus_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha); + qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_sse2; + qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_sse2; + qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_sse2; + qt_functionForMode_C[QPainter::CompositionMode_Plus] = comp_func_Plus_sse2; + #ifdef QT_COMPILER_SUPPORTS_SSSE3 if (qCpuHasFeature(SSSE3)) { extern void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl, @@ -6592,24 +6601,39 @@ static void qInitDrawhelperFunctions() } #endif -#if defined(QT_COMPILER_SUPPORTS_AVX2) && !defined(__AVX2__) +#if defined(QT_COMPILER_SUPPORTS_AVX2) if (qCpuHasFeature(AVX2)) { +#if !defined(__AVX2__) extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, const uint *src, int count, const QVector<QRgb> *, QDitherInfo *); extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, const uint *src, int count, const QVector<QRgb> *, QDitherInfo *); qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_avx2; qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_avx2; +#endif + extern void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, int const_alpha); + extern void qt_blend_argb32_on_argb32_avx2(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, int const_alpha); + qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_avx2; + qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_avx2; + qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_avx2; + qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_avx2; + qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_avx2; + qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_avx2; + qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_avx2; + qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_avx2; + + extern void QT_FASTCALL comp_func_SourceOver_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha); + extern void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint *destPixels, int length, uint color, uint const_alpha); + extern void QT_FASTCALL comp_func_Source_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha); + qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_avx2; + qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_avx2; + qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_avx2; } #endif - extern void QT_FASTCALL comp_func_SourceOver_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha); - extern void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha); - extern void QT_FASTCALL comp_func_Source_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha); - extern void QT_FASTCALL comp_func_Plus_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha); - qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_sse2; - qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_sse2; - qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_sse2; - qt_functionForMode_C[QPainter::CompositionMode_Plus] = comp_func_Plus_sse2; #endif // SSE2 diff --git a/src/gui/painting/qdrawhelper_avx2.cpp b/src/gui/painting/qdrawhelper_avx2.cpp index 35a975c972..01ffd54918 100644 --- a/src/gui/painting/qdrawhelper_avx2.cpp +++ b/src/gui/painting/qdrawhelper_avx2.cpp @@ -37,12 +37,14 @@ ** ****************************************************************************/ -#include <private/qdrawhelper_p.h> +#include "qdrawhelper_p.h" +#include "qdrawingprimitive_sse2_p.h" #if defined(QT_COMPILER_SUPPORTS_AVX2) QT_BEGIN_NAMESPACE +// Autovectorized premultiply functions: const uint *QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, const uint *src, int count, const QVector<QRgb> *, QDitherInfo *) { @@ -55,6 +57,307 @@ const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, const uint return qt_convertRGBA8888ToARGB32PM(buffer, src, count); } +// Vectorized blend functions: + +// See BYTE_MUL_SSE2 for details. +inline static void BYTE_MUL_AVX2(__m256i &pixelVector, const __m256i &alphaChannel, const __m256i &colorMask, const __m256i &half) +{ + __m256i pixelVectorAG = _mm256_srli_epi16(pixelVector, 8); + __m256i pixelVectorRB = _mm256_and_si256(pixelVector, colorMask); + + pixelVectorAG = _mm256_mullo_epi16(pixelVectorAG, alphaChannel); + pixelVectorRB = _mm256_mullo_epi16(pixelVectorRB, alphaChannel); + + pixelVectorRB = _mm256_add_epi16(pixelVectorRB, _mm256_srli_epi16(pixelVectorRB, 8)); + pixelVectorAG = _mm256_add_epi16(pixelVectorAG, _mm256_srli_epi16(pixelVectorAG, 8)); + pixelVectorRB = _mm256_add_epi16(pixelVectorRB, half); + pixelVectorAG = _mm256_add_epi16(pixelVectorAG, half); + + pixelVectorRB = _mm256_srli_epi16(pixelVectorRB, 8); + pixelVectorAG = _mm256_andnot_si256(colorMask, pixelVectorAG); + + pixelVector = _mm256_or_si256(pixelVectorAG, pixelVectorRB); +} + +// See INTERPOLATE_PIXEL_255_SSE2 for details. +inline static void INTERPOLATE_PIXEL_255_AVX2(const __m256i &srcVector, __m256i &dstVector, const __m256i &alphaChannel, const __m256i &oneMinusAlphaChannel, const __m256i &colorMask, const __m256i &half) +{ + const __m256i srcVectorAG = _mm256_srli_epi16(srcVector, 8); + const __m256i dstVectorAG = _mm256_srli_epi16(dstVector, 8); + const __m256i srcVectorRB = _mm256_and_si256(srcVector, colorMask); + const __m256i dstVectorRB = _mm256_and_si256(dstVector, colorMask); + const __m256i srcVectorAGalpha = _mm256_mullo_epi16(srcVectorAG, alphaChannel); + const __m256i srcVectorRBalpha = _mm256_mullo_epi16(srcVectorRB, alphaChannel); + const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi16(dstVectorAG, oneMinusAlphaChannel); + const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi16(dstVectorRB, oneMinusAlphaChannel); + __m256i finalAG = _mm256_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlpha); + __m256i finalRB = _mm256_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlpha); + finalAG = _mm256_add_epi16(finalAG, _mm256_srli_epi16(finalAG, 8)); + finalRB = _mm256_add_epi16(finalRB, _mm256_srli_epi16(finalRB, 8)); + finalAG = _mm256_add_epi16(finalAG, half); + finalRB = _mm256_add_epi16(finalRB, half); + finalAG = _mm256_andnot_si256(colorMask, finalAG); + finalRB = _mm256_srli_epi16(finalRB, 8); + + dstVector = _mm256_or_si256(finalAG, finalRB); +} + +// See BLEND_SOURCE_OVER_ARGB32_SSE2 for details. +inline static void BLEND_SOURCE_OVER_ARGB32_AVX2(quint32 *dst, const quint32 *src, const int length) +{ + const __m256i half = _mm256_set1_epi16(0x80); + const __m256i one = _mm256_set1_epi16(0xff); + const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff); + const __m256i alphaMask = _mm256_set1_epi32(0xff000000); + const __m256i offsetMask = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); + const __m256i alphaShuffleMask = _mm256_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3, + char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3); + + const int minusOffsetToAlignDstOn32Bytes = (reinterpret_cast<quintptr>(dst) >> 2) & 0x7; + + int x = 0; + // Prologue to handle all pixels until dst is 32-byte aligned in one step. + if (minusOffsetToAlignDstOn32Bytes != 0 && x < (length - 7)) { + const __m256i prologueMask = _mm256_sub_epi32(_mm256_set1_epi32(minusOffsetToAlignDstOn32Bytes - 1), offsetMask); + const __m256i srcVector = _mm256_maskload_epi32((const int *)&src[x - minusOffsetToAlignDstOn32Bytes], prologueMask); + const __m256i prologueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, prologueMask); + if (!_mm256_testz_si256(srcVector, prologueAlphaMask)) { + if (_mm256_testc_si256(srcVector, prologueAlphaMask)) { + _mm256_maskstore_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask, srcVector); + } else { + __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask); + alphaChannel = _mm256_sub_epi16(one, alphaChannel); + __m256i dstVector = _mm256_maskload_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask); + BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half); + dstVector = _mm256_add_epi8(dstVector, srcVector); + _mm256_maskstore_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask, dstVector); + } + } + x += (8 - minusOffsetToAlignDstOn32Bytes); + } + + for (; x < (length - 7); x += 8) { + const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]); + if (!_mm256_testz_si256(srcVector, alphaMask)) { + if (_mm256_testc_si256(srcVector, alphaMask)) { + _mm256_store_si256((__m256i *)&dst[x], srcVector); + } else { + __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask); + alphaChannel = _mm256_sub_epi16(one, alphaChannel); + __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]); + BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half); + dstVector = _mm256_add_epi8(dstVector, srcVector); + _mm256_store_si256((__m256i *)&dst[x], dstVector); + } + } + } + + // Epilogue to handle all remaining pixels in one step. + if (x < length) { + const __m256i epilogueMask = _mm256_add_epi32(offsetMask, _mm256_set1_epi32(x - length)); + const __m256i srcVector = _mm256_maskload_epi32((const int *)&src[x], epilogueMask); + const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask); + if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) { + if (_mm256_testc_si256(srcVector, epilogueAlphaMask)) { + _mm256_maskstore_epi32((int *)&dst[x], epilogueMask, srcVector); + } else { + __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask); + alphaChannel = _mm256_sub_epi16(one, alphaChannel); + __m256i dstVector = _mm256_maskload_epi32((int *)&dst[x], epilogueMask); + BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half); + dstVector = _mm256_add_epi8(dstVector, srcVector); + _mm256_maskstore_epi32((int *)&dst[x], epilogueMask, dstVector); + } + } + } +} + + +// See BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2 for details. +inline static void BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(quint32 *dst, const quint32 *src, const int length, const int const_alpha) +{ + int x = 0; + + ALIGNMENT_PROLOGUE_32BYTES(dst, x, length) + blend_pixel(dst[x], src[x], const_alpha); + + const __m256i half = _mm256_set1_epi16(0x80); + const __m256i one = _mm256_set1_epi16(0xff); + const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff); + const __m256i alphaMask = _mm256_set1_epi32(0xff000000); + const __m256i alphaShuffleMask = _mm256_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3, + char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3); + const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha); + for (; x < (length - 7); x += 8) { + __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]); + if (!_mm256_testz_si256(srcVector, alphaMask)) { + BYTE_MUL_AVX2(srcVector, constAlphaVector, colorMask, half); + + __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask); + alphaChannel = _mm256_sub_epi16(one, alphaChannel); + __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]); + BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half); + dstVector = _mm256_add_epi8(dstVector, srcVector); + _mm256_store_si256((__m256i *)&dst[x], dstVector); + } + } + for (; x < length; ++x) + blend_pixel(dst[x], src[x], const_alpha); +} + +void qt_blend_argb32_on_argb32_avx2(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha) +{ + if (const_alpha == 256) { + for (int y = 0; y < h; ++y) { + const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels); + quint32 *dst = reinterpret_cast<quint32 *>(destPixels); + BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, w); + destPixels += dbpl; + srcPixels += sbpl; + } + } else if (const_alpha != 0) { + const_alpha = (const_alpha * 255) >> 8; + for (int y = 0; y < h; ++y) { + const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels); + quint32 *dst = reinterpret_cast<quint32 *>(destPixels); + BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, w, const_alpha); + destPixels += dbpl; + srcPixels += sbpl; + } + } +} + +void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl, + const uchar *srcPixels, int sbpl, + int w, int h, + int const_alpha) +{ + if (const_alpha == 256) { + for (int y = 0; y < h; ++y) { + const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels); + quint32 *dst = reinterpret_cast<quint32 *>(destPixels); + ::memcpy(dst, src, w * sizeof(uint)); + srcPixels += sbpl; + destPixels += dbpl; + } + return; + } + if (const_alpha == 0) + return; + + const __m256i half = _mm256_set1_epi16(0x80); + const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff); + + const_alpha = (const_alpha * 255) >> 8; + int one_minus_const_alpha = 255 - const_alpha; + const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha); + const __m256i oneMinusConstAlpha = _mm256_set1_epi16(one_minus_const_alpha); + for (int y = 0; y < h; ++y) { + const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels); + quint32 *dst = reinterpret_cast<quint32 *>(destPixels); + int x = 0; + + // First, align dest to 32 bytes: + ALIGNMENT_PROLOGUE_32BYTES(dst, x, w) + dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha); + + // 2) interpolate pixels with AVX2 + for (; x < (w - 7); x += 8) { + const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]); + if (!_mm256_testc_si256(srcVector, _mm256_setzero_si256())) { + __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]); + INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half); + _mm256_store_si256((__m256i *)&dst[x], dstVector); + } + } + + // 3) Epilogue + for (; x < w; ++x) + dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha); + + srcPixels += sbpl; + destPixels += dbpl; + } +} + +void QT_FASTCALL comp_func_SourceOver_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha) +{ + Q_ASSERT(const_alpha < 256); + + const quint32 *src = (const quint32 *) srcPixels; + quint32 *dst = (quint32 *) destPixels; + + if (const_alpha == 255) + BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, length); + else + BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, length, const_alpha); +} + +void QT_FASTCALL comp_func_Source_avx2(uint *dst, const uint *src, int length, uint const_alpha) +{ + if (const_alpha == 255) { + ::memcpy(dst, src, length * sizeof(uint)); + } else { + const int ialpha = 255 - const_alpha; + + int x = 0; + + // 1) prologue, align on 32 bytes + ALIGNMENT_PROLOGUE_32BYTES(dst, x, length) + dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha); + + // 2) interpolate pixels with AVX2 + const __m256i half = _mm256_set1_epi16(0x80); + const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff); + const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha); + const __m256i oneMinusConstAlpha = _mm256_set1_epi16(ialpha); + for (; x < length - 7; x += 8) { + const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]); + __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]); + INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half); + _mm256_store_si256((__m256i *)&dst[x], dstVector); + } + + // 3) Epilogue + for (; x < length; ++x) + dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha); + } +} + +void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint *destPixels, int length, uint color, uint const_alpha) +{ + if ((const_alpha & qAlpha(color)) == 255) { + qt_memfill32(destPixels, color, length); + } else { + if (const_alpha != 255) + color = BYTE_MUL(color, const_alpha); + + const quint32 minusAlphaOfColor = qAlpha(~color); + int x = 0; + + quint32 *dst = (quint32 *) destPixels; + const __m256i colorVector = _mm256_set1_epi32(color); + const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff); + const __m256i half = _mm256_set1_epi16(0x80); + const __m256i minusAlphaOfColorVector = _mm256_set1_epi16(minusAlphaOfColor); + + ALIGNMENT_PROLOGUE_32BYTES(dst, x, length) + destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor); + + for (; x < length - 7; x += 8) { + __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]); + BYTE_MUL_AVX2(dstVector, minusAlphaOfColorVector, colorMask, half); + dstVector = _mm256_add_epi8(colorVector, dstVector); + _mm256_store_si256((__m256i *)&dst[x], dstVector); + } + for (; x < length; ++x) + destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor); + } +} + QT_END_NAMESPACE #endif |