summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@theqtcompany.com>2016-09-06 11:12:30 +0200
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2016-09-18 19:36:30 +0000
commit8b2f91e32860243f8d635a3e94e53cc6afe6def3 (patch)
treea4109504885a8c24e5dac960c9af55a8a69aba91
parent2d2d90781abdcdbd2a98201877563c83d926dd1a (diff)
Add AVX2 versions of the fast blending functions
This patch adds AVX2 versions of the fast blending functions that we already have SSE2 versions of. Change-Id: Ifd1a22f7891b6208cb74929ad26095d12c5a1efb Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
-rw-r--r--src/corelib/tools/qsimd_p.h3
-rw-r--r--src/gui/painting/qdrawhelper.cpp42
-rw-r--r--src/gui/painting/qdrawhelper_avx2.cpp305
3 files changed, 340 insertions, 10 deletions
diff --git a/src/corelib/tools/qsimd_p.h b/src/corelib/tools/qsimd_p.h
index 9d8695c0f6..2fd4be00a5 100644
--- a/src/corelib/tools/qsimd_p.h
+++ b/src/corelib/tools/qsimd_p.h
@@ -468,6 +468,9 @@ static inline quint64 qCpuFeatures()
#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
+#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
+ for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
+
QT_END_NAMESPACE
#endif // QSIMD_P_H
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index c697aceaf3..ea0c5bbb88 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -6548,6 +6548,15 @@ static void qInitDrawhelperFunctions()
qt_fetch_radial_gradient = qt_fetch_radial_gradient_sse2;
+ extern void QT_FASTCALL comp_func_SourceOver_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
+ extern void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha);
+ extern void QT_FASTCALL comp_func_Source_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
+ extern void QT_FASTCALL comp_func_Plus_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
+ qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_sse2;
+ qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_sse2;
+ qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_sse2;
+ qt_functionForMode_C[QPainter::CompositionMode_Plus] = comp_func_Plus_sse2;
+
#ifdef QT_COMPILER_SUPPORTS_SSSE3
if (qCpuHasFeature(SSSE3)) {
extern void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl,
@@ -6592,24 +6601,39 @@ static void qInitDrawhelperFunctions()
}
#endif
-#if defined(QT_COMPILER_SUPPORTS_AVX2) && !defined(__AVX2__)
+#if defined(QT_COMPILER_SUPPORTS_AVX2)
if (qCpuHasFeature(AVX2)) {
+#if !defined(__AVX2__)
extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, const uint *src, int count,
const QVector<QRgb> *, QDitherInfo *);
extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, const uint *src, int count,
const QVector<QRgb> *, QDitherInfo *);
qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_avx2;
qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_avx2;
+#endif
+ extern void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl,
+ const uchar *srcPixels, int sbpl,
+ int w, int h, int const_alpha);
+ extern void qt_blend_argb32_on_argb32_avx2(uchar *destPixels, int dbpl,
+ const uchar *srcPixels, int sbpl,
+ int w, int h, int const_alpha);
+ qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_avx2;
+ qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_avx2;
+ qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_avx2;
+ qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_avx2;
+ qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_avx2;
+ qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBX8888] = qt_blend_rgb32_on_rgb32_avx2;
+ qBlendFunctions[QImage::Format_RGBX8888][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_avx2;
+ qBlendFunctions[QImage::Format_RGBA8888_Premultiplied][QImage::Format_RGBA8888_Premultiplied] = qt_blend_argb32_on_argb32_avx2;
+
+ extern void QT_FASTCALL comp_func_SourceOver_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
+ extern void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint *destPixels, int length, uint color, uint const_alpha);
+ extern void QT_FASTCALL comp_func_Source_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
+ qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_avx2;
+ qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_avx2;
+ qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_avx2;
}
#endif
- extern void QT_FASTCALL comp_func_SourceOver_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
- extern void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha);
- extern void QT_FASTCALL comp_func_Source_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
- extern void QT_FASTCALL comp_func_Plus_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha);
- qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_sse2;
- qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_sse2;
- qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_sse2;
- qt_functionForMode_C[QPainter::CompositionMode_Plus] = comp_func_Plus_sse2;
#endif // SSE2
diff --git a/src/gui/painting/qdrawhelper_avx2.cpp b/src/gui/painting/qdrawhelper_avx2.cpp
index 35a975c972..01ffd54918 100644
--- a/src/gui/painting/qdrawhelper_avx2.cpp
+++ b/src/gui/painting/qdrawhelper_avx2.cpp
@@ -37,12 +37,14 @@
**
****************************************************************************/
-#include <private/qdrawhelper_p.h>
+#include "qdrawhelper_p.h"
+#include "qdrawingprimitive_sse2_p.h"
#if defined(QT_COMPILER_SUPPORTS_AVX2)
QT_BEGIN_NAMESPACE
+// Autovectorized premultiply functions:
const uint *QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, const uint *src, int count,
const QVector<QRgb> *, QDitherInfo *)
{
@@ -55,6 +57,307 @@ const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, const uint
return qt_convertRGBA8888ToARGB32PM(buffer, src, count);
}
+// Vectorized blend functions:
+
+// See BYTE_MUL_SSE2 for details.
+inline static void BYTE_MUL_AVX2(__m256i &pixelVector, const __m256i &alphaChannel, const __m256i &colorMask, const __m256i &half)
+{
+ __m256i pixelVectorAG = _mm256_srli_epi16(pixelVector, 8);
+ __m256i pixelVectorRB = _mm256_and_si256(pixelVector, colorMask);
+
+ pixelVectorAG = _mm256_mullo_epi16(pixelVectorAG, alphaChannel);
+ pixelVectorRB = _mm256_mullo_epi16(pixelVectorRB, alphaChannel);
+
+ pixelVectorRB = _mm256_add_epi16(pixelVectorRB, _mm256_srli_epi16(pixelVectorRB, 8));
+ pixelVectorAG = _mm256_add_epi16(pixelVectorAG, _mm256_srli_epi16(pixelVectorAG, 8));
+ pixelVectorRB = _mm256_add_epi16(pixelVectorRB, half);
+ pixelVectorAG = _mm256_add_epi16(pixelVectorAG, half);
+
+ pixelVectorRB = _mm256_srli_epi16(pixelVectorRB, 8);
+ pixelVectorAG = _mm256_andnot_si256(colorMask, pixelVectorAG);
+
+ pixelVector = _mm256_or_si256(pixelVectorAG, pixelVectorRB);
+}
+
+// See INTERPOLATE_PIXEL_255_SSE2 for details.
+inline static void INTERPOLATE_PIXEL_255_AVX2(const __m256i &srcVector, __m256i &dstVector, const __m256i &alphaChannel, const __m256i &oneMinusAlphaChannel, const __m256i &colorMask, const __m256i &half)
+{
+ const __m256i srcVectorAG = _mm256_srli_epi16(srcVector, 8);
+ const __m256i dstVectorAG = _mm256_srli_epi16(dstVector, 8);
+ const __m256i srcVectorRB = _mm256_and_si256(srcVector, colorMask);
+ const __m256i dstVectorRB = _mm256_and_si256(dstVector, colorMask);
+ const __m256i srcVectorAGalpha = _mm256_mullo_epi16(srcVectorAG, alphaChannel);
+ const __m256i srcVectorRBalpha = _mm256_mullo_epi16(srcVectorRB, alphaChannel);
+ const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi16(dstVectorAG, oneMinusAlphaChannel);
+ const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi16(dstVectorRB, oneMinusAlphaChannel);
+ __m256i finalAG = _mm256_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlpha);
+ __m256i finalRB = _mm256_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlpha);
+ finalAG = _mm256_add_epi16(finalAG, _mm256_srli_epi16(finalAG, 8));
+ finalRB = _mm256_add_epi16(finalRB, _mm256_srli_epi16(finalRB, 8));
+ finalAG = _mm256_add_epi16(finalAG, half);
+ finalRB = _mm256_add_epi16(finalRB, half);
+ finalAG = _mm256_andnot_si256(colorMask, finalAG);
+ finalRB = _mm256_srli_epi16(finalRB, 8);
+
+ dstVector = _mm256_or_si256(finalAG, finalRB);
+}
+
+// See BLEND_SOURCE_OVER_ARGB32_SSE2 for details.
+inline static void BLEND_SOURCE_OVER_ARGB32_AVX2(quint32 *dst, const quint32 *src, const int length)
+{
+ const __m256i half = _mm256_set1_epi16(0x80);
+ const __m256i one = _mm256_set1_epi16(0xff);
+ const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
+ const __m256i alphaMask = _mm256_set1_epi32(0xff000000);
+ const __m256i offsetMask = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+ const __m256i alphaShuffleMask = _mm256_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3,
+ char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
+
+ const int minusOffsetToAlignDstOn32Bytes = (reinterpret_cast<quintptr>(dst) >> 2) & 0x7;
+
+ int x = 0;
+ // Prologue to handle all pixels until dst is 32-byte aligned in one step.
+ if (minusOffsetToAlignDstOn32Bytes != 0 && x < (length - 7)) {
+ const __m256i prologueMask = _mm256_sub_epi32(_mm256_set1_epi32(minusOffsetToAlignDstOn32Bytes - 1), offsetMask);
+ const __m256i srcVector = _mm256_maskload_epi32((const int *)&src[x - minusOffsetToAlignDstOn32Bytes], prologueMask);
+ const __m256i prologueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, prologueMask);
+ if (!_mm256_testz_si256(srcVector, prologueAlphaMask)) {
+ if (_mm256_testc_si256(srcVector, prologueAlphaMask)) {
+ _mm256_maskstore_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask, srcVector);
+ } else {
+ __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
+ alphaChannel = _mm256_sub_epi16(one, alphaChannel);
+ __m256i dstVector = _mm256_maskload_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask);
+ BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
+ dstVector = _mm256_add_epi8(dstVector, srcVector);
+ _mm256_maskstore_epi32((int *)&dst[x - minusOffsetToAlignDstOn32Bytes], prologueMask, dstVector);
+ }
+ }
+ x += (8 - minusOffsetToAlignDstOn32Bytes);
+ }
+
+ for (; x < (length - 7); x += 8) {
+ const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
+ if (!_mm256_testz_si256(srcVector, alphaMask)) {
+ if (_mm256_testc_si256(srcVector, alphaMask)) {
+ _mm256_store_si256((__m256i *)&dst[x], srcVector);
+ } else {
+ __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
+ alphaChannel = _mm256_sub_epi16(one, alphaChannel);
+ __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
+ BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
+ dstVector = _mm256_add_epi8(dstVector, srcVector);
+ _mm256_store_si256((__m256i *)&dst[x], dstVector);
+ }
+ }
+ }
+
+ // Epilogue to handle all remaining pixels in one step.
+ if (x < length) {
+ const __m256i epilogueMask = _mm256_add_epi32(offsetMask, _mm256_set1_epi32(x - length));
+ const __m256i srcVector = _mm256_maskload_epi32((const int *)&src[x], epilogueMask);
+ const __m256i epilogueAlphaMask = _mm256_blendv_epi8(_mm256_setzero_si256(), alphaMask, epilogueMask);
+ if (!_mm256_testz_si256(srcVector, epilogueAlphaMask)) {
+ if (_mm256_testc_si256(srcVector, epilogueAlphaMask)) {
+ _mm256_maskstore_epi32((int *)&dst[x], epilogueMask, srcVector);
+ } else {
+ __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
+ alphaChannel = _mm256_sub_epi16(one, alphaChannel);
+ __m256i dstVector = _mm256_maskload_epi32((int *)&dst[x], epilogueMask);
+ BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
+ dstVector = _mm256_add_epi8(dstVector, srcVector);
+ _mm256_maskstore_epi32((int *)&dst[x], epilogueMask, dstVector);
+ }
+ }
+ }
+}
+
+
+// See BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2 for details.
+inline static void BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(quint32 *dst, const quint32 *src, const int length, const int const_alpha)
+{
+ int x = 0;
+
+ ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
+ blend_pixel(dst[x], src[x], const_alpha);
+
+ const __m256i half = _mm256_set1_epi16(0x80);
+ const __m256i one = _mm256_set1_epi16(0xff);
+ const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
+ const __m256i alphaMask = _mm256_set1_epi32(0xff000000);
+ const __m256i alphaShuffleMask = _mm256_set_epi8(char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3,
+ char(0xff),15,char(0xff),15,char(0xff),11,char(0xff),11,char(0xff),7,char(0xff),7,char(0xff),3,char(0xff),3);
+ const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
+ for (; x < (length - 7); x += 8) {
+ __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
+ if (!_mm256_testz_si256(srcVector, alphaMask)) {
+ BYTE_MUL_AVX2(srcVector, constAlphaVector, colorMask, half);
+
+ __m256i alphaChannel = _mm256_shuffle_epi8(srcVector, alphaShuffleMask);
+ alphaChannel = _mm256_sub_epi16(one, alphaChannel);
+ __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
+ BYTE_MUL_AVX2(dstVector, alphaChannel, colorMask, half);
+ dstVector = _mm256_add_epi8(dstVector, srcVector);
+ _mm256_store_si256((__m256i *)&dst[x], dstVector);
+ }
+ }
+ for (; x < length; ++x)
+ blend_pixel(dst[x], src[x], const_alpha);
+}
+
+void qt_blend_argb32_on_argb32_avx2(uchar *destPixels, int dbpl,
+ const uchar *srcPixels, int sbpl,
+ int w, int h,
+ int const_alpha)
+{
+ if (const_alpha == 256) {
+ for (int y = 0; y < h; ++y) {
+ const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
+ quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
+ BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, w);
+ destPixels += dbpl;
+ srcPixels += sbpl;
+ }
+ } else if (const_alpha != 0) {
+ const_alpha = (const_alpha * 255) >> 8;
+ for (int y = 0; y < h; ++y) {
+ const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
+ quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
+ BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, w, const_alpha);
+ destPixels += dbpl;
+ srcPixels += sbpl;
+ }
+ }
+}
+
+void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl,
+ const uchar *srcPixels, int sbpl,
+ int w, int h,
+ int const_alpha)
+{
+ if (const_alpha == 256) {
+ for (int y = 0; y < h; ++y) {
+ const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
+ quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
+ ::memcpy(dst, src, w * sizeof(uint));
+ srcPixels += sbpl;
+ destPixels += dbpl;
+ }
+ return;
+ }
+ if (const_alpha == 0)
+ return;
+
+ const __m256i half = _mm256_set1_epi16(0x80);
+ const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
+
+ const_alpha = (const_alpha * 255) >> 8;
+ int one_minus_const_alpha = 255 - const_alpha;
+ const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
+ const __m256i oneMinusConstAlpha = _mm256_set1_epi16(one_minus_const_alpha);
+ for (int y = 0; y < h; ++y) {
+ const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
+ quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
+ int x = 0;
+
+ // First, align dest to 32 bytes:
+ ALIGNMENT_PROLOGUE_32BYTES(dst, x, w)
+ dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
+
+ // 2) interpolate pixels with AVX2
+ for (; x < (w - 7); x += 8) {
+ const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
+ if (!_mm256_testc_si256(srcVector, _mm256_setzero_si256())) {
+ __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
+ INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
+ _mm256_store_si256((__m256i *)&dst[x], dstVector);
+ }
+ }
+
+ // 3) Epilogue
+ for (; x < w; ++x)
+ dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
+
+ srcPixels += sbpl;
+ destPixels += dbpl;
+ }
+}
+
+void QT_FASTCALL comp_func_SourceOver_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha)
+{
+ Q_ASSERT(const_alpha < 256);
+
+ const quint32 *src = (const quint32 *) srcPixels;
+ quint32 *dst = (quint32 *) destPixels;
+
+ if (const_alpha == 255)
+ BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, length);
+ else
+ BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, length, const_alpha);
+}
+
+void QT_FASTCALL comp_func_Source_avx2(uint *dst, const uint *src, int length, uint const_alpha)
+{
+ if (const_alpha == 255) {
+ ::memcpy(dst, src, length * sizeof(uint));
+ } else {
+ const int ialpha = 255 - const_alpha;
+
+ int x = 0;
+
+ // 1) prologue, align on 32 bytes
+ ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
+ dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
+
+ // 2) interpolate pixels with AVX2
+ const __m256i half = _mm256_set1_epi16(0x80);
+ const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
+ const __m256i constAlphaVector = _mm256_set1_epi16(const_alpha);
+ const __m256i oneMinusConstAlpha = _mm256_set1_epi16(ialpha);
+ for (; x < length - 7; x += 8) {
+ const __m256i srcVector = _mm256_lddqu_si256((const __m256i *)&src[x]);
+ __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
+ INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half);
+ _mm256_store_si256((__m256i *)&dst[x], dstVector);
+ }
+
+ // 3) Epilogue
+ for (; x < length; ++x)
+ dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
+ }
+}
+
+void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint *destPixels, int length, uint color, uint const_alpha)
+{
+ if ((const_alpha & qAlpha(color)) == 255) {
+ qt_memfill32(destPixels, color, length);
+ } else {
+ if (const_alpha != 255)
+ color = BYTE_MUL(color, const_alpha);
+
+ const quint32 minusAlphaOfColor = qAlpha(~color);
+ int x = 0;
+
+ quint32 *dst = (quint32 *) destPixels;
+ const __m256i colorVector = _mm256_set1_epi32(color);
+ const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
+ const __m256i half = _mm256_set1_epi16(0x80);
+ const __m256i minusAlphaOfColorVector = _mm256_set1_epi16(minusAlphaOfColor);
+
+ ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
+ destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
+
+ for (; x < length - 7; x += 8) {
+ __m256i dstVector = _mm256_load_si256((__m256i *)&dst[x]);
+ BYTE_MUL_AVX2(dstVector, minusAlphaOfColorVector, colorMask, half);
+ dstVector = _mm256_add_epi8(colorVector, dstVector);
+ _mm256_store_si256((__m256i *)&dst[x], dstVector);
+ }
+ for (; x < length; ++x)
+ destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
+ }
+}
+
QT_END_NAMESPACE
#endif