diff options
author | Thiago Macieira <thiago.macieira@intel.com> | 2018-11-02 19:38:29 -0700 |
---|---|---|
committer | Thiago Macieira <thiago.macieira@intel.com> | 2018-12-11 19:04:59 +0000 |
commit | 40894d1a60d357bc46364ae038ede0159f32261b (patch) | |
tree | 9aaf83d5aae8e0207916a35bfdc80590b3e9d929 /src | |
parent | a440aada72f2ee78c5e27d70ecc79c0071673446 (diff) |
Add AVX2 versions of qt_memfill32 and qt_memfill64
The implementation is almost the same 4-way-unrolled loop, but because
of the wider registers, we fill 128 bytes per loop. Unlike the SSE2
implementation, the AVX2 version uses unaligned stores and won't try to
align in the prologue, matching glibc's __memset_avx2 (also unaligned).
Change-Id: Iba4b5c183776497d8ee1fffd15637ccb2a7b83bc
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'src')
-rw-r--r-- | src/gui/painting/qdrawhelper.cpp | 10 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper_avx2.cpp | 53 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper_p.h | 5 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper_sse2.cpp | 6 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper_x86_p.h | 7 |
5 files changed, 76 insertions, 5 deletions
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index d97ace7480..59b46b84ef 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -6288,6 +6288,10 @@ void qt_memfill32(quint32 *dest, quint32 color, qsizetype count) qt_memfill_template<quint32>(dest, color, count); } #endif +#ifdef __SSE2__ +decltype(qt_memfill32_sse2) *qt_memfill32 = nullptr; +decltype(qt_memfill64_sse2) *qt_memfill64 = nullptr; +#endif #ifdef QT_COMPILER_SUPPORTS_SSE4_1 template<QtPixelOrder> void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count, const QVector<QRgb> *, QDitherInfo *); @@ -6301,6 +6305,10 @@ static void qInitDrawhelperFunctions() qInitBlendFunctions(); #ifdef __SSE2__ +# ifndef __AVX2__ + qt_memfill32 = qt_memfill32_sse2; + qt_memfill64 = qt_memfill64_sse2; +# endif qDrawHelper[QImage::Format_RGB32].bitmapBlit = qt_bitmapblit32_sse2; qDrawHelper[QImage::Format_ARGB32].bitmapBlit = qt_bitmapblit32_sse2; qDrawHelper[QImage::Format_ARGB32_Premultiplied].bitmapBlit = qt_bitmapblit32_sse2; @@ -6407,6 +6415,8 @@ static void qInitDrawhelperFunctions() #if defined(QT_COMPILER_SUPPORTS_AVX2) if (qCpuHasFeature(ArchHaswell)) { + qt_memfill32 = qt_memfill32_avx2; + qt_memfill64 = qt_memfill64_avx2; extern void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int w, int h, int const_alpha); diff --git a/src/gui/painting/qdrawhelper_avx2.cpp b/src/gui/painting/qdrawhelper_avx2.cpp index ec6643deed..adf11999bc 100644 --- a/src/gui/painting/qdrawhelper_avx2.cpp +++ b/src/gui/painting/qdrawhelper_avx2.cpp @@ -1,6 +1,7 @@ /**************************************************************************** ** -** Copyright (C) 2016 The Qt Company Ltd. +** Copyright (C) 2018 The Qt Company Ltd. +** Copyright (C) 2018 Intel Corporation. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of the QtGui module of the Qt Toolkit. @@ -38,6 +39,7 @@ ****************************************************************************/ #include "qdrawhelper_p.h" +#include "qdrawhelper_x86_p.h" #include "qdrawingprimitive_sse2_p.h" #include "qrgba64_p.h" @@ -316,6 +318,55 @@ void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl, } } +static Q_NEVER_INLINE +void Q_DECL_VECTORCALL qt_memfillXX_avx2(uchar *dest, __m256i value256, qsizetype bytes) +{ + __m128i value128 = _mm256_castsi256_si128(value256); + + // main body + __m256i *dst256 = reinterpret_cast<__m256i *>(dest); + uchar *end = dest + bytes; + while (reinterpret_cast<uchar *>(dst256 + 4) <= end) { + _mm256_storeu_si256(dst256 + 0, value256); + _mm256_storeu_si256(dst256 + 1, value256); + _mm256_storeu_si256(dst256 + 2, value256); + _mm256_storeu_si256(dst256 + 3, value256); + dst256 += 4; + } + + // first epilogue: fewer than 128 bytes / 32 entries + bytes = end - reinterpret_cast<uchar *>(dst256); + switch (bytes / sizeof(value256)) { + case 3: _mm256_storeu_si256(dst256++, value256); Q_FALLTHROUGH(); + case 2: _mm256_storeu_si256(dst256++, value256); Q_FALLTHROUGH(); + case 1: _mm256_storeu_si256(dst256++, value256); + } + + // second epilogue: fewer than 32 bytes + __m128i *dst128 = reinterpret_cast<__m128i *>(dst256); + if (bytes & sizeof(value128)) + _mm_storeu_si128(dst128++, value128); + + // third epilogue: fewer than 16 bytes + if (bytes & 8) + _mm_storel_epi64(reinterpret_cast<__m128i *>(end - 8), value128); +} + +void qt_memfill64_avx2(quint64 *dest, quint64 value, qsizetype count) +{ + qt_memfillXX_avx2(reinterpret_cast<uchar *>(dest), _mm256_set1_epi64x(value), count * sizeof(quint64)); +} + +void qt_memfill32_avx2(quint32 *dest, quint32 value, qsizetype count) +{ + if (count % 2) { + // odd number of pixels, round to even + *dest++ = value; + --count; + } + qt_memfillXX_avx2(reinterpret_cast<uchar *>(dest), _mm256_set1_epi32(value), count * sizeof(quint32)); +} + void QT_FASTCALL comp_func_SourceOver_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha) { Q_ASSERT(const_alpha < 256); diff --git a/src/gui/painting/qdrawhelper_p.h b/src/gui/painting/qdrawhelper_p.h index 8a26d884a5..219734c430 100644 --- a/src/gui/painting/qdrawhelper_p.h +++ b/src/gui/painting/qdrawhelper_p.h @@ -167,8 +167,13 @@ extern DrawHelper qDrawHelper[QImage::NImageFormats]; void qBlendGradient(int count, const QSpan *spans, void *userData); void qBlendTexture(int count, const QSpan *spans, void *userData); +#ifdef __SSE2__ +extern void (*qt_memfill64)(quint64 *dest, quint64 value, qsizetype count); +extern void (*qt_memfill32)(quint32 *dest, quint32 value, qsizetype count); +#else extern void qt_memfill64(quint64 *dest, quint64 value, qsizetype count); extern void qt_memfill32(quint32 *dest, quint32 value, qsizetype count); +#endif extern void qt_memfill16(quint16 *dest, quint16 value, qsizetype count); typedef void (QT_FASTCALL *CompositionFunction)(uint *Q_DECL_RESTRICT dest, const uint *Q_DECL_RESTRICT src, int length, uint const_alpha); diff --git a/src/gui/painting/qdrawhelper_sse2.cpp b/src/gui/painting/qdrawhelper_sse2.cpp index bf2e90f6af..0ac9508264 100644 --- a/src/gui/painting/qdrawhelper_sse2.cpp +++ b/src/gui/painting/qdrawhelper_sse2.cpp @@ -233,6 +233,7 @@ void QT_FASTCALL comp_func_Source_sse2(uint *dst, const uint *src, int length, u } } +#ifndef __AVX2__ static Q_NEVER_INLINE void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount) { @@ -255,7 +256,7 @@ void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintp } } -void qt_memfill64(quint64 *dest, quint64 value, qsizetype count) +void qt_memfill64_sse2(quint64 *dest, quint64 value, qsizetype count) { quintptr misaligned = quintptr(dest) % sizeof(__m128i); if (misaligned && count) { @@ -285,7 +286,7 @@ void qt_memfill64(quint64 *dest, quint64 value, qsizetype count) qt_memfillXX_aligned(dest, _mm_set1_epi64x(value), count * sizeof(quint64)); } -void qt_memfill32(quint32 *dest, quint32 value, qsizetype count) +void qt_memfill32_sse2(quint32 *dest, quint32 value, qsizetype count) { if (count < 4) { // this simplifies the code below: the first switch can fall through @@ -316,6 +317,7 @@ void qt_memfill32(quint32 *dest, quint32 value, qsizetype count) qt_memfillXX_aligned(dest, _mm_set1_epi32(value), count * sizeof(quint32)); } +#endif // !__AVX2__ void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha) { diff --git a/src/gui/painting/qdrawhelper_x86_p.h b/src/gui/painting/qdrawhelper_x86_p.h index 6f97242199..5749d8c9fb 100644 --- a/src/gui/painting/qdrawhelper_x86_p.h +++ b/src/gui/painting/qdrawhelper_x86_p.h @@ -57,8 +57,8 @@ QT_BEGIN_NAMESPACE #ifdef __SSE2__ -void qt_memfill64(quint64 *dest, quint64 value, qsizetype count); -void qt_memfill32(quint32 *dest, quint32 value, qsizetype count); +void qt_memfill64_sse2(quint64 *dest, quint64 value, qsizetype count); +void qt_memfill32_sse2(quint32 *dest, quint32 value, qsizetype count); void qt_bitmapblit32_sse2(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 &color, const uchar *src, int width, int height, int stride); @@ -79,6 +79,9 @@ void qt_blend_rgb32_on_rgb32_sse2(uchar *destPixels, int dbpl, extern CompositionFunction qt_functionForMode_SSE2[]; extern CompositionFunctionSolid qt_functionForModeSolid_SSE2[]; + +void qt_memfill64_avx2(quint64 *dest, quint64 value, qsizetype count); +void qt_memfill32_avx2(quint32 *dest, quint32 value, qsizetype count); #endif // __SSE2__ static const int numCompositionFunctions = 38; |