diff options
author | Thiago Macieira <thiago.macieira@intel.com> | 2018-11-02 15:35:43 -0700 |
---|---|---|
committer | Thiago Macieira <thiago.macieira@intel.com> | 2018-12-11 19:04:58 +0000 |
commit | a440aada72f2ee78c5e27d70ecc79c0071673446 (patch) | |
tree | 47f8d256356718bed6e44e6a0b15311254bb4432 /src/gui/painting/qdrawhelper_sse2.cpp | |
parent | 12e843581adf156d30dd09579b53fa36a361c4c9 (diff) |
Add SSE2 qt_memfill64
Implemented by merging with the qt_memfill32 implementation in a
non-inlining function.
Change-Id: I343f2beed55440a7ac0bfffd15636f8ba995a2bd
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'src/gui/painting/qdrawhelper_sse2.cpp')
-rw-r--r-- | src/gui/painting/qdrawhelper_sse2.cpp | 80 |
1 files changed, 57 insertions, 23 deletions
diff --git a/src/gui/painting/qdrawhelper_sse2.cpp b/src/gui/painting/qdrawhelper_sse2.cpp index 34bdf7909a..bf2e90f6af 100644 --- a/src/gui/painting/qdrawhelper_sse2.cpp +++ b/src/gui/painting/qdrawhelper_sse2.cpp @@ -233,19 +233,70 @@ void QT_FASTCALL comp_func_Source_sse2(uint *dst, const uint *src, int length, u } } +static Q_NEVER_INLINE +void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount) +{ + __m128i *dst128 = reinterpret_cast<__m128i *>(dest); + __m128i *end128 = reinterpret_cast<__m128i *>(static_cast<uchar *>(dest) + bytecount); + + while (dst128 + 4 <= end128) { + _mm_store_si128(dst128 + 0, value128); + _mm_store_si128(dst128 + 1, value128); + _mm_store_si128(dst128 + 2, value128); + _mm_store_si128(dst128 + 3, value128); + dst128 += 4; + } + + bytecount %= 4 * sizeof(__m128i); + switch (bytecount / sizeof(__m128i)) { + case 3: _mm_store_si128(dst128++, value128); Q_FALLTHROUGH(); + case 2: _mm_store_si128(dst128++, value128); Q_FALLTHROUGH(); + case 1: _mm_store_si128(dst128++, value128); + } +} + +void qt_memfill64(quint64 *dest, quint64 value, qsizetype count) +{ + quintptr misaligned = quintptr(dest) % sizeof(__m128i); + if (misaligned && count) { +#if defined(Q_PROCESSOR_X86_32) + // Before SSE came out, the alignment of the stack used to be only 4 + // bytes and some OS/ABIs (notably, code generated by MSVC) still only + // align to that. In any case, we cannot count on the alignment of + // quint64 to be 8 -- see QtPrivate::AlignOf_WorkaroundForI386Abi in + // qglobal.h. + // + // If the pointer is not aligned to at least 8 bytes, then we'll never + // in turn hit a multiple of 16 for the qt_memfillXX_aligned call + // below. + if (Q_UNLIKELY(misaligned % sizeof(quint64))) + return qt_memfill_template(dest, value, count); +#endif + + *dest++ = value; + --count; + } + + if (count % 2) { + dest[count - 1] = value; + --count; + } + + qt_memfillXX_aligned(dest, _mm_set1_epi64x(value), count * sizeof(quint64)); +} + void qt_memfill32(quint32 *dest, quint32 value, qsizetype count) { - if (count < 7) { + if (count < 4) { + // this simplifies the code below: the first switch can fall through + // without checking the value of count switch (count) { - case 6: *dest++ = value; Q_FALLTHROUGH(); - case 5: *dest++ = value; Q_FALLTHROUGH(); - case 4: *dest++ = value; Q_FALLTHROUGH(); case 3: *dest++ = value; Q_FALLTHROUGH(); case 2: *dest++ = value; Q_FALLTHROUGH(); case 1: *dest = value; } return; - }; + } const int align = (quintptr)(dest) & 0xf; switch (align) { @@ -263,24 +314,7 @@ void qt_memfill32(quint32 *dest, quint32 value, qsizetype count) } } - int count128 = count / 4; - __m128i *dst128 = reinterpret_cast<__m128i*>(dest); - __m128i *end128 = dst128 + count128; - const __m128i value128 = _mm_set1_epi32(value); - - while (dst128 + 3 < end128) { - _mm_store_si128(dst128 + 0, value128); - _mm_store_si128(dst128 + 1, value128); - _mm_store_si128(dst128 + 2, value128); - _mm_store_si128(dst128 + 3, value128); - dst128 += 4; - } - - switch (count128 & 0x3) { - case 3: _mm_store_si128(dst128++, value128); Q_FALLTHROUGH(); - case 2: _mm_store_si128(dst128++, value128); Q_FALLTHROUGH(); - case 1: _mm_store_si128(dst128++, value128); - } + qt_memfillXX_aligned(dest, _mm_set1_epi32(value), count * sizeof(quint32)); } void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha) |