summaryrefslogtreecommitdiffstats
path: root/src/gui/painting/qdrawhelper_sse2.cpp
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2018-11-02 15:35:43 -0700
committerThiago Macieira <thiago.macieira@intel.com>2018-12-11 19:04:58 +0000
commita440aada72f2ee78c5e27d70ecc79c0071673446 (patch)
tree47f8d256356718bed6e44e6a0b15311254bb4432 /src/gui/painting/qdrawhelper_sse2.cpp
parent12e843581adf156d30dd09579b53fa36a361c4c9 (diff)
Add SSE2 qt_memfill64
Implemented by merging with the qt_memfill32 implementation in a non-inlining function. Change-Id: I343f2beed55440a7ac0bfffd15636f8ba995a2bd Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'src/gui/painting/qdrawhelper_sse2.cpp')
-rw-r--r--src/gui/painting/qdrawhelper_sse2.cpp80
1 files changed, 57 insertions, 23 deletions
diff --git a/src/gui/painting/qdrawhelper_sse2.cpp b/src/gui/painting/qdrawhelper_sse2.cpp
index 34bdf7909a..bf2e90f6af 100644
--- a/src/gui/painting/qdrawhelper_sse2.cpp
+++ b/src/gui/painting/qdrawhelper_sse2.cpp
@@ -233,19 +233,70 @@ void QT_FASTCALL comp_func_Source_sse2(uint *dst, const uint *src, int length, u
}
}
+static Q_NEVER_INLINE
+void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount)
+{
+ __m128i *dst128 = reinterpret_cast<__m128i *>(dest);
+ __m128i *end128 = reinterpret_cast<__m128i *>(static_cast<uchar *>(dest) + bytecount);
+
+ while (dst128 + 4 <= end128) {
+ _mm_store_si128(dst128 + 0, value128);
+ _mm_store_si128(dst128 + 1, value128);
+ _mm_store_si128(dst128 + 2, value128);
+ _mm_store_si128(dst128 + 3, value128);
+ dst128 += 4;
+ }
+
+ bytecount %= 4 * sizeof(__m128i);
+ switch (bytecount / sizeof(__m128i)) {
+ case 3: _mm_store_si128(dst128++, value128); Q_FALLTHROUGH();
+ case 2: _mm_store_si128(dst128++, value128); Q_FALLTHROUGH();
+ case 1: _mm_store_si128(dst128++, value128);
+ }
+}
+
+void qt_memfill64(quint64 *dest, quint64 value, qsizetype count)
+{
+ quintptr misaligned = quintptr(dest) % sizeof(__m128i);
+ if (misaligned && count) {
+#if defined(Q_PROCESSOR_X86_32)
+ // Before SSE came out, the alignment of the stack used to be only 4
+ // bytes and some OS/ABIs (notably, code generated by MSVC) still only
+ // align to that. In any case, we cannot count on the alignment of
+ // quint64 to be 8 -- see QtPrivate::AlignOf_WorkaroundForI386Abi in
+ // qglobal.h.
+ //
+ // If the pointer is not aligned to at least 8 bytes, then we'll never
+ // in turn hit a multiple of 16 for the qt_memfillXX_aligned call
+ // below.
+ if (Q_UNLIKELY(misaligned % sizeof(quint64)))
+ return qt_memfill_template(dest, value, count);
+#endif
+
+ *dest++ = value;
+ --count;
+ }
+
+ if (count % 2) {
+ dest[count - 1] = value;
+ --count;
+ }
+
+ qt_memfillXX_aligned(dest, _mm_set1_epi64x(value), count * sizeof(quint64));
+}
+
void qt_memfill32(quint32 *dest, quint32 value, qsizetype count)
{
- if (count < 7) {
+ if (count < 4) {
+ // this simplifies the code below: the first switch can fall through
+ // without checking the value of count
switch (count) {
- case 6: *dest++ = value; Q_FALLTHROUGH();
- case 5: *dest++ = value; Q_FALLTHROUGH();
- case 4: *dest++ = value; Q_FALLTHROUGH();
case 3: *dest++ = value; Q_FALLTHROUGH();
case 2: *dest++ = value; Q_FALLTHROUGH();
case 1: *dest = value;
}
return;
- };
+ }
const int align = (quintptr)(dest) & 0xf;
switch (align) {
@@ -263,24 +314,7 @@ void qt_memfill32(quint32 *dest, quint32 value, qsizetype count)
}
}
- int count128 = count / 4;
- __m128i *dst128 = reinterpret_cast<__m128i*>(dest);
- __m128i *end128 = dst128 + count128;
- const __m128i value128 = _mm_set1_epi32(value);
-
- while (dst128 + 3 < end128) {
- _mm_store_si128(dst128 + 0, value128);
- _mm_store_si128(dst128 + 1, value128);
- _mm_store_si128(dst128 + 2, value128);
- _mm_store_si128(dst128 + 3, value128);
- dst128 += 4;
- }
-
- switch (count128 & 0x3) {
- case 3: _mm_store_si128(dst128++, value128); Q_FALLTHROUGH();
- case 2: _mm_store_si128(dst128++, value128); Q_FALLTHROUGH();
- case 1: _mm_store_si128(dst128++, value128);
- }
+ qt_memfillXX_aligned(dest, _mm_set1_epi32(value), count * sizeof(quint32));
}
void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha)