summaryrefslogtreecommitdiffstats
path: root/src/gui/painting/qdrawhelper_sse2.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/gui/painting/qdrawhelper_sse2.cpp')
-rw-r--r--src/gui/painting/qdrawhelper_sse2.cpp138
1 files changed, 76 insertions, 62 deletions
diff --git a/src/gui/painting/qdrawhelper_sse2.cpp b/src/gui/painting/qdrawhelper_sse2.cpp
index 3212ffdd2d..c82f41ec88 100644
--- a/src/gui/painting/qdrawhelper_sse2.cpp
+++ b/src/gui/painting/qdrawhelper_sse2.cpp
@@ -233,19 +233,71 @@ void QT_FASTCALL comp_func_Source_sse2(uint *dst, const uint *src, int length, u
}
}
-void qt_memfill32(quint32 *dest, quint32 value, int count)
+#ifndef __AVX2__
+static Q_NEVER_INLINE
+void Q_DECL_VECTORCALL qt_memfillXX_aligned(void *dest, __m128i value128, quintptr bytecount)
{
- if (count < 7) {
+ __m128i *dst128 = reinterpret_cast<__m128i *>(dest);
+ __m128i *end128 = reinterpret_cast<__m128i *>(static_cast<uchar *>(dest) + bytecount);
+
+ while (dst128 + 4 <= end128) {
+ _mm_store_si128(dst128 + 0, value128);
+ _mm_store_si128(dst128 + 1, value128);
+ _mm_store_si128(dst128 + 2, value128);
+ _mm_store_si128(dst128 + 3, value128);
+ dst128 += 4;
+ }
+
+ bytecount %= 4 * sizeof(__m128i);
+ switch (bytecount / sizeof(__m128i)) {
+ case 3: _mm_store_si128(dst128++, value128); Q_FALLTHROUGH();
+ case 2: _mm_store_si128(dst128++, value128); Q_FALLTHROUGH();
+ case 1: _mm_store_si128(dst128++, value128);
+ }
+}
+
+void qt_memfill64_sse2(quint64 *dest, quint64 value, qsizetype count)
+{
+ quintptr misaligned = quintptr(dest) % sizeof(__m128i);
+ if (misaligned && count) {
+#if defined(Q_PROCESSOR_X86_32)
+ // Before SSE came out, the alignment of the stack used to be only 4
+ // bytes and some OS/ABIs (notably, code generated by MSVC) still only
+ // align to that. In any case, we cannot count on the alignment of
+ // quint64 to be 8 -- see QtPrivate::AlignOf_WorkaroundForI386Abi in
+ // qglobal.h.
+ //
+ // If the pointer is not aligned to at least 8 bytes, then we'll never
+ // in turn hit a multiple of 16 for the qt_memfillXX_aligned call
+ // below.
+ if (Q_UNLIKELY(misaligned % sizeof(quint64)))
+ return qt_memfill_template(dest, value, count);
+#endif
+
+ *dest++ = value;
+ --count;
+ }
+
+ if (count % 2) {
+ dest[count - 1] = value;
+ --count;
+ }
+
+ qt_memfillXX_aligned(dest, _mm_set1_epi64x(value), count * sizeof(quint64));
+}
+
+void qt_memfill32_sse2(quint32 *dest, quint32 value, qsizetype count)
+{
+ if (count < 4) {
+ // this simplifies the code below: the first switch can fall through
+ // without checking the value of count
switch (count) {
- case 6: *dest++ = value; Q_FALLTHROUGH();
- case 5: *dest++ = value; Q_FALLTHROUGH();
- case 4: *dest++ = value; Q_FALLTHROUGH();
case 3: *dest++ = value; Q_FALLTHROUGH();
case 2: *dest++ = value; Q_FALLTHROUGH();
case 1: *dest = value;
}
return;
- };
+ }
const int align = (quintptr)(dest) & 0xf;
switch (align) {
@@ -263,25 +315,9 @@ void qt_memfill32(quint32 *dest, quint32 value, int count)
}
}
- int count128 = count / 4;
- __m128i *dst128 = reinterpret_cast<__m128i*>(dest);
- __m128i *end128 = dst128 + count128;
- const __m128i value128 = _mm_set_epi32(value, value, value, value);
-
- while (dst128 + 3 < end128) {
- _mm_stream_si128(dst128 + 0, value128);
- _mm_stream_si128(dst128 + 1, value128);
- _mm_stream_si128(dst128 + 2, value128);
- _mm_stream_si128(dst128 + 3, value128);
- dst128 += 4;
- }
-
- switch (count128 & 0x3) {
- case 3: _mm_stream_si128(dst128++, value128); Q_FALLTHROUGH();
- case 2: _mm_stream_si128(dst128++, value128); Q_FALLTHROUGH();
- case 1: _mm_stream_si128(dst128++, value128);
- }
+ qt_memfillXX_aligned(dest, _mm_set1_epi32(value), count * sizeof(quint32));
}
+#endif // !__AVX2__
void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha)
{
@@ -314,28 +350,6 @@ void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, u
}
}
-void qt_memfill16(quint16 *dest, quint16 value, int count)
-{
- if (count < 3) {
- switch (count) {
- case 2: *dest++ = value; Q_FALLTHROUGH();
- case 1: *dest = value;
- }
- return;
- }
-
- const int align = (quintptr)(dest) & 0x3;
- switch (align) {
- case 2: *dest++ = value; --count;
- }
-
- const quint32 value32 = (value << 16) | value;
- qt_memfill32(reinterpret_cast<quint32*>(dest), value32, count / 2);
-
- if (count & 0x1)
- dest[count - 1] = value;
-}
-
void qt_bitmapblit32_sse2_base(QRasterBuffer *rasterBuffer, int x, int y,
quint32 color,
const uchar *src, int width, int height, int stride)
@@ -440,30 +454,30 @@ public:
union Vect_buffer_i { Int32x4 v; int i[4]; };
union Vect_buffer_f { Float32x4 v; float f[4]; };
- static inline Float32x4 v_dup(float x) { return _mm_set1_ps(x); }
- static inline Float32x4 v_dup(double x) { return _mm_set1_ps(x); }
- static inline Int32x4 v_dup(int x) { return _mm_set1_epi32(x); }
- static inline Int32x4 v_dup(uint x) { return _mm_set1_epi32(x); }
+ static inline Float32x4 Q_DECL_VECTORCALL v_dup(float x) { return _mm_set1_ps(x); }
+ static inline Float32x4 Q_DECL_VECTORCALL v_dup(double x) { return _mm_set1_ps(x); }
+ static inline Int32x4 Q_DECL_VECTORCALL v_dup(int x) { return _mm_set1_epi32(x); }
+ static inline Int32x4 Q_DECL_VECTORCALL v_dup(uint x) { return _mm_set1_epi32(x); }
- static inline Float32x4 v_add(Float32x4 a, Float32x4 b) { return _mm_add_ps(a, b); }
- static inline Int32x4 v_add(Int32x4 a, Int32x4 b) { return _mm_add_epi32(a, b); }
+ static inline Float32x4 Q_DECL_VECTORCALL v_add(Float32x4 a, Float32x4 b) { return _mm_add_ps(a, b); }
+ static inline Int32x4 Q_DECL_VECTORCALL v_add(Int32x4 a, Int32x4 b) { return _mm_add_epi32(a, b); }
- static inline Float32x4 v_max(Float32x4 a, Float32x4 b) { return _mm_max_ps(a, b); }
- static inline Float32x4 v_min(Float32x4 a, Float32x4 b) { return _mm_min_ps(a, b); }
- static inline Int32x4 v_min_16(Int32x4 a, Int32x4 b) { return _mm_min_epi16(a, b); }
+ static inline Float32x4 Q_DECL_VECTORCALL v_max(Float32x4 a, Float32x4 b) { return _mm_max_ps(a, b); }
+ static inline Float32x4 Q_DECL_VECTORCALL v_min(Float32x4 a, Float32x4 b) { return _mm_min_ps(a, b); }
+ static inline Int32x4 Q_DECL_VECTORCALL v_min_16(Int32x4 a, Int32x4 b) { return _mm_min_epi16(a, b); }
- static inline Int32x4 v_and(Int32x4 a, Int32x4 b) { return _mm_and_si128(a, b); }
+ static inline Int32x4 Q_DECL_VECTORCALL v_and(Int32x4 a, Int32x4 b) { return _mm_and_si128(a, b); }
- static inline Float32x4 v_sub(Float32x4 a, Float32x4 b) { return _mm_sub_ps(a, b); }
- static inline Int32x4 v_sub(Int32x4 a, Int32x4 b) { return _mm_sub_epi32(a, b); }
+ static inline Float32x4 Q_DECL_VECTORCALL v_sub(Float32x4 a, Float32x4 b) { return _mm_sub_ps(a, b); }
+ static inline Int32x4 Q_DECL_VECTORCALL v_sub(Int32x4 a, Int32x4 b) { return _mm_sub_epi32(a, b); }
- static inline Float32x4 v_mul(Float32x4 a, Float32x4 b) { return _mm_mul_ps(a, b); }
+ static inline Float32x4 Q_DECL_VECTORCALL v_mul(Float32x4 a, Float32x4 b) { return _mm_mul_ps(a, b); }
- static inline Float32x4 v_sqrt(Float32x4 x) { return _mm_sqrt_ps(x); }
+ static inline Float32x4 Q_DECL_VECTORCALL v_sqrt(Float32x4 x) { return _mm_sqrt_ps(x); }
- static inline Int32x4 v_toInt(Float32x4 x) { return _mm_cvttps_epi32(x); }
+ static inline Int32x4 Q_DECL_VECTORCALL v_toInt(Float32x4 x) { return _mm_cvttps_epi32(x); }
- static inline Int32x4 v_greaterOrEqual(Float32x4 a, Float32x4 b) { return _mm_castps_si128(_mm_cmpgt_ps(a, b)); }
+ static inline Int32x4 Q_DECL_VECTORCALL v_greaterOrEqual(Float32x4 a, Float32x4 b) { return _mm_castps_si128(_mm_cmpgt_ps(a, b)); }
};
const uint * QT_FASTCALL qt_fetch_radial_gradient_sse2(uint *buffer, const Operator *op, const QSpanData *data,