summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@theqtcompany.com>2016-10-07 17:03:48 +0200
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2016-10-11 07:38:29 +0000
commita6dc28486910bde21d6854c1a64caadb0f663e1c (patch)
treedf4e2b450a9280842c8c39103cef62d5d76b54f1 /src
parent17ac3b2c146d1f48b88dbdc09927ddc3dd3aef81 (diff)
Avoid auto-vectorization of epilogues of manual vectorization
Defines a structure that tells the compiler in no uncertain terms the maximum number of times a loop can be run. The reduces the size of qdrawhelper_avx2.o from 22kbytes to 11kbytes. Change-Id: Ie3d6281b04b4be3332497c15f3dfe9f185e20507 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src')
-rw-r--r--src/corelib/tools/qsimd_p.h3
-rw-r--r--src/gui/image/qimage_ssse3.cpp22
-rw-r--r--src/gui/painting/qdrawhelper.cpp6
-rw-r--r--src/gui/painting/qdrawhelper_avx2.cpp8
-rw-r--r--src/gui/painting/qdrawhelper_neon.cpp4
-rw-r--r--src/gui/painting/qdrawhelper_sse2.cpp13
-rw-r--r--src/gui/painting/qdrawhelper_ssse3.cpp2
-rw-r--r--src/gui/painting/qdrawingprimitive_sse2_p.h4
8 files changed, 29 insertions, 33 deletions
diff --git a/src/corelib/tools/qsimd_p.h b/src/corelib/tools/qsimd_p.h
index 2fd4be00a5..3b6c85ca8a 100644
--- a/src/corelib/tools/qsimd_p.h
+++ b/src/corelib/tools/qsimd_p.h
@@ -471,6 +471,9 @@ static inline quint64 qCpuFeatures()
#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
+#define SIMD_EPILOGUE(i, length, max) \
+ for (int _i = 0; _i < max && i < length; ++i, ++_i)
+
QT_END_NAMESPACE
#endif // QSIMD_P_H
diff --git a/src/gui/image/qimage_ssse3.cpp b/src/gui/image/qimage_ssse3.cpp
index 0fa0eecd80..9cdfba20e3 100644
--- a/src/gui/image/qimage_ssse3.cpp
+++ b/src/gui/image/qimage_ssse3.cpp
@@ -50,15 +50,11 @@ QT_BEGIN_NAMESPACE
// dst must be at least len * 4 bytes
Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, const uchar *src, int len)
{
- quint32 *const end = dst + len;
+ int i = 0;
- // Prologue, align dst to 16 bytes. The alignment is done on dst because it has 4 store()
- // for each 3 load() of src.
- const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast<quintptr>(dst) >> 2) & 0x3)) & 0x3;
- const int prologLength = qMin(len, offsetToAlignOn16Bytes);
-
- for (int i = 0; i < prologLength; ++i) {
- *dst++ = qRgb(src[0], src[1], src[2]);
+ // Prologue, align dst to 16 bytes.
+ ALIGNMENT_PROLOGUE_16BYTES(dst, i, len) {
+ dst[i] = qRgb(src[0], src[1], src[2]);
src += 3;
}
@@ -72,10 +68,9 @@ Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, con
const __m128i alphaMask = _mm_set1_epi32(0xff000000);
const __m128i *inVectorPtr = (const __m128i *)src;
- __m128i *dstVectorPtr = (__m128i *)dst;
+ __m128i *dstVectorPtr = (__m128i *)(dst + i);
- const int simdRoundCount = (len - prologLength) / 16; // one iteration in the loop converts 16 pixels
- for (int i = 0; i < simdRoundCount; ++i) {
+ for (; i < (len - 15); i += 16) { // one iteration in the loop converts 16 pixels
/*
RGB888 has 5 pixels per vector, + 1 byte from the next pixel. The idea here is
to load vectors of RGB888 and use palignr to select a vector out of two vectors.
@@ -117,10 +112,9 @@ Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, con
++dstVectorPtr;
}
src = (const uchar *)inVectorPtr;
- dst = (quint32 *)dstVectorPtr;
- while (dst != end) {
- *dst++ = qRgb(src[0], src[1], src[2]);
+ SIMD_EPILOGUE(i, len, 15) {
+ dst[i] = qRgb(src[0], src[1], src[2]);
src += 3;
}
}
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 3e01d34cb2..928a56fd2f 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -603,7 +603,7 @@ static inline void qConvertARGB32PMToARGB64PM_sse2(QRgba64 *buffer, const uint *
buffer += 2;
}
- for (; i < count; ++i) {
+ SIMD_EPILOGUE(i, count, 3) {
uint s = *src++;
if (RGBA)
s = RGBA2ARGB(s);
@@ -766,7 +766,7 @@ static inline void qConvertA2RGB30PMToARGB64PM_sse2(QRgba64 *buffer, const uint
buffer += 2;
}
- for (; i < count; ++i)
+ SIMD_EPILOGUE(i, count, 3)
*buffer++ = qConvertA2rgb30ToRgb64<PixelOrder>(*src++);
}
#endif
@@ -1397,7 +1397,7 @@ static inline void qConvertARGB64PMToA2RGB30PM_sse2(uint *dest, const QRgba64 *b
}
}
- for (; i < count; ++i)
+ SIMD_EPILOGUE(i, count, 15)
*dest++ = qConvertRgb64ToRgb30<PixelOrder>(*buffer++);
}
#endif
diff --git a/src/gui/painting/qdrawhelper_avx2.cpp b/src/gui/painting/qdrawhelper_avx2.cpp
index b3fa380dc0..acc9bc7ba1 100644
--- a/src/gui/painting/qdrawhelper_avx2.cpp
+++ b/src/gui/painting/qdrawhelper_avx2.cpp
@@ -201,7 +201,7 @@ inline static void BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(quint32 *dst,
_mm256_store_si256((__m256i *)&dst[x], dstVector);
}
}
- for (; x < length; ++x)
+ SIMD_EPILOGUE(x, length, 7)
blend_pixel(dst[x], src[x], const_alpha);
}
@@ -275,7 +275,7 @@ void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl,
}
// 3) Epilogue
- for (; x < w; ++x)
+ SIMD_EPILOGUE(x, w, 7)
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
srcPixels += sbpl;
@@ -322,7 +322,7 @@ void QT_FASTCALL comp_func_Source_avx2(uint *dst, const uint *src, int length, u
}
// 3) Epilogue
- for (; x < length; ++x)
+ SIMD_EPILOGUE(x, length, 7)
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
}
}
@@ -353,7 +353,7 @@ void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint *destPixels, int length, u
dstVector = _mm256_add_epi8(colorVector, dstVector);
_mm256_store_si256((__m256i *)&dst[x], dstVector);
}
- for (; x < length; ++x)
+ SIMD_EPILOGUE(x, length, 7)
destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
}
}
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp
index f5b794ace6..d51b43961c 100644
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@@ -817,7 +817,7 @@ void QT_FASTCALL comp_func_solid_SourceOver_neon(uint *destPixels, int length, u
vst1q_u32(&dst[x], colorPlusBlendedPixels);
}
- for (;x < length; ++x)
+ SIMD_EPILOGUE(x, length, 3)
destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
}
}
@@ -869,7 +869,7 @@ void QT_FASTCALL comp_func_Plus_neon(uint *dst, const uint *src, int length, uin
vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
}
- for (; x < length; ++x)
+ SIMD_EPILOGUE(x, length, 3)
dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
}
}
diff --git a/src/gui/painting/qdrawhelper_sse2.cpp b/src/gui/painting/qdrawhelper_sse2.cpp
index 03abeed440..5ff08e8153 100644
--- a/src/gui/painting/qdrawhelper_sse2.cpp
+++ b/src/gui/painting/qdrawhelper_sse2.cpp
@@ -126,9 +126,8 @@ void qt_blend_rgb32_on_rgb32_sse2(uchar *destPixels, int dbpl,
_mm_store_si128((__m128i *)&dst[x], result);
}
}
- for (; x<w; ++x) {
+ SIMD_EPILOGUE(x, w, 3)
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
- }
dst = (quint32 *)(((uchar *) dst) + dbpl);
src = (const quint32 *)(((const uchar *) src) + sbpl);
}
@@ -177,7 +176,7 @@ void QT_FASTCALL comp_func_Plus_sse2(uint *dst, const uint *src, int length, uin
}
// 3) Epilogue:
- for (; x < length; ++x)
+ SIMD_EPILOGUE(x, length, 3)
dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
} else {
const int one_minus_const_alpha = 255 - const_alpha;
@@ -201,7 +200,7 @@ void QT_FASTCALL comp_func_Plus_sse2(uint *dst, const uint *src, int length, uin
}
// 3) Epilogue:
- for (; x < length; ++x)
+ SIMD_EPILOGUE(x, length, 3)
dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
}
}
@@ -232,7 +231,7 @@ void QT_FASTCALL comp_func_Source_sse2(uint *dst, const uint *src, int length, u
}
// 3) Epilogue
- for (; x < length; ++x)
+ SIMD_EPILOGUE(x, length, 3)
dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
}
}
@@ -313,7 +312,7 @@ void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, u
dstVector = _mm_add_epi8(colorVector, dstVector);
_mm_store_si128((__m128i *)&dst[x], dstVector);
}
- for (;x < length; ++x)
+ SIMD_EPILOGUE(x, length, 3)
destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
}
}
@@ -592,7 +591,7 @@ void qt_scale_image_argb32_on_argb32_sse2(uchar *destPixels, int dbpl,
BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask);
}
- for (; x<w; x++) {
+ SIMD_EPILOGUE(x, w, 3) {
uint s = src[(basex + x*ix) >> 16];
dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
}
diff --git a/src/gui/painting/qdrawhelper_ssse3.cpp b/src/gui/painting/qdrawhelper_ssse3.cpp
index 2026a4e656..45ecc8b422 100644
--- a/src/gui/painting/qdrawhelper_ssse3.cpp
+++ b/src/gui/painting/qdrawhelper_ssse3.cpp
@@ -215,7 +215,7 @@ static inline void store_uint24_ssse3(uchar *dst, const uint *src, int len)
dst24 = reinterpret_cast<quint24*>(dstVectorPtr);
src = reinterpret_cast<const uint*>(inVectorPtr);
- for (; i < len; ++i)
+ SIMD_EPILOGUE(i, len, 15)
*dst24++ = quint24(*src++);
}
diff --git a/src/gui/painting/qdrawingprimitive_sse2_p.h b/src/gui/painting/qdrawingprimitive_sse2_p.h
index 7affc63b32..93e4b9f572 100644
--- a/src/gui/painting/qdrawingprimitive_sse2_p.h
+++ b/src/gui/painting/qdrawingprimitive_sse2_p.h
@@ -178,7 +178,7 @@ QT_BEGIN_NAMESPACE
const __m128i srcVector = _mm_loadu_si128((const __m128i *)&src[x]); \
BLEND_SOURCE_OVER_ARGB32_SSE2_helper(dst, srcVector, nullVector, half, one, colorMask, alphaMask) \
} \
- for (; x < length; ++x) { \
+ SIMD_EPILOGUE(x, length, 3) { \
blend_pixel(dst[x], src[x]); \
} \
}
@@ -219,7 +219,7 @@ QT_BEGIN_NAMESPACE
_mm_store_si128((__m128i *)&dst[x], result); \
} \
} \
- for (; x < length; ++x) { \
+ SIMD_EPILOGUE(x, length, 3) { \
blend_pixel(dst[x], src[x], const_alpha); \
} \
}