From a6dc28486910bde21d6854c1a64caadb0f663e1c Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Fri, 7 Oct 2016 17:03:48 +0200 Subject: Avoid auto-vectorization of epilogues of manual vectorization Defines a structure that tells the compiler in no uncertain terms the maximum number of times a loop can be run. The reduces the size of qdrawhelper_avx2.o from 22kbytes to 11kbytes. Change-Id: Ie3d6281b04b4be3332497c15f3dfe9f185e20507 Reviewed-by: Thiago Macieira --- src/gui/image/qimage_ssse3.cpp | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'src/gui/image/qimage_ssse3.cpp') diff --git a/src/gui/image/qimage_ssse3.cpp b/src/gui/image/qimage_ssse3.cpp index 0fa0eecd80..9cdfba20e3 100644 --- a/src/gui/image/qimage_ssse3.cpp +++ b/src/gui/image/qimage_ssse3.cpp @@ -50,15 +50,11 @@ QT_BEGIN_NAMESPACE // dst must be at least len * 4 bytes Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, const uchar *src, int len) { - quint32 *const end = dst + len; + int i = 0; - // Prologue, align dst to 16 bytes. The alignment is done on dst because it has 4 store() - // for each 3 load() of src. - const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast(dst) >> 2) & 0x3)) & 0x3; - const int prologLength = qMin(len, offsetToAlignOn16Bytes); - - for (int i = 0; i < prologLength; ++i) { - *dst++ = qRgb(src[0], src[1], src[2]); + // Prologue, align dst to 16 bytes. + ALIGNMENT_PROLOGUE_16BYTES(dst, i, len) { + dst[i] = qRgb(src[0], src[1], src[2]); src += 3; } @@ -72,10 +68,9 @@ Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, con const __m128i alphaMask = _mm_set1_epi32(0xff000000); const __m128i *inVectorPtr = (const __m128i *)src; - __m128i *dstVectorPtr = (__m128i *)dst; + __m128i *dstVectorPtr = (__m128i *)(dst + i); - const int simdRoundCount = (len - prologLength) / 16; // one iteration in the loop converts 16 pixels - for (int i = 0; i < simdRoundCount; ++i) { + for (; i < (len - 15); i += 16) { // one iteration in the loop converts 16 pixels /* RGB888 has 5 pixels per vector, + 1 byte from the next pixel. The idea here is to load vectors of RGB888 and use palignr to select a vector out of two vectors. @@ -117,10 +112,9 @@ Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, con ++dstVectorPtr; } src = (const uchar *)inVectorPtr; - dst = (quint32 *)dstVectorPtr; - while (dst != end) { - *dst++ = qRgb(src[0], src[1], src[2]); + SIMD_EPILOGUE(i, len, 15) { + dst[i] = qRgb(src[0], src[1], src[2]); src += 3; } } -- cgit v1.2.3