From 1e95a07a5ced774b20adb66b34c31bdfaf566bdc Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Sun, 13 May 2018 09:14:09 -0700 Subject: QString: insert a number of 8-character SIMD loops We don't have _mm_cvtsi64_si128() (the REX.W expansion of MOVD [0F 6E]), but we do have _mm_loadl_epi64(), the SSE2 expansion of the MMX MOVQ at opcode 0F 7E. Ditto for _mm_cvtsi128_si64() and _mm_storel_epi64(). And those work even in 32-bit mode. By doing this, we can reduce the tail unrolled loops by half, reducing code size. I'm not adding these new SIMD sections to -Os builds. Change-Id: Ib48364abee9f464c96c6fffd152e405310ef67be Reviewed-by: Allan Sandfeld Jensen --- src/corelib/codecs/qutfcodec.cpp | 39 +++++++++++++ src/corelib/tools/qstring.cpp | 123 +++++++++++++++++++++++++++++++++------ 2 files changed, 144 insertions(+), 18 deletions(-) diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index 08cc99f4dc..96be45ff4e 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -102,6 +102,26 @@ static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const return false; } } + + if (end - src >= 8) { + // do eight characters at a time + __m128i data = _mm_loadu_si128(reinterpret_cast(src)); + __m128i packed = _mm_packus_epi16(data, data); + __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128()); + + // store even non-ASCII + _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed); + + uchar n = ~_mm_movemask_epi8(nonAscii); + if (n) { + nextAscii = src + qBitScanReverse(n) + 1; + n = qCountTrailingZeroBits(n); + dst += n; + src += n; + return false; + } + } + return src == end; } @@ -150,6 +170,25 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const return false; } + + if (end - src >= 8) { + __m128i data = _mm_loadl_epi64(reinterpret_cast(src)); + uint n = _mm_movemask_epi8(data) & 0xff; + if (!n) { + // unpack and store + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128())); + } else { + while (!(n & 1)) { + *dst++ = *src++; + n >>= 1; + } + + n = qBitScanReverse(n); + nextAscii = src + n + 1; + return false; + } + } + return src == end; } diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index 719c685f69..81e5e1e884 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -315,10 +315,19 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval) return updatePtrSimd(data1); ptr += 16; } + + // and final 8-byte comparison + if (ptr + 8 <= end) { + __m128i data1 = _mm_loadl_epi64(reinterpret_cast(ptr)); + if (!_mm_testz_si128(mask, data1)) + return updatePtrSimd(data1); + ptr += 8; + } + # else // SSE2 implementation: test 16 bytes at a time. const __m128i mask = _mm_set1_epi32(maskval); - while (ptr + 16 < end) { + while (ptr + 16 <= end) { __m128i data = _mm_loadu_si128(reinterpret_cast(ptr)); __m128i masked = _mm_and_si128(mask, data); __m128i comparison = _mm_cmpeq_epi16(masked, _mm_setzero_si128()); @@ -327,6 +336,17 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval) return updatePtr(result); ptr += 16; } + + // and one 8-byte comparison + if (ptr + 8 <= end) { + __m128i data = _mm_loadl_epi64(reinterpret_cast(ptr)); + __m128i masked = _mm_and_si128(mask, data); + __m128i comparison = _mm_cmpeq_epi16(masked, _mm_setzero_si128()); + quint8 result = _mm_movemask_epi8(comparison); + if (result != 0xff) + return updatePtr(result); + ptr += 8; + } # endif return true; @@ -342,7 +362,7 @@ bool qt_is_ascii(const char *&ptr, const char *end) Q_DECL_NOTHROW return false; #elif defined(__SSE2__) // Testing for the high bit can be done efficiently with just PMOVMSKB - while (ptr + 16 < end) { + while (ptr + 16 <= end) { __m128i data = _mm_loadu_si128(reinterpret_cast(ptr)); quint32 mask = _mm_movemask_epi8(data); if (mask) { @@ -352,6 +372,16 @@ bool qt_is_ascii(const char *&ptr, const char *end) Q_DECL_NOTHROW } ptr += 16; } + if (ptr + 8 <= end) { + __m128i data = _mm_loadl_epi64(reinterpret_cast(ptr)); + quint8 mask = _mm_movemask_epi8(data); + if (mask) { + uint idx = qCountTrailingZeroBits(mask); + ptr += idx; + return false; + } + ptr += 8; + } #endif while (ptr + 4 <= end) { @@ -480,11 +510,19 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size) Q_DECL_NOTHROW #endif } - size = size % 16; + // we're going to read str[offset..offset+7] (8 bytes) + if (str + offset + 7 < e) { + const __m128i chunk = _mm_loadl_epi64(reinterpret_cast(str + offset)); + const __m128i unpacked = _mm_unpacklo_epi8(chunk, _mm_setzero_si128()); + _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), unpacked); + offset += 8; + } + + size = size % 8; dst += offset; str += offset; # if defined(Q_COMPILER_LAMBDA) && !defined(__OPTIMIZE_SIZE__) - return UnrollTailLoop<15>::exec(int(size), [=](int i) { dst[i] = (uchar)str[i]; }); + return UnrollTailLoop<7>::exec(int(size), [=](int i) { dst[i] = (uchar)str[i]; }); # endif #endif #if defined(__mips_dsp) @@ -572,12 +610,34 @@ static void qt_to_latin1(uchar *dst, const ushort *src, int length) _mm_storeu_si128((__m128i*)(dst + offset), result); // store } - length = length % 16; +# if !defined(__OPTIMIZE_SIZE__) + // we're going to write to dst[offset..offset+7] (8 bytes) + if (dst + offset + 7 < e) { + __m128i chunk = _mm_loadu_si128(reinterpret_cast(src + offset)); + chunk = mergeQuestionMarks(chunk); + + // pack, where the upper half is ignored + const __m128i result = _mm_packus_epi16(chunk, chunk); + _mm_storel_epi64(reinterpret_cast<__m128i *>(dst + offset), result); + offset += 8; + } + + // we're going to write to dst[offset..offset+3] (4 bytes) + if (dst + offset + 3 < e) { + __m128i chunk = _mm_loadl_epi64(reinterpret_cast(src + offset)); + chunk = mergeQuestionMarks(chunk); + + // pack, we'll the upper three quarters + const __m128i result = _mm_packus_epi16(chunk, chunk); + qToUnaligned(_mm_cvtsi128_si32(result), dst + offset); + offset += 4; + } + + length = length % 4; dst += offset; src += offset; -# if defined(Q_COMPILER_LAMBDA) && !defined(__OPTIMIZE_SIZE__) - return UnrollTailLoop<15>::exec(length, [=](int i) { dst[i] = (src[i]>0xff) ? '?' : (uchar) src[i]; }); + return UnrollTailLoop<3>::exec(length, [=](int i) { dst[i] = (src[i]>0xff) ? '?' : (uchar) src[i]; }); # endif #elif defined(__ARM_NEON__) // Refer to the documentation of the SSE2 implementation @@ -837,12 +897,11 @@ static int ucstrncmp(const QChar *a, const uchar *c, size_t l) } } -# ifdef Q_PROCESSOR_X86_64 - enum { MaxTailLength = 7 }; +# if !defined(__OPTIMIZE_SIZE__) // we'll read uc[offset..offset+7] (16 bytes) and c[offset..offset+7] (8 bytes) if (uc + offset + 7 < e) { // same, but we're using an 8-byte load - __m128i chunk = _mm_cvtsi64_si128(qFromUnaligned(c + offset)); + __m128i chunk = _mm_loadl_epi64((const __m128i*)(c + offset)); __m128i secondHalf = _mm_unpacklo_epi8(chunk, nullmask); __m128i ucdata = _mm_loadu_si128((const __m128i*)(uc + offset)); @@ -857,17 +916,30 @@ static int ucstrncmp(const QChar *a, const uchar *c, size_t l) // still matched offset += 8; } -# else - // 32-bit, we can't do MOVQ to load 8 bytes - Q_UNUSED(nullmask); - enum { MaxTailLength = 15 }; -# endif + + enum { MaxTailLength = 3 }; + // we'll read uc[offset..offset+3] (8 bytes) and c[offset..offset+3] (4 bytes) + if (uc + offset + 3 < e) { + __m128i chunk = _mm_cvtsi32_si128(qFromUnaligned(c + offset)); + __m128i secondHalf = _mm_unpacklo_epi8(chunk, nullmask); + + __m128i ucdata = _mm_loadl_epi64(reinterpret_cast(uc + offset)); + __m128i result = _mm_cmpeq_epi8(secondHalf, ucdata); + uint mask = ~_mm_movemask_epi8(result); + if (uchar(mask)) { + // found a different character + uint idx = qCountTrailingZeroBits(mask); + return uc[offset + idx / 2] - c[offset + idx / 2]; + } + + // still matched + offset += 4; + } // reset uc and c uc += offset; c += offset; -# if !defined(__OPTIMIZE_SIZE__) const auto lambda = [=](size_t i) { return uc[i] - ushort(c[i]); }; return UnrollTailLoop::exec(e - uc, 0, lambda, lambda); # endif @@ -1056,8 +1128,23 @@ static int findChar(const QChar *str, int len, QChar ch, int from, } } -# if defined(Q_COMPILER_LAMBDA) && !defined(__OPTIMIZE_SIZE__) - return UnrollTailLoop<7>::exec(e - n, -1, +# if !defined(__OPTIMIZE_SIZE__) + // we're going to read n[0..3] (8 bytes) + if (e - n > 3) { + __m128i data = _mm_loadl_epi64(reinterpret_cast(n)); + __m128i result = _mm_cmpeq_epi16(data, mch); + uint mask = _mm_movemask_epi8(result); + if (uchar(mask)) { + // found a match + // same as: return n - s + _bit_scan_forward(mask) / 2 + return (reinterpret_cast(n) - reinterpret_cast(s) + + qCountTrailingZeroBits(mask)) >> 1; + } + + n += 4; + } + + return UnrollTailLoop<3>::exec(e - n, -1, [=](int i) { return n[i] == c; }, [=](int i) { return n - s + i; }); # endif -- cgit v1.2.3