From 482da2e4d2376767172a9a014321822e90fa6096 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Thu, 13 Sep 2018 14:24:24 -0700 Subject: Add an AVX2 code path to qustrchr The new loop does 32 bytes (16 code units) at a time Change-Id: I8f261579aad648fdb4f0fffd155412a4d77428e9 Reviewed-by: Allan Sandfeld Jensen --- src/corelib/tools/qstring.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'src') diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index 380ea408d3..d50a28abc5 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -265,9 +265,27 @@ const ushort *QtPrivate::qustrchr(QStringView str, ushort c) noexcept const ushort *e = reinterpret_cast(str.end()); #ifdef __SSE2__ + bool loops = true; // Using the PMOVMSKB instruction, we get two bits for each character // we compare. +# if defined(__AVX2__) && !defined(__OPTIMIZE_SIZE__) + // we're going to read n[0..15] (32 bytes) + __m256i mch256 = _mm256_set1_epi32(c | (c << 16)); + for (const ushort *next = n + 16; next <= e; n = next, next += 16) { + __m256i data = _mm256_loadu_si256(reinterpret_cast(n)); + __m256i result = _mm256_cmpeq_epi16(data, mch256); + uint mask = uint(_mm256_movemask_epi8(result)); + if (mask) { + uint idx = qCountTrailingZeroBits(mask); + return n + idx / 2; + } + } + loops = false; + __m128i mch = _mm256_castsi256_si128(mch256); +# else __m128i mch = _mm_set1_epi32(c | (c << 16)); +# endif + auto hasMatch = [mch, &n](__m128i data, ushort validityMask) { __m128i result = _mm_cmpeq_epi16(data, mch); uint mask = uint(_mm_movemask_epi8(result)); @@ -283,6 +301,11 @@ const ushort *QtPrivate::qustrchr(QStringView str, ushort c) noexcept __m128i data = _mm_loadu_si128(reinterpret_cast(n)); if (hasMatch(data, 0xffff)) return n; + + if (!loops) { + n += 8; + break; + } } # if !defined(__OPTIMIZE_SIZE__) -- cgit v1.2.3