diff options
author | Thiago Macieira <thiago.macieira@intel.com> | 2018-09-13 14:24:24 -0700 |
---|---|---|
committer | Thiago Macieira <thiago.macieira@intel.com> | 2018-12-18 20:33:26 +0000 |
commit | 482da2e4d2376767172a9a014321822e90fa6096 (patch) | |
tree | 86efe36d72ab4ac65f072243e5c6b7404db3001b /src/corelib | |
parent | 1fbd8caca6423622047512fa881817ae7cf55522 (diff) |
Add an AVX2 code path to qustrchr
The new loop does 32 bytes (16 code units) at a time
Change-Id: I8f261579aad648fdb4f0fffd155412a4d77428e9
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'src/corelib')
-rw-r--r-- | src/corelib/tools/qstring.cpp | 23 |
1 files changed, 23 insertions, 0 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index 380ea408d3..d50a28abc5 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -265,9 +265,27 @@ const ushort *QtPrivate::qustrchr(QStringView str, ushort c) noexcept const ushort *e = reinterpret_cast<const ushort *>(str.end()); #ifdef __SSE2__ + bool loops = true; // Using the PMOVMSKB instruction, we get two bits for each character // we compare. +# if defined(__AVX2__) && !defined(__OPTIMIZE_SIZE__) + // we're going to read n[0..15] (32 bytes) + __m256i mch256 = _mm256_set1_epi32(c | (c << 16)); + for (const ushort *next = n + 16; next <= e; n = next, next += 16) { + __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); + __m256i result = _mm256_cmpeq_epi16(data, mch256); + uint mask = uint(_mm256_movemask_epi8(result)); + if (mask) { + uint idx = qCountTrailingZeroBits(mask); + return n + idx / 2; + } + } + loops = false; + __m128i mch = _mm256_castsi256_si128(mch256); +# else __m128i mch = _mm_set1_epi32(c | (c << 16)); +# endif + auto hasMatch = [mch, &n](__m128i data, ushort validityMask) { __m128i result = _mm_cmpeq_epi16(data, mch); uint mask = uint(_mm_movemask_epi8(result)); @@ -283,6 +301,11 @@ const ushort *QtPrivate::qustrchr(QStringView str, ushort c) noexcept __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(n)); if (hasMatch(data, 0xffff)) return n; + + if (!loops) { + n += 8; + break; + } } # if !defined(__OPTIMIZE_SIZE__) |