summaryrefslogtreecommitdiffstats
path: root/src/corelib
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2018-09-13 14:24:24 -0700
committerThiago Macieira <thiago.macieira@intel.com>2018-12-18 20:33:26 +0000
commit482da2e4d2376767172a9a014321822e90fa6096 (patch)
tree86efe36d72ab4ac65f072243e5c6b7404db3001b /src/corelib
parent1fbd8caca6423622047512fa881817ae7cf55522 (diff)
Add an AVX2 code path to qustrchr
The new loop does 32 bytes (16 code units) at a time Change-Id: I8f261579aad648fdb4f0fffd155412a4d77428e9 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'src/corelib')
-rw-r--r--src/corelib/tools/qstring.cpp23
1 files changed, 23 insertions, 0 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index 380ea408d3..d50a28abc5 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -265,9 +265,27 @@ const ushort *QtPrivate::qustrchr(QStringView str, ushort c) noexcept
const ushort *e = reinterpret_cast<const ushort *>(str.end());
#ifdef __SSE2__
+ bool loops = true;
// Using the PMOVMSKB instruction, we get two bits for each character
// we compare.
+# if defined(__AVX2__) && !defined(__OPTIMIZE_SIZE__)
+ // we're going to read n[0..15] (32 bytes)
+ __m256i mch256 = _mm256_set1_epi32(c | (c << 16));
+ for (const ushort *next = n + 16; next <= e; n = next, next += 16) {
+ __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
+ __m256i result = _mm256_cmpeq_epi16(data, mch256);
+ uint mask = uint(_mm256_movemask_epi8(result));
+ if (mask) {
+ uint idx = qCountTrailingZeroBits(mask);
+ return n + idx / 2;
+ }
+ }
+ loops = false;
+ __m128i mch = _mm256_castsi256_si128(mch256);
+# else
__m128i mch = _mm_set1_epi32(c | (c << 16));
+# endif
+
auto hasMatch = [mch, &n](__m128i data, ushort validityMask) {
__m128i result = _mm_cmpeq_epi16(data, mch);
uint mask = uint(_mm_movemask_epi8(result));
@@ -283,6 +301,11 @@ const ushort *QtPrivate::qustrchr(QStringView str, ushort c) noexcept
__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(n));
if (hasMatch(data, 0xffff))
return n;
+
+ if (!loops) {
+ n += 8;
+ break;
+ }
}
# if !defined(__OPTIMIZE_SIZE__)