diff options
Diffstat (limited to 'src/corelib/tools/qstring.cpp')
-rw-r--r-- | src/corelib/tools/qstring.cpp | 89 |
1 files changed, 69 insertions, 20 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index f5accd0fec..a04b2f0d02 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -357,11 +357,20 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval) // character. Usually harmless. bool qt_is_ascii(const char *&ptr, const char *end) Q_DECL_NOTHROW { -#if defined(__AVX2__) - if (!simdTestMask(ptr, end, 0x80808080)) - return false; -#elif defined(__SSE2__) +#if defined(__SSE2__) // Testing for the high bit can be done efficiently with just PMOVMSKB +# if defined(__AVX2__) + while (ptr + 32 <= end) { + __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr)); + quint32 mask = _mm256_movemask_epi8(data); + if (mask) { + uint idx = qCountTrailingZeroBits(mask); + ptr += idx; + return false; + } + ptr += 32; + } +# endif while (ptr + 16 <= end) { __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr)); quint32 mask = _mm_movemask_epi8(data); @@ -778,30 +787,70 @@ static int ucstrncmp(const QChar *a, const QChar *b, size_t l) } #endif // __mips_dsp #ifdef __SSE2__ - const char *ptr = reinterpret_cast<const char*>(a); - qptrdiff distance = reinterpret_cast<const char*>(b) - ptr; - a += l & ~7; - b += l & ~7; - l &= 7; - - // we're going to read ptr[0..15] (16 bytes) - for ( ; ptr + 15 < reinterpret_cast<const char *>(a); ptr += 16) { - __m128i a_data = _mm_loadu_si128((const __m128i*)ptr); - __m128i b_data = _mm_loadu_si128((const __m128i*)(ptr + distance)); + const QChar *end = a + l; + qptrdiff offset = 0; + + // we're going to read a[0..15] and b[0..15] (32 bytes) + for ( ; a + offset + 16 <= end; offset += 16) { +#ifdef __AVX2__ + __m256i a_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(a + offset)); + __m256i b_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(b + offset)); + __m256i result = _mm256_cmpeq_epi16(a_data, b_data); + uint mask = _mm256_movemask_epi8(result); +#else + __m128i a_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)); + __m128i a_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset + 8)); + __m128i b_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset)); + __m128i b_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset + 8)); + __m128i result1 = _mm_cmpeq_epi16(a_data1, b_data1); + __m128i result2 = _mm_cmpeq_epi16(a_data2, b_data2); + uint mask = _mm_movemask_epi8(result1) | (_mm_movemask_epi8(result2) << 16); +#endif + mask = ~mask; + if (mask) { + // found a different character + uint idx = qCountTrailingZeroBits(mask); + return a[offset + idx / 2].unicode() - b[offset + idx / 2].unicode(); + } + } + + // we're going to read a[0..7] and b[0..7] (16 bytes) + if (a + offset + 8 <= end) { + __m128i a_data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)); + __m128i b_data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset)); __m128i result = _mm_cmpeq_epi16(a_data, b_data); uint mask = ~_mm_movemask_epi8(result); if (ushort(mask)) { - // found a different byte + // found a different character uint idx = qCountTrailingZeroBits(mask); - return reinterpret_cast<const QChar *>(ptr + idx)->unicode() - - reinterpret_cast<const QChar *>(ptr + distance + idx)->unicode(); + return a[offset + idx / 2].unicode() - b[offset + idx / 2].unicode(); } + + offset += 8; } + + // we're going to read a[0..3] and b[0..3] (8 bytes) + if (a + offset + 4 <= end) { + __m128i a_data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(a + offset)); + __m128i b_data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(b + offset)); + __m128i result = _mm_cmpeq_epi16(a_data, b_data); + uint mask = ~_mm_movemask_epi8(result); + if (uchar(mask)) { + // found a different character + uint idx = qCountTrailingZeroBits(mask); + return a[offset + idx / 2].unicode() - b[offset + idx / 2].unicode(); + } + + offset += 4; + } + + // reset l + l &= 3; + const auto lambda = [=](size_t i) -> int { - return reinterpret_cast<const QChar *>(ptr)[i].unicode() - - reinterpret_cast<const QChar *>(ptr + distance)[i].unicode(); + return a[offset + i].unicode() - b[offset + i].unicode(); }; - return UnrollTailLoop<7>::exec(l, 0, lambda, lambda); + return UnrollTailLoop<3>::exec(l, 0, lambda, lambda); #endif #if defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64 if (l >= 8) { |