diff options
Diffstat (limited to 'src/corelib/text/qstring.cpp')
-rw-r--r-- | src/corelib/text/qstring.cpp | 353 |
1 files changed, 187 insertions, 166 deletions
diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp index 819086b014..6525e16f2a 100644 --- a/src/corelib/text/qstring.cpp +++ b/src/corelib/text/qstring.cpp @@ -349,7 +349,12 @@ extern "C" void qt_fromlatin1_mips_asm_unroll8 (char16_t*, const char*, uint); extern "C" void qt_toLatin1_mips_dsp_asm(uchar *dst, const char16_t *src, int length); #endif -#if defined(__SSE2__) && defined(Q_CC_GNU) +#ifdef __SSE2__ +static constexpr bool UseAvx2 = + (qCompilerCpuFeatures & CpuFeatureArchHaswell) == CpuFeatureArchHaswell; +#endif + +#ifdef Q_CC_GNU # if defined(__SANITIZE_ADDRESS__) && Q_CC_GNU < 800 && !defined(Q_CC_CLANG) # warning "The __attribute__ on below will likely cause a build failure with your GCC version. Your choices are:" # warning "1) disable ASan;" @@ -467,23 +472,24 @@ const char16_t *QtPrivate::qustrchr(QStringView str, char16_t c) noexcept bool loops = true; // Using the PMOVMSKB instruction, we get two bits for each character // we compare. -# if defined(__AVX2__) && !defined(__OPTIMIZE_SIZE__) - // we're going to read n[0..15] (32 bytes) - __m256i mch256 = _mm256_set1_epi32(c | (c << 16)); - for (const char16_t *next = n + 16; next <= e; n = next, next += 16) { - __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); - __m256i result = _mm256_cmpeq_epi16(data, mch256); - uint mask = uint(_mm256_movemask_epi8(result)); - if (mask) { - uint idx = qCountTrailingZeroBits(mask); - return n + idx / 2; + __m128i mch; + if constexpr (UseAvx2) { + // we're going to read n[0..15] (32 bytes) + __m256i mch256 = _mm256_set1_epi32(c | (c << 16)); + for (const char16_t *next = n + 16; next <= e; n = next, next += 16) { + __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n)); + __m256i result = _mm256_cmpeq_epi16(data, mch256); + uint mask = uint(_mm256_movemask_epi8(result)); + if (mask) { + uint idx = qCountTrailingZeroBits(mask); + return n + idx / 2; + } } + loops = false; + mch = _mm256_castsi256_si128(mch256); + } else { + mch = _mm_set1_epi32(c | (c << 16)); } - loops = false; - __m128i mch = _mm256_castsi256_si128(mch256); -# else - __m128i mch = _mm_set1_epi32(c | (c << 16)); -# endif auto hasMatch = [mch, &n](__m128i data, ushort validityMask) { __m128i result = _mm_cmpeq_epi16(data, mch); @@ -551,64 +557,66 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval) return false; }; -# if defined(__SSE4_1__) - __m128i mask; - auto updatePtrSimd = [&](__m128i data) { - __m128i masked = _mm_and_si128(mask, data); - __m128i comparison = _mm_cmpeq_epi16(masked, _mm_setzero_si128()); - uint result = _mm_movemask_epi8(comparison); - return updatePtr(result); - }; + if constexpr (qCompilerCpuFeatures & CpuFeatureSSE4_1) { + __m128i mask; + auto updatePtrSimd = [&](__m128i data) { + __m128i masked = _mm_and_si128(mask, data); + __m128i comparison = _mm_cmpeq_epi16(masked, _mm_setzero_si128()); + uint result = _mm_movemask_epi8(comparison); + return updatePtr(result); + }; -# if defined(__AVX2__) - // AVX2 implementation: test 32 bytes at a time - const __m256i mask256 = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(maskval)); - while (ptr + 32 <= end) { - __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr)); - if (!_mm256_testz_si256(mask256, data)) { - // found a character matching the mask - __m256i masked256 = _mm256_and_si256(mask256, data); - __m256i comparison256 = _mm256_cmpeq_epi16(masked256, _mm256_setzero_si256()); - return updatePtr(_mm256_movemask_epi8(comparison256)); - } - ptr += 32; - } + if constexpr (UseAvx2) { + // AVX2 implementation: test 32 bytes at a time + const __m256i mask256 = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(maskval)); + while (ptr + 32 <= end) { + __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr)); + if (!_mm256_testz_si256(mask256, data)) { + // found a character matching the mask + __m256i masked256 = _mm256_and_si256(mask256, data); + __m256i comparison256 = _mm256_cmpeq_epi16(masked256, _mm256_setzero_si256()); + return updatePtr(_mm256_movemask_epi8(comparison256)); + } + ptr += 32; + } - mask = _mm256_castsi256_si128(mask256); -# else - // SSE 4.1 implementation: test 32 bytes at a time (two 16-byte - // comparisons, unrolled) - mask = _mm_set1_epi32(maskval); - while (ptr + 32 <= end) { - __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr)); - __m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16)); - if (!_mm_testz_si128(mask, data1)) - return updatePtrSimd(data1); + mask = _mm256_castsi256_si128(mask256); + } else { + // SSE 4.1 implementation: test 32 bytes at a time (two 16-byte + // comparisons, unrolled) + mask = _mm_set1_epi32(maskval); + while (ptr + 32 <= end) { + __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr)); + __m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16)); + if (!_mm_testz_si128(mask, data1)) + return updatePtrSimd(data1); + + ptr += 16; + if (!_mm_testz_si128(mask, data2)) + return updatePtrSimd(data2); + ptr += 16; + } + } - ptr += 16; - if (!_mm_testz_si128(mask, data2)) - return updatePtrSimd(data2); - ptr += 16; - } -# endif + // AVX2 and SSE4.1: final 16-byte comparison + if (ptr + 16 <= end) { + __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr)); + if (!_mm_testz_si128(mask, data1)) + return updatePtrSimd(data1); + ptr += 16; + } - // AVX2 and SSE4.1: final 16-byte comparison - if (ptr + 16 <= end) { - __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr)); - if (!_mm_testz_si128(mask, data1)) - return updatePtrSimd(data1); - ptr += 16; - } + // and final 8-byte comparison + if (ptr + 8 <= end) { + __m128i data1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr)); + if (!_mm_testz_si128(mask, data1)) + return updatePtrSimd(data1); + ptr += 8; + } - // and final 8-byte comparison - if (ptr + 8 <= end) { - __m128i data1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr)); - if (!_mm_testz_si128(mask, data1)) - return updatePtrSimd(data1); - ptr += 8; + return true; } -# else // SSE2 implementation: test 16 bytes at a time. const __m128i mask = _mm_set1_epi32(maskval); while (ptr + 16 <= end) { @@ -631,7 +639,6 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval) return updatePtr(result); ptr += 8; } -# endif return true; } @@ -639,16 +646,16 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval) static Q_ALWAYS_INLINE __m128i mm_load8_zero_extend(const void *ptr) { const __m128i *dataptr = static_cast<const __m128i *>(ptr); -#if defined(__SSE4_1__) - // use a MOVQ followed by PMOVZXBW - // if AVX2 is present, these should combine into a single VPMOVZXBW instruction - __m128i data = _mm_loadl_epi64(dataptr); - return _mm_cvtepu8_epi16(data); -# else + if constexpr (qCompilerCpuFeatures & CpuFeatureSSE4_1) { + // use a MOVQ followed by PMOVZXBW + // if AVX2 is present, these should combine into a single VPMOVZXBW instruction + __m128i data = _mm_loadl_epi64(dataptr); + return _mm_cvtepu8_epi16(data); + } + // use MOVQ followed by PUNPCKLBW __m128i data = _mm_loadl_epi64(dataptr); return _mm_unpacklo_epi8(data, _mm_setzero_si128()); -# endif } #endif @@ -659,19 +666,19 @@ bool qt_is_ascii(const char *&ptr, const char *end) noexcept #if defined(__SSE2__) // Testing for the high bit can be done efficiently with just PMOVMSKB bool loops = true; -# if defined(__AVX2__) - while (ptr + 32 <= end) { - __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr)); - quint32 mask = _mm256_movemask_epi8(data); - if (mask) { - uint idx = qCountTrailingZeroBits(mask); - ptr += idx; - return false; + if constexpr (UseAvx2) { + while (ptr + 32 <= end) { + __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr)); + quint32 mask = _mm256_movemask_epi8(data); + if (mask) { + uint idx = qCountTrailingZeroBits(mask); + ptr += idx; + return false; + } + ptr += 32; } - ptr += 32; + loops = false; } - loops = false; -# endif while (ptr + 16 <= end) { __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr)); @@ -802,23 +809,23 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n // we're going to read str[offset..offset+15] (16 bytes) for ( ; str + offset + 15 < e; offset += 16) { const __m128i chunk = _mm_loadu_si128((const __m128i*)(str + offset)); // load -#ifdef __AVX2__ - // zero extend to an YMM register - const __m256i extended = _mm256_cvtepu8_epi16(chunk); + if constexpr (UseAvx2) { + // zero extend to an YMM register + const __m256i extended = _mm256_cvtepu8_epi16(chunk); - // store - _mm256_storeu_si256((__m256i*)(dst + offset), extended); -#else - const __m128i nullMask = _mm_set1_epi32(0); + // store + _mm256_storeu_si256((__m256i*)(dst + offset), extended); + } else { + const __m128i nullMask = _mm_set1_epi32(0); - // unpack the first 8 bytes, padding with zeros - const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); - _mm_storeu_si128((__m128i*)(dst + offset), firstHalf); // store + // unpack the first 8 bytes, padding with zeros + const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask); + _mm_storeu_si128((__m128i*)(dst + offset), firstHalf); // store - // unpack the last 8 bytes, padding with zeros - const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); - _mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store -#endif + // unpack the last 8 bytes, padding with zeros + const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask); + _mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store + } } // we're going to read str[offset..offset+7] (8 bytes) @@ -855,24 +862,36 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len uchar *e = dst + length; qptrdiff offset = 0; -# ifdef __AVX2__ - const __m256i questionMark256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128('?')); - const __m256i outOfRange256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(0x100)); - const __m128i questionMark = _mm256_castsi256_si128(questionMark256); - const __m128i outOfRange = _mm256_castsi256_si128(outOfRange256); -# else - const __m128i questionMark = _mm_set1_epi16('?'); - const __m128i outOfRange = _mm_set1_epi16(0x100); -# endif + auto questionMark256 = []() { + if constexpr (UseAvx2) + return _mm256_broadcastw_epi16(_mm_cvtsi32_si128('?')); + else + return 0; + }(); + auto outOfRange256 = []() { + if constexpr (UseAvx2) + return _mm256_broadcastw_epi16(_mm_cvtsi32_si128(0x100)); + else + return 0; + }(); + __m128i questionMark, outOfRange; + if constexpr (UseAvx2) { + questionMark = _mm256_castsi256_si128(questionMark256); + outOfRange = _mm256_castsi256_si128(outOfRange256); + } else { + questionMark = _mm_set1_epi16('?'); + outOfRange = _mm_set1_epi16(0x100); + } auto mergeQuestionMarks = [=](__m128i chunk) { // SSE has no compare instruction for unsigned comparison. -# ifdef __SSE4_1__ - // We use an unsigned uc = qMin(uc, 0x100) and then compare for equality. - chunk = _mm_min_epu16(chunk, outOfRange); - const __m128i offLimitMask = _mm_cmpeq_epi16(chunk, outOfRange); - chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask); -# else + if constexpr (qCompilerCpuFeatures & CpuFeatureSSE4_1) { + // We use an unsigned uc = qMin(uc, 0x100) and then compare for equality. + chunk = _mm_min_epu16(chunk, outOfRange); + const __m128i offLimitMask = _mm_cmpeq_epi16(chunk, outOfRange); + chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask); + return chunk; + } // The variables must be shiffted + 0x8000 to be compared const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000)); const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000)); @@ -892,32 +911,32 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len chunk = _mm_or_si128(correctBytes, offLimitQuestionMark); Q_UNUSED(outOfRange); -# endif return chunk; }; // we're going to write to dst[offset..offset+15] (16 bytes) for ( ; dst + offset + 15 < e; offset += 16) { -# if defined(__AVX2__) - __m256i chunk = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + offset)); - if (Checked) { - // See mergeQuestionMarks lambda above for details - chunk = _mm256_min_epu16(chunk, outOfRange256); - const __m256i offLimitMask = _mm256_cmpeq_epi16(chunk, outOfRange256); - chunk = _mm256_blendv_epi8(chunk, questionMark256, offLimitMask); - } + __m128i chunk1, chunk2; + if constexpr (UseAvx2) { + __m256i chunk = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + offset)); + if (Checked) { + // See mergeQuestionMarks lambda above for details + chunk = _mm256_min_epu16(chunk, outOfRange256); + const __m256i offLimitMask = _mm256_cmpeq_epi16(chunk, outOfRange256); + chunk = _mm256_blendv_epi8(chunk, questionMark256, offLimitMask); + } - const __m128i chunk2 = _mm256_extracti128_si256(chunk, 1); - const __m128i chunk1 = _mm256_castsi256_si128(chunk); -# else - __m128i chunk1 = _mm_loadu_si128((const __m128i*)(src + offset)); // load - if (Checked) - chunk1 = mergeQuestionMarks(chunk1); + chunk2 = _mm256_extracti128_si256(chunk, 1); + chunk1 = _mm256_castsi256_si128(chunk); + } else { + chunk1 = _mm_loadu_si128((const __m128i*)(src + offset)); // load + if (Checked) + chunk1 = mergeQuestionMarks(chunk1); - __m128i chunk2 = _mm_loadu_si128((const __m128i*)(src + offset + 8)); // load - if (Checked) - chunk2 = mergeQuestionMarks(chunk2); -# endif + chunk2 = _mm_loadu_si128((const __m128i*)(src + offset + 8)); // load + if (Checked) + chunk2 = mergeQuestionMarks(chunk2); + } // pack the two vector to 16 x 8bits elements const __m128i result = _mm_packus_epi16(chunk1, chunk2); @@ -1131,20 +1150,21 @@ static int ucstrncmp(const char16_t *a, const char16_t *b, size_t l) // we're going to read a[0..15] and b[0..15] (32 bytes) for ( ; end - a >= offset + 16; offset += 16) { -#ifdef __AVX2__ - __m256i a_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(a + offset)); - __m256i b_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(b + offset)); - __m256i result = _mm256_cmpeq_epi16(a_data, b_data); - uint mask = _mm256_movemask_epi8(result); -#else - __m128i a_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)); - __m128i a_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset + 8)); - __m128i b_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset)); - __m128i b_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset + 8)); - __m128i result1 = _mm_cmpeq_epi16(a_data1, b_data1); - __m128i result2 = _mm_cmpeq_epi16(a_data2, b_data2); - uint mask = _mm_movemask_epi8(result1) | (_mm_movemask_epi8(result2) << 16); -#endif + uint mask; + if constexpr (UseAvx2) { + __m256i a_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(a + offset)); + __m256i b_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(b + offset)); + __m256i result = _mm256_cmpeq_epi16(a_data, b_data); + mask = _mm256_movemask_epi8(result); + } else { + __m128i a_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset)); + __m128i a_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset + 8)); + __m128i b_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset)); + __m128i b_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset + 8)); + __m128i result1 = _mm_cmpeq_epi16(a_data1, b_data1); + __m128i result2 = _mm_cmpeq_epi16(a_data2, b_data2); + mask = _mm_movemask_epi8(result1) | (_mm_movemask_epi8(result2) << 16); + } mask = ~mask; if (mask) { // found a different character @@ -1252,30 +1272,31 @@ static int ucstrncmp(const char16_t *a, const char *b, size_t l) for ( ; uc + offset + 15 < e; offset += 16) { // similar to fromLatin1_helper: // load 16 bytes of Latin 1 data + uint mask; __m128i chunk = _mm_loadu_si128((const __m128i*)(c + offset)); -# ifdef __AVX2__ - // expand Latin 1 data via zero extension - __m256i ldata = _mm256_cvtepu8_epi16(chunk); + if constexpr (UseAvx2) { + // expand Latin 1 data via zero extension + __m256i ldata = _mm256_cvtepu8_epi16(chunk); - // load UTF-16 data and compare - __m256i ucdata = _mm256_loadu_si256((const __m256i*)(uc + offset)); - __m256i result = _mm256_cmpeq_epi16(ldata, ucdata); + // load UTF-16 data and compare + __m256i ucdata = _mm256_loadu_si256((const __m256i*)(uc + offset)); + __m256i result = _mm256_cmpeq_epi16(ldata, ucdata); - uint mask = ~_mm256_movemask_epi8(result); -# else - // expand via unpacking - __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullmask); - __m128i secondHalf = _mm_unpackhi_epi8(chunk, nullmask); + mask = ~_mm256_movemask_epi8(result); + } else { + // expand via unpacking + __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullmask); + __m128i secondHalf = _mm_unpackhi_epi8(chunk, nullmask); - // load UTF-16 data and compare - __m128i ucdata1 = _mm_loadu_si128((const __m128i*)(uc + offset)); - __m128i ucdata2 = _mm_loadu_si128((const __m128i*)(uc + offset + 8)); - __m128i result1 = _mm_cmpeq_epi16(firstHalf, ucdata1); - __m128i result2 = _mm_cmpeq_epi16(secondHalf, ucdata2); + // load UTF-16 data and compare + __m128i ucdata1 = _mm_loadu_si128((const __m128i*)(uc + offset)); + __m128i ucdata2 = _mm_loadu_si128((const __m128i*)(uc + offset + 8)); + __m128i result1 = _mm_cmpeq_epi16(firstHalf, ucdata1); + __m128i result2 = _mm_cmpeq_epi16(secondHalf, ucdata2); - uint mask = ~(_mm_movemask_epi8(result1) | _mm_movemask_epi8(result2) << 16); -# endif + mask = ~(_mm_movemask_epi8(result1) | _mm_movemask_epi8(result2) << 16); + } if (mask) { // found a different character if (Mode == CompareStringsForEquality) |