1 files changed, 187 insertions, 166 deletions
diff --git a/src/corelib/text/qstring.cpp b/src/corelib/text/qstring.cpp
index 819086b014..6525e16f2a 100644
--- a/src/corelib/text/qstring.cpp
+++ b/src/corelib/text/qstring.cpp
@@ -349,7 +349,12 @@ extern "C" void qt_fromlatin1_mips_asm_unroll8 (char16_t*, const char*, uint);
 extern "C" void qt_toLatin1_mips_dsp_asm(uchar *dst, const char16_t *src, int length);
 #endif
 
-#if defined(__SSE2__) && defined(Q_CC_GNU)
+#ifdef __SSE2__
+static constexpr bool UseAvx2 =
+        (qCompilerCpuFeatures & CpuFeatureArchHaswell) == CpuFeatureArchHaswell;
+#endif
+
+#ifdef Q_CC_GNU
 #  if defined(__SANITIZE_ADDRESS__) && Q_CC_GNU < 800 && !defined(Q_CC_CLANG)
 #     warning "The __attribute__ on below will likely cause a build failure with your GCC version. Your choices are:"
 #     warning "1) disable ASan;"
@@ -467,23 +472,24 @@ const char16_t *QtPrivate::qustrchr(QStringView str, char16_t c) noexcept
     bool loops = true;
     // Using the PMOVMSKB instruction, we get two bits for each character
     // we compare.
-#  if defined(__AVX2__) && !defined(__OPTIMIZE_SIZE__)
-    // we're going to read n[0..15] (32 bytes)
-    __m256i mch256 = _mm256_set1_epi32(c | (c << 16));
-    for (const char16_t *next = n + 16; next <= e; n = next, next += 16) {
-        __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
-        __m256i result = _mm256_cmpeq_epi16(data, mch256);
-        uint mask = uint(_mm256_movemask_epi8(result));
-        if (mask) {
-            uint idx = qCountTrailingZeroBits(mask);
-            return n + idx / 2;
+    __m128i mch;
+    if constexpr (UseAvx2) {
+        // we're going to read n[0..15] (32 bytes)
+        __m256i mch256 = _mm256_set1_epi32(c | (c << 16));
+        for (const char16_t *next = n + 16; next <= e; n = next, next += 16) {
+            __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(n));
+            __m256i result = _mm256_cmpeq_epi16(data, mch256);
+            uint mask = uint(_mm256_movemask_epi8(result));
+            if (mask) {
+                uint idx = qCountTrailingZeroBits(mask);
+                return n + idx / 2;
+            }
         }
+        loops = false;
+        mch = _mm256_castsi256_si128(mch256);
+    } else {
+        mch = _mm_set1_epi32(c | (c << 16));
     }
-    loops = false;
-    __m128i mch = _mm256_castsi256_si128(mch256);
-#  else
-    __m128i mch = _mm_set1_epi32(c | (c << 16));
-#  endif
 
     auto hasMatch = [mch, &n](__m128i data, ushort validityMask) {
         __m128i result = _mm_cmpeq_epi16(data, mch);
@@ -551,64 +557,66 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval)
         return false;
     };
 
-#  if defined(__SSE4_1__)
-    __m128i mask;
-    auto updatePtrSimd = [&](__m128i data) {
-        __m128i masked = _mm_and_si128(mask, data);
-        __m128i comparison = _mm_cmpeq_epi16(masked, _mm_setzero_si128());
-        uint result = _mm_movemask_epi8(comparison);
-        return updatePtr(result);
-    };
+    if constexpr (qCompilerCpuFeatures & CpuFeatureSSE4_1) {
+        __m128i mask;
+        auto updatePtrSimd = [&](__m128i data) {
+            __m128i masked = _mm_and_si128(mask, data);
+            __m128i comparison = _mm_cmpeq_epi16(masked, _mm_setzero_si128());
+            uint result = _mm_movemask_epi8(comparison);
+            return updatePtr(result);
+        };
 
-#    if defined(__AVX2__)
-    // AVX2 implementation: test 32 bytes at a time
-    const __m256i mask256 = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(maskval));
-    while (ptr + 32 <= end) {
-        __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
-        if (!_mm256_testz_si256(mask256, data)) {
-            // found a character matching the mask
-            __m256i masked256 = _mm256_and_si256(mask256, data);
-            __m256i comparison256 = _mm256_cmpeq_epi16(masked256, _mm256_setzero_si256());
-            return updatePtr(_mm256_movemask_epi8(comparison256));
-        }
-        ptr += 32;
-    }
+        if constexpr (UseAvx2) {
+            // AVX2 implementation: test 32 bytes at a time
+            const __m256i mask256 = _mm256_broadcastd_epi32(_mm_cvtsi32_si128(maskval));
+            while (ptr + 32 <= end) {
+                __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
+                if (!_mm256_testz_si256(mask256, data)) {
+                    // found a character matching the mask
+                    __m256i masked256 = _mm256_and_si256(mask256, data);
+                    __m256i comparison256 = _mm256_cmpeq_epi16(masked256, _mm256_setzero_si256());
+                    return updatePtr(_mm256_movemask_epi8(comparison256));
+                }
+                ptr += 32;
+            }
 
-    mask = _mm256_castsi256_si128(mask256);
-#    else
-    // SSE 4.1 implementation: test 32 bytes at a time (two 16-byte
-    // comparisons, unrolled)
-    mask = _mm_set1_epi32(maskval);
-    while (ptr + 32 <= end) {
-        __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
-        __m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
-        if (!_mm_testz_si128(mask, data1))
-            return updatePtrSimd(data1);
+            mask = _mm256_castsi256_si128(mask256);
+        } else {
+            // SSE 4.1 implementation: test 32 bytes at a time (two 16-byte
+            // comparisons, unrolled)
+            mask = _mm_set1_epi32(maskval);
+            while (ptr + 32 <= end) {
+                __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
+                __m128i data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr + 16));
+                if (!_mm_testz_si128(mask, data1))
+                    return updatePtrSimd(data1);
+
+                ptr += 16;
+                if (!_mm_testz_si128(mask, data2))
+                    return updatePtrSimd(data2);
+                ptr += 16;
+            }
+        }
 
-        ptr += 16;
-        if (!_mm_testz_si128(mask, data2))
-            return updatePtrSimd(data2);
-        ptr += 16;
-    }
-#    endif
+        // AVX2 and SSE4.1: final 16-byte comparison
+        if (ptr + 16 <= end) {
+            __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
+            if (!_mm_testz_si128(mask, data1))
+                return updatePtrSimd(data1);
+            ptr += 16;
+        }
 
-    // AVX2 and SSE4.1: final 16-byte comparison
-    if (ptr + 16 <= end) {
-        __m128i data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
-        if (!_mm_testz_si128(mask, data1))
-            return updatePtrSimd(data1);
-        ptr += 16;
-    }
+        // and final 8-byte comparison
+        if (ptr + 8 <= end) {
+            __m128i data1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr));
+            if (!_mm_testz_si128(mask, data1))
+                return updatePtrSimd(data1);
+            ptr += 8;
+        }
 
-    // and final 8-byte comparison
-    if (ptr + 8 <= end) {
-        __m128i data1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr));
-        if (!_mm_testz_si128(mask, data1))
-            return updatePtrSimd(data1);
-        ptr += 8;
+        return true;
     }
 
-#  else
     // SSE2 implementation: test 16 bytes at a time.
     const __m128i mask = _mm_set1_epi32(maskval);
     while (ptr + 16 <= end) {
@@ -631,7 +639,6 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval)
             return updatePtr(result);
         ptr += 8;
     }
-#  endif
 
     return true;
 }
@@ -639,16 +646,16 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval)
 static Q_ALWAYS_INLINE __m128i mm_load8_zero_extend(const void *ptr)
 {
     const __m128i *dataptr = static_cast<const __m128i *>(ptr);
-#if defined(__SSE4_1__)
-    // use a MOVQ followed by PMOVZXBW
-    // if AVX2 is present, these should combine into a single VPMOVZXBW instruction
-    __m128i data = _mm_loadl_epi64(dataptr);
-    return _mm_cvtepu8_epi16(data);
-#  else
+    if constexpr (qCompilerCpuFeatures & CpuFeatureSSE4_1) {
+        // use a MOVQ followed by PMOVZXBW
+        // if AVX2 is present, these should combine into a single VPMOVZXBW instruction
+        __m128i data = _mm_loadl_epi64(dataptr);
+        return _mm_cvtepu8_epi16(data);
+    }
+
     // use MOVQ followed by PUNPCKLBW
     __m128i data = _mm_loadl_epi64(dataptr);
     return _mm_unpacklo_epi8(data, _mm_setzero_si128());
-#  endif
 }
 #endif
 
@@ -659,19 +666,19 @@ bool qt_is_ascii(const char *&ptr, const char *end) noexcept
 #if defined(__SSE2__)
     // Testing for the high bit can be done efficiently with just PMOVMSKB
     bool loops = true;
-#  if defined(__AVX2__)
-    while (ptr + 32 <= end) {
-        __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
-        quint32 mask = _mm256_movemask_epi8(data);
-        if (mask) {
-            uint idx = qCountTrailingZeroBits(mask);
-            ptr += idx;
-            return false;
+    if constexpr (UseAvx2) {
+        while (ptr + 32 <= end) {
+            __m256i data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(ptr));
+            quint32 mask = _mm256_movemask_epi8(data);
+            if (mask) {
+                uint idx = qCountTrailingZeroBits(mask);
+                ptr += idx;
+                return false;
+            }
+            ptr += 32;
         }
-        ptr += 32;
+        loops = false;
     }
-    loops = false;
-#  endif
 
     while (ptr + 16 <= end) {
         __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
@@ -802,23 +809,23 @@ Q_CORE_EXPORT void qt_from_latin1(char16_t *dst, const char *str, size_t size) n
     // we're going to read str[offset..offset+15] (16 bytes)
     for ( ; str + offset + 15 < e; offset += 16) {
         const __m128i chunk = _mm_loadu_si128((const __m128i*)(str + offset)); // load
-#ifdef __AVX2__
-        // zero extend to an YMM register
-        const __m256i extended = _mm256_cvtepu8_epi16(chunk);
+        if constexpr (UseAvx2) {
+            // zero extend to an YMM register
+            const __m256i extended = _mm256_cvtepu8_epi16(chunk);
 
-        // store
-        _mm256_storeu_si256((__m256i*)(dst + offset), extended);
-#else
-        const __m128i nullMask = _mm_set1_epi32(0);
+            // store
+            _mm256_storeu_si256((__m256i*)(dst + offset), extended);
+        } else {
+            const __m128i nullMask = _mm_set1_epi32(0);
 
-        // unpack the first 8 bytes, padding with zeros
-        const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
-        _mm_storeu_si128((__m128i*)(dst + offset), firstHalf); // store
+            // unpack the first 8 bytes, padding with zeros
+            const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
+            _mm_storeu_si128((__m128i*)(dst + offset), firstHalf); // store
 
-        // unpack the last 8 bytes, padding with zeros
-        const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
-        _mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store
-#endif
+            // unpack the last 8 bytes, padding with zeros
+            const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
+            _mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store
+        }
     }
 
     // we're going to read str[offset..offset+7] (8 bytes)
@@ -855,24 +862,36 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len
     uchar *e = dst + length;
     qptrdiff offset = 0;
 
-#  ifdef __AVX2__
-    const __m256i questionMark256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128('?'));
-    const __m256i outOfRange256 = _mm256_broadcastw_epi16(_mm_cvtsi32_si128(0x100));
-    const __m128i questionMark = _mm256_castsi256_si128(questionMark256);
-    const __m128i outOfRange = _mm256_castsi256_si128(outOfRange256);
-#  else
-    const __m128i questionMark = _mm_set1_epi16('?');
-    const __m128i outOfRange = _mm_set1_epi16(0x100);
-#  endif
+    auto questionMark256 = []() {
+        if constexpr (UseAvx2)
+            return _mm256_broadcastw_epi16(_mm_cvtsi32_si128('?'));
+        else
+            return 0;
+    }();
+    auto outOfRange256 = []() {
+        if constexpr (UseAvx2)
+            return _mm256_broadcastw_epi16(_mm_cvtsi32_si128(0x100));
+        else
+            return 0;
+    }();
+    __m128i questionMark, outOfRange;
+    if constexpr (UseAvx2) {
+        questionMark = _mm256_castsi256_si128(questionMark256);
+        outOfRange = _mm256_castsi256_si128(outOfRange256);
+    } else {
+        questionMark = _mm_set1_epi16('?');
+        outOfRange = _mm_set1_epi16(0x100);
+    }
 
     auto mergeQuestionMarks = [=](__m128i chunk) {
         // SSE has no compare instruction for unsigned comparison.
-# ifdef __SSE4_1__
-        // We use an unsigned uc = qMin(uc, 0x100) and then compare for equality.
-        chunk = _mm_min_epu16(chunk, outOfRange);
-        const __m128i offLimitMask = _mm_cmpeq_epi16(chunk, outOfRange);
-        chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
-# else
+        if constexpr (qCompilerCpuFeatures & CpuFeatureSSE4_1) {
+            // We use an unsigned uc = qMin(uc, 0x100) and then compare for equality.
+            chunk = _mm_min_epu16(chunk, outOfRange);
+            const __m128i offLimitMask = _mm_cmpeq_epi16(chunk, outOfRange);
+            chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
+            return chunk;
+        }
         // The variables must be shiffted + 0x8000 to be compared
         const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000));
         const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000));
@@ -892,32 +911,32 @@ static void qt_to_latin1_internal(uchar *dst, const char16_t *src, qsizetype len
         chunk = _mm_or_si128(correctBytes, offLimitQuestionMark);
 
         Q_UNUSED(outOfRange);
-# endif
         return chunk;
     };
 
     // we're going to write to dst[offset..offset+15] (16 bytes)
     for ( ; dst + offset + 15 < e; offset += 16) {
-#  if defined(__AVX2__)
-        __m256i chunk = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + offset));
-        if (Checked) {
-            // See mergeQuestionMarks lambda above for details
-            chunk = _mm256_min_epu16(chunk, outOfRange256);
-            const __m256i offLimitMask = _mm256_cmpeq_epi16(chunk, outOfRange256);
-            chunk = _mm256_blendv_epi8(chunk, questionMark256, offLimitMask);
-        }
+        __m128i chunk1, chunk2;
+        if constexpr (UseAvx2) {
+            __m256i chunk = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + offset));
+            if (Checked) {
+                // See mergeQuestionMarks lambda above for details
+                chunk = _mm256_min_epu16(chunk, outOfRange256);
+                const __m256i offLimitMask = _mm256_cmpeq_epi16(chunk, outOfRange256);
+                chunk = _mm256_blendv_epi8(chunk, questionMark256, offLimitMask);
+            }
 
-        const __m128i chunk2 = _mm256_extracti128_si256(chunk, 1);
-        const __m128i chunk1 = _mm256_castsi256_si128(chunk);
-#  else
-        __m128i chunk1 = _mm_loadu_si128((const __m128i*)(src + offset)); // load
-        if (Checked)
-            chunk1 = mergeQuestionMarks(chunk1);
+            chunk2 = _mm256_extracti128_si256(chunk, 1);
+            chunk1 = _mm256_castsi256_si128(chunk);
+        } else {
+            chunk1 = _mm_loadu_si128((const __m128i*)(src + offset)); // load
+            if (Checked)
+                chunk1 = mergeQuestionMarks(chunk1);
 
-        __m128i chunk2 = _mm_loadu_si128((const __m128i*)(src + offset + 8)); // load
-        if (Checked)
-            chunk2 = mergeQuestionMarks(chunk2);
-#  endif
+            chunk2 = _mm_loadu_si128((const __m128i*)(src + offset + 8)); // load
+            if (Checked)
+                chunk2 = mergeQuestionMarks(chunk2);
+        }
 
         // pack the two vector to 16 x 8bits elements
         const __m128i result = _mm_packus_epi16(chunk1, chunk2);
@@ -1131,20 +1150,21 @@ static int ucstrncmp(const char16_t *a, const char16_t *b, size_t l)
 
     // we're going to read a[0..15] and b[0..15] (32 bytes)
     for ( ; end - a >= offset + 16; offset += 16) {
-#ifdef __AVX2__
-        __m256i a_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(a + offset));
-        __m256i b_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(b + offset));
-        __m256i result = _mm256_cmpeq_epi16(a_data, b_data);
-        uint mask = _mm256_movemask_epi8(result);
-#else
-        __m128i a_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset));
-        __m128i a_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset + 8));
-        __m128i b_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset));
-        __m128i b_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset + 8));
-        __m128i result1 = _mm_cmpeq_epi16(a_data1, b_data1);
-        __m128i result2 = _mm_cmpeq_epi16(a_data2, b_data2);
-        uint mask = _mm_movemask_epi8(result1) | (_mm_movemask_epi8(result2) << 16);
-#endif
+        uint mask;
+        if constexpr (UseAvx2) {
+            __m256i a_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(a + offset));
+            __m256i b_data = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(b + offset));
+            __m256i result = _mm256_cmpeq_epi16(a_data, b_data);
+            mask = _mm256_movemask_epi8(result);
+        } else {
+            __m128i a_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset));
+            __m128i a_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(a + offset + 8));
+            __m128i b_data1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset));
+            __m128i b_data2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(b + offset + 8));
+            __m128i result1 = _mm_cmpeq_epi16(a_data1, b_data1);
+            __m128i result2 = _mm_cmpeq_epi16(a_data2, b_data2);
+            mask = _mm_movemask_epi8(result1) | (_mm_movemask_epi8(result2) << 16);
+        }
         mask = ~mask;
         if (mask) {
             // found a different character
@@ -1252,30 +1272,31 @@ static int ucstrncmp(const char16_t *a, const char *b, size_t l)
     for ( ; uc + offset + 15 < e; offset += 16) {
         // similar to fromLatin1_helper:
         // load 16 bytes of Latin 1 data
+        uint mask;
         __m128i chunk = _mm_loadu_si128((const __m128i*)(c + offset));
 
-#  ifdef __AVX2__
-        // expand Latin 1 data via zero extension
-        __m256i ldata = _mm256_cvtepu8_epi16(chunk);
+        if constexpr (UseAvx2) {
+            // expand Latin 1 data via zero extension
+            __m256i ldata = _mm256_cvtepu8_epi16(chunk);
 
-        // load UTF-16 data and compare
-        __m256i ucdata = _mm256_loadu_si256((const __m256i*)(uc + offset));
-        __m256i result = _mm256_cmpeq_epi16(ldata, ucdata);
+            // load UTF-16 data and compare
+            __m256i ucdata = _mm256_loadu_si256((const __m256i*)(uc + offset));
+            __m256i result = _mm256_cmpeq_epi16(ldata, ucdata);
 
-        uint mask = ~_mm256_movemask_epi8(result);
-#  else
-        // expand via unpacking
-        __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullmask);
-        __m128i secondHalf = _mm_unpackhi_epi8(chunk, nullmask);
+            mask = ~_mm256_movemask_epi8(result);
+        } else {
+            // expand via unpacking
+            __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullmask);
+            __m128i secondHalf = _mm_unpackhi_epi8(chunk, nullmask);
 
-        // load UTF-16 data and compare
-        __m128i ucdata1 = _mm_loadu_si128((const __m128i*)(uc + offset));
-        __m128i ucdata2 = _mm_loadu_si128((const __m128i*)(uc + offset + 8));
-        __m128i result1 = _mm_cmpeq_epi16(firstHalf, ucdata1);
-        __m128i result2 = _mm_cmpeq_epi16(secondHalf, ucdata2);
+            // load UTF-16 data and compare
+            __m128i ucdata1 = _mm_loadu_si128((const __m128i*)(uc + offset));
+            __m128i ucdata2 = _mm_loadu_si128((const __m128i*)(uc + offset + 8));
+            __m128i result1 = _mm_cmpeq_epi16(firstHalf, ucdata1);
+            __m128i result2 = _mm_cmpeq_epi16(secondHalf, ucdata2);
 
-        uint mask = ~(_mm_movemask_epi8(result1) | _mm_movemask_epi8(result2) << 16);
-#  endif
+            mask = ~(_mm_movemask_epi8(result1) | _mm_movemask_epi8(result2) << 16);
+        }
         if (mask) {
             // found a different character
             if (Mode == CompareStringsForEquality)