1 files changed, 58 insertions, 29 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index c56c050f76..3a2e35c322 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -3535,6 +3535,61 @@ bool QString::endsWith(const QChar &c, Qt::CaseSensitivity cs) const
 }
 
 
+#if defined(QT_ALWAYS_HAVE_SSE2)
+static inline __m128i mergeQuestionMarks(__m128i chunk)
+{
+    const __m128i questionMark = _mm_set1_epi16('?');
+
+# ifdef __SSE4_2__
+    // compare the unsigned shorts for the range 0x0100-0xFFFF
+    // note on the use of _mm_cmpestrm:
+    //  The MSDN documentation online (http://technet.microsoft.com/en-us/library/bb514080.aspx)
+    //  says for range search the following:
+    //    For each character c in a, determine whether b0 <= c <= b1 or b2 <= c <= b3
+    //
+    //  However, all examples on the Internet, including from Intel
+    //  (see http://software.intel.com/en-us/articles/xml-parsing-accelerator-with-intel-streaming-simd-extensions-4-intel-sse4/)
+    //  put the range to be searched first
+    //
+    //  Disassembly and instruction-level debugging with GCC and ICC show
+    //  that they are doing the right thing. Inverting the arguments in the
+    //  instruction does cause a bunch of test failures.
+
+    const int mode = _SIDD_UWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK;
+    const __m128i rangeMatch = _mm_cvtsi32_si128(0xffff0100);
+    const __m128i offLimitMask = _mm_cmpestrm(rangeMatch, 2, chunk, 8, mode);
+
+    // replace the non-Latin 1 characters in the chunk with question marks
+    chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
+# else
+    // SSE has no compare instruction for unsigned comparison.
+    // The variables must be shiffted + 0x8000 to be compared
+    const __m128i signedBitOffset = _mm_set1_epi16(0x8000);
+    const __m128i thresholdMask = _mm_set1_epi16(0xff + 0x8000);
+
+    const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset);
+    const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
+
+#  ifdef __SSE4_1__
+    // replace the non-Latin 1 characters in the chunk with question marks
+    chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
+#  else
+    // offLimitQuestionMark contains '?' for each 16 bits that was off-limit
+    // the 16 bits that were correct contains zeros
+    const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
+
+    // correctBytes contains the bytes that were in limit
+    // the 16 bits that were off limits contains zeros
+    const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk);
+
+    // merge offLimitQuestionMark and correctBytes to have the result
+    chunk = _mm_or_si128(correctBytes, offLimitQuestionMark);
+#  endif
+# endif
+    return chunk;
+}
+#endif
+
 static QByteArray toLatin1_helper(const QChar *data, int length)
 {
     QByteArray ba;
@@ -3545,41 +3600,15 @@ static QByteArray toLatin1_helper(const QChar *data, int length)
 #if defined(QT_ALWAYS_HAVE_SSE2)
         if (length >= 16) {
             const int chunkCount = length >> 4; // divided by 16
-            const __m128i questionMark = _mm_set1_epi16('?');
-            // SSE has no compare instruction for unsigned comparison.
-            // The variables must be shiffted + 0x8000 to be compared
-            const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000));
-            const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000));
+
             for (int i = 0; i < chunkCount; ++i) {
                 __m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load
+                chunk1 = mergeQuestionMarks(chunk1);
                 src += 8;
-                {
-                    // each 16 bit is equal to 0xFF if the source is outside latin 1 (>0xff)
-                    const __m128i signedChunk = _mm_add_epi16(chunk1, signedBitOffset);
-                    const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
-
-                    // offLimitQuestionMark contains '?' for each 16 bits that was off-limit
-                    // the 16 bits that were correct contains zeros
-                    const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
-
-                    // correctBytes contains the bytes that were in limit
-                    // the 16 bits that were off limits contains zeros
-                    const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk1);
-
-                    // merge offLimitQuestionMark and correctBytes to have the result
-                    chunk1 = _mm_or_si128(correctBytes, offLimitQuestionMark);
-                }
 
                 __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load
+                chunk2 = mergeQuestionMarks(chunk2);
                 src += 8;
-                {
-                    // exactly the same operations as for the previous chunk of data
-                    const __m128i signedChunk = _mm_add_epi16(chunk2, signedBitOffset);
-                    const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
-                    const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
-                    const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2);
-                    chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark);
-                }
 
                 // pack the two vector to 16 x 8bits elements
                 const __m128i result = _mm_packus_epi16(chunk1, chunk2);