summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2018-05-14 13:57:30 -0700
committerThiago Macieira <thiago.macieira@intel.com>2018-05-16 06:29:01 +0000
commita9074779cf1425b76c010b891403a0521e2cb4e4 (patch)
tree6eb5dc85d24a68bb9b6db38620fd537fc4b42273
parentbd0905db95cd1b2f225050390bd50809452ff53f (diff)
QString::toLatin1: stop using PCMPESTRM
This instruction is somewhat slow and requires a lot of inputs to be correctly set. Instead, use the PMIN trick, which does have unsigned comparison support. This commit moves the helper function to a lambda inside qt_to_latin1, to make it easier to reuse the constants in the next commit and to avoid warnings of unused static functions. Change-Id: Ib48364abee9f464c96c6fffd152e9e84f4ad3ae8 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
-rw-r--r--src/corelib/tools/qstring.cpp85
1 files changed, 32 insertions, 53 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index d045913b87..639f6f8968 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -536,67 +536,46 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size) Q_DECL_NOTHROW
#endif
}
-#if defined(__SSE2__)
-static inline __m128i mergeQuestionMarks(__m128i chunk)
+template <bool Checked>
+static void qt_to_latin1_internal(uchar *dst, const ushort *src, qsizetype length)
{
- const __m128i questionMark = _mm_set1_epi16('?');
+#if defined(__SSE2__)
+ uchar *e = dst + length;
+ qptrdiff offset = 0;
-# ifdef __SSE4_2__
- // compare the unsigned shorts for the range 0x0100-0xFFFF
- // note on the use of _mm_cmpestrm:
- // The MSDN documentation online (http://technet.microsoft.com/en-us/library/bb514080.aspx)
- // says for range search the following:
- // For each character c in a, determine whether b0 <= c <= b1 or b2 <= c <= b3
- //
- // However, all examples on the Internet, including from Intel
- // (see http://software.intel.com/en-us/articles/xml-parsing-accelerator-with-intel-streaming-simd-extensions-4-intel-sse4/)
- // put the range to be searched first
- //
- // Disassembly and instruction-level debugging with GCC and ICC show
- // that they are doing the right thing. Inverting the arguments in the
- // instruction does cause a bunch of test failures.
-
- const __m128i rangeMatch = _mm_cvtsi32_si128(0xffff0100);
- const __m128i offLimitMask = _mm_cmpestrm(rangeMatch, 2, chunk, 8,
- _SIDD_UWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK);
-
- // replace the non-Latin 1 characters in the chunk with question marks
- chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
+ const __m128i questionMark = _mm_set1_epi16('?');
+ const __m128i outOfRange = _mm_set1_epi16(0x100);
+
+ auto mergeQuestionMarks = [=](__m128i chunk) {
+ // SSE has no compare instruction for unsigned comparison.
+# ifdef __SSE4_1__
+ // We use an unsigned uc = qMin(uc, 0x100) and then compare for equality.
+ chunk = _mm_min_epu16(chunk, outOfRange);
+ const __m128i offLimitMask = _mm_cmpeq_epi16(chunk, outOfRange);
+ chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
# else
- // SSE has no compare instruction for unsigned comparison.
- // The variables must be shiffted + 0x8000 to be compared
- const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000));
- const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000));
+ // The variables must be shiffted + 0x8000 to be compared
+ const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000));
+ const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000));
- const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset);
- const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
+ const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset);
+ const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask);
-# ifdef __SSE4_1__
- // replace the non-Latin 1 characters in the chunk with question marks
- chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask);
-# else
- // offLimitQuestionMark contains '?' for each 16 bits that was off-limit
- // the 16 bits that were correct contains zeros
- const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
+ // offLimitQuestionMark contains '?' for each 16 bits that was off-limit
+ // the 16 bits that were correct contains zeros
+ const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark);
- // correctBytes contains the bytes that were in limit
- // the 16 bits that were off limits contains zeros
- const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk);
+ // correctBytes contains the bytes that were in limit
+ // the 16 bits that were off limits contains zeros
+ const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk);
- // merge offLimitQuestionMark and correctBytes to have the result
- chunk = _mm_or_si128(correctBytes, offLimitQuestionMark);
-# endif
-# endif
- return chunk;
-}
-#endif
+ // merge offLimitQuestionMark and correctBytes to have the result
+ chunk = _mm_or_si128(correctBytes, offLimitQuestionMark);
-template <bool Checked>
-static void qt_to_latin1_internal(uchar *dst, const ushort *src, qsizetype length)
-{
-#if defined(__SSE2__)
- uchar *e = dst + length;
- qptrdiff offset = 0;
+ Q_UNUSED(outOfRange);
+# endif
+ return chunk;
+ };
// we're going to write to dst[offset..offset+15] (16 bytes)
for ( ; dst + offset + 15 < e; offset += 16) {