From 85f963b2f16147ffce243e9acdfbeb46f19b2fed Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Wed, 22 Dec 2010 14:42:33 +0100 Subject: Improve toLatin1 x86 SIMD by using a new SSE4.1 instruction MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new instruction is PBLENDVB, which creates a result by selecting bytes from one of two registers, depending on whether the mask contains a 1 (0xff) or a zero. The SSE2 code requires three instructions (and, andnot, or). The equivalent Neon instruction is VBSL (bit select). Reviewed-by: Samuel Rødal (cherry picked from commit bdad106358ae177d1345f5ff85c0e38cfeb5ca90) Change-Id: I5b0d055a4be532f81c6f11181d710525cd6c3f25 Reviewed-on: http://codereview.qt-project.org/4466 Reviewed-by: Qt Sanity Bot Reviewed-by: Oswald Buddenhagen Reviewed-by: Samuel Rødal --- src/corelib/tools/qstring.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index c56c050f76..241762636b 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -3558,6 +3558,10 @@ static QByteArray toLatin1_helper(const QChar *data, int length) const __m128i signedChunk = _mm_add_epi16(chunk1, signedBitOffset); const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); +#ifdef __SSE4_1__ + chunk1 = _mm_blendv_epi8(chunk1, questionMark, offLimitMask); +#else + // offLimitQuestionMark contains '?' for each 16 bits that was off-limit // the 16 bits that were correct contains zeros const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); @@ -3568,6 +3572,7 @@ static QByteArray toLatin1_helper(const QChar *data, int length) // merge offLimitQuestionMark and correctBytes to have the result chunk1 = _mm_or_si128(correctBytes, offLimitQuestionMark); +#endif } __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load @@ -3576,9 +3581,13 @@ static QByteArray toLatin1_helper(const QChar *data, int length) // exactly the same operations as for the previous chunk of data const __m128i signedChunk = _mm_add_epi16(chunk2, signedBitOffset); const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); +#ifdef __SSE4_1__ + chunk2 = _mm_blendv_epi8(chunk2, questionMark, offLimitMask); +#else const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2); chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark); +#endif } // pack the two vector to 16 x 8bits elements -- cgit v1.2.3 From 526c851902c9b2762179bb338a21e025e7f471ac Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Wed, 22 Dec 2010 19:52:18 +0100 Subject: Create a function that merges the SSE common code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reviewed-by: Samuel Rødal (cherry picked from commit bb3bd601560132df769c32808ae0b36c56d1caab) Change-Id: Icd7f661785a793effcd4d8cd08ffa8bb5a592cd9 Reviewed-on: http://codereview.qt-project.org/4467 Reviewed-by: Qt Sanity Bot Reviewed-by: Oswald Buddenhagen Reviewed-by: Samuel Rødal --- src/corelib/tools/qstring.cpp | 73 +++++++++++++++++++++---------------------- 1 file changed, 35 insertions(+), 38 deletions(-) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index 241762636b..b34d638504 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -3535,6 +3535,38 @@ bool QString::endsWith(const QChar &c, Qt::CaseSensitivity cs) const } +#if defined(QT_ALWAYS_HAVE_SSE2) +static inline __m128i mergeQuestionMarks(__m128i chunk) +{ + const __m128i questionMark = _mm_set1_epi16('?'); + + // SSE has no compare instruction for unsigned comparison. + // The variables must be shiffted + 0x8000 to be compared + const __m128i signedBitOffset = _mm_set1_epi16(0x8000); + const __m128i thresholdMask = _mm_set1_epi16(0xff + 0x8000); + + const __m128i signedChunk = _mm_add_epi16(chunk, signedBitOffset); + const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); + +# ifdef __SSE4_1__ + // replace the non-Latin 1 characters in the chunk with question marks + chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask); +# else + // offLimitQuestionMark contains '?' for each 16 bits that was off-limit + // the 16 bits that were correct contains zeros + const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); + + // correctBytes contains the bytes that were in limit + // the 16 bits that were off limits contains zeros + const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk); + + // merge offLimitQuestionMark and correctBytes to have the result + chunk = _mm_or_si128(correctBytes, offLimitQuestionMark); +# endif + return chunk; +} +#endif + static QByteArray toLatin1_helper(const QChar *data, int length) { QByteArray ba; @@ -3545,50 +3577,15 @@ static QByteArray toLatin1_helper(const QChar *data, int length) #if defined(QT_ALWAYS_HAVE_SSE2) if (length >= 16) { const int chunkCount = length >> 4; // divided by 16 - const __m128i questionMark = _mm_set1_epi16('?'); - // SSE has no compare instruction for unsigned comparison. - // The variables must be shiffted + 0x8000 to be compared - const __m128i signedBitOffset = _mm_set1_epi16(short(0x8000)); - const __m128i thresholdMask = _mm_set1_epi16(short(0xff + 0x8000)); + for (int i = 0; i < chunkCount; ++i) { __m128i chunk1 = _mm_loadu_si128((__m128i*)src); // load + chunk1 = mergeQuestionMarks(chunk1); src += 8; - { - // each 16 bit is equal to 0xFF if the source is outside latin 1 (>0xff) - const __m128i signedChunk = _mm_add_epi16(chunk1, signedBitOffset); - const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); - -#ifdef __SSE4_1__ - chunk1 = _mm_blendv_epi8(chunk1, questionMark, offLimitMask); -#else - - // offLimitQuestionMark contains '?' for each 16 bits that was off-limit - // the 16 bits that were correct contains zeros - const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); - - // correctBytes contains the bytes that were in limit - // the 16 bits that were off limits contains zeros - const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk1); - - // merge offLimitQuestionMark and correctBytes to have the result - chunk1 = _mm_or_si128(correctBytes, offLimitQuestionMark); -#endif - } __m128i chunk2 = _mm_loadu_si128((__m128i*)src); // load + chunk2 = mergeQuestionMarks(chunk2); src += 8; - { - // exactly the same operations as for the previous chunk of data - const __m128i signedChunk = _mm_add_epi16(chunk2, signedBitOffset); - const __m128i offLimitMask = _mm_cmpgt_epi16(signedChunk, thresholdMask); -#ifdef __SSE4_1__ - chunk2 = _mm_blendv_epi8(chunk2, questionMark, offLimitMask); -#else - const __m128i offLimitQuestionMark = _mm_and_si128(offLimitMask, questionMark); - const __m128i correctBytes = _mm_andnot_si128(offLimitMask, chunk2); - chunk2 = _mm_or_si128(correctBytes, offLimitQuestionMark); -#endif - } // pack the two vector to 16 x 8bits elements const __m128i result = _mm_packus_epi16(chunk1, chunk2); -- cgit v1.2.3 From 83ba0d56f878409d3f758549c72e5d099cc71a07 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Wed, 22 Dec 2010 19:52:48 +0100 Subject: Add an SSE4.2 even simpler version of toLatin1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Use the new PCMPESTRM instruction (Parallel CoMPare Explicit-length STRings with result in a Mask) which is added in SSE4.2 for facilitating string operations. The "compare ranges" mode allows us to search for characters outside the Latin 1 range and then use the SSE4.1 PBLENDVB instruction to replace those with question marks. Unlike previous SSE compare instructions, the PCMPxSTRx family allows us to operate on unsigned 16-bit values. This saves us another parallel add. Reviewed-By: Samuel Rødal (cherry picked from commit 45d2d36c9dbcbce403c78838ea52acd1ab111b68) Change-Id: I0f9d864f9d19c0f0da43ccb6918dc28295074496 Reviewed-on: http://codereview.qt-project.org/4468 Reviewed-by: Qt Sanity Bot Reviewed-by: Oswald Buddenhagen Reviewed-by: Samuel Rødal --- src/corelib/tools/qstring.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index b34d638504..3a2e35c322 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -3540,6 +3540,28 @@ static inline __m128i mergeQuestionMarks(__m128i chunk) { const __m128i questionMark = _mm_set1_epi16('?'); +# ifdef __SSE4_2__ + // compare the unsigned shorts for the range 0x0100-0xFFFF + // note on the use of _mm_cmpestrm: + // The MSDN documentation online (http://technet.microsoft.com/en-us/library/bb514080.aspx) + // says for range search the following: + // For each character c in a, determine whether b0 <= c <= b1 or b2 <= c <= b3 + // + // However, all examples on the Internet, including from Intel + // (see http://software.intel.com/en-us/articles/xml-parsing-accelerator-with-intel-streaming-simd-extensions-4-intel-sse4/) + // put the range to be searched first + // + // Disassembly and instruction-level debugging with GCC and ICC show + // that they are doing the right thing. Inverting the arguments in the + // instruction does cause a bunch of test failures. + + const int mode = _SIDD_UWORD_OPS | _SIDD_CMP_RANGES | _SIDD_UNIT_MASK; + const __m128i rangeMatch = _mm_cvtsi32_si128(0xffff0100); + const __m128i offLimitMask = _mm_cmpestrm(rangeMatch, 2, chunk, 8, mode); + + // replace the non-Latin 1 characters in the chunk with question marks + chunk = _mm_blendv_epi8(chunk, questionMark, offLimitMask); +# else // SSE has no compare instruction for unsigned comparison. // The variables must be shiffted + 0x8000 to be compared const __m128i signedBitOffset = _mm_set1_epi16(0x8000); @@ -3563,6 +3585,7 @@ static inline __m128i mergeQuestionMarks(__m128i chunk) // merge offLimitQuestionMark and correctBytes to have the result chunk = _mm_or_si128(correctBytes, offLimitQuestionMark); # endif +# endif return chunk; } #endif -- cgit v1.2.3