Improve a few string operations with AVX2

AVX2 brings the new PMOVZXBW instruction that extends from one 128-bit SSE register to an 256-bit AVX register. With that, the main decoding code is just two instructions (the loop requires a couple more to maintain the offset counter and do the end-of-loop check). This buys us another 4% performance improvement in the fromLatin1 code, calculated on top of the VEX-encoded SSE2 code (which is already a little better than plain SSE2). Change-Id: I675fa24de4fa97683b662f19d146047251f77359 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@digia.com>
author: Thiago Macieira <thiago.macieira@intel.com> 2014-01-26 23:45:27 -0800
committer: The Qt Project <gerrit-noreply@qt-project.org> 2014-05-21 17:10:56 +0200
commit: 309d3557ca832c42b8fbd372b957af51510b159e (patch)
tree: b2e5f2ede70647dcc1747e5537c46ba0d1ad4a3a /src/corelib/codecs
parent: 4dba08eebf7db7e801ccbf83dec6f17369123f10 (diff)
1 files changed, 32 insertions, 15 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index 072cda63aa..c0f26ad803 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -103,27 +103,44 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const
     for ( ; end - src >= 16; src += 16, dst += 16) {
         __m128i data = _mm_loadu_si128((__m128i*)src);
 
+#ifdef __AVX2__
+        const int BitSpacing = 2;
+        // load and zero extend to an YMM register
+        const __m256i extended = _mm256_cvtepu8_epi16(data);
+
+        uint n = _mm256_movemask_epi8(extended);
+        if (!n) {
+            // store
+            _mm256_storeu_si256((__m256i*)dst, extended);
+            continue;
+        }
+#else
+        const int BitSpacing = 1;
+
         // check if everything is ASCII
         // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
         uint n = _mm_movemask_epi8(data);
-        if (n) {
-            // copy the front part that is still ASCII
-            while (!(n & 1)) {
-                *dst++ = *src++;
-                n >>= 1;
-            }
+        if (!n) {
+            // unpack
+            _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
+            _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
+            continue;
+        }
+#endif
 
-            // find the next probable ASCII character
-            // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
-            // characters still coming
-            n = _bit_scan_reverse(n);
-            nextAscii = src + n + 1;
-            return false;
+        // copy the front part that is still ASCII
+        while (!(n & 1)) {
+            *dst++ = *src++;
+            n >>= BitSpacing;
         }
 
-        // unpack
-        _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
-        _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
+        // find the next probable ASCII character
+        // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+        // characters still coming
+        n = _bit_scan_reverse(n);
+        nextAscii = src + (n / BitSpacing) + 1;
+        return false;
+
     }
     return src == end;
 }
author	Thiago Macieira <thiago.macieira@intel.com>	2014-01-26 23:45:27 -0800
committer	The Qt Project <gerrit-noreply@qt-project.org>	2014-05-21 17:10:56 +0200
commit	309d3557ca832c42b8fbd372b957af51510b159e (patch)
tree	b2e5f2ede70647dcc1747e5537c46ba0d1ad4a3a /src/corelib/codecs
parent	4dba08eebf7db7e801ccbf83dec6f17369123f10 (diff)