From 309d3557ca832c42b8fbd372b957af51510b159e Mon Sep 17 00:00:00 2001
From: Thiago Macieira <thiago.macieira@intel.com>
Date: Sun, 26 Jan 2014 23:45:27 -0800
Subject: Improve a few string operations with AVX2

AVX2 brings the new PMOVZXBW instruction that extends from one 128-bit
SSE register to an 256-bit AVX register. With that, the main decoding
code is just two instructions (the loop requires a couple more to
maintain the offset counter and do the end-of-loop check).

This buys us another 4% performance improvement in the fromLatin1 code,
calculated on top of the VEX-encoded SSE2 code (which is already a little
better than plain SSE2).

Change-Id: I675fa24de4fa97683b662f19d146047251f77359
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@digia.com>
---
 src/corelib/codecs/qutfcodec.cpp | 47 +++++++++++++++++++++++++++-------------
 src/corelib/tools/qstring.cpp    | 25 +++++++++++++++++++--
 2 files changed, 55 insertions(+), 17 deletions(-)

(limited to 'src/corelib')

diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index 072cda63aa..c0f26ad803 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -103,27 +103,44 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const
     for ( ; end - src >= 16; src += 16, dst += 16) {
         __m128i data = _mm_loadu_si128((__m128i*)src);
 
+#ifdef __AVX2__
+        const int BitSpacing = 2;
+        // load and zero extend to an YMM register
+        const __m256i extended = _mm256_cvtepu8_epi16(data);
+
+        uint n = _mm256_movemask_epi8(extended);
+        if (!n) {
+            // store
+            _mm256_storeu_si256((__m256i*)dst, extended);
+            continue;
+        }
+#else
+        const int BitSpacing = 1;
+
         // check if everything is ASCII
         // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
         uint n = _mm_movemask_epi8(data);
-        if (n) {
-            // copy the front part that is still ASCII
-            while (!(n & 1)) {
-                *dst++ = *src++;
-                n >>= 1;
-            }
+        if (!n) {
+            // unpack
+            _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
+            _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
+            continue;
+        }
+#endif
 
-            // find the next probable ASCII character
-            // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
-            // characters still coming
-            n = _bit_scan_reverse(n);
-            nextAscii = src + n + 1;
-            return false;
+        // copy the front part that is still ASCII
+        while (!(n & 1)) {
+            *dst++ = *src++;
+            n >>= BitSpacing;
         }
 
-        // unpack
-        _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
-        _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
+        // find the next probable ASCII character
+        // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+        // characters still coming
+        n = _bit_scan_reverse(n);
+        nextAscii = src + (n / BitSpacing) + 1;
+        return false;
+
     }
     return src == end;
 }
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index 541a853487..a7d516a726 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -223,8 +223,15 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size)
 
     // we're going to read str[offset..offset+15] (16 bytes)
     for ( ; str + offset + 15 < e; offset += 16) {
-        const __m128i nullMask = _mm_set1_epi32(0);
         const __m128i chunk = _mm_loadu_si128((__m128i*)(str + offset)); // load
+#ifdef __AVX2__
+        // zero extend to an YMM register
+        const __m256i extended = _mm256_cvtepu8_epi16(chunk);
+
+        // store
+        _mm256_storeu_si256((__m256i*)(dst + offset), extended);
+#else
+        const __m128i nullMask = _mm_set1_epi32(0);
 
         // unpack the first 8 bytes, padding with zeros
         const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
@@ -233,6 +240,7 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size)
         // unpack the last 8 bytes, padding with zeros
         const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
         _mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store
+#endif
     }
 
     size = size % 16;
@@ -540,8 +548,20 @@ static int ucstrncmp(const QChar *a, const uchar *c, int l)
     // and c[offset..offset+15] (16 bytes)
     for ( ; uc + offset + 15 < e; offset += 16) {
         // similar to fromLatin1_helper:
-        // load Latin 1 data and expand to UTF-16
+        // load 16 bytes of Latin 1 data
         __m128i chunk = _mm_loadu_si128((__m128i*)(c + offset));
+
+#  ifdef __AVX2__
+        // expand Latin 1 data via zero extension
+        __m256i ldata = _mm256_cvtepu8_epi16(chunk);
+
+        // load UTF-16 data and compare
+        __m256i ucdata = _mm256_loadu_si256((__m256i*)(uc + offset));
+        __m256i result = _mm256_cmpeq_epi16(ldata, ucdata);
+
+        uint mask = ~_mm256_movemask_epi8(result);
+#  else
+        // expand via unpacking
         __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullmask);
         __m128i secondHalf = _mm_unpackhi_epi8(chunk, nullmask);
 
@@ -552,6 +572,7 @@ static int ucstrncmp(const QChar *a, const uchar *c, int l)
         __m128i result2 = _mm_cmpeq_epi16(secondHalf, ucdata2);
 
         uint mask = ~(_mm_movemask_epi8(result1) | _mm_movemask_epi8(result2) << 16);
+#  endif
         if (mask) {
             // found a different character
             uint idx = uint(_bit_scan_forward(mask));
-- 
cgit v1.2.3