summaryrefslogtreecommitdiffstats
path: root/src/corelib/codecs
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2014-01-26 23:45:27 -0800
committerThe Qt Project <gerrit-noreply@qt-project.org>2014-05-21 17:10:56 +0200
commit309d3557ca832c42b8fbd372b957af51510b159e (patch)
treeb2e5f2ede70647dcc1747e5537c46ba0d1ad4a3a /src/corelib/codecs
parent4dba08eebf7db7e801ccbf83dec6f17369123f10 (diff)
Improve a few string operations with AVX2
AVX2 brings the new PMOVZXBW instruction that extends from one 128-bit SSE register to an 256-bit AVX register. With that, the main decoding code is just two instructions (the loop requires a couple more to maintain the offset counter and do the end-of-loop check). This buys us another 4% performance improvement in the fromLatin1 code, calculated on top of the VEX-encoded SSE2 code (which is already a little better than plain SSE2). Change-Id: I675fa24de4fa97683b662f19d146047251f77359 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@digia.com>
Diffstat (limited to 'src/corelib/codecs')
-rw-r--r--src/corelib/codecs/qutfcodec.cpp47
1 files changed, 32 insertions, 15 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index 072cda63aa..c0f26ad803 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -103,27 +103,44 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const
for ( ; end - src >= 16; src += 16, dst += 16) {
__m128i data = _mm_loadu_si128((__m128i*)src);
+#ifdef __AVX2__
+ const int BitSpacing = 2;
+ // load and zero extend to an YMM register
+ const __m256i extended = _mm256_cvtepu8_epi16(data);
+
+ uint n = _mm256_movemask_epi8(extended);
+ if (!n) {
+ // store
+ _mm256_storeu_si256((__m256i*)dst, extended);
+ continue;
+ }
+#else
+ const int BitSpacing = 1;
+
// check if everything is ASCII
// movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
uint n = _mm_movemask_epi8(data);
- if (n) {
- // copy the front part that is still ASCII
- while (!(n & 1)) {
- *dst++ = *src++;
- n >>= 1;
- }
+ if (!n) {
+ // unpack
+ _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
+ _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
+ continue;
+ }
+#endif
- // find the next probable ASCII character
- // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
- // characters still coming
- n = _bit_scan_reverse(n);
- nextAscii = src + n + 1;
- return false;
+ // copy the front part that is still ASCII
+ while (!(n & 1)) {
+ *dst++ = *src++;
+ n >>= BitSpacing;
}
- // unpack
- _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
- _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
+ // find the next probable ASCII character
+ // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+ // characters still coming
+ n = _bit_scan_reverse(n);
+ nextAscii = src + (n / BitSpacing) + 1;
+ return false;
+
}
return src == end;
}