summaryrefslogtreecommitdiffstats
path: root/src/corelib/tools
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2014-01-26 23:45:27 -0800
committerThe Qt Project <gerrit-noreply@qt-project.org>2014-05-21 17:10:56 +0200
commit309d3557ca832c42b8fbd372b957af51510b159e (patch)
treeb2e5f2ede70647dcc1747e5537c46ba0d1ad4a3a /src/corelib/tools
parent4dba08eebf7db7e801ccbf83dec6f17369123f10 (diff)
Improve a few string operations with AVX2
AVX2 brings the new PMOVZXBW instruction that extends from one 128-bit SSE register to an 256-bit AVX register. With that, the main decoding code is just two instructions (the loop requires a couple more to maintain the offset counter and do the end-of-loop check). This buys us another 4% performance improvement in the fromLatin1 code, calculated on top of the VEX-encoded SSE2 code (which is already a little better than plain SSE2). Change-Id: I675fa24de4fa97683b662f19d146047251f77359 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@digia.com>
Diffstat (limited to 'src/corelib/tools')
-rw-r--r--src/corelib/tools/qstring.cpp25
1 files changed, 23 insertions, 2 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index 541a853487..a7d516a726 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -223,8 +223,15 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size)
// we're going to read str[offset..offset+15] (16 bytes)
for ( ; str + offset + 15 < e; offset += 16) {
- const __m128i nullMask = _mm_set1_epi32(0);
const __m128i chunk = _mm_loadu_si128((__m128i*)(str + offset)); // load
+#ifdef __AVX2__
+ // zero extend to an YMM register
+ const __m256i extended = _mm256_cvtepu8_epi16(chunk);
+
+ // store
+ _mm256_storeu_si256((__m256i*)(dst + offset), extended);
+#else
+ const __m128i nullMask = _mm_set1_epi32(0);
// unpack the first 8 bytes, padding with zeros
const __m128i firstHalf = _mm_unpacklo_epi8(chunk, nullMask);
@@ -233,6 +240,7 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size)
// unpack the last 8 bytes, padding with zeros
const __m128i secondHalf = _mm_unpackhi_epi8 (chunk, nullMask);
_mm_storeu_si128((__m128i*)(dst + offset + 8), secondHalf); // store
+#endif
}
size = size % 16;
@@ -540,8 +548,20 @@ static int ucstrncmp(const QChar *a, const uchar *c, int l)
// and c[offset..offset+15] (16 bytes)
for ( ; uc + offset + 15 < e; offset += 16) {
// similar to fromLatin1_helper:
- // load Latin 1 data and expand to UTF-16
+ // load 16 bytes of Latin 1 data
__m128i chunk = _mm_loadu_si128((__m128i*)(c + offset));
+
+# ifdef __AVX2__
+ // expand Latin 1 data via zero extension
+ __m256i ldata = _mm256_cvtepu8_epi16(chunk);
+
+ // load UTF-16 data and compare
+ __m256i ucdata = _mm256_loadu_si256((__m256i*)(uc + offset));
+ __m256i result = _mm256_cmpeq_epi16(ldata, ucdata);
+
+ uint mask = ~_mm256_movemask_epi8(result);
+# else
+ // expand via unpacking
__m128i firstHalf = _mm_unpacklo_epi8(chunk, nullmask);
__m128i secondHalf = _mm_unpackhi_epi8(chunk, nullmask);
@@ -552,6 +572,7 @@ static int ucstrncmp(const QChar *a, const uchar *c, int l)
__m128i result2 = _mm_cmpeq_epi16(secondHalf, ucdata2);
uint mask = ~(_mm_movemask_epi8(result1) | _mm_movemask_epi8(result2) << 16);
+# endif
if (mask) {
// found a different character
uint idx = uint(_bit_scan_forward(mask));