diff options
author | Thiago Macieira <thiago.macieira@intel.com> | 2018-09-12 23:06:40 -0700 |
---|---|---|
committer | Thiago Macieira <thiago.macieira@intel.com> | 2018-12-08 20:28:56 +0000 |
commit | d36a4fc19709e6047fe846b36731b59909218b6d (patch) | |
tree | 1ca9b89ab023fdf97ba9f829fc9b70ce1b16603d /src/corelib/tools/qstring.cpp | |
parent | 2e715c31ed3a37fc196e97d4c58d0e277b1b9215 (diff) |
Optimize further the loading of 8 Latin 1 characters
This is important when AVX is enabled, which makes the VMOVQ load and
the VPMOVZXBW instruction be combined into a single VPMOVZXBW with
direct memory access. This is guaranteed to only read 8 bytes, so it's
safe even close to the end of a page. Clang and ICC do combine the
instructions like we want and I have filed a request for GCC to do so
too[1].
AVX was first introduced in 2011, so plenty of computers today would
benefit from this.
[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=87317
Change-Id: I8f261579aad648fdb4f0fffd1553e08e90df3171
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'src/corelib/tools/qstring.cpp')
-rw-r--r-- | src/corelib/tools/qstring.cpp | 21 |
1 files changed, 17 insertions, 4 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index edb9983c33..d20c46774d 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -415,6 +415,21 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval) return true; } + +static Q_ALWAYS_INLINE __m128i mm_load8_zero_extend(const void *ptr) +{ + const __m128i *dataptr = static_cast<const __m128i *>(ptr); +#if defined(__SSE4_1__) + // use a MOVQ followed by PMOVZXBW + // if AVX2 is present, these should combine into a single VPMOVZXBW instruction + __m128i data = _mm_loadl_epi64(dataptr); + return _mm_cvtepu8_epi16(data); +# else + // use MOVQ followed by PUNPCKLBW + __m128i data = _mm_loadl_epi64(dataptr); + return _mm_unpacklo_epi8(data, _mm_setzero_si128()); +# endif +} #endif // Note: ptr on output may be off by one and point to a preceding US-ASCII @@ -585,8 +600,7 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size) Q_DECL_NOTHROW // we're going to read str[offset..offset+7] (8 bytes) if (str + offset + 7 < e) { - const __m128i chunk = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(str + offset)); - const __m128i unpacked = _mm_unpacklo_epi8(chunk, _mm_setzero_si128()); + const __m128i unpacked = mm_load8_zero_extend(str + offset); _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), unpacked); offset += 8; } @@ -1044,8 +1058,7 @@ static int ucstrncmp(const QChar *a, const uchar *c, size_t l) // we'll read uc[offset..offset+7] (16 bytes) and c[offset..offset+7] (8 bytes) if (uc + offset + 7 < e) { // same, but we're using an 8-byte load - __m128i chunk = _mm_loadl_epi64((const __m128i*)(c + offset)); - __m128i secondHalf = _mm_unpacklo_epi8(chunk, nullmask); + __m128i secondHalf = mm_load8_zero_extend(c + offset); __m128i ucdata = _mm_loadu_si128((const __m128i*)(uc + offset)); __m128i result = _mm_cmpeq_epi16(secondHalf, ucdata); |