summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2018-05-13 09:14:09 -0700
committerThiago Macieira <thiago.macieira@intel.com>2018-05-15 23:07:11 +0000
commit1e95a07a5ced774b20adb66b34c31bdfaf566bdc (patch)
tree777cc63d356b9347c7a40b10173fff16b98f2a0a /src
parent40ccf9818829d4b7df0e4e93a84a25cd75a3f678 (diff)
QString: insert a number of 8-character SIMD loops
We don't have _mm_cvtsi64_si128() (the REX.W expansion of MOVD [0F 6E]), but we do have _mm_loadl_epi64(), the SSE2 expansion of the MMX MOVQ at opcode 0F 7E. Ditto for _mm_cvtsi128_si64() and _mm_storel_epi64(). And those work even in 32-bit mode. By doing this, we can reduce the tail unrolled loops by half, reducing code size. I'm not adding these new SIMD sections to -Os builds. Change-Id: Ib48364abee9f464c96c6fffd152e405310ef67be Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'src')
-rw-r--r--src/corelib/codecs/qutfcodec.cpp39
-rw-r--r--src/corelib/tools/qstring.cpp123
2 files changed, 144 insertions, 18 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index 08cc99f4dc..96be45ff4e 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -102,6 +102,26 @@ static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const
return false;
}
}
+
+ if (end - src >= 8) {
+ // do eight characters at a time
+ __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src));
+ __m128i packed = _mm_packus_epi16(data, data);
+ __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
+
+ // store even non-ASCII
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(dst), packed);
+
+ uchar n = ~_mm_movemask_epi8(nonAscii);
+ if (n) {
+ nextAscii = src + qBitScanReverse(n) + 1;
+ n = qCountTrailingZeroBits(n);
+ dst += n;
+ src += n;
+ return false;
+ }
+ }
+
return src == end;
}
@@ -150,6 +170,25 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const
return false;
}
+
+ if (end - src >= 8) {
+ __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src));
+ uint n = _mm_movemask_epi8(data) & 0xff;
+ if (!n) {
+ // unpack and store
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), _mm_unpacklo_epi8(data, _mm_setzero_si128()));
+ } else {
+ while (!(n & 1)) {
+ *dst++ = *src++;
+ n >>= 1;
+ }
+
+ n = qBitScanReverse(n);
+ nextAscii = src + n + 1;
+ return false;
+ }
+ }
+
return src == end;
}
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp
index 719c685f69..81e5e1e884 100644
--- a/src/corelib/tools/qstring.cpp
+++ b/src/corelib/tools/qstring.cpp
@@ -315,10 +315,19 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval)
return updatePtrSimd(data1);
ptr += 16;
}
+
+ // and final 8-byte comparison
+ if (ptr + 8 <= end) {
+ __m128i data1 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr));
+ if (!_mm_testz_si128(mask, data1))
+ return updatePtrSimd(data1);
+ ptr += 8;
+ }
+
# else
// SSE2 implementation: test 16 bytes at a time.
const __m128i mask = _mm_set1_epi32(maskval);
- while (ptr + 16 < end) {
+ while (ptr + 16 <= end) {
__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
__m128i masked = _mm_and_si128(mask, data);
__m128i comparison = _mm_cmpeq_epi16(masked, _mm_setzero_si128());
@@ -327,6 +336,17 @@ static bool simdTestMask(const char *&ptr, const char *end, quint32 maskval)
return updatePtr(result);
ptr += 16;
}
+
+ // and one 8-byte comparison
+ if (ptr + 8 <= end) {
+ __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr));
+ __m128i masked = _mm_and_si128(mask, data);
+ __m128i comparison = _mm_cmpeq_epi16(masked, _mm_setzero_si128());
+ quint8 result = _mm_movemask_epi8(comparison);
+ if (result != 0xff)
+ return updatePtr(result);
+ ptr += 8;
+ }
# endif
return true;
@@ -342,7 +362,7 @@ bool qt_is_ascii(const char *&ptr, const char *end) Q_DECL_NOTHROW
return false;
#elif defined(__SSE2__)
// Testing for the high bit can be done efficiently with just PMOVMSKB
- while (ptr + 16 < end) {
+ while (ptr + 16 <= end) {
__m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i *>(ptr));
quint32 mask = _mm_movemask_epi8(data);
if (mask) {
@@ -352,6 +372,16 @@ bool qt_is_ascii(const char *&ptr, const char *end) Q_DECL_NOTHROW
}
ptr += 16;
}
+ if (ptr + 8 <= end) {
+ __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(ptr));
+ quint8 mask = _mm_movemask_epi8(data);
+ if (mask) {
+ uint idx = qCountTrailingZeroBits(mask);
+ ptr += idx;
+ return false;
+ }
+ ptr += 8;
+ }
#endif
while (ptr + 4 <= end) {
@@ -480,11 +510,19 @@ void qt_from_latin1(ushort *dst, const char *str, size_t size) Q_DECL_NOTHROW
#endif
}
- size = size % 16;
+ // we're going to read str[offset..offset+7] (8 bytes)
+ if (str + offset + 7 < e) {
+ const __m128i chunk = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(str + offset));
+ const __m128i unpacked = _mm_unpacklo_epi8(chunk, _mm_setzero_si128());
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + offset), unpacked);
+ offset += 8;
+ }
+
+ size = size % 8;
dst += offset;
str += offset;
# if defined(Q_COMPILER_LAMBDA) && !defined(__OPTIMIZE_SIZE__)
- return UnrollTailLoop<15>::exec(int(size), [=](int i) { dst[i] = (uchar)str[i]; });
+ return UnrollTailLoop<7>::exec(int(size), [=](int i) { dst[i] = (uchar)str[i]; });
# endif
#endif
#if defined(__mips_dsp)
@@ -572,12 +610,34 @@ static void qt_to_latin1(uchar *dst, const ushort *src, int length)
_mm_storeu_si128((__m128i*)(dst + offset), result); // store
}
- length = length % 16;
+# if !defined(__OPTIMIZE_SIZE__)
+ // we're going to write to dst[offset..offset+7] (8 bytes)
+ if (dst + offset + 7 < e) {
+ __m128i chunk = _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + offset));
+ chunk = mergeQuestionMarks(chunk);
+
+ // pack, where the upper half is ignored
+ const __m128i result = _mm_packus_epi16(chunk, chunk);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(dst + offset), result);
+ offset += 8;
+ }
+
+ // we're going to write to dst[offset..offset+3] (4 bytes)
+ if (dst + offset + 3 < e) {
+ __m128i chunk = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(src + offset));
+ chunk = mergeQuestionMarks(chunk);
+
+ // pack, we'll the upper three quarters
+ const __m128i result = _mm_packus_epi16(chunk, chunk);
+ qToUnaligned(_mm_cvtsi128_si32(result), dst + offset);
+ offset += 4;
+ }
+
+ length = length % 4;
dst += offset;
src += offset;
-# if defined(Q_COMPILER_LAMBDA) && !defined(__OPTIMIZE_SIZE__)
- return UnrollTailLoop<15>::exec(length, [=](int i) { dst[i] = (src[i]>0xff) ? '?' : (uchar) src[i]; });
+ return UnrollTailLoop<3>::exec(length, [=](int i) { dst[i] = (src[i]>0xff) ? '?' : (uchar) src[i]; });
# endif
#elif defined(__ARM_NEON__)
// Refer to the documentation of the SSE2 implementation
@@ -837,12 +897,11 @@ static int ucstrncmp(const QChar *a, const uchar *c, size_t l)
}
}
-# ifdef Q_PROCESSOR_X86_64
- enum { MaxTailLength = 7 };
+# if !defined(__OPTIMIZE_SIZE__)
// we'll read uc[offset..offset+7] (16 bytes) and c[offset..offset+7] (8 bytes)
if (uc + offset + 7 < e) {
// same, but we're using an 8-byte load
- __m128i chunk = _mm_cvtsi64_si128(qFromUnaligned<long long>(c + offset));
+ __m128i chunk = _mm_loadl_epi64((const __m128i*)(c + offset));
__m128i secondHalf = _mm_unpacklo_epi8(chunk, nullmask);
__m128i ucdata = _mm_loadu_si128((const __m128i*)(uc + offset));
@@ -857,17 +916,30 @@ static int ucstrncmp(const QChar *a, const uchar *c, size_t l)
// still matched
offset += 8;
}
-# else
- // 32-bit, we can't do MOVQ to load 8 bytes
- Q_UNUSED(nullmask);
- enum { MaxTailLength = 15 };
-# endif
+
+ enum { MaxTailLength = 3 };
+ // we'll read uc[offset..offset+3] (8 bytes) and c[offset..offset+3] (4 bytes)
+ if (uc + offset + 3 < e) {
+ __m128i chunk = _mm_cvtsi32_si128(qFromUnaligned<int>(c + offset));
+ __m128i secondHalf = _mm_unpacklo_epi8(chunk, nullmask);
+
+ __m128i ucdata = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(uc + offset));
+ __m128i result = _mm_cmpeq_epi8(secondHalf, ucdata);
+ uint mask = ~_mm_movemask_epi8(result);
+ if (uchar(mask)) {
+ // found a different character
+ uint idx = qCountTrailingZeroBits(mask);
+ return uc[offset + idx / 2] - c[offset + idx / 2];
+ }
+
+ // still matched
+ offset += 4;
+ }
// reset uc and c
uc += offset;
c += offset;
-# if !defined(__OPTIMIZE_SIZE__)
const auto lambda = [=](size_t i) { return uc[i] - ushort(c[i]); };
return UnrollTailLoop<MaxTailLength>::exec(e - uc, 0, lambda, lambda);
# endif
@@ -1056,8 +1128,23 @@ static int findChar(const QChar *str, int len, QChar ch, int from,
}
}
-# if defined(Q_COMPILER_LAMBDA) && !defined(__OPTIMIZE_SIZE__)
- return UnrollTailLoop<7>::exec(e - n, -1,
+# if !defined(__OPTIMIZE_SIZE__)
+ // we're going to read n[0..3] (8 bytes)
+ if (e - n > 3) {
+ __m128i data = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(n));
+ __m128i result = _mm_cmpeq_epi16(data, mch);
+ uint mask = _mm_movemask_epi8(result);
+ if (uchar(mask)) {
+ // found a match
+ // same as: return n - s + _bit_scan_forward(mask) / 2
+ return (reinterpret_cast<const char *>(n) - reinterpret_cast<const char *>(s)
+ + qCountTrailingZeroBits(mask)) >> 1;
+ }
+
+ n += 4;
+ }
+
+ return UnrollTailLoop<3>::exec(e - n, -1,
[=](int i) { return n[i] == c; },
[=](int i) { return n - s + i; });
# endif