diff options
author | Thiago Macieira <thiago.macieira@intel.com> | 2013-12-12 22:41:04 -0800 |
---|---|---|
committer | The Qt Project <gerrit-noreply@qt-project.org> | 2014-01-31 21:51:11 +0100 |
commit | f32a5b158f3929a8f391240b4f21dde1db294637 (patch) | |
tree | 21b41aa36fc01c50ced58f1a34ba6064162b46cc /src/corelib/tools/qstring.cpp | |
parent | 34821e226a94858480e57bb25ac7655bfd19f1e6 (diff) |
Improve ucstrncmp with SSE2
The benchmarks showed that the basic SSE2-based building block
improves performance by about 50% with data extracted from a Qt
Creator run. None of the other alternatives provide clear better
results -- the best was 3.8% and with only one compiler.
Change-Id: I77314785afecfacaf21c41fd79c97cadf357f895
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
Diffstat (limited to 'src/corelib/tools/qstring.cpp')
-rw-r--r-- | src/corelib/tools/qstring.cpp | 52 |
1 files changed, 52 insertions, 0 deletions
diff --git a/src/corelib/tools/qstring.cpp b/src/corelib/tools/qstring.cpp index 555f8a8c97..1dd2832ad9 100644 --- a/src/corelib/tools/qstring.cpp +++ b/src/corelib/tools/qstring.cpp @@ -102,6 +102,36 @@ QT_BEGIN_NAMESPACE +/* + * Note on the use of SIMD in qstring.cpp: + * + * Several operations with strings are improved with the use of SIMD code, + * since they are repetitive. For MIPS, we have hand-written assembly code + * outside of qstring.cpp targeting MIPS DSP and MIPS DSPr2. For ARM and for + * x86, we can only use intrinsics and therefore everything is contained in + * qstring.cpp. We need to use intrinsics only for those platforms due to the + * different compilers and toolchains used, which have different syntax for + * assembly sources. + * + * ** SSE notes: ** + * + * Whenever multiple alternatives are equivalent or near so, we prefer the one + * using instructions from SSE2, since SSE2 is guaranteed to be enabled for all + * 64-bit builds and we enable it for 32-bit builds by default. Use of higher + * SSE versions should be done when there's a clear performance benefit and + * requires fallback code to SSE2, if it exists. + * + * Performance measurement in the past shows that most strings are short in + * size and, therefore, do not benefit from alignment prologues. That is, + * trying to find a 16-byte-aligned boundary to operate on is often more + * expensive than executing the unaligned operation directly. In addition, note + * that the QString private data is designed so that the data is stored on + * 16-byte boundaries if the system malloc() returns 16-byte aligned pointers + * on its own (64-bit glibc on Linux does; 32-bit glibc on Linux returns them + * 50% of the time), so skipping the alignment prologue is actually optimizing + * for the common case. + */ + // internal int qFindString(const QChar *haystack, int haystackLen, int from, const QChar *needle, int needleLen, Qt::CaseSensitivity cs); @@ -206,6 +236,28 @@ static int ucstrncmp(const QChar *a, const QChar *b, int l) l); } #endif // __mips_dsp +#ifdef __SSE2__ + const char *ptr = reinterpret_cast<const char*>(a); + qptrdiff distance = reinterpret_cast<const char*>(b) - ptr; + a += l & ~7; + b += l & ~7; + l &= 7; + + // we're going to read ptr[0..15] (16 bytes) + for ( ; ptr + 15 < reinterpret_cast<const char *>(a); ptr += 16) { + __m128i a_data = _mm_loadu_si128((__m128i*)ptr); + __m128i b_data = _mm_loadu_si128((__m128i*)(ptr + distance)); + __m128i result = _mm_cmpeq_epi16(a_data, b_data); + uint mask = ~_mm_movemask_epi8(result); + if (ushort(mask)) { + // found a different byte + uint idx = uint(_bit_scan_forward(mask)); + return reinterpret_cast<const QChar *>(ptr + idx)->unicode() + - reinterpret_cast<const QChar *>(ptr + distance + idx)->unicode(); + } + } +#endif + while (l-- && *a == *b) a++,b++; if (l==-1) |