diff options
author | Thiago Macieira <thiago.macieira@intel.com> | 2013-10-26 15:53:40 -0400 |
---|---|---|
committer | The Qt Project <gerrit-noreply@qt-project.org> | 2014-01-31 21:51:08 +0100 |
commit | 34821e226a94858480e57bb25ac7655bfd19f1e6 (patch) | |
tree | ac70c9067992ec4f71ac6bec022c9379187c67ae /src/corelib/tools/qsimd_p.h | |
parent | bc91dc48952680c0652a80e15ab3f0b8f87b6374 (diff) |
Add support for UTF-8 encoding/decoding with SIMD
Decoding from UTF-8 is easy: if the high bit is set, we fall back to
the byte-by-byte decoding. Encoding to UTF-8 requires a little bit
more work: to detect anything between 0x0080 and 0xffff, we have
several options but none as easy as above. Multiple alternatives are
in the benchmark code.
In both loops, we do two things once we run into a non-ASCII
character: first, we continue the loop for the remainder of ASCII
characters in the buffer (which we can tell by checking the bits set
in the mask), then we find the last non-ASCII character in that
16-character group, so we don't reenter the SSE code too soon.
For the UTF-8 encoding, I have chosen the alternative that results in
the best performance. It's closely tied to the alternative running the
PMIN instruction, but that requires SSE 4.1. It's not worth the
complexity. And quite counter-intuitively, the dedicated string
instruction from SSE 4.2 performs most poorly of all solutions. This
begs re-visiting the performance of the toLatin1 encoder.
The best of 10 benchmark runs of this code were measured on my
SandyBridge CPU @ 2.66 GHz (turbo @ 3.3 GHz), both as CPU cycles and
as CPU ticks:
Compared to: ICU Qt 4.7 non-SSE Qt 5.3
Data set fromUtf8 toUtf8 fromUtf8 toUtf8 fromUtf8 toUtf8
ASCII only 7.50x 6.22x 6.94x 7.60x 4.45x 4.90x
2-char UTF-8 1.17x 1.33x 1.64x 1.56x 1.01x 1.02x
3-char UTF-8 1.08x 1.18x 1.48x 1.33x 0.97x 0.92x
4-char UTF-8 1.05x 1.19x 1.20x 1.21x 0.97x 0.97x
Creator data 3.62x 2.16x 2.60x 1.25x 1.78x 1.23x
As shown by the numbers, the SSE-based code is slightly worse than the
non-SSE code for dense non-ASCII strings. However, as evident in the
Qt Creator data, most strings manipulated by applications are either
pure ASCII or mostly so, so there's a net gain.
Done-with: H. Peter Anvin <hpa@linux.intel.com>
Change-Id: Ia74fbdfdcd7b088f6cba5048c03a153c01f5dbc1
Reviewed-by: Lars Knoll <lars.knoll@digia.com>
Diffstat (limited to 'src/corelib/tools/qsimd_p.h')
-rw-r--r-- | src/corelib/tools/qsimd_p.h | 39 |
1 files changed, 34 insertions, 5 deletions
diff --git a/src/corelib/tools/qsimd_p.h b/src/corelib/tools/qsimd_p.h index b01c47d4ce..1e428b6aeb 100644 --- a/src/corelib/tools/qsimd_p.h +++ b/src/corelib/tools/qsimd_p.h @@ -72,7 +72,7 @@ * I = intrinsics; C = code generation */ -#ifdef __MINGW64_VERSION_MAJOR +#if defined(__MINGW64_VERSION_MAJOR) || (defined(Q_CC_MSVC) && !defined(Q_OS_WINCE)) #include <intrin.h> #endif @@ -139,10 +139,15 @@ #endif // other x86 intrinsics -#if defined(QT_COMPILER_SUPPORTS_AVX) && defined(Q_CC_GNU) && \ - (!defined(Q_CC_INTEL)|| __INTEL_COMPILER >= 1310 || (__GNUC__ * 100 + __GNUC_MINOR__ < 407)) -#define QT_COMPILER_SUPPORTS_X86INTRIN -#include <x86intrin.h> +#if defined(Q_PROCESSOR_X86) && ((defined(Q_CC_GNU) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 404)) \ + || (defined(Q_CC_CLANG) && (__clang_major__ * 100 + __clang_minor__ >= 208)) \ + || defined(Q_CC_INTEL)) +# define QT_COMPILER_SUPPORTS_X86INTRIN +# ifndef Q_CC_INTEL +// The Intel compiler has no <x86intrin.h> -- all intrinsics are in <immintrin.h>; +// GCC 4.4 and Clang 2.8 added a few more intrinsics there +# include <x86intrin.h> +# endif #endif // NEON intrinsics @@ -241,6 +246,30 @@ static inline uint qCpuFeatures() #define qCpuHasFeature(feature) ((qCompilerCpuFeatures & (feature)) || (qCpuFeatures() & (feature))) +#ifdef Q_PROCESSOR_X86 +// Bit scan functions for x86 +# ifdef Q_CC_MSVC +// MSVC calls it _BitScanReverse and returns the carry flag, which we don't need +static __forceinline unsigned long _bit_scan_reverse(uint val) +{ + unsigned long result; + _BitScanReverse(&result, val); + return result; +} +# elif (defined(Q_CC_CLANG) || (defined(Q_CC_GNU) && __GNUC__ * 100 + __GNUC_MINOR__ < 405)) \ + && !defined(Q_CC_INTEL) +// Clang is missing the intrinsic for _bit_scan_reverse +// GCC only added it in version 4.5 +static inline __attribute__((always_inline)) +unsigned _bit_scan_reverse(unsigned val) +{ + unsigned result; + asm("bsr %1, %0" : "=r" (result) : "r" (val)); + return result; +} +# endif +#endif // Q_PROCESSOR_X86 + #define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \ for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i) |