2 files changed, 163 insertions, 20 deletions
diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp
index b0e0b3f010..20bacb1584 100644
--- a/src/corelib/codecs/qutfcodec.cpp
+++ b/src/corelib/codecs/qutfcodec.cpp
@@ -45,10 +45,97 @@
 #include "qendian.h"
 #include "qchar.h"
 
+#include "private/qsimd_p.h"
+
 QT_BEGIN_NAMESPACE
 
 enum { Endian = 0, Data = 1 };
 
+#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)
+static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end)
+{
+    // do sixteen characters at a time
+    for ( ; end - src >= 16; src += 16, dst += 16) {
+        __m128i data1 = _mm_loadu_si128((__m128i*)src);
+        __m128i data2 = _mm_loadu_si128(1+(__m128i*)src);
+
+
+        // check if everything is ASCII
+        // the highest ASCII value is U+007F
+        // Do the packing directly:
+        // The PACKUSWB instruction has packs a signed 16-bit integer to an unsigned 8-bit
+        // with saturation. That is, anything from 0x0100 to 0x7fff is saturated to 0xff,
+        // while all negatives (0x8000 to 0xffff) get saturated to 0x00. To detect non-ASCII,
+        // we simply do a signed greater-than comparison to 0x00. That means we detect NULs as
+        // "non-ASCII", but it's an acceptable compromise.
+        __m128i packed = _mm_packus_epi16(data1, data2);
+        __m128i nonAscii = _mm_cmpgt_epi8(packed, _mm_setzero_si128());
+
+        // n will contain 1 bit set per character in [data1, data2] that is non-ASCII (or NUL)
+        ushort n = ~_mm_movemask_epi8(nonAscii);
+        if (n) {
+            // copy the front part that is still ASCII
+            while (!(n & 1)) {
+                *dst++ = *src++;
+                n >>= 1;
+            }
+
+            // find the next probable ASCII character
+            // we don't want to load 32 bytes again in this loop if we know there are non-ASCII
+            // characters still coming
+            n = _bit_scan_reverse(n);
+            nextAscii = src + n;
+            return false;
+        }
+
+        // pack
+        _mm_storeu_si128((__m128i*)dst, packed);
+    }
+    return src == end;
+}
+
+static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end)
+{
+    // do sixteen characters at a time
+    for ( ; end - src >= 16; src += 16, dst += 16) {
+        __m128i data = _mm_loadu_si128((__m128i*)src);
+
+        // check if everything is ASCII
+        // movemask extracts the high bit of every byte, so n is non-zero if something isn't ASCII
+        uint n = _mm_movemask_epi8(data);
+        if (n) {
+            // copy the front part that is still ASCII
+            while (!(n & 1)) {
+                *dst++ = *src++;
+                n >>= 1;
+            }
+
+            // find the next probable ASCII character
+            // we don't want to load 16 bytes again in this loop if we know there are non-ASCII
+            // characters still coming
+            n = _bit_scan_reverse(n);
+            nextAscii = src + n;
+            return false;
+        }
+
+        // unpack
+        _mm_storeu_si128((__m128i*)dst, _mm_unpacklo_epi8(data, _mm_setzero_si128()));
+        _mm_storeu_si128(1+(__m128i*)dst, _mm_unpackhi_epi8(data, _mm_setzero_si128()));
+    }
+    return src == end;
+}
+#else
+static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *)
+{
+    return false;
+}
+
+static inline bool simdDecodeAscii(ushort *, const uchar *, const uchar *, const uchar *)
+{
+    return false;
+}
+#endif
+
 QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len)
 {
     // create a QByteArray with the worst case scenario size
@@ -58,12 +145,18 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len)
     const ushort *const end = src + len;
 
     while (src != end) {
-        ushort uc = *src++;
-        int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end);
-        if (res < 0) {
-            // encoding error - append '?'
-            *dst++ = '?';
-        }
+        const ushort *nextAscii = end;
+        if (simdEncodeAscii(dst, nextAscii, src, end))
+            break;
+
+        do {
+            ushort uc = *src++;
+            int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, dst, src, end);
+            if (res < 0) {
+                // encoding error - append '?'
+                *dst++ = '?';
+            }
+        } while (src < nextAscii);
     }
 
     result.truncate(dst - reinterpret_cast<uchar *>(const_cast<char *>(result.constData())));
@@ -98,10 +191,21 @@ QByteArray QUtf8::convertFromUnicode(const QChar *uc, int len, QTextCodec::Conve
         *cursor++ = 0xbf;
     }
 
+    const ushort *nextAscii = src;
     while (src != end) {
-        ushort uc = surrogate_high == -1 ? *src++ : surrogate_high;
-        surrogate_high = -1;
-        int res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+        int res;
+        ushort uc;
+        if (surrogate_high != -1) {
+            uc = surrogate_high;
+            surrogate_high = -1;
+            res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+        } else {
+            if (src >= nextAscii && simdEncodeAscii(cursor, nextAscii, src, end))
+                break;
+
+            uc = *src++;
+            res = QUtf8Functions::toUtf8<QUtf8BaseTraits>(uc, cursor, src, end);
+        }
         if (Q_LIKELY(res >= 0))
             continue;
 
@@ -136,12 +240,18 @@ QString QUtf8::convertToUnicode(const char *chars, int len)
     const uchar *end = src + len;
 
     while (src < end) {
-        uchar b = *src++;
-        int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
-        if (res < 0) {
-            // decoding error
-            *dst++ = QChar::ReplacementCharacter;
-        }
+        const uchar *nextAscii = end;
+        if (simdDecodeAscii(dst, nextAscii, src, end))
+            break;
+
+        do {
+            uchar b = *src++;
+            int res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(b, dst, src, end);
+            if (res < 0) {
+                // decoding error
+                *dst++ = QChar::ReplacementCharacter;
+            }
+        } while (src < nextAscii);
     }
 
     result.truncate(dst - reinterpret_cast<const ushort *>(result.constData()));
@@ -204,7 +314,11 @@ QString QUtf8::convertToUnicode(const char *chars, int len, QTextCodec::Converte
 
     // main body, stateless decoding
     res = 0;
+    const uchar *nextAscii = src;
     while (res >= 0 && src < end) {
+        if (src >= nextAscii && simdDecodeAscii(dst, nextAscii, src, end))
+            break;
+
         ch = *src++;
         res = QUtf8Functions::fromUtf8<QUtf8BaseTraits>(ch, dst, src, end);
         if (!headerdone && res >= 0) {
diff --git a/src/corelib/tools/qsimd_p.h b/src/corelib/tools/qsimd_p.h
index b01c47d4ce..1e428b6aeb 100644
--- a/src/corelib/tools/qsimd_p.h
+++ b/src/corelib/tools/qsimd_p.h
@@ -72,7 +72,7 @@
  * I = intrinsics; C = code generation
  */
 
-#ifdef __MINGW64_VERSION_MAJOR
+#if defined(__MINGW64_VERSION_MAJOR) || (defined(Q_CC_MSVC) && !defined(Q_OS_WINCE))
 #include <intrin.h>
 #endif
 
@@ -139,10 +139,15 @@
 #endif
 
 // other x86 intrinsics
-#if defined(QT_COMPILER_SUPPORTS_AVX) && defined(Q_CC_GNU) && \
-    (!defined(Q_CC_INTEL)|| __INTEL_COMPILER >= 1310 || (__GNUC__ * 100 + __GNUC_MINOR__ < 407))
-#define QT_COMPILER_SUPPORTS_X86INTRIN
-#include <x86intrin.h>
+#if defined(Q_PROCESSOR_X86) && ((defined(Q_CC_GNU) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 404)) \
+    || (defined(Q_CC_CLANG) && (__clang_major__ * 100 + __clang_minor__ >= 208)) \
+    || defined(Q_CC_INTEL))
+#  define QT_COMPILER_SUPPORTS_X86INTRIN
+#  ifndef Q_CC_INTEL
+// The Intel compiler has no <x86intrin.h> -- all intrinsics are in <immintrin.h>;
+// GCC 4.4 and Clang 2.8 added a few more intrinsics there
+#    include <x86intrin.h>
+#  endif
 #endif
 
 // NEON intrinsics
@@ -241,6 +246,30 @@ static inline uint qCpuFeatures()
 
 #define qCpuHasFeature(feature)  ((qCompilerCpuFeatures & (feature)) || (qCpuFeatures() & (feature)))
 
+#ifdef Q_PROCESSOR_X86
+// Bit scan functions for x86
+#  ifdef Q_CC_MSVC
+// MSVC calls it _BitScanReverse and returns the carry flag, which we don't need
+static __forceinline unsigned long _bit_scan_reverse(uint val)
+{
+    unsigned long result;
+    _BitScanReverse(&result, val);
+    return result;
+}
+#  elif (defined(Q_CC_CLANG) || (defined(Q_CC_GNU) && __GNUC__ * 100 + __GNUC_MINOR__ < 405)) \
+    && !defined(Q_CC_INTEL)
+// Clang is missing the intrinsic for _bit_scan_reverse
+// GCC only added it in version 4.5
+static inline __attribute__((always_inline))
+unsigned _bit_scan_reverse(unsigned val)
+{
+    unsigned result;
+    asm("bsr %1, %0" : "=r" (result) : "r" (val));
+    return result;
+}
+#  endif
+#endif // Q_PROCESSOR_X86
+
 #define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
     for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)