Optimize the Latin1 conversion code in the JSON parser with SSE2

This also reduces the number of variables modified in each loop from two (l and i) to one (just i) and avoids calling str.length() all the time. Those should be no-op changes, but why not help the optimizer? Change-Id: I8895c35f84d545dba45bffff13bc4147ae53eaab Reviewed-by: Lars Knoll <lars.knoll@digia.com>
author: Thiago Macieira <thiago.macieira@intel.com> 2015-01-23 23:07:34 -0800
committer: Thiago Macieira <thiago.macieira@intel.com> 2015-02-15 06:35:38 +0000
commit: 8d9d2a7b8568c4c067252e6ca9eea45de7a743a5 (patch)
tree: 35921ca48095680b360b535be55747d53bed1195 /src/corelib/json
parent: dbd21d087954bef315f8ec60535aa81b94680c22 (diff)
1 files changed, 28 insertions, 5 deletions
diff --git a/src/corelib/json/qjson_p.h b/src/corelib/json/qjson_p.h
index c2863f41a1..b8430cabf5 100644
--- a/src/corelib/json/qjson_p.h
+++ b/src/corelib/json/qjson_p.h
@@ -54,6 +54,8 @@
 #include <qendian.h>
 #include <qnumeric.h>
 
+#include "private/qsimd_p.h"
+
 #include <limits.h>
 #include <limits>
 
@@ -374,13 +376,34 @@ public:
 
     inline Latin1String &operator=(const QString &str)
     {
-        d->length = str.length();
+        int len = d->length = str.length();
         uchar *l = (uchar *)d->latin1;
         const ushort *uc = (const ushort *)str.unicode();
-        for (int i = 0; i < str.length(); ++i)
-            *l++ = uc[i];
-        while ((quintptr)l & 0x3)
-            *l++ = 0;
+        int i = 0;
+#ifdef __SSE2__
+        for ( ; i + 16 < len; i += 16) {
+            __m128i chunk1 = _mm_loadu_si128((__m128i*)&uc[i]); // load
+            __m128i chunk2 = _mm_loadu_si128((__m128i*)&uc[i + 8]); // load
+            // pack the two vector to 16 x 8bits elements
+            const __m128i result = _mm_packus_epi16(chunk1, chunk2);
+            _mm_storeu_si128((__m128i*)&l[i], result); // store
+        }
+#  ifdef Q_PROCESSOR_X86_64
+        // we can do one more round, of 8 characters
+        if (i + 8 < len) {
+            __m128i chunk = _mm_loadu_si128((__m128i*)&uc[i]); // load
+            // pack with itself, we'll discard the high part anyway
+            chunk = _mm_packus_epi16(chunk, chunk);
+            // unaligned 64-bit store
+            *(quint64*)&l[i] = _mm_cvtsi128_si64(chunk);
+            i += 8;
+        }
+#  endif
+#endif
+        for ( ; i < len; ++i)
+            l[i] = uc[i];
+        for ( ; (quintptr)(l+i) & 0x3; ++i)
+            l[i] = 0;
         return *this;
     }
author	Thiago Macieira <thiago.macieira@intel.com>	2015-01-23 23:07:34 -0800
committer	Thiago Macieira <thiago.macieira@intel.com>	2015-02-15 06:35:38 +0000
commit	8d9d2a7b8568c4c067252e6ca9eea45de7a743a5 (patch)
tree	35921ca48095680b360b535be55747d53bed1195 /src/corelib/json
parent	dbd21d087954bef315f8ec60535aa81b94680c22 (diff)