QHash: rewrite the x86 aeshash function for len >= 16

The loop for 32 bytes is left unchanged, but the tail operation for 16 to 31 bytes is replaced with an overlapped load-and-scramble. This should make the operation even faster. Also updated the key creation back to something similar to what Go does. This massively improves performance as well as the bit spread. Histogram for the bits in the hash value for the testcase from QTBUG-91739: || Bit || Before || After || | 0 | 35.0300% | 50.4800% | | 1 | 42.5250% | 50.2400% | | 2 | 46.0100% | 50.0000% | | 3 | 67.5150% | 49.9400% | | 4 | 56.5150% | 50.0000% | | 5 | 51.9950% | 50.0000% | | 6 | 58.9800% | 50.1400% | | 7 | 55.9550% | 50.0000% | | 8 | 41.9850% | 49.9200% | | 9 | 69.9700% | 49.6400% | | 10 | 68.4950% | 50.0000% | | 11 | 37.4950% | 50.3000% | | 12 | 61.9950% | 49.8200% | | 13 | 53.4900% | 50.0000% | | 14 | 63.0200% | 49.9800% | | 15 | 54.9700% | 50.1000% | Task-number: QTBUG-91739 Pick-to: 6.2.3 6.2 6.3 Change-Id: Icad7c1bad46a449c8e8afffd16cb7fe7ffd3584f Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org> Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
author: Thiago Macieira <thiago.macieira@intel.com> 2022-01-18 11:39:28 -0800
committer: Thiago Macieira <thiago.macieira@intel.com> 2022-01-21 14:08:01 -0800
commit: 2549f189951d47d7487c58c4e799bdeb487fa068 (patch)
tree: 8ff53916397c7b12da1af943ed0ac2194e8f7906 /src/corelib/tools
parent: b5d480d01d9f8a3cd97c614f579008ac137c9174 (diff)
1 files changed, 54 insertions, 52 deletions
diff --git a/src/corelib/tools/qhash.cpp b/src/corelib/tools/qhash.cpp
index c6373ee11d..c8cf1a496e 100644
--- a/src/corelib/tools/qhash.cpp
+++ b/src/corelib/tools/qhash.cpp
@@ -516,21 +516,9 @@ static uint siphash(const uint8_t *in, uint inlen, uint seed, uint seed2)
 QT_FUNCTION_TARGET(AES)
 static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept
 {
-    __m128i key;
-    if (sizeof(size_t) == 8) {
-#ifdef Q_PROCESSOR_X86_64
-        __m128i mseed = _mm_cvtsi64_si128(seed);
-        key = _mm_insert_epi64(mseed, seed2, 1);
-#endif
-    } else {
-        __m128i mseed = _mm_cvtsi32_si128(int(seed));
-        key = _mm_insert_epi32(mseed, int(seed2), 1);
-        key = _mm_unpacklo_epi64(key, key);
-    }
-
     // This is inspired by the algorithm in the Go language. See:
-    // https://github.com/golang/go/blob/894abb5f680c040777f17f9f8ee5a5ab3a03cb94/src/runtime/asm_386.s#L902
-    // https://github.com/golang/go/blob/894abb5f680c040777f17f9f8ee5a5ab3a03cb94/src/runtime/asm_amd64.s#L903
+    // https://github.com/golang/go/blob/01b6cf09fc9f272d9db3d30b4c93982f4911d120/src/runtime/asm_amd64.s#L1105
+    // https://github.com/golang/go/blob/01b6cf09fc9f272d9db3d30b4c93982f4911d120/src/runtime/asm_386.s#L908
     //
     // Even though we're using the AESENC instruction from the CPU, this code
     // is not encryption and this routine makes no claim to be
@@ -548,50 +536,64 @@ static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noe
         state0 = _mm_aesenc_si128(state0, state0);
     };
 
-    __m128i state0 = key;
-    auto src = reinterpret_cast<const __m128i *>(p);
+    // hash twice 16 bytes, running 2 scramble rounds of AES on itself
+    const auto hash2x16bytes = [](__m128i &state0, __m128i &state1, const __m128i *src0,
+            const __m128i *src1) QT_FUNCTION_TARGET(AES) {
+        __m128i data0 = _mm_loadu_si128(src0);
+        __m128i data1 = _mm_loadu_si128(src1);
+        state0 = _mm_xor_si128(data0, state0);
+        state1 = _mm_xor_si128(data1, state1);
+        state0 = _mm_aesenc_si128(state0, state0);
+        state1 = _mm_aesenc_si128(state1, state1);
+        state0 = _mm_aesenc_si128(state0, state0);
+        state1 = _mm_aesenc_si128(state1, state1);
+    };
 
-    if (len < 16)
-        goto lt16;
-    if (len < 32)
-        goto lt32;
+    __m128i mseed, mseed2;
+    if (sizeof(size_t) == 8) {
+#ifdef Q_PROCESSOR_X86_64
+        mseed = _mm_cvtsi64_si128(seed);
+        mseed2 = _mm_set1_epi64x(seed2);
+#endif
+    } else {
+        mseed = _mm_cvtsi32_si128(int(seed));
+        mseed2 = _mm_set1_epi32(int(seed2));
+    }
 
-    // rounds of 32 bytes
-    {
-        // Make state1 = ~state0:
-        __m128i one = _mm_cmpeq_epi64(key, key);
-        __m128i state1 = _mm_xor_si128(state0, one);
+    // mseed (epi16) = [ seed, seed >> 16, seed >> 32, seed >> 48, len, 0, 0, 0 ]
+    mseed = _mm_insert_epi16(mseed, short(seed), 4);
+    // mseed (epi16) = [ seed, seed >> 16, seed >> 32, seed >> 48, len, len, len, len ]
+    mseed = _mm_shufflehi_epi16(mseed, 0);
+
+    // merge with the process-global seed
+    __m128i key = _mm_xor_si128(mseed, mseed2);
+
+    // scramble the key
+    __m128i state0 = _mm_aesenc_si128(key, key);
+
+    auto src = reinterpret_cast<const __m128i *>(p);
+    if (len >= sizeof(__m128i)) {
+        // unlike the Go code, we don't have more per-process seed
+        __m128i state1 = _mm_aesenc_si128(state0, mseed2);
 
-        // do simplified rounds of 32 bytes: unlike the Go code, we only
-        // scramble twice and we keep 256 bits of state
         const auto srcend = reinterpret_cast<const __m128i *>(p + len);
-        while (src + 2 <= srcend) {
-            __m128i data0 = _mm_loadu_si128(src);
-            __m128i data1 = _mm_loadu_si128(src + 1);
-            state0 = _mm_xor_si128(data0, state0);
-            state1 = _mm_xor_si128(data1, state1);
-            state0 = _mm_aesenc_si128(state0, state0);
-            state1 = _mm_aesenc_si128(state1, state1);
-            state0 = _mm_aesenc_si128(state0, state0);
-            state1 = _mm_aesenc_si128(state1, state1);
-            src += 2;
-        }
-        state0 = _mm_xor_si128(state0, state1);
-    }
-    len &= 0x1f;
 
-    // do we still have 16 or more bytes?
-    if (len & 0x10) {
-lt32:
-        __m128i data = _mm_loadu_si128(src);
-        hash16bytes(state0, data);
-        ++src;
-    }
-    len &= 0xf;
+        // main loop: scramble two 16-byte blocks
+        for ( ; src + 2 < srcend; src += 2)
+            hash2x16bytes(state0, state1, src, src + 1);
+
+        if (src + 1 < srcend) {
+            // epilogue: between 16 and 31 bytes
+            hash2x16bytes(state0, state1, src, srcend - 1);
+        } else if (src != srcend) {
+            // epilogue: between 1 and 16 bytes, overlap with the end
+            __m128i data = _mm_loadu_si128(srcend - 1);
+            hash16bytes(state0, data);
+        }
 
-lt16:
-    if (len) {
-        // load the last chunk of data
+        // combine results:
+        state0 = _mm_xor_si128(state0, state1);
+    } else if (len) {
         // We're going to load 16 bytes and mask zero the part we don't care
         // (the hash of a short string is different from the hash of a longer
         // including NULLs at the end because the length is in the key)
author	Thiago Macieira <thiago.macieira@intel.com>	2022-01-18 11:39:28 -0800
committer	Thiago Macieira <thiago.macieira@intel.com>	2022-01-21 14:08:01 -0800
commit	2549f189951d47d7487c58c4e799bdeb487fa068 (patch)
tree	8ff53916397c7b12da1af943ed0ac2194e8f7906 /src/corelib/tools
parent	b5d480d01d9f8a3cd97c614f579008ac137c9174 (diff)