QHash: split the x86 AES hash into three separate functions

Instead of performing decisions inside the single aeshash() function, we have three implementations instead. Those decisions are permanent for each CPU, so the branch predictor should be pretty good, but hashing is somewhat performance-sensitive. We're only adding three of the four possible combinations of AVX512VL and VAES. Excluded from the implementation are the CPUs that support AVX512 but not VAES, which are the Skylake-based ones. Those are mostly found in server CPUs (Intel Xeon Scalable line) as well as top-end workstations (Intel Core i9), but never made into general desktop and laptop parts. For those, the performance will remain what it was in Qt 6.3. VAES is supported in Intel architectures codenamed Sunny Cove and Gracemont and their successors. That means it's supported in both the E and P cores of the Intel Alder Lake (12th Generation Core), as well as future Atom lines. But neither Atoms nor hybrid CPUS have AVX512 (at least when the E cores are active). AVX512+VAES is supported for Ice Lake (10th Generation Core), Tiger Lake (11th) as well as later generation with AVX512 support enabled. Like in qstring.cpp, we restricted ourselves to 256-bit operations, which don't cause performance impact and because the 512-bit VAESENC operates on the fused Ports 0 and 1, so it has the exact same throughput as two 256-bit VAESENC. Change-Id: I6fcda969a9e9427198bffffd16cece9c37dbdbd3 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
author: Thiago Macieira <thiago.macieira@intel.com> 2022-01-29 09:41:24 -0800
committer: Thiago Macieira <thiago.macieira@intel.com> 2022-02-20 16:53:31 -0800
commit: 4be85491e069081cbc9dc29202a25d0771b61f06 (patch)
tree: e81362c7c329964e1859104f5d399657138233ef /src/corelib/tools
parent: b313a5ec323b4a54423bca8b03bae5f8ce372793 (diff)
1 files changed, 45 insertions, 27 deletions
diff --git a/src/corelib/tools/qhash.cpp b/src/corelib/tools/qhash.cpp
index 62fbbca6f0..19ec0872de 100644
--- a/src/corelib/tools/qhash.cpp
+++ b/src/corelib/tools/qhash.cpp
@@ -510,6 +510,7 @@ static uint siphash(const uint8_t *in, uint inlen, uint seed, uint seed2)
 #if QT_COMPILER_SUPPORTS_HERE(AES) && QT_COMPILER_SUPPORTS_HERE(SSE4_2) && \
     !defined(QHASH_AES_SANITIZER_BUILD)
 #  define AESHASH
+#  define QT_FUNCTION_TARGET_STRING_AES_AVX2    "avx2,aes"
 #  define QT_FUNCTION_TARGET_STRING_AES_AVX512          \
     QT_FUNCTION_TARGET_STRING_ARCH_SKYLAKE_AVX512 ","   \
     QT_FUNCTION_TARGET_STRING_AES
@@ -572,6 +573,8 @@ namespace {
         __m128i mseed2;
         AESHashSeed(size_t seed, size_t seed2) QT_FUNCTION_TARGET(AES);
         __m128i state1() const QT_FUNCTION_TARGET(AES);
+        __m256i state0_256() const QT_FUNCTION_TARGET(AES_AVX2)
+        { return _mm256_set_m128i(state1(), state0); }
     };
 } // unnamed namespace
 
@@ -673,11 +676,11 @@ aeshash128_ge32(__m128i state0, __m128i state1, const __m128i *src, const __m128
 
 #  if QT_COMPILER_SUPPORTS_HERE(VAES)
 static size_t QT_FUNCTION_TARGET(ARCH_ICL) QT_VECTORCALL
-aeshash256_lt32_avx256(__m128i state0_128, __m128i state1_128, const uchar *p, size_t len)
+aeshash256_lt32_avx256(__m256i state0, const uchar *p, size_t len)
 {
+    __m128i state0_128 = _mm256_castsi256_si128(state0);
     if (len) {
         __mmask32 mask = _bzhi_u32(-1, len);
-        __m256i state0 = _mm256_set_m128i(state1_128, state0_128);
         __m256i data = _mm256_maskz_loadu_epi8(mask, p);
         __m128i data0 = _mm256_castsi256_si128(data);
         if (len >= sizeof(__m128i)) {
@@ -699,7 +702,7 @@ aeshash256_lt32_avx256(__m128i state0_128, __m128i state1_128, const uchar *p, s
 }
 
 static size_t QT_FUNCTION_TARGET(VAES) QT_VECTORCALL
-aeshash256_ge32(__m128i state0_128, __m128i state1_128, const uchar *p, size_t len)
+aeshash256_ge32(__m256i state0, const uchar *p, size_t len)
 {
     static const auto hash32bytes = [](__m256i &state0, __m256i data) QT_FUNCTION_TARGET(VAES) {
         state0 = _mm256_xor_si256(state0, data);
@@ -724,7 +727,6 @@ aeshash256_ge32(__m128i state0_128, __m128i state1_128, const uchar *p, size_t l
     auto src = reinterpret_cast<const __m256i *>(p);
     const auto srcend = reinterpret_cast<const __m256i *>(p + len);
 
-    __m256i state0 = _mm256_set_m128i(state1_128, state0_128);
     __m256i state1 = _mm256_aesenc_epi128(state0, mm256_set1_epz(len));
 
     // main loop: scramble two 32-byte blocks
@@ -748,45 +750,61 @@ aeshash256_ge32(__m128i state0_128, __m128i state1_128, const uchar *p, size_t l
     __m128i high = _mm256_extracti128_si256(state0, 1);
     return mm_cvtsi128_sz(_mm_xor_si128(low, high));
 }
-#  else
-static size_t QT_VECTORCALL aeshash256_lt32_avx256(__m256i state0, const uchar *p, size_t len)
+
+static size_t QT_FUNCTION_TARGET(VAES)
+aeshash256(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept
 {
-    Q_UNREACHABLE();
-    return 0;
+    AESHashSeed state(seed, seed2);
+    auto src = reinterpret_cast<const __m128i *>(p);
+    const auto srcend = reinterpret_cast<const __m128i *>(p + len);
+
+    if (len < sizeof(__m128i))
+        return aeshash128_lt16(state.state0, p, len);
+
+    if (len <= sizeof(__m256i))
+        return aeshash128_16to32(state.state0, state.state1(), src, srcend);
+
+    return aeshash256_ge32(state.state0_256(), p, len);
 }
 
-static size_t QT_VECTORCALL aeshash256_ge32(__m128i, __m128i, const uchar *, size_t)
+static size_t QT_FUNCTION_TARGET(VAES_AVX512)
+aeshash256_avx256(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept
 {
-    Q_UNREACHABLE();
-    return 0;
+    AESHashSeed state(seed, seed2);
+    if (len <= sizeof(__m256i))
+        return aeshash256_lt32_avx256(state.state0_256(), p, len);
+
+    return aeshash256_ge32(state.state0_256(), p, len);
 }
 #  endif // VAES
 
-QT_FUNCTION_TARGET(AES)
-static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept
+static size_t QT_FUNCTION_TARGET(VAES)
+aeshash128(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept
 {
     AESHashSeed state(seed, seed2);
-    bool useOpMaskLoad = qCpuHasFeature(AVX512VL);
-    bool useVaes = false;
-#  if QT_COMPILER_SUPPORTS_HERE(VAES)
-    useVaes = qCpuHasFeature(VAES);
-#  endif
-
     auto src = reinterpret_cast<const __m128i *>(p);
     const auto srcend = reinterpret_cast<const __m128i *>(p + len);
 
-    if (len <= sizeof(__m256i)) {
-        if (useOpMaskLoad && useVaes)
-            return aeshash256_lt32_avx256(state.state0, state.state1(), p, len);
-        if (len >= sizeof(__m128i))
-            return aeshash128_16to32(state.state0, state.state1(), src, srcend);
+    if (len < sizeof(__m128i))
         return aeshash128_lt16(state.state0, p, len);
-    }
 
-    if (useVaes)
-        return aeshash256_ge32(state.state0, state.state1(), p, len);
+    if (len <= sizeof(__m256i))
+        return aeshash128_16to32(state.state0, state.state1(), src, srcend);
+
     return aeshash128_ge32(state.state0, state.state1(), src, srcend);
 }
+
+static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept
+{
+#  if QT_COMPILER_SUPPORTS_HERE(VAES)
+    if (qCpuHasFeature(VAES)) {
+        if (qCpuHasFeature(AVX512VL))
+            return aeshash256_avx256(p, len, seed, seed2);
+        return aeshash256(p, len, seed, seed2);
+    }
+#  endif
+    return aeshash128(p, len, seed, seed2);
+}
 #endif // x86 AESNI
 
 #if defined(Q_PROCESSOR_ARM) && QT_COMPILER_SUPPORTS_HERE(AES) && !defined(QHASH_AES_SANITIZER_BUILD) && !defined(QT_BOOTSTRAPPED)
author	Thiago Macieira <thiago.macieira@intel.com>	2022-01-29 09:41:24 -0800
committer	Thiago Macieira <thiago.macieira@intel.com>	2022-02-20 16:53:31 -0800
commit	4be85491e069081cbc9dc29202a25d0771b61f06 (patch)
tree	e81362c7c329964e1859104f5d399657138233ef /src/corelib/tools
parent	b313a5ec323b4a54423bca8b03bae5f8ce372793 (diff)