From da1720485eee265c40b3832dc92c19100891d86f Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Sat, 29 Jan 2022 08:20:25 -0800 Subject: QHash: add support for VAES and AVX512VL The strategy is explained in the aeshash() function. I've chosen to exclude the case of simultaneous VAES and AVX512VL support for len <= 32 case. Instead, the aeshash128_lt32_avx256() does not attempt to use VAES, because we wouldn't be getting sufficient benefit at the cost of code expansion (AESENC can dispatch 2 per cycle). See simulation at https://analysis.godbolt.org/z/8Y54PMWGj. The code is slightly convoluted with unexpected indentation so most of the important lines in the algorithm aren't changed by this commit. Change-Id: I6fcda969a9e9427198bffffd16ceca30f5e924b5 Reviewed-by: Thiago Macieira --- src/corelib/tools/qhash.cpp | 221 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 185 insertions(+), 36 deletions(-) (limited to 'src/corelib/tools') diff --git a/src/corelib/tools/qhash.cpp b/src/corelib/tools/qhash.cpp index aca39c1be6..2a50c7abf6 100644 --- a/src/corelib/tools/qhash.cpp +++ b/src/corelib/tools/qhash.cpp @@ -510,12 +510,26 @@ static uint siphash(const uint8_t *in, uint inlen, uint seed, uint seed2) #if QT_COMPILER_SUPPORTS_HERE(AES) && QT_COMPILER_SUPPORTS_HERE(SSE4_2) && \ !defined(QHASH_AES_SANITIZER_BUILD) # define AESHASH +# define QT_FUNCTION_TARGET_STRING_AES_AVX512 \ + QT_FUNCTION_TARGET_STRING_ARCH_SKYLAKE_AVX512 "," \ + QT_FUNCTION_TARGET_STRING_AES +# define QT_FUNCTION_TARGET_STRING_VAES_AVX512 \ + QT_FUNCTION_TARGET_STRING_ARCH_SKYLAKE_AVX512 "," \ + QT_FUNCTION_TARGET_STRING_VAES +# undef QHASH_AES_SANITIZER_BUILD +# if QT_POINTER_SIZE == 8 +# define mm_set1_epz _mm_set1_epi64x +# define mm_cvtsz_si128 _mm_cvtsi64_si128 +# define mm_cvtsi128_sz _mm_cvtsi128_si64 +# define mm256_set1_epz _mm256_set1_epi64x +# else +# define mm_set1_epz _mm_set1_epi32 +# define mm_cvtepz_si128 _mm_cvtsi32_si128 +# define mm_cvtsi128_sz _mm_cvtsi128_si32 +# define mm256_set1_epz _mm256_set1_epi32 +# endif -#undef QHASH_AES_SANITIZER_BUILD - -QT_FUNCTION_TARGET(AES) -static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept -{ +namespace { // This is inspired by the algorithm in the Go language. See: // https://github.com/golang/go/blob/01b6cf09fc9f272d9db3d30b4c93982f4911d120/src/runtime/asm_amd64.s#L1105 // https://github.com/golang/go/blob/01b6cf09fc9f272d9db3d30b4c93982f4911d120/src/runtime/asm_386.s#L908 @@ -529,16 +543,19 @@ static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noe // [1] https://en.wikipedia.org/wiki/Advanced_Encryption_Standard#High-level_description_of_the_algorithm // hash 16 bytes, running 3 scramble rounds of AES on itself (like label "final1") - const auto hash16bytes = [](__m128i &state0, __m128i data) QT_FUNCTION_TARGET(AES) { + static void QT_FUNCTION_TARGET(AES) QT_VECTORCALL + hash16bytes(__m128i &state0, __m128i data) + { state0 = _mm_xor_si128(state0, data); state0 = _mm_aesenc_si128(state0, state0); state0 = _mm_aesenc_si128(state0, state0); state0 = _mm_aesenc_si128(state0, state0); - }; + } // hash twice 16 bytes, running 2 scramble rounds of AES on itself - const auto hash2x16bytes = [](__m128i &state0, __m128i &state1, const __m128i *src0, - const __m128i *src1) QT_FUNCTION_TARGET(AES) { + static void QT_FUNCTION_TARGET(AES) QT_VECTORCALL + hash2x16bytes(__m128i &state0, __m128i &state1, const __m128i *src0, const __m128i *src1) + { __m128i data0 = _mm_loadu_si128(src0); __m128i data1 = _mm_loadu_si128(src1); state0 = _mm_xor_si128(data0, state0); @@ -547,18 +564,21 @@ static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noe state1 = _mm_aesenc_si128(state1, state1); state0 = _mm_aesenc_si128(state0, state0); state1 = _mm_aesenc_si128(state1, state1); + } + + struct AESHashSeed + { + __m128i state0; + __m128i mseed2; + AESHashSeed(size_t seed, size_t seed2) QT_FUNCTION_TARGET(AES); + __m128i state1() const QT_FUNCTION_TARGET(AES); }; +} // unnamed namespace - __m128i mseed, mseed2; - if (sizeof(size_t) == 8) { -#ifdef Q_PROCESSOR_X86_64 - mseed = _mm_cvtsi64_si128(seed); - mseed2 = _mm_set1_epi64x(seed2); -#endif - } else { - mseed = _mm_cvtsi32_si128(int(seed)); - mseed2 = _mm_set1_epi32(int(seed2)); - } +Q_ALWAYS_INLINE AESHashSeed::AESHashSeed(size_t seed, size_t seed2) +{ + __m128i mseed = mm_cvtsz_si128(seed); + mseed2 = mm_set1_epz(seed2); // mseed (epi16) = [ seed, seed >> 16, seed >> 32, seed >> 48, len, 0, 0, 0 ] mseed = _mm_insert_epi16(mseed, short(seed), 4); @@ -570,18 +590,22 @@ static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noe // scramble the key __m128i state0 = _mm_aesenc_si128(key, key); + this->state0 = state0; +} - auto src = reinterpret_cast(p); - if (len >= sizeof(__m128i)) { +Q_ALWAYS_INLINE __m128i AESHashSeed::state1() const +{ + { // unlike the Go code, we don't have more per-process seed __m128i state1 = _mm_aesenc_si128(state0, mseed2); + return state1; + } +} - const auto srcend = reinterpret_cast(p + len); - - // main loop: scramble two 16-byte blocks - for ( ; src + 2 < srcend; src += 2) - hash2x16bytes(state0, state1, src, src + 1); - +static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL +aeshash128_16to32(__m128i state0, __m128i state1, const __m128i *src, const __m128i *srcend) +{ + { if (src + 1 < srcend) { // epilogue: between 16 and 31 bytes hash2x16bytes(state0, state1, src, srcend - 1); @@ -593,7 +617,15 @@ static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noe // combine results: state0 = _mm_xor_si128(state0, state1); - } else if (len) { + } + + return mm_cvtsi128_sz(state0); +} + +static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL +aeshash128_lt16(__m128i state0, const uchar *p, size_t len) +{ + if (len) { // We're going to load 16 bytes and mask zero the part we don't care // (the hash of a short string is different from the hash of a longer // including NULLs at the end because the length is in the key) @@ -602,15 +634,15 @@ static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noe constexpr quintptr PageSize = 4096; __m128i data; - if ((quintptr(src) & (PageSize / 2)) == 0) { + if ((quintptr(p) & (PageSize / 2)) == 0) { // lower half of the page: // load all 16 bytes and mask off the bytes past the end of the source static const qint8 maskarray[] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - }; + }; __m128i mask = _mm_loadu_si128(reinterpret_cast(maskarray + 15 - len)); - data = _mm_loadu_si128(src); + data = _mm_loadu_si128(reinterpret_cast(p)); data = _mm_and_si128(data, mask); } else { // upper half of the page: @@ -626,15 +658,132 @@ static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noe hash16bytes(state0, data); } + return mm_cvtsi128_sz(state0); +} - // extract state0 -# if QT_POINTER_SIZE == 8 - return _mm_cvtsi128_si64(state0); +static size_t QT_FUNCTION_TARGET(AES_AVX512) QT_VECTORCALL +aeshash128_lt32_avx256(__m128i state0, const uchar *p, size_t len, size_t seed2) +{ + if (len) { + __mmask32 mask = _bzhi_u32(-1, len); + __m256i data = _mm256_maskz_loadu_epi8(mask, p); + __m128i data0 = _mm256_castsi256_si128(data); + if (len > sizeof(__m128i)) { + __m128i data1 = _mm256_extractf128_si256(data, 1); + __m128i state1 = _mm_aesenc_si128(state0, mm_set1_epz(seed2)); + + // like hash2x16bytes, but without the load: + state0 = _mm_xor_si128(data0, state0); + state1 = _mm_xor_si128(data1, state1); + state0 = _mm_aesenc_si128(state0, state0); + state1 = _mm_aesenc_si128(state1, state1); + state0 = _mm_aesenc_si128(state0, state0); + state1 = _mm_aesenc_si128(state1, state1); + + // combine results: + state0 = _mm_xor_si128(state0, state1); + } else { + hash16bytes(state0, data0); + } + } + return mm_cvtsi128_sz(state0); +} + +static size_t QT_FUNCTION_TARGET(AES) QT_VECTORCALL +aeshash128_ge32(__m128i state0, __m128i state1, const __m128i *src, const __m128i *srcend) +{ + // main loop: scramble two 16-byte blocks + for ( ; src + 2 < srcend; src += 2) + hash2x16bytes(state0, state1, src, src + 1); + + return aeshash128_16to32(state0, state1, src, srcend); +} + +# if QT_COMPILER_SUPPORTS_HERE(VAES) +static size_t QT_FUNCTION_TARGET(VAES) QT_VECTORCALL +aeshash256_ge32(__m128i state0_128, __m128i state1_128, const uchar *p, size_t len) +{ + static const auto hash32bytes = [](__m256i &state0, __m256i data) QT_FUNCTION_TARGET(VAES) { + state0 = _mm256_xor_si256(state0, data); + state0 = _mm256_aesenc_epi128(state0, state0); + state0 = _mm256_aesenc_epi128(state0, state0); + state0 = _mm256_aesenc_epi128(state0, state0); + }; + + // hash twice 32 bytes, running 2 scramble rounds of AES on itself + const auto hash2x32bytes = [](__m256i &state0, __m256i &state1, const __m256i *src0, + const __m256i *src1) QT_FUNCTION_TARGET(VAES) { + __m256i data0 = _mm256_loadu_si256(src0); + __m256i data1 = _mm256_loadu_si256(src1); + state0 = _mm256_xor_si256(data0, state0); + state1 = _mm256_xor_si256(data1, state1); + state0 = _mm256_aesenc_epi128(state0, state0); + state1 = _mm256_aesenc_epi128(state1, state1); + state0 = _mm256_aesenc_epi128(state0, state0); + state1 = _mm256_aesenc_epi128(state1, state1); + }; + + auto src = reinterpret_cast(p); + const auto srcend = reinterpret_cast(p + len); + + __m256i state0 = _mm256_set_m128i(state1_128, state0_128); + __m256i state1 = _mm256_aesenc_epi128(state0, mm256_set1_epz(len)); + + // main loop: scramble two 32-byte blocks + for ( ; src + 2 < srcend; src += 2) + hash2x32bytes(state0, state1, src, src + 1); + + if (src + 1 < srcend) { + // epilogue: between 32 and 31 bytes + hash2x32bytes(state0, state1, src, srcend - 1); + } else if (src != srcend) { + // epilogue: between 1 and 32 bytes, overlap with the end + __m256i data = _mm256_loadu_si256(srcend - 1); + hash32bytes(state0, data); + } + + // combine results: + state0 = _mm256_xor_si256(state0, state1); + + // XOR the two halves and extract + __m128i low = _mm256_extracti128_si256(state0, 0); + __m128i high = _mm256_extracti128_si256(state0, 1); + return mm_cvtsi128_sz(_mm_xor_si128(low, high)); +} # else - return _mm_cvtsi128_si32(state0); +static size_t QT_VECTORCALL aeshash256_ge32(__m128i, __m128i, const uchar *, size_t) +{ + Q_UNREACHABLE(); + return 0; +} +# endif // VAES + +QT_FUNCTION_TARGET(AES) +static size_t aeshash(const uchar *p, size_t len, size_t seed, size_t seed2) noexcept +{ + AESHashSeed state(seed, seed2); + bool useOpMaskLoad = qCpuHasFeature(AVX512VL); + bool useVaes = false; +# if QT_COMPILER_SUPPORTS_HERE(VAES) + useVaes = qCpuHasFeature(VAES); # endif + + auto src = reinterpret_cast(p); + const auto srcend = reinterpret_cast(p + len); + + if (len <= sizeof(__m256i)) { + if (useOpMaskLoad) + return aeshash128_lt32_avx256(state.state0, p, len, seed2); + if (len >= sizeof(__m128i)) + return aeshash128_16to32(state.state0, state.state1(), src, srcend); + return aeshash128_lt16(state.state0, p, len); + } + + if (useVaes) + return aeshash256_ge32(state.state0, state.state1(), p, len); + return aeshash128_ge32(state.state0, state.state1(), src, srcend); } -#endif +#endif // x86 AESNI #if defined(Q_PROCESSOR_ARM) && QT_COMPILER_SUPPORTS_HERE(AES) && !defined(QHASH_AES_SANITIZER_BUILD) && !defined(QT_BOOTSTRAPPED) QT_FUNCTION_TARGET(AES) -- cgit v1.2.3