diff options
author | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2020-08-13 14:17:34 +0200 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2021-03-05 18:31:27 +0200 |
commit | ecf84e0989bf85145c5aa7be0e7e31cafa1138d2 (patch) | |
tree | b55d08a9c3885b36a1dbcc9f1dd74f066be71d89 | |
parent | 89e9164abbd730e4f3db7e56ece3477199fbeaa9 (diff) |
Add ARM version of the "AES" qhash algorithm
Change-Id: Ia2c20e970a0149efb7665a5690538f83965e7be7
Reviewed-by: Erik Verbruggen <erik.verbruggen@me.com>
-rw-r--r-- | src/corelib/tools/qhash.cpp | 135 |
1 files changed, 135 insertions, 0 deletions
diff --git a/src/corelib/tools/qhash.cpp b/src/corelib/tools/qhash.cpp index fa9688340d..933dff84ab 100644 --- a/src/corelib/tools/qhash.cpp +++ b/src/corelib/tools/qhash.cpp @@ -527,6 +527,138 @@ lt16: } #endif +#if defined(__ARM_FEATURE_CRYPTO) +static size_t aeshash(const uchar *p, size_t len, size_t seed) noexcept +{ + uint8x16_t key; +# if QT_POINTER_SIZE == 8 + quint64 seededlen = seed ^ len; + uint64x2_t vseed = vcombine_u64(vcreate_u64(seed), vcreate_u64(seededlen)); + key = vreinterpretq_u8_u64(vseed); +# else + quint32 replicated_len = quint16(len) | (quint32(quint16(len)) << 16); + uint32x2_t vseed = vmov_n_u32(seed); + vseed = vset_lane_u32(replicated_len, vseed, 1); + key = vreinterpretq_u8_u32(vcombine_u32(vseed, vseed)); +# endif + + // Compared to x86 AES, ARM splits each round into two instructions + // and includes the pre-xor instead of the post-xor. + const auto hash16bytes = [](uint8x16_t &state0, uint8x16_t data) { + auto state1 = state0; + state0 = vaeseq_u8(state0, data); + state0 = vaesmcq_u8(state0); + auto state2 = state0; + state0 = vaeseq_u8(state0, state1); + state0 = vaesmcq_u8(state0); + auto state3 = state0; + state0 = vaeseq_u8(state0, state2); + state0 = vaesmcq_u8(state0); + state0 = veorq_u8(state0, state3); + }; + + uint8x16_t state0 = key; + + if (len < 8) + goto lt8; + if (len < 16) + goto lt16; + if (len < 32) + goto lt32; + + // rounds of 32 bytes + { + // Make state1 = ~state0: + uint8x16_t state1 = veorq_u8(state0, vdupq_n_u8(255)); + + // do simplified rounds of 32 bytes: unlike the Go code, we only + // scramble twice and we keep 256 bits of state + const auto *e = p + len - 31; + while (p < e) { + uint8x16_t data0 = vld1q_u8(p); + uint8x16_t data1 = vld1q_u8(p + 16); + auto oldstate0 = state0; + auto oldstate1 = state1; + state0 = vaeseq_u8(state0, data0); + state1 = vaeseq_u8(state1, data1); + state0 = vaesmcq_u8(state0); + state1 = vaesmcq_u8(state1); + auto laststate0 = state0; + auto laststate1 = state1; + state0 = vaeseq_u8(state0, oldstate0); + state1 = vaeseq_u8(state1, oldstate1); + state0 = vaesmcq_u8(state0); + state1 = vaesmcq_u8(state1); + state0 = veorq_u8(state0, laststate0); + state1 = veorq_u8(state1, laststate1); + p += 32; + } + state0 = veorq_u8(state0, state1); + } + len &= 0x1f; + + // do we still have 16 or more bytes? + if (len & 0x10) { +lt32: + uint8x16_t data = vld1q_u8(p); + hash16bytes(state0, data); + p += 16; + } + len &= 0xf; + + if (len & 0x08) { +lt16: + uint8x8_t data8 = vld1_u8(p); + uint8x16_t data = vcombine_u8(data8, vdup_n_u8(0)); + hash16bytes(state0, data); + p += 8; + } + len &= 0x7; + +lt8: + if (len) { + // load the last chunk of data + // We're going to load 8 bytes and mask zero the part we don't care + // (the hash of a short string is different from the hash of a longer + // including NULLs at the end because the length is in the key) + // WARNING: this may produce valgrind warnings, but it's safe + + uint8x8_t data8; + + if (Q_LIKELY(quintptr(p + 8) & 0xff8)) { + // same page, we definitely can't fault: + // load all 8 bytes and mask off the bytes past the end of the source + static const qint8 maskarray[] = { + -1, -1, -1, -1, -1, -1, -1, + 0, 0, 0, 0, 0, 0, 0, + }; + uint8x8_t mask = vld1_u8(reinterpret_cast<const quint8 *>(maskarray) + 7 - len); + data8 = vld1_u8(p); + data8 = vand_u8(data8, mask); + } else { + // too close to the end of the page, it could fault: + // load 8 bytes ending at the data end, then shuffle them to the beginning + static const qint8 shufflecontrol[] = { + 1, 2, 3, 4, 5, 6, 7, + -1, -1, -1, -1, -1, -1, -1, + }; + uint8x8_t control = vld1_u8(reinterpret_cast<const quint8 *>(shufflecontrol) + 7 - len); + data8 = vld1_u8(p - 8 + len); + data8 = vtbl1_u8(data8, control); + } + uint8x16_t data = vcombine_u8(data8, vdup_n_u8(0)); + hash16bytes(state0, data); + } + + // extract state0 +# if QT_POINTER_SIZE == 8 + return vgetq_lane_u64(vreinterpretq_u64_u8(state0), 0); +# else + return vgetq_lane_u32(vreinterpretq_u32_u8(state0), 0); +# endif +} +#endif + size_t qHashBits(const void *p, size_t size, size_t seed) noexcept { #ifdef QT_BOOTSTRAPPED @@ -537,6 +669,9 @@ size_t qHashBits(const void *p, size_t size, size_t seed) noexcept #ifdef AESHASH if (seed && qCpuHasFeature(AES) && qCpuHasFeature(SSE4_2)) return aeshash(reinterpret_cast<const uchar *>(p), size, seed); +#elif defined(__ARM_FEATURE_CRYPTO) + if (seed) + return aeshash(reinterpret_cast<const uchar *>(p), size, seed); #endif if (size <= QT_POINTER_SIZE) return murmurhash(p, size, seed); |