diff options
author | Christian Kandeler <christian.kandeler@qt.io> | 2018-01-03 17:56:52 +0100 |
---|---|---|
committer | Christian Kandeler <christian.kandeler@qt.io> | 2018-08-10 09:23:42 +0000 |
commit | a44fe2e4f03fc18ce9c3d050f71fe369916259b8 (patch) | |
tree | e5f3211ced0bc7fc1628ff5e7a051007dfa18207 /src/libs/3rdparty/botan/src/lib/block/aes | |
parent | 78c4cf9884770149fb9d69f923aa2169baa3f42a (diff) |
SSH: Use Botan2
Botan 1.10 will be completely unsupported by the end of this year, so we
now target API version 2 instead.
Also upgrade our bundled Botan to the latest version 2.7. We no longer
check in pre-processed files, but use the upstream sources directly
(with unneeded parts removed), employing Botan's own configure
script for building. This will make future upgrades much simpler. A
script to automate this process is also provided.
Task-number: QTCREATORBUG-18802
Task-number: QTCREATORBUG-8107
Change-Id: I5a5ea62cfd30d720b556217142e8b7e06bf49f7e
Reviewed-by: hjk <hjk@qt.io>
Reviewed-by: Eike Ziller <eike.ziller@qt.io>
Diffstat (limited to 'src/libs/3rdparty/botan/src/lib/block/aes')
11 files changed, 3205 insertions, 0 deletions
diff --git a/src/libs/3rdparty/botan/src/lib/block/aes/aes.cpp b/src/libs/3rdparty/botan/src/lib/block/aes/aes.cpp new file mode 100644 index 0000000000..403945cc91 --- /dev/null +++ b/src/libs/3rdparty/botan/src/lib/block/aes/aes.cpp @@ -0,0 +1,750 @@ +/* +* AES +* (C) 1999-2010,2015,2017 Jack Lloyd +* +* Based on the public domain reference implementation by Paulo Baretto +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/aes.h> +#include <botan/loadstor.h> +#include <botan/cpuid.h> +#include <type_traits> + +/* +* This implementation is based on table lookups which are known to be +* vulnerable to timing and cache based side channel attacks. Some +* countermeasures are used which may be helpful in some situations: +* +* - Only a single 256-word T-table is used, with rotations applied. +* Most implementations use 4 T-tables which leaks much more +* information via cache usage. +* +* - The TE and TD tables are computed at runtime to avoid flush+reload +* attacks using clflush. As different processes will not share the +* same underlying table data, an attacker can't manipulate another +* processes cache lines via their shared reference to the library +* read only segment. +* +* - Each cache line of the lookup tables is accessed at the beginning +* of each call to encrypt or decrypt. (See the Z variable below) +* +* If available SSSE3 or AES-NI are used instead of this version, as both +* are faster and immune to side channel attacks. +* +* Some AES cache timing papers for reference: +* +* "Software mitigations to hedge AES against cache-based software side +* channel vulnerabilities" https://eprint.iacr.org/2006/052.pdf +* +* "Cache Games - Bringing Access-Based Cache Attacks on AES to Practice" +* http://www.ieee-security.org/TC/SP2011/PAPERS/2011/paper031.pdf +* +* "Cache-Collision Timing Attacks Against AES" Bonneau, Mironov +* http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.88.4753 +*/ + +namespace Botan { + +namespace { + +BOTAN_ALIGNAS(64) +const uint8_t SE[256] = { + 0x63, 0x7C, 0x77, 0x7B, 0xF2, 0x6B, 0x6F, 0xC5, 0x30, 0x01, 0x67, 0x2B, + 0xFE, 0xD7, 0xAB, 0x76, 0xCA, 0x82, 0xC9, 0x7D, 0xFA, 0x59, 0x47, 0xF0, + 0xAD, 0xD4, 0xA2, 0xAF, 0x9C, 0xA4, 0x72, 0xC0, 0xB7, 0xFD, 0x93, 0x26, + 0x36, 0x3F, 0xF7, 0xCC, 0x34, 0xA5, 0xE5, 0xF1, 0x71, 0xD8, 0x31, 0x15, + 0x04, 0xC7, 0x23, 0xC3, 0x18, 0x96, 0x05, 0x9A, 0x07, 0x12, 0x80, 0xE2, + 0xEB, 0x27, 0xB2, 0x75, 0x09, 0x83, 0x2C, 0x1A, 0x1B, 0x6E, 0x5A, 0xA0, + 0x52, 0x3B, 0xD6, 0xB3, 0x29, 0xE3, 0x2F, 0x84, 0x53, 0xD1, 0x00, 0xED, + 0x20, 0xFC, 0xB1, 0x5B, 0x6A, 0xCB, 0xBE, 0x39, 0x4A, 0x4C, 0x58, 0xCF, + 0xD0, 0xEF, 0xAA, 0xFB, 0x43, 0x4D, 0x33, 0x85, 0x45, 0xF9, 0x02, 0x7F, + 0x50, 0x3C, 0x9F, 0xA8, 0x51, 0xA3, 0x40, 0x8F, 0x92, 0x9D, 0x38, 0xF5, + 0xBC, 0xB6, 0xDA, 0x21, 0x10, 0xFF, 0xF3, 0xD2, 0xCD, 0x0C, 0x13, 0xEC, + 0x5F, 0x97, 0x44, 0x17, 0xC4, 0xA7, 0x7E, 0x3D, 0x64, 0x5D, 0x19, 0x73, + 0x60, 0x81, 0x4F, 0xDC, 0x22, 0x2A, 0x90, 0x88, 0x46, 0xEE, 0xB8, 0x14, + 0xDE, 0x5E, 0x0B, 0xDB, 0xE0, 0x32, 0x3A, 0x0A, 0x49, 0x06, 0x24, 0x5C, + 0xC2, 0xD3, 0xAC, 0x62, 0x91, 0x95, 0xE4, 0x79, 0xE7, 0xC8, 0x37, 0x6D, + 0x8D, 0xD5, 0x4E, 0xA9, 0x6C, 0x56, 0xF4, 0xEA, 0x65, 0x7A, 0xAE, 0x08, + 0xBA, 0x78, 0x25, 0x2E, 0x1C, 0xA6, 0xB4, 0xC6, 0xE8, 0xDD, 0x74, 0x1F, + 0x4B, 0xBD, 0x8B, 0x8A, 0x70, 0x3E, 0xB5, 0x66, 0x48, 0x03, 0xF6, 0x0E, + 0x61, 0x35, 0x57, 0xB9, 0x86, 0xC1, 0x1D, 0x9E, 0xE1, 0xF8, 0x98, 0x11, + 0x69, 0xD9, 0x8E, 0x94, 0x9B, 0x1E, 0x87, 0xE9, 0xCE, 0x55, 0x28, 0xDF, + 0x8C, 0xA1, 0x89, 0x0D, 0xBF, 0xE6, 0x42, 0x68, 0x41, 0x99, 0x2D, 0x0F, + 0xB0, 0x54, 0xBB, 0x16 }; + +BOTAN_ALIGNAS(64) +const uint8_t SD[256] = { + 0x52, 0x09, 0x6A, 0xD5, 0x30, 0x36, 0xA5, 0x38, 0xBF, 0x40, 0xA3, 0x9E, + 0x81, 0xF3, 0xD7, 0xFB, 0x7C, 0xE3, 0x39, 0x82, 0x9B, 0x2F, 0xFF, 0x87, + 0x34, 0x8E, 0x43, 0x44, 0xC4, 0xDE, 0xE9, 0xCB, 0x54, 0x7B, 0x94, 0x32, + 0xA6, 0xC2, 0x23, 0x3D, 0xEE, 0x4C, 0x95, 0x0B, 0x42, 0xFA, 0xC3, 0x4E, + 0x08, 0x2E, 0xA1, 0x66, 0x28, 0xD9, 0x24, 0xB2, 0x76, 0x5B, 0xA2, 0x49, + 0x6D, 0x8B, 0xD1, 0x25, 0x72, 0xF8, 0xF6, 0x64, 0x86, 0x68, 0x98, 0x16, + 0xD4, 0xA4, 0x5C, 0xCC, 0x5D, 0x65, 0xB6, 0x92, 0x6C, 0x70, 0x48, 0x50, + 0xFD, 0xED, 0xB9, 0xDA, 0x5E, 0x15, 0x46, 0x57, 0xA7, 0x8D, 0x9D, 0x84, + 0x90, 0xD8, 0xAB, 0x00, 0x8C, 0xBC, 0xD3, 0x0A, 0xF7, 0xE4, 0x58, 0x05, + 0xB8, 0xB3, 0x45, 0x06, 0xD0, 0x2C, 0x1E, 0x8F, 0xCA, 0x3F, 0x0F, 0x02, + 0xC1, 0xAF, 0xBD, 0x03, 0x01, 0x13, 0x8A, 0x6B, 0x3A, 0x91, 0x11, 0x41, + 0x4F, 0x67, 0xDC, 0xEA, 0x97, 0xF2, 0xCF, 0xCE, 0xF0, 0xB4, 0xE6, 0x73, + 0x96, 0xAC, 0x74, 0x22, 0xE7, 0xAD, 0x35, 0x85, 0xE2, 0xF9, 0x37, 0xE8, + 0x1C, 0x75, 0xDF, 0x6E, 0x47, 0xF1, 0x1A, 0x71, 0x1D, 0x29, 0xC5, 0x89, + 0x6F, 0xB7, 0x62, 0x0E, 0xAA, 0x18, 0xBE, 0x1B, 0xFC, 0x56, 0x3E, 0x4B, + 0xC6, 0xD2, 0x79, 0x20, 0x9A, 0xDB, 0xC0, 0xFE, 0x78, 0xCD, 0x5A, 0xF4, + 0x1F, 0xDD, 0xA8, 0x33, 0x88, 0x07, 0xC7, 0x31, 0xB1, 0x12, 0x10, 0x59, + 0x27, 0x80, 0xEC, 0x5F, 0x60, 0x51, 0x7F, 0xA9, 0x19, 0xB5, 0x4A, 0x0D, + 0x2D, 0xE5, 0x7A, 0x9F, 0x93, 0xC9, 0x9C, 0xEF, 0xA0, 0xE0, 0x3B, 0x4D, + 0xAE, 0x2A, 0xF5, 0xB0, 0xC8, 0xEB, 0xBB, 0x3C, 0x83, 0x53, 0x99, 0x61, + 0x17, 0x2B, 0x04, 0x7E, 0xBA, 0x77, 0xD6, 0x26, 0xE1, 0x69, 0x14, 0x63, + 0x55, 0x21, 0x0C, 0x7D }; + +inline uint8_t xtime(uint8_t s) { return static_cast<uint8_t>(s << 1) ^ ((s >> 7) * 0x1B); } +inline uint8_t xtime4(uint8_t s) { return xtime(xtime(s)); } +inline uint8_t xtime8(uint8_t s) { return xtime(xtime(xtime(s))); } + +inline uint8_t xtime3(uint8_t s) { return xtime(s) ^ s; } +inline uint8_t xtime9(uint8_t s) { return xtime8(s) ^ s; } +inline uint8_t xtime11(uint8_t s) { return xtime8(s) ^ xtime(s) ^ s; } +inline uint8_t xtime13(uint8_t s) { return xtime8(s) ^ xtime4(s) ^ s; } +inline uint8_t xtime14(uint8_t s) { return xtime8(s) ^ xtime4(s) ^ xtime(s); } + +inline uint32_t SE_word(uint32_t x) + { + return make_uint32(SE[get_byte(0, x)], + SE[get_byte(1, x)], + SE[get_byte(2, x)], + SE[get_byte(3, x)]); + } + +const uint32_t* AES_TE() + { + class TE_Table final + { + public: + TE_Table() + { + uint32_t* p = reinterpret_cast<uint32_t*>(&data); + for(size_t i = 0; i != 256; ++i) + { + const uint8_t s = SE[i]; + p[i] = make_uint32(xtime(s), s, s, xtime3(s)); + } + } + + const uint32_t* ptr() const + { + return reinterpret_cast<const uint32_t*>(&data); + } + private: + std::aligned_storage<256*sizeof(uint32_t), 64>::type data; + }; + + static TE_Table table; + return table.ptr(); + } + +const uint32_t* AES_TD() + { + class TD_Table final + { + public: + TD_Table() + { + uint32_t* p = reinterpret_cast<uint32_t*>(&data); + for(size_t i = 0; i != 256; ++i) + { + const uint8_t s = SD[i]; + p[i] = make_uint32(xtime14(s), xtime9(s), xtime13(s), xtime11(s)); + } + } + + const uint32_t* ptr() const + { + return reinterpret_cast<const uint32_t*>(&data); + } + private: + std::aligned_storage<256*sizeof(uint32_t), 64>::type data; + }; + + static TD_Table table; + return table.ptr(); + } + +#define AES_T(T, K, V0, V1, V2, V3) \ + (K ^ T[get_byte(0, V0)] ^ \ + rotr< 8>(T[get_byte(1, V1)]) ^ \ + rotr<16>(T[get_byte(2, V2)]) ^ \ + rotr<24>(T[get_byte(3, V3)])) + +/* +* AES Encryption +*/ +void aes_encrypt_n(const uint8_t in[], uint8_t out[], + size_t blocks, + const secure_vector<uint32_t>& EK, + const secure_vector<uint8_t>& ME) + { + BOTAN_ASSERT(EK.size() && ME.size() == 16, "Key was set"); + + const size_t cache_line_size = CPUID::cache_line_size(); + + const uint32_t* TE = AES_TE(); + + // Hit every cache line of TE + volatile uint32_t Z = 0; + for(size_t i = 0; i < 256; i += cache_line_size / sizeof(uint32_t)) + { + Z |= TE[i]; + } + Z &= TE[82]; // this is zero, which hopefully the compiler cannot deduce + + for(size_t i = 0; i < blocks; ++i) + { + uint32_t T0, T1, T2, T3; + load_be(in + 16*i, T0, T1, T2, T3); + + T0 ^= EK[0]; + T1 ^= EK[1]; + T2 ^= EK[2]; + T3 ^= EK[3]; + + T0 ^= Z; + + uint32_t B0 = AES_T(TE, EK[4], T0, T1, T2, T3); + uint32_t B1 = AES_T(TE, EK[5], T1, T2, T3, T0); + uint32_t B2 = AES_T(TE, EK[6], T2, T3, T0, T1); + uint32_t B3 = AES_T(TE, EK[7], T3, T0, T1, T2); + + for(size_t r = 2*4; r < EK.size(); r += 2*4) + { + T0 = AES_T(TE, EK[r ], B0, B1, B2, B3); + T1 = AES_T(TE, EK[r+1], B1, B2, B3, B0); + T2 = AES_T(TE, EK[r+2], B2, B3, B0, B1); + T3 = AES_T(TE, EK[r+3], B3, B0, B1, B2); + + B0 = AES_T(TE, EK[r+4], T0, T1, T2, T3); + B1 = AES_T(TE, EK[r+5], T1, T2, T3, T0); + B2 = AES_T(TE, EK[r+6], T2, T3, T0, T1); + B3 = AES_T(TE, EK[r+7], T3, T0, T1, T2); + } + + /* + * Use TE[x] >> 8 instead of SE[] so encryption only references a single + * lookup table. + */ + out[16*i+ 0] = static_cast<uint8_t>(TE[get_byte(0, B0)] >> 8) ^ ME[0]; + out[16*i+ 1] = static_cast<uint8_t>(TE[get_byte(1, B1)] >> 8) ^ ME[1]; + out[16*i+ 2] = static_cast<uint8_t>(TE[get_byte(2, B2)] >> 8) ^ ME[2]; + out[16*i+ 3] = static_cast<uint8_t>(TE[get_byte(3, B3)] >> 8) ^ ME[3]; + out[16*i+ 4] = static_cast<uint8_t>(TE[get_byte(0, B1)] >> 8) ^ ME[4]; + out[16*i+ 5] = static_cast<uint8_t>(TE[get_byte(1, B2)] >> 8) ^ ME[5]; + out[16*i+ 6] = static_cast<uint8_t>(TE[get_byte(2, B3)] >> 8) ^ ME[6]; + out[16*i+ 7] = static_cast<uint8_t>(TE[get_byte(3, B0)] >> 8) ^ ME[7]; + out[16*i+ 8] = static_cast<uint8_t>(TE[get_byte(0, B2)] >> 8) ^ ME[8]; + out[16*i+ 9] = static_cast<uint8_t>(TE[get_byte(1, B3)] >> 8) ^ ME[9]; + out[16*i+10] = static_cast<uint8_t>(TE[get_byte(2, B0)] >> 8) ^ ME[10]; + out[16*i+11] = static_cast<uint8_t>(TE[get_byte(3, B1)] >> 8) ^ ME[11]; + out[16*i+12] = static_cast<uint8_t>(TE[get_byte(0, B3)] >> 8) ^ ME[12]; + out[16*i+13] = static_cast<uint8_t>(TE[get_byte(1, B0)] >> 8) ^ ME[13]; + out[16*i+14] = static_cast<uint8_t>(TE[get_byte(2, B1)] >> 8) ^ ME[14]; + out[16*i+15] = static_cast<uint8_t>(TE[get_byte(3, B2)] >> 8) ^ ME[15]; + } + } + +/* +* AES Decryption +*/ +void aes_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks, + const secure_vector<uint32_t>& DK, + const secure_vector<uint8_t>& MD) + { + BOTAN_ASSERT(DK.size() && MD.size() == 16, "Key was set"); + + const size_t cache_line_size = CPUID::cache_line_size(); + const uint32_t* TD = AES_TD(); + + volatile uint32_t Z = 0; + for(size_t i = 0; i < 256; i += cache_line_size / sizeof(uint32_t)) + { + Z |= TD[i]; + } + Z &= TD[99]; // this is zero, which hopefully the compiler cannot deduce + + for(size_t i = 0; i != blocks; ++i) + { + uint32_t T0 = load_be<uint32_t>(in, 0) ^ DK[0]; + uint32_t T1 = load_be<uint32_t>(in, 1) ^ DK[1]; + uint32_t T2 = load_be<uint32_t>(in, 2) ^ DK[2]; + uint32_t T3 = load_be<uint32_t>(in, 3) ^ DK[3]; + + T0 ^= Z; + + uint32_t B0 = AES_T(TD, DK[4], T0, T3, T2, T1); + uint32_t B1 = AES_T(TD, DK[5], T1, T0, T3, T2); + uint32_t B2 = AES_T(TD, DK[6], T2, T1, T0, T3); + uint32_t B3 = AES_T(TD, DK[7], T3, T2, T1, T0); + + for(size_t r = 2*4; r < DK.size(); r += 2*4) + { + T0 = AES_T(TD, DK[r ], B0, B3, B2, B1); + T1 = AES_T(TD, DK[r+1], B1, B0, B3, B2); + T2 = AES_T(TD, DK[r+2], B2, B1, B0, B3); + T3 = AES_T(TD, DK[r+3], B3, B2, B1, B0); + + B0 = AES_T(TD, DK[r+4], T0, T3, T2, T1); + B1 = AES_T(TD, DK[r+5], T1, T0, T3, T2); + B2 = AES_T(TD, DK[r+6], T2, T1, T0, T3); + B3 = AES_T(TD, DK[r+7], T3, T2, T1, T0); + } + + out[ 0] = SD[get_byte(0, B0)] ^ MD[0]; + out[ 1] = SD[get_byte(1, B3)] ^ MD[1]; + out[ 2] = SD[get_byte(2, B2)] ^ MD[2]; + out[ 3] = SD[get_byte(3, B1)] ^ MD[3]; + out[ 4] = SD[get_byte(0, B1)] ^ MD[4]; + out[ 5] = SD[get_byte(1, B0)] ^ MD[5]; + out[ 6] = SD[get_byte(2, B3)] ^ MD[6]; + out[ 7] = SD[get_byte(3, B2)] ^ MD[7]; + out[ 8] = SD[get_byte(0, B2)] ^ MD[8]; + out[ 9] = SD[get_byte(1, B1)] ^ MD[9]; + out[10] = SD[get_byte(2, B0)] ^ MD[10]; + out[11] = SD[get_byte(3, B3)] ^ MD[11]; + out[12] = SD[get_byte(0, B3)] ^ MD[12]; + out[13] = SD[get_byte(1, B2)] ^ MD[13]; + out[14] = SD[get_byte(2, B1)] ^ MD[14]; + out[15] = SD[get_byte(3, B0)] ^ MD[15]; + + in += 16; + out += 16; + } + } + +void aes_key_schedule(const uint8_t key[], size_t length, + secure_vector<uint32_t>& EK, + secure_vector<uint32_t>& DK, + secure_vector<uint8_t>& ME, + secure_vector<uint8_t>& MD) + { + static const uint32_t RC[10] = { + 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, + 0x20000000, 0x40000000, 0x80000000, 0x1B000000, 0x36000000 }; + + const size_t rounds = (length / 4) + 6; + + secure_vector<uint32_t> XEK(length + 32), XDK(length + 32); + + const size_t X = length / 4; + + // Can't happen, but make static analyzers happy + BOTAN_ARG_CHECK(X == 4 || X == 6 || X == 8, "Invalid AES key size"); + + for(size_t i = 0; i != X; ++i) + XEK[i] = load_be<uint32_t>(key, i); + + for(size_t i = X; i < 4*(rounds+1); i += X) + { + XEK[i] = XEK[i-X] ^ RC[(i-X)/X] ^ SE_word(rotl<8>(XEK[i-1])); + + for(size_t j = 1; j != X; ++j) + { + XEK[i+j] = XEK[i+j-X]; + + if(X == 8 && j == 4) + XEK[i+j] ^= SE_word(XEK[i+j-1]); + else + XEK[i+j] ^= XEK[i+j-1]; + } + } + + for(size_t i = 0; i != 4*(rounds+1); i += 4) + { + XDK[i ] = XEK[4*rounds-i ]; + XDK[i+1] = XEK[4*rounds-i+1]; + XDK[i+2] = XEK[4*rounds-i+2]; + XDK[i+3] = XEK[4*rounds-i+3]; + } + + for(size_t i = 4; i != length + 24; ++i) + { + XDK[i] = SE_word(XDK[i]); + XDK[i] = AES_T(AES_TD(), 0, XDK[i], XDK[i], XDK[i], XDK[i]); + } + + ME.resize(16); + MD.resize(16); + + for(size_t i = 0; i != 4; ++i) + { + store_be(XEK[i+4*rounds], &ME[4*i]); + store_be(XEK[i], &MD[4*i]); + } + + EK.resize(length + 24); + DK.resize(length + 24); + copy_mem(EK.data(), XEK.data(), EK.size()); + copy_mem(DK.data(), XDK.data(), DK.size()); + +#if defined(BOTAN_HAS_AES_ARMV8) + if(CPUID::has_arm_aes()) + { + // ARM needs the subkeys to be byte reversed + + for(size_t i = 0; i != EK.size(); ++i) + EK[i] = reverse_bytes(EK[i]); + for(size_t i = 0; i != DK.size(); ++i) + DK[i] = reverse_bytes(DK[i]); + } +#endif + + } + +#undef AES_T + +size_t aes_parallelism() + { +#if defined(BOTAN_HAS_AES_NI) + if(CPUID::has_aes_ni()) + { + return 4; + } +#endif + + return 1; + } + +const char* aes_provider() + { +#if defined(BOTAN_HAS_AES_NI) + if(CPUID::has_aes_ni()) + { + return "aesni"; + } +#endif + +#if defined(BOTAN_HAS_AES_SSSE3) + if(CPUID::has_ssse3()) + { + return "ssse3"; + } +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + if(CPUID::has_ppc_crypto()) + { + return "power8"; + } +#endif + +#if defined(BOTAN_HAS_AES_ARMV8) + if(CPUID::has_arm_aes()) + { + return "armv8"; + } +#endif + + return "base"; + } + +} + +std::string AES_128::provider() const { return aes_provider(); } +std::string AES_192::provider() const { return aes_provider(); } +std::string AES_256::provider() const { return aes_provider(); } + +size_t AES_128::parallelism() const { return aes_parallelism(); } +size_t AES_192::parallelism() const { return aes_parallelism(); } +size_t AES_256::parallelism() const { return aes_parallelism(); } + +void AES_128::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + verify_key_set(m_EK.empty() == false); + +#if defined(BOTAN_HAS_AES_NI) + if(CPUID::has_aes_ni()) + { + return aesni_encrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_SSSE3) + if(CPUID::has_ssse3()) + { + return ssse3_encrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_ARMV8) + if(CPUID::has_arm_aes()) + { + return armv8_encrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + if(CPUID::has_ppc_crypto()) + { + return power8_encrypt_n(in, out, blocks); + } +#endif + + aes_encrypt_n(in, out, blocks, m_EK, m_ME); + } + +void AES_128::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + verify_key_set(m_DK.empty() == false); + +#if defined(BOTAN_HAS_AES_NI) + if(CPUID::has_aes_ni()) + { + return aesni_decrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_SSSE3) + if(CPUID::has_ssse3()) + { + return ssse3_decrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_ARMV8) + if(CPUID::has_arm_aes()) + { + return armv8_decrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + if(CPUID::has_ppc_crypto()) + { + return power8_decrypt_n(in, out, blocks); + } +#endif + + aes_decrypt_n(in, out, blocks, m_DK, m_MD); + } + +void AES_128::key_schedule(const uint8_t key[], size_t length) + { +#if defined(BOTAN_HAS_AES_NI) + if(CPUID::has_aes_ni()) + { + return aesni_key_schedule(key, length); + } +#endif + +#if defined(BOTAN_HAS_AES_SSSE3) + if(CPUID::has_ssse3()) + { + return ssse3_key_schedule(key, length); + } +#endif + + aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } + +void AES_128::clear() + { + zap(m_EK); + zap(m_DK); + zap(m_ME); + zap(m_MD); + } + +void AES_192::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + verify_key_set(m_EK.empty() == false); + +#if defined(BOTAN_HAS_AES_NI) + if(CPUID::has_aes_ni()) + { + return aesni_encrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_SSSE3) + if(CPUID::has_ssse3()) + { + return ssse3_encrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_ARMV8) + if(CPUID::has_arm_aes()) + { + return armv8_encrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + if(CPUID::has_ppc_crypto()) + { + return power8_encrypt_n(in, out, blocks); + } +#endif + + aes_encrypt_n(in, out, blocks, m_EK, m_ME); + } + +void AES_192::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + verify_key_set(m_DK.empty() == false); + +#if defined(BOTAN_HAS_AES_NI) + if(CPUID::has_aes_ni()) + { + return aesni_decrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_SSSE3) + if(CPUID::has_ssse3()) + { + return ssse3_decrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_ARMV8) + if(CPUID::has_arm_aes()) + { + return armv8_decrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + if(CPUID::has_ppc_crypto()) + { + return power8_decrypt_n(in, out, blocks); + } +#endif + + aes_decrypt_n(in, out, blocks, m_DK, m_MD); + } + +void AES_192::key_schedule(const uint8_t key[], size_t length) + { +#if defined(BOTAN_HAS_AES_NI) + if(CPUID::has_aes_ni()) + { + return aesni_key_schedule(key, length); + } +#endif + +#if defined(BOTAN_HAS_AES_SSSE3) + if(CPUID::has_ssse3()) + { + return ssse3_key_schedule(key, length); + } +#endif + + aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } + +void AES_192::clear() + { + zap(m_EK); + zap(m_DK); + zap(m_ME); + zap(m_MD); + } + +void AES_256::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + verify_key_set(m_EK.empty() == false); + +#if defined(BOTAN_HAS_AES_NI) + if(CPUID::has_aes_ni()) + { + return aesni_encrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_SSSE3) + if(CPUID::has_ssse3()) + { + return ssse3_encrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_ARMV8) + if(CPUID::has_arm_aes()) + { + return armv8_encrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + if(CPUID::has_ppc_crypto()) + { + return power8_encrypt_n(in, out, blocks); + } +#endif + + aes_encrypt_n(in, out, blocks, m_EK, m_ME); + } + +void AES_256::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + verify_key_set(m_DK.empty() == false); + +#if defined(BOTAN_HAS_AES_NI) + if(CPUID::has_aes_ni()) + { + return aesni_decrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_SSSE3) + if(CPUID::has_ssse3()) + { + return ssse3_decrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_ARMV8) + if(CPUID::has_arm_aes()) + { + return armv8_decrypt_n(in, out, blocks); + } +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + if(CPUID::has_ppc_crypto()) + { + return power8_decrypt_n(in, out, blocks); + } +#endif + + aes_decrypt_n(in, out, blocks, m_DK, m_MD); + } + +void AES_256::key_schedule(const uint8_t key[], size_t length) + { +#if defined(BOTAN_HAS_AES_NI) + if(CPUID::has_aes_ni()) + { + return aesni_key_schedule(key, length); + } +#endif + +#if defined(BOTAN_HAS_AES_SSSE3) + if(CPUID::has_ssse3()) + { + return ssse3_key_schedule(key, length); + } +#endif + + aes_key_schedule(key, length, m_EK, m_DK, m_ME, m_MD); + } + +void AES_256::clear() + { + zap(m_EK); + zap(m_DK); + zap(m_ME); + zap(m_MD); + } + +} diff --git a/src/libs/3rdparty/botan/src/lib/block/aes/aes.h b/src/libs/3rdparty/botan/src/lib/block/aes/aes.h new file mode 100644 index 0000000000..294cdcad37 --- /dev/null +++ b/src/libs/3rdparty/botan/src/lib/block/aes/aes.h @@ -0,0 +1,153 @@ +/* +* AES +* (C) 1999-2010 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#ifndef BOTAN_AES_H_ +#define BOTAN_AES_H_ + +#include <botan/block_cipher.h> + +namespace Botan { + +/** +* AES-128 +*/ +class BOTAN_PUBLIC_API(2,0) AES_128 final : public Block_Cipher_Fixed_Params<16, 16> + { + public: + void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override; + void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override; + + void clear() override; + + std::string provider() const override; + std::string name() const override { return "AES-128"; } + BlockCipher* clone() const override { return new AES_128; } + size_t parallelism() const override; + + private: + void key_schedule(const uint8_t key[], size_t length) override; + +#if defined(BOTAN_HAS_AES_SSSE3) + void ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void ssse3_key_schedule(const uint8_t key[], size_t length); +#endif + +#if defined(BOTAN_HAS_AES_NI) + void aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void aesni_key_schedule(const uint8_t key[], size_t length); +#endif + +#if defined(BOTAN_HAS_AES_ARMV8) + void armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + void power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; +#endif + + secure_vector<uint32_t> m_EK, m_DK; + secure_vector<uint8_t> m_ME, m_MD; + }; + +/** +* AES-192 +*/ +class BOTAN_PUBLIC_API(2,0) AES_192 final : public Block_Cipher_Fixed_Params<16, 24> + { + public: + void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override; + void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override; + + void clear() override; + + std::string provider() const override; + std::string name() const override { return "AES-192"; } + BlockCipher* clone() const override { return new AES_192; } + size_t parallelism() const override; + + private: +#if defined(BOTAN_HAS_AES_SSSE3) + void ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void ssse3_key_schedule(const uint8_t key[], size_t length); +#endif + +#if defined(BOTAN_HAS_AES_NI) + void aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void aesni_key_schedule(const uint8_t key[], size_t length); +#endif + +#if defined(BOTAN_HAS_AES_ARMV8) + void armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + void power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; +#endif + + void key_schedule(const uint8_t key[], size_t length) override; + + secure_vector<uint32_t> m_EK, m_DK; + secure_vector<uint8_t> m_ME, m_MD; + }; + +/** +* AES-256 +*/ +class BOTAN_PUBLIC_API(2,0) AES_256 final : public Block_Cipher_Fixed_Params<16, 32> + { + public: + void encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override; + void decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const override; + + void clear() override; + + std::string provider() const override; + + std::string name() const override { return "AES-256"; } + BlockCipher* clone() const override { return new AES_256; } + size_t parallelism() const override; + + private: +#if defined(BOTAN_HAS_AES_SSSE3) + void ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void ssse3_key_schedule(const uint8_t key[], size_t length); +#endif + +#if defined(BOTAN_HAS_AES_NI) + void aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void aesni_key_schedule(const uint8_t key[], size_t length); +#endif + +#if defined(BOTAN_HAS_AES_ARMV8) + void armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; +#endif + +#if defined(BOTAN_HAS_AES_POWER8) + void power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; + void power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const; +#endif + + void key_schedule(const uint8_t key[], size_t length) override; + + secure_vector<uint32_t> m_EK, m_DK; + secure_vector<uint8_t> m_ME, m_MD; + }; + +} + +#endif diff --git a/src/libs/3rdparty/botan/src/lib/block/aes/aes_armv8/aes_armv8.cpp b/src/libs/3rdparty/botan/src/lib/block/aes/aes_armv8/aes_armv8.cpp new file mode 100644 index 0000000000..8a332ceafd --- /dev/null +++ b/src/libs/3rdparty/botan/src/lib/block/aes/aes_armv8/aes_armv8.cpp @@ -0,0 +1,501 @@ +/* +* AES using ARMv8 +* Contributed by Jeffrey Walton +* +* Further changes +* (C) 2017,2018 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/aes.h> +#include <botan/loadstor.h> +#include <arm_neon.h> + +namespace Botan { + +#define AES_ENC_4_ROUNDS(K) \ + do \ + { \ + B0 = vaesmcq_u8(vaeseq_u8(B0, K)); \ + B1 = vaesmcq_u8(vaeseq_u8(B1, K)); \ + B2 = vaesmcq_u8(vaeseq_u8(B2, K)); \ + B3 = vaesmcq_u8(vaeseq_u8(B3, K)); \ + } while(0) + +#define AES_ENC_4_LAST_ROUNDS(K, K2) \ + do \ + { \ + B0 = veorq_u8(vaeseq_u8(B0, K), K2); \ + B1 = veorq_u8(vaeseq_u8(B1, K), K2); \ + B2 = veorq_u8(vaeseq_u8(B2, K), K2); \ + B3 = veorq_u8(vaeseq_u8(B3, K), K2); \ + } while(0) + +#define AES_DEC_4_ROUNDS(K) \ + do \ + { \ + B0 = vaesimcq_u8(vaesdq_u8(B0, K)); \ + B1 = vaesimcq_u8(vaesdq_u8(B1, K)); \ + B2 = vaesimcq_u8(vaesdq_u8(B2, K)); \ + B3 = vaesimcq_u8(vaesdq_u8(B3, K)); \ + } while(0) + +#define AES_DEC_4_LAST_ROUNDS(K, K2) \ + do \ + { \ + B0 = veorq_u8(vaesdq_u8(B0, K), K2); \ + B1 = veorq_u8(vaesdq_u8(B1, K), K2); \ + B2 = veorq_u8(vaesdq_u8(B2, K), K2); \ + B3 = veorq_u8(vaesdq_u8(B3, K), K2); \ + } while(0) + +/* +* AES-128 Encryption +*/ +BOTAN_FUNC_ISA("+crypto") +void AES_128::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_EK.empty() == false, "Key was set"); + + const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_EK.data()); + const uint8_t *mkey = reinterpret_cast<const uint8_t*>(m_ME.data()); + + const uint8x16_t K0 = vld1q_u8(skey + 0); + const uint8x16_t K1 = vld1q_u8(skey + 16); + const uint8x16_t K2 = vld1q_u8(skey + 32); + const uint8x16_t K3 = vld1q_u8(skey + 48); + const uint8x16_t K4 = vld1q_u8(skey + 64); + const uint8x16_t K5 = vld1q_u8(skey + 80); + const uint8x16_t K6 = vld1q_u8(skey + 96); + const uint8x16_t K7 = vld1q_u8(skey + 112); + const uint8x16_t K8 = vld1q_u8(skey + 128); + const uint8x16_t K9 = vld1q_u8(skey + 144); + const uint8x16_t K10 = vld1q_u8(mkey); + + while(blocks >= 4) + { + uint8x16_t B0 = vld1q_u8(in); + uint8x16_t B1 = vld1q_u8(in+16); + uint8x16_t B2 = vld1q_u8(in+32); + uint8x16_t B3 = vld1q_u8(in+48); + + AES_ENC_4_ROUNDS(K0); + AES_ENC_4_ROUNDS(K1); + AES_ENC_4_ROUNDS(K2); + AES_ENC_4_ROUNDS(K3); + AES_ENC_4_ROUNDS(K4); + AES_ENC_4_ROUNDS(K5); + AES_ENC_4_ROUNDS(K6); + AES_ENC_4_ROUNDS(K7); + AES_ENC_4_ROUNDS(K8); + AES_ENC_4_LAST_ROUNDS(K9, K10); + + vst1q_u8(out, B0); + vst1q_u8(out+16, B1); + vst1q_u8(out+32, B2); + vst1q_u8(out+48, B3); + + in += 16*4; + out += 16*4; + blocks -= 4; + } + + for(size_t i = 0; i != blocks; ++i) + { + uint8x16_t B = vld1q_u8(in+16*i); + B = vaesmcq_u8(vaeseq_u8(B, K0)); + B = vaesmcq_u8(vaeseq_u8(B, K1)); + B = vaesmcq_u8(vaeseq_u8(B, K2)); + B = vaesmcq_u8(vaeseq_u8(B, K3)); + B = vaesmcq_u8(vaeseq_u8(B, K4)); + B = vaesmcq_u8(vaeseq_u8(B, K5)); + B = vaesmcq_u8(vaeseq_u8(B, K6)); + B = vaesmcq_u8(vaeseq_u8(B, K7)); + B = vaesmcq_u8(vaeseq_u8(B, K8)); + B = veorq_u8(vaeseq_u8(B, K9), K10); + vst1q_u8(out+16*i, B); + } + } + +/* +* AES-128 Decryption +*/ +BOTAN_FUNC_ISA("+crypto") +void AES_128::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_DK.empty() == false, "Key was set"); + + const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_DK.data()); + const uint8_t *mkey = reinterpret_cast<const uint8_t*>(m_MD.data()); + + const uint8x16_t K0 = vld1q_u8(skey + 0); + const uint8x16_t K1 = vld1q_u8(skey + 16); + const uint8x16_t K2 = vld1q_u8(skey + 32); + const uint8x16_t K3 = vld1q_u8(skey + 48); + const uint8x16_t K4 = vld1q_u8(skey + 64); + const uint8x16_t K5 = vld1q_u8(skey + 80); + const uint8x16_t K6 = vld1q_u8(skey + 96); + const uint8x16_t K7 = vld1q_u8(skey + 112); + const uint8x16_t K8 = vld1q_u8(skey + 128); + const uint8x16_t K9 = vld1q_u8(skey + 144); + const uint8x16_t K10 = vld1q_u8(mkey); + + while(blocks >= 4) + { + uint8x16_t B0 = vld1q_u8(in); + uint8x16_t B1 = vld1q_u8(in+16); + uint8x16_t B2 = vld1q_u8(in+32); + uint8x16_t B3 = vld1q_u8(in+48); + + AES_DEC_4_ROUNDS(K0); + AES_DEC_4_ROUNDS(K1); + AES_DEC_4_ROUNDS(K2); + AES_DEC_4_ROUNDS(K3); + AES_DEC_4_ROUNDS(K4); + AES_DEC_4_ROUNDS(K5); + AES_DEC_4_ROUNDS(K6); + AES_DEC_4_ROUNDS(K7); + AES_DEC_4_ROUNDS(K8); + AES_DEC_4_LAST_ROUNDS(K9, K10); + + vst1q_u8(out, B0); + vst1q_u8(out+16, B1); + vst1q_u8(out+32, B2); + vst1q_u8(out+48, B3); + + in += 16*4; + out += 16*4; + blocks -= 4; + } + + for(size_t i = 0; i != blocks; ++i) + { + uint8x16_t B = vld1q_u8(in+16*i); + B = vaesimcq_u8(vaesdq_u8(B, K0)); + B = vaesimcq_u8(vaesdq_u8(B, K1)); + B = vaesimcq_u8(vaesdq_u8(B, K2)); + B = vaesimcq_u8(vaesdq_u8(B, K3)); + B = vaesimcq_u8(vaesdq_u8(B, K4)); + B = vaesimcq_u8(vaesdq_u8(B, K5)); + B = vaesimcq_u8(vaesdq_u8(B, K6)); + B = vaesimcq_u8(vaesdq_u8(B, K7)); + B = vaesimcq_u8(vaesdq_u8(B, K8)); + B = veorq_u8(vaesdq_u8(B, K9), K10); + vst1q_u8(out+16*i, B); + } + } + +/* +* AES-192 Encryption +*/ +BOTAN_FUNC_ISA("+crypto") +void AES_192::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_EK.empty() == false, "Key was set"); + + const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_EK.data()); + const uint8_t *mkey = reinterpret_cast<const uint8_t*>(m_ME.data()); + + const uint8x16_t K0 = vld1q_u8(skey + 0); + const uint8x16_t K1 = vld1q_u8(skey + 16); + const uint8x16_t K2 = vld1q_u8(skey + 32); + const uint8x16_t K3 = vld1q_u8(skey + 48); + const uint8x16_t K4 = vld1q_u8(skey + 64); + const uint8x16_t K5 = vld1q_u8(skey + 80); + const uint8x16_t K6 = vld1q_u8(skey + 96); + const uint8x16_t K7 = vld1q_u8(skey + 112); + const uint8x16_t K8 = vld1q_u8(skey + 128); + const uint8x16_t K9 = vld1q_u8(skey + 144); + const uint8x16_t K10 = vld1q_u8(skey + 160); + const uint8x16_t K11 = vld1q_u8(skey + 176); + const uint8x16_t K12 = vld1q_u8(mkey); + + while(blocks >= 4) + { + uint8x16_t B0 = vld1q_u8(in); + uint8x16_t B1 = vld1q_u8(in+16); + uint8x16_t B2 = vld1q_u8(in+32); + uint8x16_t B3 = vld1q_u8(in+48); + + AES_ENC_4_ROUNDS(K0); + AES_ENC_4_ROUNDS(K1); + AES_ENC_4_ROUNDS(K2); + AES_ENC_4_ROUNDS(K3); + AES_ENC_4_ROUNDS(K4); + AES_ENC_4_ROUNDS(K5); + AES_ENC_4_ROUNDS(K6); + AES_ENC_4_ROUNDS(K7); + AES_ENC_4_ROUNDS(K8); + AES_ENC_4_ROUNDS(K9); + AES_ENC_4_ROUNDS(K10); + AES_ENC_4_LAST_ROUNDS(K11, K12); + + vst1q_u8(out, B0); + vst1q_u8(out+16, B1); + vst1q_u8(out+32, B2); + vst1q_u8(out+48, B3); + + in += 16*4; + out += 16*4; + blocks -= 4; + } + + for(size_t i = 0; i != blocks; ++i) + { + uint8x16_t B = vld1q_u8(in+16*i); + B = vaesmcq_u8(vaeseq_u8(B, K0)); + B = vaesmcq_u8(vaeseq_u8(B, K1)); + B = vaesmcq_u8(vaeseq_u8(B, K2)); + B = vaesmcq_u8(vaeseq_u8(B, K3)); + B = vaesmcq_u8(vaeseq_u8(B, K4)); + B = vaesmcq_u8(vaeseq_u8(B, K5)); + B = vaesmcq_u8(vaeseq_u8(B, K6)); + B = vaesmcq_u8(vaeseq_u8(B, K7)); + B = vaesmcq_u8(vaeseq_u8(B, K8)); + B = vaesmcq_u8(vaeseq_u8(B, K9)); + B = vaesmcq_u8(vaeseq_u8(B, K10)); + B = veorq_u8(vaeseq_u8(B, K11), K12); + vst1q_u8(out+16*i, B); + } + } + +/* +* AES-192 Decryption +*/ +BOTAN_FUNC_ISA("+crypto") +void AES_192::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_DK.empty() == false, "Key was set"); + const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_DK.data()); + const uint8_t *mkey = reinterpret_cast<const uint8_t*>(m_MD.data()); + + const uint8x16_t K0 = vld1q_u8(skey + 0); + const uint8x16_t K1 = vld1q_u8(skey + 16); + const uint8x16_t K2 = vld1q_u8(skey + 32); + const uint8x16_t K3 = vld1q_u8(skey + 48); + const uint8x16_t K4 = vld1q_u8(skey + 64); + const uint8x16_t K5 = vld1q_u8(skey + 80); + const uint8x16_t K6 = vld1q_u8(skey + 96); + const uint8x16_t K7 = vld1q_u8(skey + 112); + const uint8x16_t K8 = vld1q_u8(skey + 128); + const uint8x16_t K9 = vld1q_u8(skey + 144); + const uint8x16_t K10 = vld1q_u8(skey + 160); + const uint8x16_t K11 = vld1q_u8(skey + 176); + const uint8x16_t K12 = vld1q_u8(mkey); + + while(blocks >= 4) + { + uint8x16_t B0 = vld1q_u8(in); + uint8x16_t B1 = vld1q_u8(in+16); + uint8x16_t B2 = vld1q_u8(in+32); + uint8x16_t B3 = vld1q_u8(in+48); + + AES_DEC_4_ROUNDS(K0); + AES_DEC_4_ROUNDS(K1); + AES_DEC_4_ROUNDS(K2); + AES_DEC_4_ROUNDS(K3); + AES_DEC_4_ROUNDS(K4); + AES_DEC_4_ROUNDS(K5); + AES_DEC_4_ROUNDS(K6); + AES_DEC_4_ROUNDS(K7); + AES_DEC_4_ROUNDS(K8); + AES_DEC_4_ROUNDS(K9); + AES_DEC_4_ROUNDS(K10); + AES_DEC_4_LAST_ROUNDS(K11, K12); + + vst1q_u8(out, B0); + vst1q_u8(out+16, B1); + vst1q_u8(out+32, B2); + vst1q_u8(out+48, B3); + + in += 16*4; + out += 16*4; + blocks -= 4; + } + + for(size_t i = 0; i != blocks; ++i) + { + uint8x16_t B = vld1q_u8(in+16*i); + B = vaesimcq_u8(vaesdq_u8(B, K0)); + B = vaesimcq_u8(vaesdq_u8(B, K1)); + B = vaesimcq_u8(vaesdq_u8(B, K2)); + B = vaesimcq_u8(vaesdq_u8(B, K3)); + B = vaesimcq_u8(vaesdq_u8(B, K4)); + B = vaesimcq_u8(vaesdq_u8(B, K5)); + B = vaesimcq_u8(vaesdq_u8(B, K6)); + B = vaesimcq_u8(vaesdq_u8(B, K7)); + B = vaesimcq_u8(vaesdq_u8(B, K8)); + B = vaesimcq_u8(vaesdq_u8(B, K9)); + B = vaesimcq_u8(vaesdq_u8(B, K10)); + B = veorq_u8(vaesdq_u8(B, K11), K12); + vst1q_u8(out+16*i, B); + } + } + +/* +* AES-256 Encryption +*/ +BOTAN_FUNC_ISA("+crypto") +void AES_256::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_EK.empty() == false, "Key was set"); + + const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_EK.data()); + const uint8_t *mkey = reinterpret_cast<const uint8_t*>(m_ME.data()); + + const uint8x16_t K0 = vld1q_u8(skey + 0); + const uint8x16_t K1 = vld1q_u8(skey + 16); + const uint8x16_t K2 = vld1q_u8(skey + 32); + const uint8x16_t K3 = vld1q_u8(skey + 48); + const uint8x16_t K4 = vld1q_u8(skey + 64); + const uint8x16_t K5 = vld1q_u8(skey + 80); + const uint8x16_t K6 = vld1q_u8(skey + 96); + const uint8x16_t K7 = vld1q_u8(skey + 112); + const uint8x16_t K8 = vld1q_u8(skey + 128); + const uint8x16_t K9 = vld1q_u8(skey + 144); + const uint8x16_t K10 = vld1q_u8(skey + 160); + const uint8x16_t K11 = vld1q_u8(skey + 176); + const uint8x16_t K12 = vld1q_u8(skey + 192); + const uint8x16_t K13 = vld1q_u8(skey + 208); + const uint8x16_t K14 = vld1q_u8(mkey); + + while(blocks >= 4) + { + uint8x16_t B0 = vld1q_u8(in); + uint8x16_t B1 = vld1q_u8(in+16); + uint8x16_t B2 = vld1q_u8(in+32); + uint8x16_t B3 = vld1q_u8(in+48); + + AES_ENC_4_ROUNDS(K0); + AES_ENC_4_ROUNDS(K1); + AES_ENC_4_ROUNDS(K2); + AES_ENC_4_ROUNDS(K3); + AES_ENC_4_ROUNDS(K4); + AES_ENC_4_ROUNDS(K5); + AES_ENC_4_ROUNDS(K6); + AES_ENC_4_ROUNDS(K7); + AES_ENC_4_ROUNDS(K8); + AES_ENC_4_ROUNDS(K9); + AES_ENC_4_ROUNDS(K10); + AES_ENC_4_ROUNDS(K11); + AES_ENC_4_ROUNDS(K12); + AES_ENC_4_LAST_ROUNDS(K13, K14); + + vst1q_u8(out, B0); + vst1q_u8(out+16, B1); + vst1q_u8(out+32, B2); + vst1q_u8(out+48, B3); + + in += 16*4; + out += 16*4; + blocks -= 4; + } + + for(size_t i = 0; i != blocks; ++i) + { + uint8x16_t B = vld1q_u8(in+16*i); + B = vaesmcq_u8(vaeseq_u8(B, K0)); + B = vaesmcq_u8(vaeseq_u8(B, K1)); + B = vaesmcq_u8(vaeseq_u8(B, K2)); + B = vaesmcq_u8(vaeseq_u8(B, K3)); + B = vaesmcq_u8(vaeseq_u8(B, K4)); + B = vaesmcq_u8(vaeseq_u8(B, K5)); + B = vaesmcq_u8(vaeseq_u8(B, K6)); + B = vaesmcq_u8(vaeseq_u8(B, K7)); + B = vaesmcq_u8(vaeseq_u8(B, K8)); + B = vaesmcq_u8(vaeseq_u8(B, K9)); + B = vaesmcq_u8(vaeseq_u8(B, K10)); + B = vaesmcq_u8(vaeseq_u8(B, K11)); + B = vaesmcq_u8(vaeseq_u8(B, K12)); + B = veorq_u8(vaeseq_u8(B, K13), K14); + vst1q_u8(out+16*i, B); + } + } + +/* +* AES-256 Decryption +*/ +BOTAN_FUNC_ISA("+crypto") +void AES_256::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_DK.empty() == false, "Key was set"); + + const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_DK.data()); + const uint8_t *mkey = reinterpret_cast<const uint8_t*>(m_MD.data()); + + const uint8x16_t K0 = vld1q_u8(skey + 0); + const uint8x16_t K1 = vld1q_u8(skey + 16); + const uint8x16_t K2 = vld1q_u8(skey + 32); + const uint8x16_t K3 = vld1q_u8(skey + 48); + const uint8x16_t K4 = vld1q_u8(skey + 64); + const uint8x16_t K5 = vld1q_u8(skey + 80); + const uint8x16_t K6 = vld1q_u8(skey + 96); + const uint8x16_t K7 = vld1q_u8(skey + 112); + const uint8x16_t K8 = vld1q_u8(skey + 128); + const uint8x16_t K9 = vld1q_u8(skey + 144); + const uint8x16_t K10 = vld1q_u8(skey + 160); + const uint8x16_t K11 = vld1q_u8(skey + 176); + const uint8x16_t K12 = vld1q_u8(skey + 192); + const uint8x16_t K13 = vld1q_u8(skey + 208); + const uint8x16_t K14 = vld1q_u8(mkey); + + while(blocks >= 4) + { + uint8x16_t B0 = vld1q_u8(in); + uint8x16_t B1 = vld1q_u8(in+16); + uint8x16_t B2 = vld1q_u8(in+32); + uint8x16_t B3 = vld1q_u8(in+48); + + AES_DEC_4_ROUNDS(K0); + AES_DEC_4_ROUNDS(K1); + AES_DEC_4_ROUNDS(K2); + AES_DEC_4_ROUNDS(K3); + AES_DEC_4_ROUNDS(K4); + AES_DEC_4_ROUNDS(K5); + AES_DEC_4_ROUNDS(K6); + AES_DEC_4_ROUNDS(K7); + AES_DEC_4_ROUNDS(K8); + AES_DEC_4_ROUNDS(K9); + AES_DEC_4_ROUNDS(K10); + AES_DEC_4_ROUNDS(K11); + AES_DEC_4_ROUNDS(K12); + AES_DEC_4_LAST_ROUNDS(K13, K14); + + vst1q_u8(out, B0); + vst1q_u8(out+16, B1); + vst1q_u8(out+32, B2); + vst1q_u8(out+48, B3); + + in += 16*4; + out += 16*4; + blocks -= 4; + } + + for(size_t i = 0; i != blocks; ++i) + { + uint8x16_t B = vld1q_u8(in+16*i); + B = vaesimcq_u8(vaesdq_u8(B, K0)); + B = vaesimcq_u8(vaesdq_u8(B, K1)); + B = vaesimcq_u8(vaesdq_u8(B, K2)); + B = vaesimcq_u8(vaesdq_u8(B, K3)); + B = vaesimcq_u8(vaesdq_u8(B, K4)); + B = vaesimcq_u8(vaesdq_u8(B, K5)); + B = vaesimcq_u8(vaesdq_u8(B, K6)); + B = vaesimcq_u8(vaesdq_u8(B, K7)); + B = vaesimcq_u8(vaesdq_u8(B, K8)); + B = vaesimcq_u8(vaesdq_u8(B, K9)); + B = vaesimcq_u8(vaesdq_u8(B, K10)); + B = vaesimcq_u8(vaesdq_u8(B, K11)); + B = vaesimcq_u8(vaesdq_u8(B, K12)); + B = veorq_u8(vaesdq_u8(B, K13), K14); + vst1q_u8(out+16*i, B); + } + } + +#undef AES_ENC_4_ROUNDS +#undef AES_ENC_4_LAST_ROUNDS +#undef AES_DEC_4_ROUNDS +#undef AES_DEC_4_LAST_ROUNDS + +} diff --git a/src/libs/3rdparty/botan/src/lib/block/aes/aes_armv8/info.txt b/src/libs/3rdparty/botan/src/lib/block/aes/aes_armv8/info.txt new file mode 100644 index 0000000000..08d51a1b2d --- /dev/null +++ b/src/libs/3rdparty/botan/src/lib/block/aes/aes_armv8/info.txt @@ -0,0 +1,10 @@ +<defines> +AES_ARMV8 -> 20170903 +</defines> + +need_isa armv8crypto + +<cc> +gcc:5 +clang:3.8 +</cc> diff --git a/src/libs/3rdparty/botan/src/lib/block/aes/aes_ni/aes_ni.cpp b/src/libs/3rdparty/botan/src/lib/block/aes/aes_ni/aes_ni.cpp new file mode 100644 index 0000000000..9f1ba8fcc2 --- /dev/null +++ b/src/libs/3rdparty/botan/src/lib/block/aes/aes_ni/aes_ni.cpp @@ -0,0 +1,792 @@ +/* +* AES using AES-NI instructions +* (C) 2009,2012 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/aes.h> +#include <botan/loadstor.h> +#include <wmmintrin.h> + +namespace Botan { + +namespace { + +BOTAN_FUNC_ISA("ssse3") +__m128i aes_128_key_expansion(__m128i key, __m128i key_with_rcon) + { + key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(3,3,3,3)); + key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); + key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); + key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); + return _mm_xor_si128(key, key_with_rcon); + } + +BOTAN_FUNC_ISA("ssse3") +void aes_192_key_expansion(__m128i* K1, __m128i* K2, __m128i key2_with_rcon, + uint32_t out[], bool last) + { + __m128i key1 = *K1; + __m128i key2 = *K2; + + key2_with_rcon = _mm_shuffle_epi32(key2_with_rcon, _MM_SHUFFLE(1,1,1,1)); + key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); + key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); + key1 = _mm_xor_si128(key1, _mm_slli_si128(key1, 4)); + key1 = _mm_xor_si128(key1, key2_with_rcon); + + *K1 = key1; + _mm_storeu_si128(reinterpret_cast<__m128i*>(out), key1); + + if(last) + return; + + key2 = _mm_xor_si128(key2, _mm_slli_si128(key2, 4)); + key2 = _mm_xor_si128(key2, _mm_shuffle_epi32(key1, _MM_SHUFFLE(3,3,3,3))); + + *K2 = key2; + out[4] = _mm_cvtsi128_si32(key2); + out[5] = _mm_cvtsi128_si32(_mm_srli_si128(key2, 4)); + } + +/* +* The second half of the AES-256 key expansion (other half same as AES-128) +*/ +BOTAN_FUNC_ISA("ssse3,aes") +__m128i aes_256_key_expansion(__m128i key, __m128i key2) + { + __m128i key_with_rcon = _mm_aeskeygenassist_si128(key2, 0x00); + key_with_rcon = _mm_shuffle_epi32(key_with_rcon, _MM_SHUFFLE(2,2,2,2)); + + key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); + key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); + key = _mm_xor_si128(key, _mm_slli_si128(key, 4)); + return _mm_xor_si128(key, key_with_rcon); + } + +} + +#define AES_ENC_4_ROUNDS(K) \ + do \ + { \ + B0 = _mm_aesenc_si128(B0, K); \ + B1 = _mm_aesenc_si128(B1, K); \ + B2 = _mm_aesenc_si128(B2, K); \ + B3 = _mm_aesenc_si128(B3, K); \ + } while(0) + +#define AES_ENC_4_LAST_ROUNDS(K) \ + do \ + { \ + B0 = _mm_aesenclast_si128(B0, K); \ + B1 = _mm_aesenclast_si128(B1, K); \ + B2 = _mm_aesenclast_si128(B2, K); \ + B3 = _mm_aesenclast_si128(B3, K); \ + } while(0) + +#define AES_DEC_4_ROUNDS(K) \ + do \ + { \ + B0 = _mm_aesdec_si128(B0, K); \ + B1 = _mm_aesdec_si128(B1, K); \ + B2 = _mm_aesdec_si128(B2, K); \ + B3 = _mm_aesdec_si128(B3, K); \ + } while(0) + +#define AES_DEC_4_LAST_ROUNDS(K) \ + do \ + { \ + B0 = _mm_aesdeclast_si128(B0, K); \ + B1 = _mm_aesdeclast_si128(B1, K); \ + B2 = _mm_aesdeclast_si128(B2, K); \ + B3 = _mm_aesdeclast_si128(B3, K); \ + } while(0) + +/* +* AES-128 Encryption +*/ +BOTAN_FUNC_ISA("ssse3,aes") +void AES_128::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_EK.empty() == false, "Key was set"); + + const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); + __m128i* out_mm = reinterpret_cast<__m128i*>(out); + + const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data()); + + const __m128i K0 = _mm_loadu_si128(key_mm); + const __m128i K1 = _mm_loadu_si128(key_mm + 1); + const __m128i K2 = _mm_loadu_si128(key_mm + 2); + const __m128i K3 = _mm_loadu_si128(key_mm + 3); + const __m128i K4 = _mm_loadu_si128(key_mm + 4); + const __m128i K5 = _mm_loadu_si128(key_mm + 5); + const __m128i K6 = _mm_loadu_si128(key_mm + 6); + const __m128i K7 = _mm_loadu_si128(key_mm + 7); + const __m128i K8 = _mm_loadu_si128(key_mm + 8); + const __m128i K9 = _mm_loadu_si128(key_mm + 9); + const __m128i K10 = _mm_loadu_si128(key_mm + 10); + + while(blocks >= 4) + { + __m128i B0 = _mm_loadu_si128(in_mm + 0); + __m128i B1 = _mm_loadu_si128(in_mm + 1); + __m128i B2 = _mm_loadu_si128(in_mm + 2); + __m128i B3 = _mm_loadu_si128(in_mm + 3); + + B0 = _mm_xor_si128(B0, K0); + B1 = _mm_xor_si128(B1, K0); + B2 = _mm_xor_si128(B2, K0); + B3 = _mm_xor_si128(B3, K0); + + AES_ENC_4_ROUNDS(K1); + AES_ENC_4_ROUNDS(K2); + AES_ENC_4_ROUNDS(K3); + AES_ENC_4_ROUNDS(K4); + AES_ENC_4_ROUNDS(K5); + AES_ENC_4_ROUNDS(K6); + AES_ENC_4_ROUNDS(K7); + AES_ENC_4_ROUNDS(K8); + AES_ENC_4_ROUNDS(K9); + AES_ENC_4_LAST_ROUNDS(K10); + + _mm_storeu_si128(out_mm + 0, B0); + _mm_storeu_si128(out_mm + 1, B1); + _mm_storeu_si128(out_mm + 2, B2); + _mm_storeu_si128(out_mm + 3, B3); + + blocks -= 4; + in_mm += 4; + out_mm += 4; + } + + for(size_t i = 0; i != blocks; ++i) + { + __m128i B = _mm_loadu_si128(in_mm + i); + + B = _mm_xor_si128(B, K0); + + B = _mm_aesenc_si128(B, K1); + B = _mm_aesenc_si128(B, K2); + B = _mm_aesenc_si128(B, K3); + B = _mm_aesenc_si128(B, K4); + B = _mm_aesenc_si128(B, K5); + B = _mm_aesenc_si128(B, K6); + B = _mm_aesenc_si128(B, K7); + B = _mm_aesenc_si128(B, K8); + B = _mm_aesenc_si128(B, K9); + B = _mm_aesenclast_si128(B, K10); + + _mm_storeu_si128(out_mm + i, B); + } + } + +/* +* AES-128 Decryption +*/ +BOTAN_FUNC_ISA("ssse3,aes") +void AES_128::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_DK.empty() == false, "Key was set"); + + const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); + __m128i* out_mm = reinterpret_cast<__m128i*>(out); + + const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data()); + + const __m128i K0 = _mm_loadu_si128(key_mm); + const __m128i K1 = _mm_loadu_si128(key_mm + 1); + const __m128i K2 = _mm_loadu_si128(key_mm + 2); + const __m128i K3 = _mm_loadu_si128(key_mm + 3); + const __m128i K4 = _mm_loadu_si128(key_mm + 4); + const __m128i K5 = _mm_loadu_si128(key_mm + 5); + const __m128i K6 = _mm_loadu_si128(key_mm + 6); + const __m128i K7 = _mm_loadu_si128(key_mm + 7); + const __m128i K8 = _mm_loadu_si128(key_mm + 8); + const __m128i K9 = _mm_loadu_si128(key_mm + 9); + const __m128i K10 = _mm_loadu_si128(key_mm + 10); + + while(blocks >= 4) + { + __m128i B0 = _mm_loadu_si128(in_mm + 0); + __m128i B1 = _mm_loadu_si128(in_mm + 1); + __m128i B2 = _mm_loadu_si128(in_mm + 2); + __m128i B3 = _mm_loadu_si128(in_mm + 3); + + B0 = _mm_xor_si128(B0, K0); + B1 = _mm_xor_si128(B1, K0); + B2 = _mm_xor_si128(B2, K0); + B3 = _mm_xor_si128(B3, K0); + + AES_DEC_4_ROUNDS(K1); + AES_DEC_4_ROUNDS(K2); + AES_DEC_4_ROUNDS(K3); + AES_DEC_4_ROUNDS(K4); + AES_DEC_4_ROUNDS(K5); + AES_DEC_4_ROUNDS(K6); + AES_DEC_4_ROUNDS(K7); + AES_DEC_4_ROUNDS(K8); + AES_DEC_4_ROUNDS(K9); + AES_DEC_4_LAST_ROUNDS(K10); + + _mm_storeu_si128(out_mm + 0, B0); + _mm_storeu_si128(out_mm + 1, B1); + _mm_storeu_si128(out_mm + 2, B2); + _mm_storeu_si128(out_mm + 3, B3); + + blocks -= 4; + in_mm += 4; + out_mm += 4; + } + + for(size_t i = 0; i != blocks; ++i) + { + __m128i B = _mm_loadu_si128(in_mm + i); + + B = _mm_xor_si128(B, K0); + + B = _mm_aesdec_si128(B, K1); + B = _mm_aesdec_si128(B, K2); + B = _mm_aesdec_si128(B, K3); + B = _mm_aesdec_si128(B, K4); + B = _mm_aesdec_si128(B, K5); + B = _mm_aesdec_si128(B, K6); + B = _mm_aesdec_si128(B, K7); + B = _mm_aesdec_si128(B, K8); + B = _mm_aesdec_si128(B, K9); + B = _mm_aesdeclast_si128(B, K10); + + _mm_storeu_si128(out_mm + i, B); + } + } + +/* +* AES-128 Key Schedule +*/ +BOTAN_FUNC_ISA("ssse3,aes") +void AES_128::aesni_key_schedule(const uint8_t key[], size_t) + { + m_EK.resize(44); + m_DK.resize(44); + + #define AES_128_key_exp(K, RCON) \ + aes_128_key_expansion(K, _mm_aeskeygenassist_si128(K, RCON)) + + const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); + const __m128i K1 = AES_128_key_exp(K0, 0x01); + const __m128i K2 = AES_128_key_exp(K1, 0x02); + const __m128i K3 = AES_128_key_exp(K2, 0x04); + const __m128i K4 = AES_128_key_exp(K3, 0x08); + const __m128i K5 = AES_128_key_exp(K4, 0x10); + const __m128i K6 = AES_128_key_exp(K5, 0x20); + const __m128i K7 = AES_128_key_exp(K6, 0x40); + const __m128i K8 = AES_128_key_exp(K7, 0x80); + const __m128i K9 = AES_128_key_exp(K8, 0x1B); + const __m128i K10 = AES_128_key_exp(K9, 0x36); + + __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data()); + _mm_storeu_si128(EK_mm , K0); + _mm_storeu_si128(EK_mm + 1, K1); + _mm_storeu_si128(EK_mm + 2, K2); + _mm_storeu_si128(EK_mm + 3, K3); + _mm_storeu_si128(EK_mm + 4, K4); + _mm_storeu_si128(EK_mm + 5, K5); + _mm_storeu_si128(EK_mm + 6, K6); + _mm_storeu_si128(EK_mm + 7, K7); + _mm_storeu_si128(EK_mm + 8, K8); + _mm_storeu_si128(EK_mm + 9, K9); + _mm_storeu_si128(EK_mm + 10, K10); + + // Now generate decryption keys + + __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); + _mm_storeu_si128(DK_mm , K10); + _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K9)); + _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K8)); + _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K7)); + _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K6)); + _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K5)); + _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K4)); + _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K3)); + _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K2)); + _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K1)); + _mm_storeu_si128(DK_mm + 10, K0); + } + +/* +* AES-192 Encryption +*/ +BOTAN_FUNC_ISA("ssse3,aes") +void AES_192::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_EK.empty() == false, "Key was set"); + + const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); + __m128i* out_mm = reinterpret_cast<__m128i*>(out); + + const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data()); + + const __m128i K0 = _mm_loadu_si128(key_mm); + const __m128i K1 = _mm_loadu_si128(key_mm + 1); + const __m128i K2 = _mm_loadu_si128(key_mm + 2); + const __m128i K3 = _mm_loadu_si128(key_mm + 3); + const __m128i K4 = _mm_loadu_si128(key_mm + 4); + const __m128i K5 = _mm_loadu_si128(key_mm + 5); + const __m128i K6 = _mm_loadu_si128(key_mm + 6); + const __m128i K7 = _mm_loadu_si128(key_mm + 7); + const __m128i K8 = _mm_loadu_si128(key_mm + 8); + const __m128i K9 = _mm_loadu_si128(key_mm + 9); + const __m128i K10 = _mm_loadu_si128(key_mm + 10); + const __m128i K11 = _mm_loadu_si128(key_mm + 11); + const __m128i K12 = _mm_loadu_si128(key_mm + 12); + + while(blocks >= 4) + { + __m128i B0 = _mm_loadu_si128(in_mm + 0); + __m128i B1 = _mm_loadu_si128(in_mm + 1); + __m128i B2 = _mm_loadu_si128(in_mm + 2); + __m128i B3 = _mm_loadu_si128(in_mm + 3); + + B0 = _mm_xor_si128(B0, K0); + B1 = _mm_xor_si128(B1, K0); + B2 = _mm_xor_si128(B2, K0); + B3 = _mm_xor_si128(B3, K0); + + AES_ENC_4_ROUNDS(K1); + AES_ENC_4_ROUNDS(K2); + AES_ENC_4_ROUNDS(K3); + AES_ENC_4_ROUNDS(K4); + AES_ENC_4_ROUNDS(K5); + AES_ENC_4_ROUNDS(K6); + AES_ENC_4_ROUNDS(K7); + AES_ENC_4_ROUNDS(K8); + AES_ENC_4_ROUNDS(K9); + AES_ENC_4_ROUNDS(K10); + AES_ENC_4_ROUNDS(K11); + AES_ENC_4_LAST_ROUNDS(K12); + + _mm_storeu_si128(out_mm + 0, B0); + _mm_storeu_si128(out_mm + 1, B1); + _mm_storeu_si128(out_mm + 2, B2); + _mm_storeu_si128(out_mm + 3, B3); + + blocks -= 4; + in_mm += 4; + out_mm += 4; + } + + for(size_t i = 0; i != blocks; ++i) + { + __m128i B = _mm_loadu_si128(in_mm + i); + + B = _mm_xor_si128(B, K0); + + B = _mm_aesenc_si128(B, K1); + B = _mm_aesenc_si128(B, K2); + B = _mm_aesenc_si128(B, K3); + B = _mm_aesenc_si128(B, K4); + B = _mm_aesenc_si128(B, K5); + B = _mm_aesenc_si128(B, K6); + B = _mm_aesenc_si128(B, K7); + B = _mm_aesenc_si128(B, K8); + B = _mm_aesenc_si128(B, K9); + B = _mm_aesenc_si128(B, K10); + B = _mm_aesenc_si128(B, K11); + B = _mm_aesenclast_si128(B, K12); + + _mm_storeu_si128(out_mm + i, B); + } + } + +/* +* AES-192 Decryption +*/ +BOTAN_FUNC_ISA("ssse3,aes") +void AES_192::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_DK.empty() == false, "Key was set"); + + const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); + __m128i* out_mm = reinterpret_cast<__m128i*>(out); + + const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data()); + + const __m128i K0 = _mm_loadu_si128(key_mm); + const __m128i K1 = _mm_loadu_si128(key_mm + 1); + const __m128i K2 = _mm_loadu_si128(key_mm + 2); + const __m128i K3 = _mm_loadu_si128(key_mm + 3); + const __m128i K4 = _mm_loadu_si128(key_mm + 4); + const __m128i K5 = _mm_loadu_si128(key_mm + 5); + const __m128i K6 = _mm_loadu_si128(key_mm + 6); + const __m128i K7 = _mm_loadu_si128(key_mm + 7); + const __m128i K8 = _mm_loadu_si128(key_mm + 8); + const __m128i K9 = _mm_loadu_si128(key_mm + 9); + const __m128i K10 = _mm_loadu_si128(key_mm + 10); + const __m128i K11 = _mm_loadu_si128(key_mm + 11); + const __m128i K12 = _mm_loadu_si128(key_mm + 12); + + while(blocks >= 4) + { + __m128i B0 = _mm_loadu_si128(in_mm + 0); + __m128i B1 = _mm_loadu_si128(in_mm + 1); + __m128i B2 = _mm_loadu_si128(in_mm + 2); + __m128i B3 = _mm_loadu_si128(in_mm + 3); + + B0 = _mm_xor_si128(B0, K0); + B1 = _mm_xor_si128(B1, K0); + B2 = _mm_xor_si128(B2, K0); + B3 = _mm_xor_si128(B3, K0); + + AES_DEC_4_ROUNDS(K1); + AES_DEC_4_ROUNDS(K2); + AES_DEC_4_ROUNDS(K3); + AES_DEC_4_ROUNDS(K4); + AES_DEC_4_ROUNDS(K5); + AES_DEC_4_ROUNDS(K6); + AES_DEC_4_ROUNDS(K7); + AES_DEC_4_ROUNDS(K8); + AES_DEC_4_ROUNDS(K9); + AES_DEC_4_ROUNDS(K10); + AES_DEC_4_ROUNDS(K11); + AES_DEC_4_LAST_ROUNDS(K12); + + _mm_storeu_si128(out_mm + 0, B0); + _mm_storeu_si128(out_mm + 1, B1); + _mm_storeu_si128(out_mm + 2, B2); + _mm_storeu_si128(out_mm + 3, B3); + + blocks -= 4; + in_mm += 4; + out_mm += 4; + } + + for(size_t i = 0; i != blocks; ++i) + { + __m128i B = _mm_loadu_si128(in_mm + i); + + B = _mm_xor_si128(B, K0); + + B = _mm_aesdec_si128(B, K1); + B = _mm_aesdec_si128(B, K2); + B = _mm_aesdec_si128(B, K3); + B = _mm_aesdec_si128(B, K4); + B = _mm_aesdec_si128(B, K5); + B = _mm_aesdec_si128(B, K6); + B = _mm_aesdec_si128(B, K7); + B = _mm_aesdec_si128(B, K8); + B = _mm_aesdec_si128(B, K9); + B = _mm_aesdec_si128(B, K10); + B = _mm_aesdec_si128(B, K11); + B = _mm_aesdeclast_si128(B, K12); + + _mm_storeu_si128(out_mm + i, B); + } + } + +/* +* AES-192 Key Schedule +*/ +BOTAN_FUNC_ISA("ssse3,aes") +void AES_192::aesni_key_schedule(const uint8_t key[], size_t) + { + m_EK.resize(52); + m_DK.resize(52); + + __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); + __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 8)); + K1 = _mm_srli_si128(K1, 8); + + load_le(m_EK.data(), key, 6); + + #define AES_192_key_exp(RCON, EK_OFF) \ + aes_192_key_expansion(&K0, &K1, \ + _mm_aeskeygenassist_si128(K1, RCON), \ + &m_EK[EK_OFF], EK_OFF == 48) + + AES_192_key_exp(0x01, 6); + AES_192_key_exp(0x02, 12); + AES_192_key_exp(0x04, 18); + AES_192_key_exp(0x08, 24); + AES_192_key_exp(0x10, 30); + AES_192_key_exp(0x20, 36); + AES_192_key_exp(0x40, 42); + AES_192_key_exp(0x80, 48); + + #undef AES_192_key_exp + + // Now generate decryption keys + const __m128i* EK_mm = reinterpret_cast<const __m128i*>(m_EK.data()); + + __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); + _mm_storeu_si128(DK_mm , _mm_loadu_si128(EK_mm + 12)); + _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 11))); + _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 10))); + _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 9))); + _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 8))); + _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 7))); + _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 6))); + _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 5))); + _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 4))); + _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 3))); + _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 2))); + _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(_mm_loadu_si128(EK_mm + 1))); + _mm_storeu_si128(DK_mm + 12, _mm_loadu_si128(EK_mm + 0)); + } + +/* +* AES-256 Encryption +*/ +BOTAN_FUNC_ISA("ssse3,aes") +void AES_256::aesni_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_EK.empty() == false, "Key was set"); + + const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); + __m128i* out_mm = reinterpret_cast<__m128i*>(out); + + const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_EK.data()); + + const __m128i K0 = _mm_loadu_si128(key_mm); + const __m128i K1 = _mm_loadu_si128(key_mm + 1); + const __m128i K2 = _mm_loadu_si128(key_mm + 2); + const __m128i K3 = _mm_loadu_si128(key_mm + 3); + const __m128i K4 = _mm_loadu_si128(key_mm + 4); + const __m128i K5 = _mm_loadu_si128(key_mm + 5); + const __m128i K6 = _mm_loadu_si128(key_mm + 6); + const __m128i K7 = _mm_loadu_si128(key_mm + 7); + const __m128i K8 = _mm_loadu_si128(key_mm + 8); + const __m128i K9 = _mm_loadu_si128(key_mm + 9); + const __m128i K10 = _mm_loadu_si128(key_mm + 10); + const __m128i K11 = _mm_loadu_si128(key_mm + 11); + const __m128i K12 = _mm_loadu_si128(key_mm + 12); + const __m128i K13 = _mm_loadu_si128(key_mm + 13); + const __m128i K14 = _mm_loadu_si128(key_mm + 14); + + while(blocks >= 4) + { + __m128i B0 = _mm_loadu_si128(in_mm + 0); + __m128i B1 = _mm_loadu_si128(in_mm + 1); + __m128i B2 = _mm_loadu_si128(in_mm + 2); + __m128i B3 = _mm_loadu_si128(in_mm + 3); + + B0 = _mm_xor_si128(B0, K0); + B1 = _mm_xor_si128(B1, K0); + B2 = _mm_xor_si128(B2, K0); + B3 = _mm_xor_si128(B3, K0); + + AES_ENC_4_ROUNDS(K1); + AES_ENC_4_ROUNDS(K2); + AES_ENC_4_ROUNDS(K3); + AES_ENC_4_ROUNDS(K4); + AES_ENC_4_ROUNDS(K5); + AES_ENC_4_ROUNDS(K6); + AES_ENC_4_ROUNDS(K7); + AES_ENC_4_ROUNDS(K8); + AES_ENC_4_ROUNDS(K9); + AES_ENC_4_ROUNDS(K10); + AES_ENC_4_ROUNDS(K11); + AES_ENC_4_ROUNDS(K12); + AES_ENC_4_ROUNDS(K13); + AES_ENC_4_LAST_ROUNDS(K14); + + _mm_storeu_si128(out_mm + 0, B0); + _mm_storeu_si128(out_mm + 1, B1); + _mm_storeu_si128(out_mm + 2, B2); + _mm_storeu_si128(out_mm + 3, B3); + + blocks -= 4; + in_mm += 4; + out_mm += 4; + } + + for(size_t i = 0; i != blocks; ++i) + { + __m128i B = _mm_loadu_si128(in_mm + i); + + B = _mm_xor_si128(B, K0); + + B = _mm_aesenc_si128(B, K1); + B = _mm_aesenc_si128(B, K2); + B = _mm_aesenc_si128(B, K3); + B = _mm_aesenc_si128(B, K4); + B = _mm_aesenc_si128(B, K5); + B = _mm_aesenc_si128(B, K6); + B = _mm_aesenc_si128(B, K7); + B = _mm_aesenc_si128(B, K8); + B = _mm_aesenc_si128(B, K9); + B = _mm_aesenc_si128(B, K10); + B = _mm_aesenc_si128(B, K11); + B = _mm_aesenc_si128(B, K12); + B = _mm_aesenc_si128(B, K13); + B = _mm_aesenclast_si128(B, K14); + + _mm_storeu_si128(out_mm + i, B); + } + } + +/* +* AES-256 Decryption +*/ +BOTAN_FUNC_ISA("ssse3,aes") +void AES_256::aesni_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_DK.empty() == false, "Key was set"); + + const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); + __m128i* out_mm = reinterpret_cast<__m128i*>(out); + + const __m128i* key_mm = reinterpret_cast<const __m128i*>(m_DK.data()); + + const __m128i K0 = _mm_loadu_si128(key_mm); + const __m128i K1 = _mm_loadu_si128(key_mm + 1); + const __m128i K2 = _mm_loadu_si128(key_mm + 2); + const __m128i K3 = _mm_loadu_si128(key_mm + 3); + const __m128i K4 = _mm_loadu_si128(key_mm + 4); + const __m128i K5 = _mm_loadu_si128(key_mm + 5); + const __m128i K6 = _mm_loadu_si128(key_mm + 6); + const __m128i K7 = _mm_loadu_si128(key_mm + 7); + const __m128i K8 = _mm_loadu_si128(key_mm + 8); + const __m128i K9 = _mm_loadu_si128(key_mm + 9); + const __m128i K10 = _mm_loadu_si128(key_mm + 10); + const __m128i K11 = _mm_loadu_si128(key_mm + 11); + const __m128i K12 = _mm_loadu_si128(key_mm + 12); + const __m128i K13 = _mm_loadu_si128(key_mm + 13); + const __m128i K14 = _mm_loadu_si128(key_mm + 14); + + while(blocks >= 4) + { + __m128i B0 = _mm_loadu_si128(in_mm + 0); + __m128i B1 = _mm_loadu_si128(in_mm + 1); + __m128i B2 = _mm_loadu_si128(in_mm + 2); + __m128i B3 = _mm_loadu_si128(in_mm + 3); + + B0 = _mm_xor_si128(B0, K0); + B1 = _mm_xor_si128(B1, K0); + B2 = _mm_xor_si128(B2, K0); + B3 = _mm_xor_si128(B3, K0); + + AES_DEC_4_ROUNDS(K1); + AES_DEC_4_ROUNDS(K2); + AES_DEC_4_ROUNDS(K3); + AES_DEC_4_ROUNDS(K4); + AES_DEC_4_ROUNDS(K5); + AES_DEC_4_ROUNDS(K6); + AES_DEC_4_ROUNDS(K7); + AES_DEC_4_ROUNDS(K8); + AES_DEC_4_ROUNDS(K9); + AES_DEC_4_ROUNDS(K10); + AES_DEC_4_ROUNDS(K11); + AES_DEC_4_ROUNDS(K12); + AES_DEC_4_ROUNDS(K13); + AES_DEC_4_LAST_ROUNDS(K14); + + _mm_storeu_si128(out_mm + 0, B0); + _mm_storeu_si128(out_mm + 1, B1); + _mm_storeu_si128(out_mm + 2, B2); + _mm_storeu_si128(out_mm + 3, B3); + + blocks -= 4; + in_mm += 4; + out_mm += 4; + } + + for(size_t i = 0; i != blocks; ++i) + { + __m128i B = _mm_loadu_si128(in_mm + i); + + B = _mm_xor_si128(B, K0); + + B = _mm_aesdec_si128(B, K1); + B = _mm_aesdec_si128(B, K2); + B = _mm_aesdec_si128(B, K3); + B = _mm_aesdec_si128(B, K4); + B = _mm_aesdec_si128(B, K5); + B = _mm_aesdec_si128(B, K6); + B = _mm_aesdec_si128(B, K7); + B = _mm_aesdec_si128(B, K8); + B = _mm_aesdec_si128(B, K9); + B = _mm_aesdec_si128(B, K10); + B = _mm_aesdec_si128(B, K11); + B = _mm_aesdec_si128(B, K12); + B = _mm_aesdec_si128(B, K13); + B = _mm_aesdeclast_si128(B, K14); + + _mm_storeu_si128(out_mm + i, B); + } + } + +/* +* AES-256 Key Schedule +*/ +BOTAN_FUNC_ISA("ssse3,aes") +void AES_256::aesni_key_schedule(const uint8_t key[], size_t) + { + m_EK.resize(60); + m_DK.resize(60); + + const __m128i K0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key)); + const __m128i K1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(key + 16)); + + const __m128i K2 = aes_128_key_expansion(K0, _mm_aeskeygenassist_si128(K1, 0x01)); + const __m128i K3 = aes_256_key_expansion(K1, K2); + + const __m128i K4 = aes_128_key_expansion(K2, _mm_aeskeygenassist_si128(K3, 0x02)); + const __m128i K5 = aes_256_key_expansion(K3, K4); + + const __m128i K6 = aes_128_key_expansion(K4, _mm_aeskeygenassist_si128(K5, 0x04)); + const __m128i K7 = aes_256_key_expansion(K5, K6); + + const __m128i K8 = aes_128_key_expansion(K6, _mm_aeskeygenassist_si128(K7, 0x08)); + const __m128i K9 = aes_256_key_expansion(K7, K8); + + const __m128i K10 = aes_128_key_expansion(K8, _mm_aeskeygenassist_si128(K9, 0x10)); + const __m128i K11 = aes_256_key_expansion(K9, K10); + + const __m128i K12 = aes_128_key_expansion(K10, _mm_aeskeygenassist_si128(K11, 0x20)); + const __m128i K13 = aes_256_key_expansion(K11, K12); + + const __m128i K14 = aes_128_key_expansion(K12, _mm_aeskeygenassist_si128(K13, 0x40)); + + __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data()); + _mm_storeu_si128(EK_mm , K0); + _mm_storeu_si128(EK_mm + 1, K1); + _mm_storeu_si128(EK_mm + 2, K2); + _mm_storeu_si128(EK_mm + 3, K3); + _mm_storeu_si128(EK_mm + 4, K4); + _mm_storeu_si128(EK_mm + 5, K5); + _mm_storeu_si128(EK_mm + 6, K6); + _mm_storeu_si128(EK_mm + 7, K7); + _mm_storeu_si128(EK_mm + 8, K8); + _mm_storeu_si128(EK_mm + 9, K9); + _mm_storeu_si128(EK_mm + 10, K10); + _mm_storeu_si128(EK_mm + 11, K11); + _mm_storeu_si128(EK_mm + 12, K12); + _mm_storeu_si128(EK_mm + 13, K13); + _mm_storeu_si128(EK_mm + 14, K14); + + // Now generate decryption keys + __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); + _mm_storeu_si128(DK_mm , K14); + _mm_storeu_si128(DK_mm + 1, _mm_aesimc_si128(K13)); + _mm_storeu_si128(DK_mm + 2, _mm_aesimc_si128(K12)); + _mm_storeu_si128(DK_mm + 3, _mm_aesimc_si128(K11)); + _mm_storeu_si128(DK_mm + 4, _mm_aesimc_si128(K10)); + _mm_storeu_si128(DK_mm + 5, _mm_aesimc_si128(K9)); + _mm_storeu_si128(DK_mm + 6, _mm_aesimc_si128(K8)); + _mm_storeu_si128(DK_mm + 7, _mm_aesimc_si128(K7)); + _mm_storeu_si128(DK_mm + 8, _mm_aesimc_si128(K6)); + _mm_storeu_si128(DK_mm + 9, _mm_aesimc_si128(K5)); + _mm_storeu_si128(DK_mm + 10, _mm_aesimc_si128(K4)); + _mm_storeu_si128(DK_mm + 11, _mm_aesimc_si128(K3)); + _mm_storeu_si128(DK_mm + 12, _mm_aesimc_si128(K2)); + _mm_storeu_si128(DK_mm + 13, _mm_aesimc_si128(K1)); + _mm_storeu_si128(DK_mm + 14, K0); + } + +#undef AES_ENC_4_ROUNDS +#undef AES_ENC_4_LAST_ROUNDS +#undef AES_DEC_4_ROUNDS +#undef AES_DEC_4_LAST_ROUNDS + +} diff --git a/src/libs/3rdparty/botan/src/lib/block/aes/aes_ni/info.txt b/src/libs/3rdparty/botan/src/lib/block/aes/aes_ni/info.txt new file mode 100644 index 0000000000..d5d3593489 --- /dev/null +++ b/src/libs/3rdparty/botan/src/lib/block/aes/aes_ni/info.txt @@ -0,0 +1,7 @@ +<defines> +AES_NI -> 20131128 +</defines> + +load_on auto + +need_isa aesni diff --git a/src/libs/3rdparty/botan/src/lib/block/aes/aes_power8/aes_power8.cpp b/src/libs/3rdparty/botan/src/lib/block/aes/aes_power8/aes_power8.cpp new file mode 100644 index 0000000000..98520a13cf --- /dev/null +++ b/src/libs/3rdparty/botan/src/lib/block/aes/aes_power8/aes_power8.cpp @@ -0,0 +1,328 @@ +/* +* AES using POWER8 crypto extensions +* +* Contributed by Jeffrey Walton +* +* Further changes +* (C) 2018 Jack Lloyd +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/aes.h> +#include <botan/cpuid.h> + +#include <altivec.h> +#undef vector +#undef bool + +namespace Botan { + +namespace { + +__vector unsigned long long LoadKey(const uint32_t* src) + { + __vector unsigned int vec = vec_vsx_ld(0, src); + + if(CPUID::is_little_endian()) + { + const __vector unsigned char mask = {12,13,14,15, 8,9,10,11, 4,5,6,7, 0,1,2,3}; + const __vector unsigned char zero = {0}; + return (__vector unsigned long long)vec_perm((__vector unsigned char)vec, zero, mask); + } + else + { + return (__vector unsigned long long)vec; + } + } + +__vector unsigned char Reverse8x16(const __vector unsigned char src) + { + if(CPUID::is_little_endian()) + { + const __vector unsigned char mask = {15,14,13,12, 11,10,9,8, 7,6,5,4, 3,2,1,0}; + const __vector unsigned char zero = {0}; + return vec_perm(src, zero, mask); + } + else + { + return src; + } + } + +__vector unsigned long long LoadBlock(const uint8_t* src) + { + return (__vector unsigned long long)Reverse8x16(vec_vsx_ld(0, src)); + } + +void StoreBlock(const __vector unsigned long long src, uint8_t* dest) + { + vec_vsx_st(Reverse8x16((__vector unsigned char)src), 0, dest); + } + +} + +BOTAN_FUNC_ISA("crypto") +void AES_128::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_EK.empty() == false, "Key was set"); + + const __vector unsigned long long K0 = LoadKey(&m_EK[0]); + const __vector unsigned long long K1 = LoadKey(&m_EK[4]); + const __vector unsigned long long K2 = LoadKey(&m_EK[8]); + const __vector unsigned long long K3 = LoadKey(&m_EK[12]); + const __vector unsigned long long K4 = LoadKey(&m_EK[16]); + const __vector unsigned long long K5 = LoadKey(&m_EK[20]); + const __vector unsigned long long K6 = LoadKey(&m_EK[24]); + const __vector unsigned long long K7 = LoadKey(&m_EK[28]); + const __vector unsigned long long K8 = LoadKey(&m_EK[32]); + const __vector unsigned long long K9 = LoadKey(&m_EK[36]); + const __vector unsigned long long K10 = LoadBlock(m_ME.data()); + + for(size_t i = 0; i != blocks; ++i) + { + __vector unsigned long long B = LoadBlock(in); + + B = vec_xor(B, K0); + B = __builtin_crypto_vcipher(B, K1); + B = __builtin_crypto_vcipher(B, K2); + B = __builtin_crypto_vcipher(B, K3); + B = __builtin_crypto_vcipher(B, K4); + B = __builtin_crypto_vcipher(B, K5); + B = __builtin_crypto_vcipher(B, K6); + B = __builtin_crypto_vcipher(B, K7); + B = __builtin_crypto_vcipher(B, K8); + B = __builtin_crypto_vcipher(B, K9); + B = __builtin_crypto_vcipherlast(B, K10); + + StoreBlock(B, out); + + out += 16; + in += 16; + } + } + +BOTAN_FUNC_ISA("crypto") +void AES_128::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_EK.empty() == false, "Key was set"); + + const __vector unsigned long long K0 = LoadBlock(m_ME.data()); + const __vector unsigned long long K1 = LoadKey(&m_EK[36]); + const __vector unsigned long long K2 = LoadKey(&m_EK[32]); + const __vector unsigned long long K3 = LoadKey(&m_EK[28]); + const __vector unsigned long long K4 = LoadKey(&m_EK[24]); + const __vector unsigned long long K5 = LoadKey(&m_EK[20]); + const __vector unsigned long long K6 = LoadKey(&m_EK[16]); + const __vector unsigned long long K7 = LoadKey(&m_EK[12]); + const __vector unsigned long long K8 = LoadKey(&m_EK[8]); + const __vector unsigned long long K9 = LoadKey(&m_EK[4]); + const __vector unsigned long long K10 = LoadKey(&m_EK[0]); + + for(size_t i = 0; i != blocks; ++i) + { + __vector unsigned long long B = LoadBlock(in); + + B = vec_xor(B, K0); + B = __builtin_crypto_vncipher(B, K1); + B = __builtin_crypto_vncipher(B, K2); + B = __builtin_crypto_vncipher(B, K3); + B = __builtin_crypto_vncipher(B, K4); + B = __builtin_crypto_vncipher(B, K5); + B = __builtin_crypto_vncipher(B, K6); + B = __builtin_crypto_vncipher(B, K7); + B = __builtin_crypto_vncipher(B, K8); + B = __builtin_crypto_vncipher(B, K9); + B = __builtin_crypto_vncipherlast(B, K10); + + StoreBlock(B, out); + + out += 16; + in += 16; + } + } + +BOTAN_FUNC_ISA("crypto") +void AES_192::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_EK.empty() == false, "Key was set"); + + const __vector unsigned long long K0 = LoadKey(&m_EK[0]); + const __vector unsigned long long K1 = LoadKey(&m_EK[4]); + const __vector unsigned long long K2 = LoadKey(&m_EK[8]); + const __vector unsigned long long K3 = LoadKey(&m_EK[12]); + const __vector unsigned long long K4 = LoadKey(&m_EK[16]); + const __vector unsigned long long K5 = LoadKey(&m_EK[20]); + const __vector unsigned long long K6 = LoadKey(&m_EK[24]); + const __vector unsigned long long K7 = LoadKey(&m_EK[28]); + const __vector unsigned long long K8 = LoadKey(&m_EK[32]); + const __vector unsigned long long K9 = LoadKey(&m_EK[36]); + const __vector unsigned long long K10 = LoadKey(&m_EK[40]); + const __vector unsigned long long K11 = LoadKey(&m_EK[44]); + const __vector unsigned long long K12 = LoadBlock(m_ME.data()); + + for(size_t i = 0; i != blocks; ++i) + { + __vector unsigned long long B = LoadBlock(in); + + B = vec_xor(B, K0); + B = __builtin_crypto_vcipher(B, K1); + B = __builtin_crypto_vcipher(B, K2); + B = __builtin_crypto_vcipher(B, K3); + B = __builtin_crypto_vcipher(B, K4); + B = __builtin_crypto_vcipher(B, K5); + B = __builtin_crypto_vcipher(B, K6); + B = __builtin_crypto_vcipher(B, K7); + B = __builtin_crypto_vcipher(B, K8); + B = __builtin_crypto_vcipher(B, K9); + B = __builtin_crypto_vcipher(B, K10); + B = __builtin_crypto_vcipher(B, K11); + B = __builtin_crypto_vcipherlast(B, K12); + + StoreBlock(B, out); + + out += 16; + in += 16; + } + } + +BOTAN_FUNC_ISA("crypto") +void AES_192::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_EK.empty() == false, "Key was set"); + + const __vector unsigned long long K0 = LoadBlock(m_ME.data()); + const __vector unsigned long long K1 = LoadKey(&m_EK[44]); + const __vector unsigned long long K2 = LoadKey(&m_EK[40]); + const __vector unsigned long long K3 = LoadKey(&m_EK[36]); + const __vector unsigned long long K4 = LoadKey(&m_EK[32]); + const __vector unsigned long long K5 = LoadKey(&m_EK[28]); + const __vector unsigned long long K6 = LoadKey(&m_EK[24]); + const __vector unsigned long long K7 = LoadKey(&m_EK[20]); + const __vector unsigned long long K8 = LoadKey(&m_EK[16]); + const __vector unsigned long long K9 = LoadKey(&m_EK[12]); + const __vector unsigned long long K10 = LoadKey(&m_EK[8]); + const __vector unsigned long long K11 = LoadKey(&m_EK[4]); + const __vector unsigned long long K12 = LoadKey(&m_EK[0]); + + for(size_t i = 0; i != blocks; ++i) + { + __vector unsigned long long B = LoadBlock(in); + + B = vec_xor(B, K0); + B = __builtin_crypto_vncipher(B, K1); + B = __builtin_crypto_vncipher(B, K2); + B = __builtin_crypto_vncipher(B, K3); + B = __builtin_crypto_vncipher(B, K4); + B = __builtin_crypto_vncipher(B, K5); + B = __builtin_crypto_vncipher(B, K6); + B = __builtin_crypto_vncipher(B, K7); + B = __builtin_crypto_vncipher(B, K8); + B = __builtin_crypto_vncipher(B, K9); + B = __builtin_crypto_vncipher(B, K10); + B = __builtin_crypto_vncipher(B, K11); + B = __builtin_crypto_vncipherlast(B, K12); + + StoreBlock(B, out); + + out += 16; + in += 16; + } + } + +BOTAN_FUNC_ISA("crypto") +void AES_256::power8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_EK.empty() == false, "Key was set"); + const __vector unsigned long long K0 = LoadKey(&m_EK[0]); + const __vector unsigned long long K1 = LoadKey(&m_EK[4]); + const __vector unsigned long long K2 = LoadKey(&m_EK[8]); + const __vector unsigned long long K3 = LoadKey(&m_EK[12]); + const __vector unsigned long long K4 = LoadKey(&m_EK[16]); + const __vector unsigned long long K5 = LoadKey(&m_EK[20]); + const __vector unsigned long long K6 = LoadKey(&m_EK[24]); + const __vector unsigned long long K7 = LoadKey(&m_EK[28]); + const __vector unsigned long long K8 = LoadKey(&m_EK[32]); + const __vector unsigned long long K9 = LoadKey(&m_EK[36]); + const __vector unsigned long long K10 = LoadKey(&m_EK[40]); + const __vector unsigned long long K11 = LoadKey(&m_EK[44]); + const __vector unsigned long long K12 = LoadKey(&m_EK[48]); + const __vector unsigned long long K13 = LoadKey(&m_EK[52]); + const __vector unsigned long long K14 = LoadBlock(m_ME.data()); + + for(size_t i = 0; i != blocks; ++i) + { + __vector unsigned long long B = LoadBlock(in); + + B = vec_xor(B, K0); + B = __builtin_crypto_vcipher(B, K1); + B = __builtin_crypto_vcipher(B, K2); + B = __builtin_crypto_vcipher(B, K3); + B = __builtin_crypto_vcipher(B, K4); + B = __builtin_crypto_vcipher(B, K5); + B = __builtin_crypto_vcipher(B, K6); + B = __builtin_crypto_vcipher(B, K7); + B = __builtin_crypto_vcipher(B, K8); + B = __builtin_crypto_vcipher(B, K9); + B = __builtin_crypto_vcipher(B, K10); + B = __builtin_crypto_vcipher(B, K11); + B = __builtin_crypto_vcipher(B, K12); + B = __builtin_crypto_vcipher(B, K13); + B = __builtin_crypto_vcipherlast(B, K14); + + StoreBlock(B, out); + + out += 16; + in += 16; + } + } + +BOTAN_FUNC_ISA("crypto") +void AES_256::power8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + BOTAN_ASSERT(m_EK.empty() == false, "Key was set"); + + const __vector unsigned long long K0 = LoadBlock(m_ME.data()); + const __vector unsigned long long K1 = LoadKey(&m_EK[52]); + const __vector unsigned long long K2 = LoadKey(&m_EK[48]); + const __vector unsigned long long K3 = LoadKey(&m_EK[44]); + const __vector unsigned long long K4 = LoadKey(&m_EK[40]); + const __vector unsigned long long K5 = LoadKey(&m_EK[36]); + const __vector unsigned long long K6 = LoadKey(&m_EK[32]); + const __vector unsigned long long K7 = LoadKey(&m_EK[28]); + const __vector unsigned long long K8 = LoadKey(&m_EK[24]); + const __vector unsigned long long K9 = LoadKey(&m_EK[20]); + const __vector unsigned long long K10 = LoadKey(&m_EK[16]); + const __vector unsigned long long K11 = LoadKey(&m_EK[12]); + const __vector unsigned long long K12 = LoadKey(&m_EK[8]); + const __vector unsigned long long K13 = LoadKey(&m_EK[4]); + const __vector unsigned long long K14 = LoadKey(&m_EK[0]); + + for(size_t i = 0; i != blocks; ++i) + { + __vector unsigned long long B = LoadBlock(in); + + B = vec_xor(B, K0); + B = __builtin_crypto_vncipher(B, K1); + B = __builtin_crypto_vncipher(B, K2); + B = __builtin_crypto_vncipher(B, K3); + B = __builtin_crypto_vncipher(B, K4); + B = __builtin_crypto_vncipher(B, K5); + B = __builtin_crypto_vncipher(B, K6); + B = __builtin_crypto_vncipher(B, K7); + B = __builtin_crypto_vncipher(B, K8); + B = __builtin_crypto_vncipher(B, K9); + B = __builtin_crypto_vncipher(B, K10); + B = __builtin_crypto_vncipher(B, K11); + B = __builtin_crypto_vncipher(B, K12); + B = __builtin_crypto_vncipher(B, K13); + B = __builtin_crypto_vncipherlast(B, K14); + + StoreBlock(B, out); + + out += 16; + in += 16; + } + } + +} diff --git a/src/libs/3rdparty/botan/src/lib/block/aes/aes_power8/info.txt b/src/libs/3rdparty/botan/src/lib/block/aes/aes_power8/info.txt new file mode 100644 index 0000000000..6aa52d25a0 --- /dev/null +++ b/src/libs/3rdparty/botan/src/lib/block/aes/aes_power8/info.txt @@ -0,0 +1,9 @@ +<defines> +AES_POWER8 -> 20180223 +</defines> + +<arch> +ppc64 +</arch> + +need_isa ppccrypto diff --git a/src/libs/3rdparty/botan/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp b/src/libs/3rdparty/botan/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp new file mode 100644 index 0000000000..47d70d0b8b --- /dev/null +++ b/src/libs/3rdparty/botan/src/lib/block/aes/aes_ssse3/aes_ssse3.cpp @@ -0,0 +1,637 @@ +/* +* AES using SSSE3 +* (C) 2010,2016 Jack Lloyd +* +* This is more or less a direct translation of public domain x86-64 +* assembly written by Mike Hamburg, described in "Accelerating AES +* with Vector Permute Instructions" (CHES 2009). His original code is +* available at https://crypto.stanford.edu/vpaes/ +* +* Botan is released under the Simplified BSD License (see license.txt) +*/ + +#include <botan/aes.h> +#include <botan/internal/ct_utils.h> +#include <tmmintrin.h> + +namespace Botan { + +namespace { + +const __m128i low_nibs = _mm_set1_epi8(0x0F); + +const __m128i k_ipt1 = _mm_set_epi32( + 0xCABAE090, 0x52227808, 0xC2B2E898, 0x5A2A7000); +const __m128i k_ipt2 = _mm_set_epi32( + 0xCD80B1FC, 0xB0FDCC81, 0x4C01307D, 0x317C4D00); + +const __m128i k_inv1 = _mm_set_epi32( + 0x04070309, 0x0A0B0C02, 0x0E05060F, 0x0D080180); +const __m128i k_inv2 = _mm_set_epi32( + 0x030D0E0C, 0x02050809, 0x01040A06, 0x0F0B0780); + +const __m128i sb1u = _mm_set_epi32( + 0xA5DF7A6E, 0x142AF544, 0xB19BE18F, 0xCB503E00); +const __m128i sb1t = _mm_set_epi32( + 0x3BF7CCC1, 0x0D2ED9EF, 0x3618D415, 0xFAE22300); + +const __m128i mc_forward[4] = { + _mm_set_epi32(0x0C0F0E0D, 0x080B0A09, 0x04070605, 0x00030201), + _mm_set_epi32(0x00030201, 0x0C0F0E0D, 0x080B0A09, 0x04070605), + _mm_set_epi32(0x04070605, 0x00030201, 0x0C0F0E0D, 0x080B0A09), + _mm_set_epi32(0x080B0A09, 0x04070605, 0x00030201, 0x0C0F0E0D) +}; + +const __m128i sr[4] = { + _mm_set_epi32(0x0F0E0D0C, 0x0B0A0908, 0x07060504, 0x03020100), + _mm_set_epi32(0x0B06010C, 0x07020D08, 0x030E0904, 0x0F0A0500), + _mm_set_epi32(0x070E050C, 0x030A0108, 0x0F060D04, 0x0B020900), + _mm_set_epi32(0x0306090C, 0x0F020508, 0x0B0E0104, 0x070A0D00), +}; + +#define mm_xor3(x, y, z) _mm_xor_si128(x, _mm_xor_si128(y, z)) + +BOTAN_FUNC_ISA("ssse3") +__m128i aes_schedule_transform(__m128i input, + __m128i table_1, + __m128i table_2) + { + __m128i i_1 = _mm_and_si128(low_nibs, input); + __m128i i_2 = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input), 4); + + return _mm_xor_si128( + _mm_shuffle_epi8(table_1, i_1), + _mm_shuffle_epi8(table_2, i_2)); + } + +BOTAN_FUNC_ISA("ssse3") +__m128i aes_schedule_mangle(__m128i k, uint8_t round_no) + { + __m128i t = _mm_shuffle_epi8(_mm_xor_si128(k, _mm_set1_epi8(0x5B)), + mc_forward[0]); + + __m128i t2 = t; + + t = _mm_shuffle_epi8(t, mc_forward[0]); + + t2 = mm_xor3(t2, t, _mm_shuffle_epi8(t, mc_forward[0])); + + return _mm_shuffle_epi8(t2, sr[round_no % 4]); + } + +BOTAN_FUNC_ISA("ssse3") +__m128i aes_schedule_192_smear(__m128i x, __m128i y) + { + return mm_xor3(y, + _mm_shuffle_epi32(x, 0xFE), + _mm_shuffle_epi32(y, 0x80)); + } + +BOTAN_FUNC_ISA("ssse3") +__m128i aes_schedule_mangle_dec(__m128i k, uint8_t round_no) + { + const __m128i dsk[8] = { + _mm_set_epi32(0x4AED9334, 0x82255BFC, 0xB6116FC8, 0x7ED9A700), + _mm_set_epi32(0x8BB89FAC, 0xE9DAFDCE, 0x45765162, 0x27143300), + _mm_set_epi32(0x4622EE8A, 0xADC90561, 0x27438FEB, 0xCCA86400), + _mm_set_epi32(0x73AEE13C, 0xBD602FF2, 0x815C13CE, 0x4F92DD00), + _mm_set_epi32(0xF83F3EF9, 0xFA3D3CFB, 0x03C4C502, 0x01C6C700), + _mm_set_epi32(0xA5526A9D, 0x7384BC4B, 0xEE1921D6, 0x38CFF700), + _mm_set_epi32(0xA080D3F3, 0x10306343, 0xE3C390B0, 0x53732000), + _mm_set_epi32(0x2F45AEC4, 0x8CE60D67, 0xA0CA214B, 0x036982E8) + }; + + __m128i t = aes_schedule_transform(k, dsk[0], dsk[1]); + __m128i output = _mm_shuffle_epi8(t, mc_forward[0]); + + t = aes_schedule_transform(t, dsk[2], dsk[3]); + output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]); + + t = aes_schedule_transform(t, dsk[4], dsk[5]); + output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]); + + t = aes_schedule_transform(t, dsk[6], dsk[7]); + output = _mm_shuffle_epi8(_mm_xor_si128(t, output), mc_forward[0]); + + return _mm_shuffle_epi8(output, sr[round_no % 4]); + } + +BOTAN_FUNC_ISA("ssse3") +__m128i aes_schedule_mangle_last(__m128i k, uint8_t round_no) + { + const __m128i out_tr1 = _mm_set_epi32( + 0xF7974121, 0xDEBE6808, 0xFF9F4929, 0xD6B66000); + const __m128i out_tr2 = _mm_set_epi32( + 0xE10D5DB1, 0xB05C0CE0, 0x01EDBD51, 0x50BCEC00); + + k = _mm_shuffle_epi8(k, sr[round_no % 4]); + k = _mm_xor_si128(k, _mm_set1_epi8(0x5B)); + return aes_schedule_transform(k, out_tr1, out_tr2); + } + +BOTAN_FUNC_ISA("ssse3") +__m128i aes_schedule_mangle_last_dec(__m128i k) + { + const __m128i deskew1 = _mm_set_epi32( + 0x1DFEB95A, 0x5DBEF91A, 0x07E4A340, 0x47A4E300); + const __m128i deskew2 = _mm_set_epi32( + 0x2841C2AB, 0xF49D1E77, 0x5F36B5DC, 0x83EA6900); + + k = _mm_xor_si128(k, _mm_set1_epi8(0x5B)); + return aes_schedule_transform(k, deskew1, deskew2); + } + +BOTAN_FUNC_ISA("ssse3") +__m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2) + { + if(rcon) + { + input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15), + input2); + + *rcon = _mm_alignr_epi8(*rcon, *rcon, 15); // next rcon + + input1 = _mm_shuffle_epi32(input1, 0xFF); // rotate + input1 = _mm_alignr_epi8(input1, input1, 1); + } + + __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4)); + smeared = mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B)); + + __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4); + + input1 = _mm_and_si128(low_nibs, input1); + + __m128i t2 = _mm_shuffle_epi8(k_inv2, input1); + + input1 = _mm_xor_si128(input1, t); + + __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); + __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1)); + + __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3)); + __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); + + return mm_xor3(_mm_shuffle_epi8(sb1u, t5), + _mm_shuffle_epi8(sb1t, t6), + smeared); + } + +BOTAN_FUNC_ISA("ssse3") +__m128i aes_ssse3_encrypt(__m128i B, const __m128i* keys, size_t rounds) + { + const __m128i sb2u = _mm_set_epi32( + 0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400); + const __m128i sb2t = _mm_set_epi32( + 0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900); + + const __m128i sbou = _mm_set_epi32( + 0x15AABF7A, 0xC502A878, 0xD0D26D17, 0x6FBDC700); + const __m128i sbot = _mm_set_epi32( + 0x8E1E90D1, 0x412B35FA, 0xCFE474A5, 0x5FBB6A00); + + const __m128i mc_backward[4] = { + _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003), + _mm_set_epi32(0x0A09080B, 0x06050407, 0x02010003, 0x0E0D0C0F), + _mm_set_epi32(0x06050407, 0x02010003, 0x0E0D0C0F, 0x0A09080B), + _mm_set_epi32(0x02010003, 0x0E0D0C0F, 0x0A09080B, 0x06050407), + }; + + B = mm_xor3(_mm_shuffle_epi8(k_ipt1, _mm_and_si128(low_nibs, B)), + _mm_shuffle_epi8(k_ipt2, + _mm_srli_epi32( + _mm_andnot_si128(low_nibs, B), + 4)), + _mm_loadu_si128(keys)); + + for(size_t r = 1; ; ++r) + { + const __m128i K = _mm_loadu_si128(keys + r); + + __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4); + + B = _mm_and_si128(low_nibs, B); + + __m128i t2 = _mm_shuffle_epi8(k_inv2, B); + + B = _mm_xor_si128(B, t); + + __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); + __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B)); + + __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3)); + __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); + + if(r == rounds) + { + B = _mm_shuffle_epi8( + mm_xor3(_mm_shuffle_epi8(sbou, t5), + _mm_shuffle_epi8(sbot, t6), + K), + sr[r % 4]); + + return B; + } + + __m128i t7 = mm_xor3(_mm_shuffle_epi8(sb1t, t6), + _mm_shuffle_epi8(sb1u, t5), + K); + + __m128i t8 = mm_xor3(_mm_shuffle_epi8(sb2t, t6), + _mm_shuffle_epi8(sb2u, t5), + _mm_shuffle_epi8(t7, mc_forward[r % 4])); + + B = mm_xor3(_mm_shuffle_epi8(t8, mc_forward[r % 4]), + _mm_shuffle_epi8(t7, mc_backward[r % 4]), + t8); + } + } + +BOTAN_FUNC_ISA("ssse3") +__m128i aes_ssse3_decrypt(__m128i B, const __m128i* keys, size_t rounds) + { + const __m128i k_dipt1 = _mm_set_epi32( + 0x154A411E, 0x114E451A, 0x0F505B04, 0x0B545F00); + const __m128i k_dipt2 = _mm_set_epi32( + 0x12771772, 0xF491F194, 0x86E383E6, 0x60056500); + + const __m128i sb9u = _mm_set_epi32( + 0xCAD51F50, 0x4F994CC9, 0x851C0353, 0x9A86D600); + const __m128i sb9t = _mm_set_epi32( + 0x725E2C9E, 0xB2FBA565, 0xC03B1789, 0xECD74900); + + const __m128i sbeu = _mm_set_epi32( + 0x22426004, 0x64B4F6B0, 0x46F29296, 0x26D4D000); + const __m128i sbet = _mm_set_epi32( + 0x9467F36B, 0x98593E32, 0x0C55A6CD, 0xFFAAC100); + + const __m128i sbdu = _mm_set_epi32( + 0xF56E9B13, 0x882A4439, 0x7D57CCDF, 0xE6B1A200); + const __m128i sbdt = _mm_set_epi32( + 0x2931180D, 0x15DEEFD3, 0x3CE2FAF7, 0x24C6CB00); + + const __m128i sbbu = _mm_set_epi32( + 0x602646F6, 0xB0F2D404, 0xD0226492, 0x96B44200); + const __m128i sbbt = _mm_set_epi32( + 0xF3FF0C3E, 0x3255AA6B, 0xC19498A6, 0xCD596700); + + __m128i mc = mc_forward[3]; + + __m128i t = + _mm_shuffle_epi8(k_dipt2, + _mm_srli_epi32( + _mm_andnot_si128(low_nibs, B), + 4)); + + B = mm_xor3(t, _mm_loadu_si128(keys), + _mm_shuffle_epi8(k_dipt1, _mm_and_si128(B, low_nibs))); + + for(size_t r = 1; ; ++r) + { + const __m128i K = _mm_loadu_si128(keys + r); + + t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4); + + B = _mm_and_si128(low_nibs, B); + + __m128i t2 = _mm_shuffle_epi8(k_inv2, B); + + B = _mm_xor_si128(B, t); + + __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); + __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B)); + __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3)); + __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); + + if(r == rounds) + { + const __m128i sbou = _mm_set_epi32( + 0xC7AA6DB9, 0xD4943E2D, 0x1387EA53, 0x7EF94000); + const __m128i sbot = _mm_set_epi32( + 0xCA4B8159, 0xD8C58E9C, 0x12D7560F, 0x93441D00); + + __m128i x = _mm_shuffle_epi8(sbou, t5); + __m128i y = _mm_shuffle_epi8(sbot, t6); + x = _mm_xor_si128(x, K); + x = _mm_xor_si128(x, y); + + const uint32_t which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16; + return _mm_shuffle_epi8(x, sr[which_sr]); + } + + __m128i t8 = _mm_xor_si128(_mm_shuffle_epi8(sb9t, t6), + _mm_xor_si128(_mm_shuffle_epi8(sb9u, t5), K)); + + __m128i t9 = mm_xor3(_mm_shuffle_epi8(t8, mc), + _mm_shuffle_epi8(sbdu, t5), + _mm_shuffle_epi8(sbdt, t6)); + + __m128i t12 = _mm_xor_si128( + _mm_xor_si128( + _mm_shuffle_epi8(t9, mc), + _mm_shuffle_epi8(sbbu, t5)), + _mm_shuffle_epi8(sbbt, t6)); + + B = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t12, mc), + _mm_shuffle_epi8(sbeu, t5)), + _mm_shuffle_epi8(sbet, t6)); + + mc = _mm_alignr_epi8(mc, mc, 12); + } + } + +} + +/* +* AES-128 Encryption +*/ +BOTAN_FUNC_ISA("ssse3") +void AES_128::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); + __m128i* out_mm = reinterpret_cast<__m128i*>(out); + + const __m128i* keys = reinterpret_cast<const __m128i*>(m_EK.data()); + + CT::poison(in, blocks * block_size()); + + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) + { + __m128i B = _mm_loadu_si128(in_mm + i); + _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 10)); + } + + CT::unpoison(in, blocks * block_size()); + CT::unpoison(out, blocks * block_size()); + } + +/* +* AES-128 Decryption +*/ +BOTAN_FUNC_ISA("ssse3") +void AES_128::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); + __m128i* out_mm = reinterpret_cast<__m128i*>(out); + + const __m128i* keys = reinterpret_cast<const __m128i*>(m_DK.data()); + + CT::poison(in, blocks * block_size()); + + BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) + { + __m128i B = _mm_loadu_si128(in_mm + i); + _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 10)); + } + + CT::unpoison(in, blocks * block_size()); + CT::unpoison(out, blocks * block_size()); + } + +/* +* AES-128 Key Schedule +*/ +BOTAN_FUNC_ISA("ssse3") +void AES_128::ssse3_key_schedule(const uint8_t keyb[], size_t) + { + __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81, + 0x1F8391B9, 0xAF9DEEB6); + + __m128i key = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb)); + + m_EK.resize(11*4); + m_DK.resize(11*4); + + __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data()); + __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); + + _mm_storeu_si128(DK_mm + 10, _mm_shuffle_epi8(key, sr[2])); + + key = aes_schedule_transform(key, k_ipt1, k_ipt2); + + _mm_storeu_si128(EK_mm, key); + + for(size_t i = 1; i != 10; ++i) + { + key = aes_schedule_round(&rcon, key, key); + + _mm_storeu_si128(EK_mm + i, + aes_schedule_mangle(key, (12-i) % 4)); + + _mm_storeu_si128(DK_mm + (10-i), + aes_schedule_mangle_dec(key, (10-i) % 4)); + } + + key = aes_schedule_round(&rcon, key, key); + _mm_storeu_si128(EK_mm + 10, aes_schedule_mangle_last(key, 2)); + _mm_storeu_si128(DK_mm, aes_schedule_mangle_last_dec(key)); + } + +/* +* AES-192 Encryption +*/ +BOTAN_FUNC_ISA("ssse3") +void AES_192::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); + __m128i* out_mm = reinterpret_cast<__m128i*>(out); + + const __m128i* keys = reinterpret_cast<const __m128i*>(m_EK.data()); + + CT::poison(in, blocks * block_size()); + + for(size_t i = 0; i != blocks; ++i) + { + __m128i B = _mm_loadu_si128(in_mm + i); + _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 12)); + } + + CT::unpoison(in, blocks * block_size()); + CT::unpoison(out, blocks * block_size()); + } + +/* +* AES-192 Decryption +*/ +BOTAN_FUNC_ISA("ssse3") +void AES_192::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); + __m128i* out_mm = reinterpret_cast<__m128i*>(out); + + const __m128i* keys = reinterpret_cast<const __m128i*>(m_DK.data()); + + CT::poison(in, blocks * block_size()); + + for(size_t i = 0; i != blocks; ++i) + { + __m128i B = _mm_loadu_si128(in_mm + i); + _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 12)); + } + + CT::unpoison(in, blocks * block_size()); + CT::unpoison(out, blocks * block_size()); + } + +/* +* AES-192 Key Schedule +*/ +BOTAN_FUNC_ISA("ssse3") +void AES_192::ssse3_key_schedule(const uint8_t keyb[], size_t) + { + __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81, + 0x1F8391B9, 0xAF9DEEB6); + + m_EK.resize(13*4); + m_DK.resize(13*4); + + __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data()); + __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); + + __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb)); + __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((keyb + 8))); + + _mm_storeu_si128(DK_mm + 12, _mm_shuffle_epi8(key1, sr[0])); + + key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2); + key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2); + + _mm_storeu_si128(EK_mm + 0, key1); + + // key2 with 8 high bytes masked off + __m128i t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8); + + for(size_t i = 0; i != 4; ++i) + { + key2 = aes_schedule_round(&rcon, key2, key1); + + _mm_storeu_si128(EK_mm + 3*i+1, + aes_schedule_mangle(_mm_alignr_epi8(key2, t, 8), (i+3)%4)); + _mm_storeu_si128(DK_mm + 11-3*i, + aes_schedule_mangle_dec(_mm_alignr_epi8(key2, t, 8), (i+3)%4)); + + t = aes_schedule_192_smear(key2, t); + + _mm_storeu_si128(EK_mm + 3*i+2, + aes_schedule_mangle(t, (i+2)%4)); + _mm_storeu_si128(DK_mm + 10-3*i, + aes_schedule_mangle_dec(t, (i+2)%4)); + + key2 = aes_schedule_round(&rcon, t, key2); + + if(i == 3) + { + _mm_storeu_si128(EK_mm + 3*i+3, + aes_schedule_mangle_last(key2, (i+1)%4)); + _mm_storeu_si128(DK_mm + 9-3*i, + aes_schedule_mangle_last_dec(key2)); + } + else + { + _mm_storeu_si128(EK_mm + 3*i+3, + aes_schedule_mangle(key2, (i+1)%4)); + _mm_storeu_si128(DK_mm + 9-3*i, + aes_schedule_mangle_dec(key2, (i+1)%4)); + } + + key1 = key2; + key2 = aes_schedule_192_smear(key2, + _mm_slli_si128(_mm_srli_si128(t, 8), 8)); + t = _mm_slli_si128(_mm_srli_si128(key2, 8), 8); + } + } + +/* +* AES-256 Encryption +*/ +BOTAN_FUNC_ISA("ssse3") +void AES_256::ssse3_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); + __m128i* out_mm = reinterpret_cast<__m128i*>(out); + + const __m128i* keys = reinterpret_cast<const __m128i*>(m_EK.data()); + + CT::poison(in, blocks * block_size()); + + for(size_t i = 0; i != blocks; ++i) + { + __m128i B = _mm_loadu_si128(in_mm + i); + _mm_storeu_si128(out_mm + i, aes_ssse3_encrypt(B, keys, 14)); + } + + CT::unpoison(in, blocks * block_size()); + CT::unpoison(out, blocks * block_size()); + } + +/* +* AES-256 Decryption +*/ +BOTAN_FUNC_ISA("ssse3") +void AES_256::ssse3_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const + { + const __m128i* in_mm = reinterpret_cast<const __m128i*>(in); + __m128i* out_mm = reinterpret_cast<__m128i*>(out); + + const __m128i* keys = reinterpret_cast<const __m128i*>(m_DK.data()); + + CT::poison(in, blocks * block_size()); + + for(size_t i = 0; i != blocks; ++i) + { + __m128i B = _mm_loadu_si128(in_mm + i); + _mm_storeu_si128(out_mm + i, aes_ssse3_decrypt(B, keys, 14)); + } + + CT::unpoison(in, blocks * block_size()); + CT::unpoison(out, blocks * block_size()); + } + +/* +* AES-256 Key Schedule +*/ +BOTAN_FUNC_ISA("ssse3") +void AES_256::ssse3_key_schedule(const uint8_t keyb[], size_t) + { + __m128i rcon = _mm_set_epi32(0x702A9808, 0x4D7C7D81, + 0x1F8391B9, 0xAF9DEEB6); + + m_EK.resize(15*4); + m_DK.resize(15*4); + + __m128i* EK_mm = reinterpret_cast<__m128i*>(m_EK.data()); + __m128i* DK_mm = reinterpret_cast<__m128i*>(m_DK.data()); + + __m128i key1 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(keyb)); + __m128i key2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>((keyb + 16))); + + _mm_storeu_si128(DK_mm + 14, _mm_shuffle_epi8(key1, sr[2])); + + key1 = aes_schedule_transform(key1, k_ipt1, k_ipt2); + key2 = aes_schedule_transform(key2, k_ipt1, k_ipt2); + + _mm_storeu_si128(EK_mm + 0, key1); + _mm_storeu_si128(EK_mm + 1, aes_schedule_mangle(key2, 3)); + + _mm_storeu_si128(DK_mm + 13, aes_schedule_mangle_dec(key2, 1)); + + for(size_t i = 2; i != 14; i += 2) + { + __m128i k_t = key2; + key1 = key2 = aes_schedule_round(&rcon, key2, key1); + + _mm_storeu_si128(EK_mm + i, aes_schedule_mangle(key2, i % 4)); + _mm_storeu_si128(DK_mm + (14-i), aes_schedule_mangle_dec(key2, (i+2) % 4)); + + key2 = aes_schedule_round(nullptr, _mm_shuffle_epi32(key2, 0xFF), k_t); + _mm_storeu_si128(EK_mm + i + 1, aes_schedule_mangle(key2, (i - 1) % 4)); + _mm_storeu_si128(DK_mm + (13-i), aes_schedule_mangle_dec(key2, (i+1) % 4)); + } + + key2 = aes_schedule_round(&rcon, key2, key1); + + _mm_storeu_si128(EK_mm + 14, aes_schedule_mangle_last(key2, 2)); + _mm_storeu_si128(DK_mm + 0, aes_schedule_mangle_last_dec(key2)); + } + +} diff --git a/src/libs/3rdparty/botan/src/lib/block/aes/aes_ssse3/info.txt b/src/libs/3rdparty/botan/src/lib/block/aes/aes_ssse3/info.txt new file mode 100644 index 0000000000..8457eed569 --- /dev/null +++ b/src/libs/3rdparty/botan/src/lib/block/aes/aes_ssse3/info.txt @@ -0,0 +1,15 @@ +<defines> +AES_SSSE3 -> 20131128 +</defines> + +load_on auto + +need_isa ssse3 + +# Intel C++ can't deal with syntax for defining constants :( +<cc> +gcc +clang +msvc +sunstudio +</cc> diff --git a/src/libs/3rdparty/botan/src/lib/block/aes/info.txt b/src/libs/3rdparty/botan/src/lib/block/aes/info.txt new file mode 100644 index 0000000000..62455cf2c3 --- /dev/null +++ b/src/libs/3rdparty/botan/src/lib/block/aes/info.txt @@ -0,0 +1,3 @@ +<defines> +AES -> 20131128 +</defines> |