From cf63b0e1dfc0bf3d11a92c5bf82840ddb6bb22ac Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Sun, 17 Sep 2017 12:39:35 -0700 Subject: qsimd: add support for new x86 CPU features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This adds detection for: VAES, GFNI, AVX512VBMI2, AVX512VNNI, AVX512BITALG, AVX512VPOPCNTDQ, AVX512_4NNIW, AVX512_4FMAPS. These features were found in the "IntelĀ® Architecture Instruction Set Extensions and Future Features" manual, revision 30. This commit also adds support for RDPID (already in the main manual) and the Control-flow Enforcement Technology, which appears in a separate Intel paper. This new support was done by adding a new generator script so we don't have to maintain two tables in sync, one in qsimd.cpp with the feature names, and the other in qsimd_p.h. Since we now need a lot more bits, it's no longer worth keeping the two halves of the qt_cpu_features variable mostly similar to the main two CPUID results. This commit goes back to keeping things in order, like we used to prior to commit 6a8251a89b6a61258498f4af1ba7b3d5b7f7096c (Qt 5.6) At the time of this commit, GCC 8 has macros for AVX512VPOPCNTDQ, AVX512_4NNIW, AVX512_4FMAPS, AVX512VBMI2 and GFNI. Change-Id: I938b024e38bf4aac9154fffd14f7afae50faaa96 Reviewed-by: Edward Welbourne Reviewed-by: Lars Knoll --- src/corelib/tools/qsimd.cpp | 252 +++++++++++----------------------------- src/corelib/tools/qsimd_p.h | 144 +---------------------- src/corelib/tools/qsimd_x86.cpp | 98 ++++++++++++++++ src/corelib/tools/qsimd_x86_p.h | 227 ++++++++++++++++++++++++++++++++++++ 4 files changed, 393 insertions(+), 328 deletions(-) create mode 100644 src/corelib/tools/qsimd_x86.cpp create mode 100644 src/corelib/tools/qsimd_x86_p.h (limited to 'src') diff --git a/src/corelib/tools/qsimd.cpp b/src/corelib/tools/qsimd.cpp index c4d7117449..fd9c6a7079 100644 --- a/src/corelib/tools/qsimd.cpp +++ b/src/corelib/tools/qsimd.cpp @@ -1,7 +1,7 @@ /**************************************************************************** ** ** Copyright (C) 2016 The Qt Company Ltd. -** Copyright (C) 2016 Intel Corporation. +** Copyright (C) 2018 Intel Corporation. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of the QtCore module of the Qt Toolkit. @@ -80,6 +80,43 @@ QT_BEGIN_NAMESPACE +/* + * Use kdesdk/scripts/generate_string_table.pl to update the table below. Note + * we remove the terminating -1 that the script adds. + */ + +// begin generated +#if defined(Q_PROCESSOR_ARM) +/* Data: + neon + crc32 + */ +static const char features_string[] = + " neon\0" + " crc32\0" + "\0"; +static const int features_indices[] = { 0, 6 }; +#elif defined(Q_PROCESSOR_MIPS) +/* Data: + dsp + dspr2 +*/ +static const char features_string[] = + " dsp\0" + " dspr2\0" + "\0"; + +static const int features_indices[] = { + 0, 5 +}; +#elif defined(Q_PROCESSOR_X86) +# include "qsimd_x86.cpp" // generated by util/x86simdgen +#else +static const char features_string[] = ""; +static const int features_indices[] = { }; +#endif +// end generated + #if defined (Q_OS_NACL) static inline uint detectProcessorFeatures() { @@ -222,29 +259,32 @@ static void cpuidFeatures01(uint &ecx, uint &edx) inline void __cpuidex(int info[4], int, __int64) { memset(info, 0, 4*sizeof(int));} #endif -static void cpuidFeatures07_00(uint &ebx, uint &ecx) +static void cpuidFeatures07_00(uint &ebx, uint &ecx, uint &edx) { #if defined(Q_CC_GNU) qregisteruint rbx; // in case it's 64-bit qregisteruint rcx = 0; + qregisteruint rdx = 0; asm ("xchg " PICreg", %0\n" "cpuid\n" "xchg " PICreg", %0\n" - : "=&r" (rbx), "+&c" (rcx) - : "a" (7) - : "%edx"); + : "=&r" (rbx), "+&c" (rcx), "+&d" (rdx) + : "a" (7)); ebx = rbx; ecx = rcx; + edx = rdx; #elif defined(Q_OS_WIN) int info[4]; __cpuidex(info, 7, 0); ebx = info[1]; ecx = info[2]; + edx = info[3]; #elif defined(Q_CC_GHS) unsigned int info[4]; __CPUIDEX(7, 0, info); ebx = info[1]; ecx = info[2]; + edx = info[3]; #endif } @@ -285,8 +325,11 @@ static quint64 detectProcessorFeatures() static const quint64 AllAVX512 = (Q_UINT64_C(1) << CpuFeatureAVX512F) | (Q_UINT64_C(1) << CpuFeatureAVX512CD) | (Q_UINT64_C(1) << CpuFeatureAVX512ER) | (Q_UINT64_C(1) << CpuFeatureAVX512PF) | (Q_UINT64_C(1) << CpuFeatureAVX512BW) | (Q_UINT64_C(1) << CpuFeatureAVX512DQ) | - (Q_UINT64_C(1) << CpuFeatureAVX512VL) | - (Q_UINT64_C(1) << CpuFeatureAVX512IFMA) | (Q_UINT64_C(1) << CpuFeatureAVX512VBMI); + (Q_UINT64_C(1) << CpuFeatureAVX512VL) | (Q_UINT64_C(1) << CpuFeatureAVX512IFMA) | + (Q_UINT64_C(1) << CpuFeatureAVX512VBMI) | (Q_UINT64_C(1) << CpuFeatureAVX512VBMI2) | + (Q_UINT64_C(1) << CpuFeatureAVX512VNNI) | (Q_UINT64_C(1) << CpuFeatureAVX512BITALG) | + (Q_UINT64_C(1) << CpuFeatureAVX512VPOPCNTDQ) | + (Q_UINT64_C(1) << CpuFeatureAVX5124NNIW) | (Q_UINT64_C(1) << CpuFeatureAVX5124FMAPS); static const quint64 AllAVX2 = (Q_UINT64_C(1) << CpuFeatureAVX2) | AllAVX512; static const quint64 AllAVX = (Q_UINT64_C(1) << CpuFeatureAVX) | AllAVX2; @@ -299,52 +342,33 @@ static quint64 detectProcessorFeatures() Q_ASSERT(cpuidLevel >= 1); #endif - uint cpuid01ECX = 0, cpuid01EDX = 0; - cpuidFeatures01(cpuid01ECX, cpuid01EDX); - - // the low 32-bits of features is cpuid01ECX - // note: we need to check OS support for saving the AVX register state - features = cpuid01ECX; - -#if defined(Q_PROCESSOR_X86_32) - // x86 might not have SSE2 support - if (cpuid01EDX & (1u << 26)) - features |= Q_UINT64_C(1) << CpuFeatureSSE2; - else - features &= ~(Q_UINT64_C(1) << CpuFeatureSSE2); - // we should verify that the OS enabled saving of the SSE state... -#else - // x86-64 or x32 - features |= Q_UINT64_C(1) << CpuFeatureSSE2; -#endif + uint results[X86CpuidMaxLeaf] = {}; + cpuidFeatures01(results[Leaf1ECX], results[Leaf1EDX]); + if (cpuidLevel >= 7) + cpuidFeatures07_00(results[Leaf7_0EBX], results[Leaf7_0ECX], results[Leaf7_0EDX]); + + // populate our feature list + for (uint i = 0; i < sizeof(x86_locators) / sizeof(x86_locators[0]); ++i) { + uint word = x86_locators[i] / 32; + uint bit = 1U << (x86_locators[i] % 32); + quint64 feature = Q_UINT64_C(1) << (i + 1); + if (results[word] & bit) + features |= feature; + } + // now check the AVX state uint xgetbvA = 0, xgetbvD = 0; - if (cpuid01ECX & (1u << 27)) { + if (results[Leaf1ECX] & (1u << 27)) { // XGETBV enabled xgetbv(0, xgetbvA, xgetbvD); } - uint cpuid0700EBX = 0; - uint cpuid0700ECX = 0; - if (cpuidLevel >= 7) { - cpuidFeatures07_00(cpuid0700EBX, cpuid0700ECX); - - // the high 32-bits of features is cpuid0700EBX - features |= quint64(cpuid0700EBX) << 32; - } - if ((xgetbvA & AVXState) != AVXState) { // support for YMM registers is disabled, disable all AVX features &= ~AllAVX; } else if ((xgetbvA & AVX512State) != AVX512State) { // support for ZMM registers or mask registers is disabled, disable all AVX512 features &= ~AllAVX512; - } else { - // this feature is out of order - if (cpuid0700ECX & (1u << 1)) - features |= Q_UINT64_C(1) << CpuFeatureAVX512VBMI; - else - features &= ~(Q_UINT64_C(1) << CpuFeatureAVX512VBMI); } return features; @@ -493,152 +517,6 @@ static inline uint detectProcessorFeatures() } #endif -/* - * Use kdesdk/scripts/generate_string_table.pl to update the table below. Note - * that the x86 version has a lot of blanks that must be kept and that the - * offset table's type is changed to make the table smaller. We also remove the - * terminating -1 that the script adds. - */ - -// begin generated -#if defined(Q_PROCESSOR_ARM) -/* Data: - neon - crc32 - */ -static const char features_string[] = - " neon\0" - " crc32\0" - "\0"; -static const int features_indices[] = { 0, 6 }; -#elif defined(Q_PROCESSOR_MIPS) -/* Data: - dsp - dspr2 -*/ -static const char features_string[] = - " dsp\0" - " dspr2\0" - "\0"; - -static const int features_indices[] = { - 0, 5 -}; -#elif defined(Q_PROCESSOR_X86) -/* Data: - sse3 - sse2 - avx512vbmi - - - - - - - ssse3 - - - fma - cmpxchg16b - - - - - - sse4.1 - sse4.2 - - movbe - popcnt - - aes - - - avx - f16c - rdrand - - - - - bmi - hle - avx2 - - - bmi2 - - - rtm - - - - - avx512f - avx512dq - rdseed - - - avx512ifma - - - - - avx512pf - avx512er - avx512cd - sha - avx512bw - avx512vl - */ -static const char features_string[] = - " sse3\0" - " sse2\0" - " avx512vbmi\0" - " ssse3\0" - " fma\0" - " cmpxchg16b\0" - " sse4.1\0" - " sse4.2\0" - " movbe\0" - " popcnt\0" - " aes\0" - " avx\0" - " f16c\0" - " rdrand\0" - " bmi\0" - " hle\0" - " avx2\0" - " bmi2\0" - " rtm\0" - " avx512f\0" - " avx512dq\0" - " rdseed\0" - " avx512ifma\0" - " avx512pf\0" - " avx512er\0" - " avx512cd\0" - " sha\0" - " avx512bw\0" - " avx512vl\0" - "\0"; - -static const quint8 features_indices[] = { - 0, 6, 12, 5, 5, 5, 5, 5, - 5, 24, 5, 5, 31, 36, 5, 5, - 5, 5, 5, 48, 56, 5, 64, 71, - 5, 79, 5, 5, 84, 89, 95, 5, - 5, 5, 5, 103, 108, 113, 5, 5, - 119, 5, 5, 125, 5, 5, 5, 5, - 130, 139, 149, 5, 5, 157, 5, 5, - 5, 5, 169, 179, 189, 199, 204, 214 -}; -#else -static const char features_string[] = ""; -static const int features_indices[] = { }; -#endif -// end generated - static const int features_count = (sizeof features_indices) / (sizeof features_indices[0]); // record what CPU features were enabled by default in this Qt build diff --git a/src/corelib/tools/qsimd_p.h b/src/corelib/tools/qsimd_p.h index 3161ee7412..1b7ed57fa8 100644 --- a/src/corelib/tools/qsimd_p.h +++ b/src/corelib/tools/qsimd_p.h @@ -179,6 +179,7 @@ #ifdef Q_PROCESSOR_X86 /* -- x86 intrinsic support -- */ +# include "qsimd_x86_p.h" # if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2) // MSVC doesn't define __SSE2__, so do it ourselves @@ -232,33 +233,6 @@ # define __RDRND__ 1 # endif -#define QT_FUNCTION_TARGET_STRING_SSE2 "sse2" -#define QT_FUNCTION_TARGET_STRING_SSE3 "sse3" -#define QT_FUNCTION_TARGET_STRING_SSSE3 "ssse3" -#define QT_FUNCTION_TARGET_STRING_SSE4_1 "sse4.1" -#define QT_FUNCTION_TARGET_STRING_SSE4_2 "sse4.2" -#define QT_FUNCTION_TARGET_STRING_AVX "avx" -#define QT_FUNCTION_TARGET_STRING_AVX2 "avx2" -#define QT_FUNCTION_TARGET_STRING_AVX512F "avx512f" -#define QT_FUNCTION_TARGET_STRING_AVX512CD "avx512cd" -#define QT_FUNCTION_TARGET_STRING_AVX512ER "avx512er" -#define QT_FUNCTION_TARGET_STRING_AVX512PF "avx512pf" -#define QT_FUNCTION_TARGET_STRING_AVX512BW "avx512bw" -#define QT_FUNCTION_TARGET_STRING_AVX512DQ "avx512dq" -#define QT_FUNCTION_TARGET_STRING_AVX512VL "avx512vl" -#define QT_FUNCTION_TARGET_STRING_AVX512IFMA "avx512ifma" -#define QT_FUNCTION_TARGET_STRING_AVX512VBMI "avx512vbmi" - -#define QT_FUNCTION_TARGET_STRING_AES "aes,sse4.2" -#define QT_FUNCTION_TARGET_STRING_PCLMUL "pclmul,sse4.2" -#define QT_FUNCTION_TARGET_STRING_POPCNT "popcnt" -#define QT_FUNCTION_TARGET_STRING_F16C "f16c,avx" -#define QT_FUNCTION_TARGET_STRING_RDRND "rdrnd" -#define QT_FUNCTION_TARGET_STRING_BMI "bmi" -#define QT_FUNCTION_TARGET_STRING_BMI2 "bmi2" -#define QT_FUNCTION_TARGET_STRING_RDSEED "rdseed" -#define QT_FUNCTION_TARGET_STRING_SHA "sha" - #endif /* Q_PROCESSOR_X86 */ // Clang compiler fix, see http://lists.llvm.org/pipermail/cfe-commits/Week-of-Mon-20160222/151168.html @@ -292,6 +266,7 @@ QT_BEGIN_NAMESPACE +#ifndef Q_PROCESSOR_X86 enum CPUFeatures { #if defined(Q_PROCESSOR_ARM) CpuFeatureNEON = 0, @@ -300,42 +275,6 @@ enum CPUFeatures { #elif defined(Q_PROCESSOR_MIPS) CpuFeatureDSP = 0, CpuFeatureDSPR2 = 1, -#elif defined(Q_PROCESSOR_X86) - // The order of the flags is jumbled so it matches most closely the bits in CPUID - // Out of order: - CpuFeatureSSE2 = 1, // uses the bit for PCLMULQDQ - // in level 1, ECX - CpuFeatureSSE3 = (0 + 0), - CpuFeatureSSSE3 = (0 + 9), - CpuFeatureSSE4_1 = (0 + 19), - CpuFeatureSSE4_2 = (0 + 20), - CpuFeatureMOVBE = (0 + 22), - CpuFeaturePOPCNT = (0 + 23), - CpuFeatureAES = (0 + 25), - CpuFeatureAVX = (0 + 28), - CpuFeatureF16C = (0 + 29), - CpuFeatureRDRND = (0 + 30), - // 31 is always zero and we've used it for the QSimdInitialized - - // in level 7, leaf 0, EBX - CpuFeatureBMI = (32 + 3), - CpuFeatureHLE = (32 + 4), - CpuFeatureAVX2 = (32 + 5), - CpuFeatureBMI2 = (32 + 8), - CpuFeatureRTM = (32 + 11), - CpuFeatureAVX512F = (32 + 16), - CpuFeatureAVX512DQ = (32 + 17), - CpuFeatureRDSEED = (32 + 18), - CpuFeatureAVX512IFMA = (32 + 21), - CpuFeatureAVX512PF = (32 + 26), - CpuFeatureAVX512ER = (32 + 27), - CpuFeatureAVX512CD = (32 + 28), - CpuFeatureSHA = (32 + 29), - CpuFeatureAVX512BW = (32 + 30), - CpuFeatureAVX512VL = (32 + 31), - - // in level 7, leaf 0, ECX (out of order, for now) - CpuFeatureAVX512VBMI = 2, // uses the bit for DTES64 #endif // used only to indicate that the CPU detection was initialised @@ -343,84 +282,6 @@ enum CPUFeatures { }; static const quint64 qCompilerCpuFeatures = 0 -#if defined __SHA__ - | (Q_UINT64_C(1) << CpuFeatureSHA) -#endif -#if defined __AES__ - | (Q_UINT64_C(1) << CpuFeatureAES) -#endif -#if defined __RTM__ - | (Q_UINT64_C(1) << CpuFeatureRTM) -#endif -#ifdef __RDRND__ - | (Q_UINT64_C(1) << CpuFeatureRDRND) -#endif -#ifdef __RDSEED__ - | (Q_UINT64_C(1) << CpuFeatureRDSEED) -#endif -#if defined __BMI__ - | (Q_UINT64_C(1) << CpuFeatureBMI) -#endif -#if defined __BMI2__ - | (Q_UINT64_C(1) << CpuFeatureBMI2) -#endif -#if defined __F16C__ - | (Q_UINT64_C(1) << CpuFeatureF16C) -#endif -#if defined __POPCNT__ - | (Q_UINT64_C(1) << CpuFeaturePOPCNT) -#endif -#if defined __MOVBE__ // GCC and Clang don't seem to define this - | (Q_UINT64_C(1) << CpuFeatureMOVBE) -#endif -#if defined __AVX512F__ - | (Q_UINT64_C(1) << CpuFeatureAVX512F) -#endif -#if defined __AVX512CD__ - | (Q_UINT64_C(1) << CpuFeatureAVX512CD) -#endif -#if defined __AVX512ER__ - | (Q_UINT64_C(1) << CpuFeatureAVX512ER) -#endif -#if defined __AVX512PF__ - | (Q_UINT64_C(1) << CpuFeatureAVX512PF) -#endif -#if defined __AVX512BW__ - | (Q_UINT64_C(1) << CpuFeatureAVX512BW) -#endif -#if defined __AVX512DQ__ - | (Q_UINT64_C(1) << CpuFeatureAVX512DQ) -#endif -#if defined __AVX512VL__ - | (Q_UINT64_C(1) << CpuFeatureAVX512VL) -#endif -#if defined __AVX512IFMA__ - | (Q_UINT64_C(1) << CpuFeatureAVX512IFMA) -#endif -#if defined __AVX512VBMI__ - | (Q_UINT64_C(1) << CpuFeatureAVX512VBMI) -#endif -#if defined __AVX2__ - | (Q_UINT64_C(1) << CpuFeatureAVX2) -#endif -#if defined __AVX__ - | (Q_UINT64_C(1) << CpuFeatureAVX) -#endif -#if defined __SSE4_2__ - | (Q_UINT64_C(1) << CpuFeatureSSE4_2) -#endif -#if defined __SSE4_1__ - | (Q_UINT64_C(1) << CpuFeatureSSE4_1) -#endif -#if defined __SSSE3__ - | (Q_UINT64_C(1) << CpuFeatureSSSE3) -#endif -#if defined __SSE3__ - | (Q_UINT64_C(1) << CpuFeatureSSE3) -#endif -#if defined __SSE2__ - | (Q_UINT64_C(1) << CpuFeatureSSE2) -#endif #if defined __ARM_NEON__ | (Q_UINT64_C(1) << CpuFeatureNEON) #endif @@ -434,6 +295,7 @@ static const quint64 qCompilerCpuFeatures = 0 | (Q_UINT64_C(1) << CpuFeatureDSPR2) #endif ; +#endif #ifdef QT_BOOTSTRAPPED static inline quint64 qCpuFeatures() diff --git a/src/corelib/tools/qsimd_x86.cpp b/src/corelib/tools/qsimd_x86.cpp new file mode 100644 index 0000000000..8275f964d8 --- /dev/null +++ b/src/corelib/tools/qsimd_x86.cpp @@ -0,0 +1,98 @@ +// This is a generated file. DO NOT EDIT. +// Please see util/x86simdgen/generate.pl +#include + +static const char features_string[] = + " sse2\0" + " sse3\0" + " ssse3\0" + " fma\0" + " sse4.1\0" + " sse4.2\0" + " movbe\0" + " popcnt\0" + " aes\0" + " avx\0" + " f16c\0" + " rdrnd\0" + " bmi\0" + " hle\0" + " avx2\0" + " bmi2\0" + " rtm\0" + " avx512f\0" + " avx512dq\0" + " rdseed\0" + " avx512ifma\0" + " avx512pf\0" + " avx512er\0" + " avx512cd\0" + " sha\0" + " avx512bw\0" + " avx512vl\0" + " avx512vbmi\0" + " avx512vbmi2\0" + " gfni\0" + " vaes\0" + " avx512vnni\0" + " avx512bitalg\0" + " avx512vpopcntdq\0" + " avx5124nniw\0" + " avx5124fmaps\0" + "\0"; + +static const quint16 features_indices[] = { + 306, 0, 6, 12, 19, 24, 32, 40, + 47, 55, 60, 65, 71, 78, 83, 88, + 94, 100, 105, 114, 124, 132, 144, 154, + 164, 174, 179, 189, 199, 211, 224, 230, + 236, 248, 262, 279, 292 +}; + +enum X86CpuidLeaves { + Leaf1ECX, + Leaf1EDX, + Leaf7_0EBX, + Leaf7_0ECX, + Leaf7_0EDX, + X86CpuidMaxLeaf +}; + +static const quint8 x86_locators[] = { + Leaf1EDX*32 + 26, // sse2 + Leaf1ECX*32 + 0, // sse3 + Leaf1ECX*32 + 9, // ssse3 + Leaf1ECX*32 + 12, // fma + Leaf1ECX*32 + 19, // sse4.1 + Leaf1ECX*32 + 20, // sse4.2 + Leaf1ECX*32 + 22, // movbe + Leaf1ECX*32 + 23, // popcnt + Leaf1ECX*32 + 25, // aes + Leaf1ECX*32 + 28, // avx + Leaf1ECX*32 + 29, // f16c + Leaf1ECX*32 + 30, // rdrnd + Leaf7_0EBX*32 + 3, // bmi + Leaf7_0EBX*32 + 4, // hle + Leaf7_0EBX*32 + 5, // avx2 + Leaf7_0EBX*32 + 8, // bmi2 + Leaf7_0EBX*32 + 11, // rtm + Leaf7_0EBX*32 + 16, // avx512f + Leaf7_0EBX*32 + 17, // avx512dq + Leaf7_0EBX*32 + 18, // rdseed + Leaf7_0EBX*32 + 21, // avx512ifma + Leaf7_0EBX*32 + 26, // avx512pf + Leaf7_0EBX*32 + 27, // avx512er + Leaf7_0EBX*32 + 28, // avx512cd + Leaf7_0EBX*32 + 29, // sha + Leaf7_0EBX*32 + 30, // avx512bw + Leaf7_0EBX*32 + 31, // avx512vl + Leaf7_0ECX*32 + 1, // avx512vbmi + Leaf7_0ECX*32 + 6, // avx512vbmi2 + Leaf7_0ECX*32 + 8, // gfni + Leaf7_0ECX*32 + 9, // vaes + Leaf7_0ECX*32 + 11, // avx512vnni + Leaf7_0ECX*32 + 12, // avx512bitalg + Leaf7_0ECX*32 + 14, // avx512vpopcntdq + Leaf7_0EDX*32 + 2, // avx5124nniw + Leaf7_0EDX*32 + 3 // avx5124fmaps +}; diff --git a/src/corelib/tools/qsimd_x86_p.h b/src/corelib/tools/qsimd_x86_p.h new file mode 100644 index 0000000000..45d5f2895f --- /dev/null +++ b/src/corelib/tools/qsimd_x86_p.h @@ -0,0 +1,227 @@ +// This is a generated file. DO NOT EDIT. +// Please see util/x86simdgen/generate.pl +#ifndef QSIMD_P_H +# error "Please include instead" +#endif +#ifndef QSIMD_X86_P_H +#define QSIMD_X86_P_H + +#include "qsimd_p.h" + +// +// W A R N I N G +// ------------- +// +// This file is not part of the Qt API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. +// + +QT_BEGIN_NAMESPACE + +// Macros for QT_FUNCTION_TARGET (for Clang and GCC) +#define QT_FUNCTION_TARGET_STRING_SSE2 "sse2" +#define QT_FUNCTION_TARGET_STRING_SSE3 "sse3" +#define QT_FUNCTION_TARGET_STRING_SSSE3 "ssse3" +#define QT_FUNCTION_TARGET_STRING_FMA "fma" +#define QT_FUNCTION_TARGET_STRING_SSE4_1 "sse4.1" +#define QT_FUNCTION_TARGET_STRING_SSE4_2 "sse4.2" +#define QT_FUNCTION_TARGET_STRING_MOVBE "movbe" +#define QT_FUNCTION_TARGET_STRING_POPCNT "popcnt" +#define QT_FUNCTION_TARGET_STRING_AES "aes,sse4.2" +#define QT_FUNCTION_TARGET_STRING_AVX "avx" +#define QT_FUNCTION_TARGET_STRING_F16C "f16c" +#define QT_FUNCTION_TARGET_STRING_RDRND "rdrnd" +#define QT_FUNCTION_TARGET_STRING_BMI "bmi" +#define QT_FUNCTION_TARGET_STRING_HLE "hle" +#define QT_FUNCTION_TARGET_STRING_AVX2 "avx2" +#define QT_FUNCTION_TARGET_STRING_BMI2 "bmi2" +#define QT_FUNCTION_TARGET_STRING_RTM "rtm" +#define QT_FUNCTION_TARGET_STRING_AVX512F "avx512f" +#define QT_FUNCTION_TARGET_STRING_AVX512DQ "avx512dq" +#define QT_FUNCTION_TARGET_STRING_RDSEED "rdseed" +#define QT_FUNCTION_TARGET_STRING_AVX512IFMA "avx512ifma" +#define QT_FUNCTION_TARGET_STRING_AVX512PF "avx512pf" +#define QT_FUNCTION_TARGET_STRING_AVX512ER "avx512er" +#define QT_FUNCTION_TARGET_STRING_AVX512CD "avx512cd" +#define QT_FUNCTION_TARGET_STRING_SHA "sha" +#define QT_FUNCTION_TARGET_STRING_AVX512BW "avx512bw" +#define QT_FUNCTION_TARGET_STRING_AVX512VL "avx512vl" +#define QT_FUNCTION_TARGET_STRING_AVX512VBMI "avx512vbmi" +#define QT_FUNCTION_TARGET_STRING_AVX512VBMI2 "avx512vbmi2" +#define QT_FUNCTION_TARGET_STRING_GFNI "gfni" +#define QT_FUNCTION_TARGET_STRING_VAES "vaes" +#define QT_FUNCTION_TARGET_STRING_AVX512VNNI "avx512vnni" +#define QT_FUNCTION_TARGET_STRING_AVX512BITALG "avx512bitalg" +#define QT_FUNCTION_TARGET_STRING_AVX512VPOPCNTDQ "avx512vpopcntdq" +#define QT_FUNCTION_TARGET_STRING_AVX5124NNIW "avx5124nniw" +#define QT_FUNCTION_TARGET_STRING_AVX5124FMAPS "avx5124fmaps" + +enum CPUFeatures { + // in CPUID Leaf 1, EDX: + CpuFeatureSSE2 = 1, + + // in CPUID Leaf 1, ECX: + CpuFeatureSSE3 = 2, + CpuFeatureSSSE3 = 3, + CpuFeatureFMA = 4, + CpuFeatureSSE4_1 = 5, + CpuFeatureSSE4_2 = 6, + CpuFeatureMOVBE = 7, + CpuFeaturePOPCNT = 8, + CpuFeatureAES = 9, + CpuFeatureAVX = 10, + CpuFeatureF16C = 11, + CpuFeatureRDRND = 12, + + // in CPUID Leaf 7, Sub-leaf 0, EBX: + CpuFeatureBMI = 13, + CpuFeatureHLE = 14, + CpuFeatureAVX2 = 15, + CpuFeatureBMI2 = 16, + CpuFeatureRTM = 17, + CpuFeatureAVX512F = 18, + CpuFeatureAVX512DQ = 19, + CpuFeatureRDSEED = 20, + CpuFeatureAVX512IFMA = 21, + CpuFeatureAVX512PF = 22, + CpuFeatureAVX512ER = 23, + CpuFeatureAVX512CD = 24, + CpuFeatureSHA = 25, + CpuFeatureAVX512BW = 26, + CpuFeatureAVX512VL = 27, + + // in CPUID Leaf 7, Sub-leaf 0, ECX: + CpuFeatureAVX512VBMI = 28, + CpuFeatureAVX512VBMI2 = 29, + CpuFeatureGFNI = 30, + CpuFeatureVAES = 31, + CpuFeatureAVX512VNNI = 32, + CpuFeatureAVX512BITALG = 33, + CpuFeatureAVX512VPOPCNTDQ = 34, + + // in CPUID Leaf 7, Sub-leaf 0, EDX: + CpuFeatureAVX5124NNIW = 35, + CpuFeatureAVX5124FMAPS = 36, + + // used only to indicate that the CPU detection was initialized + QSimdInitialized = 1 +}; + +static const quint64 qCompilerCpuFeatures = 0 +#ifdef __SSE2__ + | (Q_UINT64_C(1) << CpuFeatureSSE2) +#endif +#ifdef __SSE3__ + | (Q_UINT64_C(1) << CpuFeatureSSE3) +#endif +#ifdef __SSSE3__ + | (Q_UINT64_C(1) << CpuFeatureSSSE3) +#endif +#ifdef __FMA__ + | (Q_UINT64_C(1) << CpuFeatureFMA) +#endif +#ifdef __SSE4_1__ + | (Q_UINT64_C(1) << CpuFeatureSSE4_1) +#endif +#ifdef __SSE4_2__ + | (Q_UINT64_C(1) << CpuFeatureSSE4_2) +#endif +#ifdef __MOVBE__ + | (Q_UINT64_C(1) << CpuFeatureMOVBE) +#endif +#ifdef __POPCNT__ + | (Q_UINT64_C(1) << CpuFeaturePOPCNT) +#endif +#ifdef __AES__ + | (Q_UINT64_C(1) << CpuFeatureAES) +#endif +#ifdef __AVX__ + | (Q_UINT64_C(1) << CpuFeatureAVX) +#endif +#ifdef __F16C__ + | (Q_UINT64_C(1) << CpuFeatureF16C) +#endif +#ifdef __RDRND__ + | (Q_UINT64_C(1) << CpuFeatureRDRND) +#endif +#ifdef __BMI__ + | (Q_UINT64_C(1) << CpuFeatureBMI) +#endif +#ifdef __HLE__ + | (Q_UINT64_C(1) << CpuFeatureHLE) +#endif +#ifdef __AVX2__ + | (Q_UINT64_C(1) << CpuFeatureAVX2) +#endif +#ifdef __BMI2__ + | (Q_UINT64_C(1) << CpuFeatureBMI2) +#endif +#ifdef __RTM__ + | (Q_UINT64_C(1) << CpuFeatureRTM) +#endif +#ifdef __AVX512F__ + | (Q_UINT64_C(1) << CpuFeatureAVX512F) +#endif +#ifdef __AVX512DQ__ + | (Q_UINT64_C(1) << CpuFeatureAVX512DQ) +#endif +#ifdef __RDSEED__ + | (Q_UINT64_C(1) << CpuFeatureRDSEED) +#endif +#ifdef __AVX512IFMA__ + | (Q_UINT64_C(1) << CpuFeatureAVX512IFMA) +#endif +#ifdef __AVX512PF__ + | (Q_UINT64_C(1) << CpuFeatureAVX512PF) +#endif +#ifdef __AVX512ER__ + | (Q_UINT64_C(1) << CpuFeatureAVX512ER) +#endif +#ifdef __AVX512CD__ + | (Q_UINT64_C(1) << CpuFeatureAVX512CD) +#endif +#ifdef __SHA__ + | (Q_UINT64_C(1) << CpuFeatureSHA) +#endif +#ifdef __AVX512BW__ + | (Q_UINT64_C(1) << CpuFeatureAVX512BW) +#endif +#ifdef __AVX512VL__ + | (Q_UINT64_C(1) << CpuFeatureAVX512VL) +#endif +#ifdef __AVX512VBMI__ + | (Q_UINT64_C(1) << CpuFeatureAVX512VBMI) +#endif +#ifdef __AVX512VBMI2__ + | (Q_UINT64_C(1) << CpuFeatureAVX512VBMI2) +#endif +#ifdef __GFNI__ + | (Q_UINT64_C(1) << CpuFeatureGFNI) +#endif +#ifdef __VAES__ + | (Q_UINT64_C(1) << CpuFeatureVAES) +#endif +#ifdef __AVX512VNNI__ + | (Q_UINT64_C(1) << CpuFeatureAVX512VNNI) +#endif +#ifdef __AVX512BITALG__ + | (Q_UINT64_C(1) << CpuFeatureAVX512BITALG) +#endif +#ifdef __AVX512VPOPCNTDQ__ + | (Q_UINT64_C(1) << CpuFeatureAVX512VPOPCNTDQ) +#endif +#ifdef __AVX5124NNIW__ + | (Q_UINT64_C(1) << CpuFeatureAVX5124NNIW) +#endif +#ifdef __AVX5124FMAPS__ + | (Q_UINT64_C(1) << CpuFeatureAVX5124FMAPS) +#endif + ; + +QT_END_NAMESPACE + +#endif // QSIMD_X86_P_H + -- cgit v1.2.3