diff options
Diffstat (limited to 'src/corelib/global/qsimd_p.h')
-rw-r--r-- | src/corelib/global/qsimd_p.h | 156 |
1 files changed, 71 insertions, 85 deletions
diff --git a/src/corelib/global/qsimd_p.h b/src/corelib/global/qsimd_p.h index 5df2faf4e4..012eb6cf4f 100644 --- a/src/corelib/global/qsimd_p.h +++ b/src/corelib/global/qsimd_p.h @@ -34,33 +34,6 @@ QT_WARNING_DISABLE_INTEL(103) for (int _i = 0; _i < max && i < length; ++i, ++_i) /* - * qt_module_config.prf defines the QT_COMPILER_SUPPORTS_XXX macros. - * They mean the compiler supports the necessary flags and the headers - * for the x86 and ARM intrinsics: - * - GCC: the -mXXX or march=YYY flag is necessary before #include - * up to 4.8; GCC >= 4.9 can include unconditionally - * - Intel CC: #include can happen unconditionally - * - MSVC: #include can happen unconditionally - * - RVCT: ??? - * - * We will try to include all headers possible under this configuration. - * - * MSVC does not define __SSE2__ & family, so we will define them. MSVC 2013 & - * up do define __AVX__ if the -arch:AVX option is passed on the command-line. - * - * Supported XXX are: - * Flag | Arch | GCC | Intel CC | MSVC | - * ARM_NEON | ARM | I & C | None | ? | - * SSE2 | x86 | I & C | I & C | I & C | - * SSE3 | x86 | I & C | I & C | I only | - * SSSE3 | x86 | I & C | I & C | I only | - * SSE4_1 | x86 | I & C | I & C | I only | - * SSE4_2 | x86 | I & C | I & C | I only | - * AVX | x86 | I & C | I & C | I & C | - * AVX2 | x86 | I & C | I & C | I only | - * AVX512xx | x86 | I & C | I & C | I only | - * I = intrinsics; C = code generation - * * Code can use the following constructs to determine compiler support & status: * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__) * If this test passes, then the compiler is already generating code for that @@ -160,9 +133,50 @@ QT_WARNING_DISABLE_INTEL(103) # define QT_FUNCTION_TARGET(x) #endif +#if defined(__SSE2__) && !defined(QT_COMPILER_SUPPORTS_SSE2) && !defined(QT_BOOTSTRAPPED) +// Intrinsic support appears to be missing, so pretend these features don't exist +# undef __SSE__ +# undef __SSE2__ +# undef __SSE3__ +# undef __SSSE3__ +# undef __SSE4_1__ +# undef __SSE4_2__ +# undef __AES__ +# undef __POPCNT__ +# undef __AVX__ +# undef __F16C__ +# undef __RDRND__ +# undef __AVX2__ +# undef __BMI__ +# undef __BMI2__ +# undef __FMA__ +# undef __MOVBE__ +# undef __RDSEED__ +# undef __AVX512F__ +# undef __AVX512ER__ +# undef __AVX512CD__ +# undef __AVX512PF__ +# undef __AVX512DQ__ +# undef __AVX512BW__ +# undef __AVX512VL__ +# undef __AVX512IFMA__ +# undef __AVX512VBMI__ +# undef __SHA__ +# undef __AVX512VBMI2__ +# undef __AVX512BITALG__ +# undef __AVX512VNNI__ +# undef __AVX512VPOPCNTDQ__ +# undef __GFNI__ +# undef __VAES__ +#endif + #ifdef Q_PROCESSOR_X86 /* -- x86 intrinsic support -- */ +# if defined(QT_COMPILER_SUPPORTS_RDSEED) && defined(Q_OS_QNX) +// The compiler for QNX is missing the intrinsic +# undef QT_COMPILER_SUPPORTS_RDSEED +# endif # if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2) // MSVC doesn't define __SSE2__, so do it ourselves # define __SSE__ 1 @@ -199,32 +213,37 @@ asm( # include <immintrin.h> # endif -# include "qsimd_x86_p.h" +# include <QtCore/private/qsimd_x86_p.h> // x86-64 sub-architecture version 3 // // The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2, -// BMI1, BMI2, FMA, LZCNT, MOVBE, which makes it a good divider for a -// sub-target for us. The first AMD processor with AVX2 support (Zen) has the -// same features. This feature set was chosen as the version 3 of the x86-64 -// ISA (x86-64-v3) and is supported by GCC and Clang. -// -// macOS's fat binaries support the "x86_64h" sub-architecture and the GNU libc -// ELF loader also supports a "haswell/" subdir (e.g., /usr/lib/haswell). -# define ARCH_HASWELL_MACROS (__AVX2__ + __BMI__ + __BMI2__ + __F16C__ + __FMA__ + __LZCNT__) -# if ARCH_HASWELL_MACROS != 0 -# if ARCH_HASWELL_MACROS != 6 +// BMI1, BMI2, FMA, LZCNT, MOVBE. This feature set was chosen as the version 3 +// of the x86-64 ISA (x86-64-v3) and is supported by GCC and Clang. On systems +// with the GNU libc, libraries with this feature can be installed on a +// "glibc-hwcaps/x86-64-v3" subdir. macOS's fat binaries support the "x86_64h" +// sub-architecture too. + +# if defined(__AVX2__) +// List of features present with -march=x86-64-v3 and not architecturally +// implied by __AVX2__ +# define ARCH_HASWELL_MACROS \ + (__AVX2__ + __BMI__ + __BMI2__ + __F16C__ + __FMA__ + __LZCNT__ + __POPCNT__) +# if ARCH_HASWELL_MACROS != 7 # error "Please enable all x86-64-v3 extensions; you probably want to use -march=haswell or -march=x86-64-v3 instead of -mavx2" # endif static_assert(ARCH_HASWELL_MACROS, "Undeclared identifiers indicate which features are missing."); # define __haswell__ 1 +# undef ARCH_HASWELL_MACROS # endif -# undef ARCH_HASWELL_MACROS // x86-64 sub-architecture version 4 // -// Similar to the above, x86-64-v4 marches the AVX512 variant of the Intel Core -// 6th generation (codename "Skylake"). +// Similar to the above, x86-64-v4 matches the AVX512 variant of the Intel Core +// 6th generation (codename "Skylake"). AMD Zen4 is the their first processor +// with AVX512 support and it includes all of these too. The GNU libc subdir for +// this is "glibc-hwcaps/x86-64-v4". +// # define ARCH_SKX_MACROS (__AVX512F__ + __AVX512BW__ + __AVX512CD__ + __AVX512DQ__ + __AVX512VL__) # if ARCH_SKX_MACROS != 0 # if ARCH_SKX_MACROS != 5 @@ -306,12 +325,19 @@ static const uint64_t qCompilerCpuFeatures = 0 #if defined __ARM_NEON__ | CpuFeatureNEON #endif +#if !(defined(Q_OS_LINUX) && defined(Q_PROCESSOR_ARM_64)) + // Yocto Project recipes enable Crypto extension for all ARMv8 configs, + // even for targets without the Crypto extension. That's wrong, but as + // the compiler never generates the code for them on their own, most + // code never notices the problem. But we would. By not setting the + // bits here, we force a runtime detection. #if defined __ARM_FEATURE_CRC32 | CpuFeatureCRC32 #endif #if defined __ARM_FEATURE_CRYPTO | CpuFeatureAES #endif +#endif // Q_OS_LINUX && Q_PROCESSOR_ARM64 #if defined __mips_dsp | CpuFeatureDSP #endif @@ -346,61 +372,21 @@ Q_CORE_EXPORT uint64_t QT_MANGLE_NAMESPACE(qDetectCpuFeatures)(); static inline uint64_t qCpuFeatures() { +#ifdef QT_BOOTSTRAPPED + return qCompilerCpuFeatures; // no detection +#else quint64 features = atomic_load_explicit(QT_MANGLE_NAMESPACE(qt_cpu_features), memory_order_relaxed); if (!QT_SUPPORTS_INIT_PRIORITY) { if (Q_UNLIKELY(features == 0)) features = QT_MANGLE_NAMESPACE(qDetectCpuFeatures)(); } return features; +#endif } #define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \ || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature)) -/* - Small wrapper around x86's PAUSE and ARM's YIELD instructions. - - This is completely different from QThread::yieldCurrentThread(), which is - an OS-level operation that takes the whole thread off the CPU. - - This is just preventing one SMT thread from filling a core's pipeline with - speculated further loop iterations (which need to be expensively flushed on - final success) when it could just give those pipeline slots to a second SMT - thread that can do something useful with the core, such as unblocking this - SMT thread :) - - So, instead of - - while (!condition) - ; - - it's better to use - - while (!condition) - qYieldCpu(); -*/ -static inline void qYieldCpu() -{ -#if defined(Q_PROCESSOR_X86) - _mm_pause(); -#elif defined(Q_PROCESSOR_ARM) -# if __has_builtin(__builtin_arm_yield) /* e.g. Clang */ - __builtin_arm_yield(); -# elif defined(Q_OS_INTEGRITY) || \ - (defined(Q_CC_GNU) && !defined(Q_CC_CLANG)) - /* - - Integrity is missing the arm_acle.h header - - GCC doesn't have __yield() in arm_acle.h - https://stackoverflow.com/a/70076751/134841 - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105416 - */ - asm volatile("yield"); /* this works everywhere */ -# else - __yield(); /* this is what should work everywhere */ -# endif -#endif -} - #ifdef __cplusplus } // extern "C" |