summaryrefslogtreecommitdiffstats
path: root/src/corelib/global/qsimd_p.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/global/qsimd_p.h')
-rw-r--r--src/corelib/global/qsimd_p.h156
1 files changed, 71 insertions, 85 deletions
diff --git a/src/corelib/global/qsimd_p.h b/src/corelib/global/qsimd_p.h
index 5df2faf4e4..012eb6cf4f 100644
--- a/src/corelib/global/qsimd_p.h
+++ b/src/corelib/global/qsimd_p.h
@@ -34,33 +34,6 @@ QT_WARNING_DISABLE_INTEL(103)
for (int _i = 0; _i < max && i < length; ++i, ++_i)
/*
- * qt_module_config.prf defines the QT_COMPILER_SUPPORTS_XXX macros.
- * They mean the compiler supports the necessary flags and the headers
- * for the x86 and ARM intrinsics:
- * - GCC: the -mXXX or march=YYY flag is necessary before #include
- * up to 4.8; GCC >= 4.9 can include unconditionally
- * - Intel CC: #include can happen unconditionally
- * - MSVC: #include can happen unconditionally
- * - RVCT: ???
- *
- * We will try to include all headers possible under this configuration.
- *
- * MSVC does not define __SSE2__ & family, so we will define them. MSVC 2013 &
- * up do define __AVX__ if the -arch:AVX option is passed on the command-line.
- *
- * Supported XXX are:
- * Flag | Arch | GCC | Intel CC | MSVC |
- * ARM_NEON | ARM | I & C | None | ? |
- * SSE2 | x86 | I & C | I & C | I & C |
- * SSE3 | x86 | I & C | I & C | I only |
- * SSSE3 | x86 | I & C | I & C | I only |
- * SSE4_1 | x86 | I & C | I & C | I only |
- * SSE4_2 | x86 | I & C | I & C | I only |
- * AVX | x86 | I & C | I & C | I & C |
- * AVX2 | x86 | I & C | I & C | I only |
- * AVX512xx | x86 | I & C | I & C | I only |
- * I = intrinsics; C = code generation
- *
* Code can use the following constructs to determine compiler support & status:
* - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
* If this test passes, then the compiler is already generating code for that
@@ -160,9 +133,50 @@ QT_WARNING_DISABLE_INTEL(103)
# define QT_FUNCTION_TARGET(x)
#endif
+#if defined(__SSE2__) && !defined(QT_COMPILER_SUPPORTS_SSE2) && !defined(QT_BOOTSTRAPPED)
+// Intrinsic support appears to be missing, so pretend these features don't exist
+# undef __SSE__
+# undef __SSE2__
+# undef __SSE3__
+# undef __SSSE3__
+# undef __SSE4_1__
+# undef __SSE4_2__
+# undef __AES__
+# undef __POPCNT__
+# undef __AVX__
+# undef __F16C__
+# undef __RDRND__
+# undef __AVX2__
+# undef __BMI__
+# undef __BMI2__
+# undef __FMA__
+# undef __MOVBE__
+# undef __RDSEED__
+# undef __AVX512F__
+# undef __AVX512ER__
+# undef __AVX512CD__
+# undef __AVX512PF__
+# undef __AVX512DQ__
+# undef __AVX512BW__
+# undef __AVX512VL__
+# undef __AVX512IFMA__
+# undef __AVX512VBMI__
+# undef __SHA__
+# undef __AVX512VBMI2__
+# undef __AVX512BITALG__
+# undef __AVX512VNNI__
+# undef __AVX512VPOPCNTDQ__
+# undef __GFNI__
+# undef __VAES__
+#endif
+
#ifdef Q_PROCESSOR_X86
/* -- x86 intrinsic support -- */
+# if defined(QT_COMPILER_SUPPORTS_RDSEED) && defined(Q_OS_QNX)
+// The compiler for QNX is missing the intrinsic
+# undef QT_COMPILER_SUPPORTS_RDSEED
+# endif
# if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2)
// MSVC doesn't define __SSE2__, so do it ourselves
# define __SSE__ 1
@@ -199,32 +213,37 @@ asm(
# include <immintrin.h>
# endif
-# include "qsimd_x86_p.h"
+# include <QtCore/private/qsimd_x86_p.h>
// x86-64 sub-architecture version 3
//
// The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2,
-// BMI1, BMI2, FMA, LZCNT, MOVBE, which makes it a good divider for a
-// sub-target for us. The first AMD processor with AVX2 support (Zen) has the
-// same features. This feature set was chosen as the version 3 of the x86-64
-// ISA (x86-64-v3) and is supported by GCC and Clang.
-//
-// macOS's fat binaries support the "x86_64h" sub-architecture and the GNU libc
-// ELF loader also supports a "haswell/" subdir (e.g., /usr/lib/haswell).
-# define ARCH_HASWELL_MACROS (__AVX2__ + __BMI__ + __BMI2__ + __F16C__ + __FMA__ + __LZCNT__)
-# if ARCH_HASWELL_MACROS != 0
-# if ARCH_HASWELL_MACROS != 6
+// BMI1, BMI2, FMA, LZCNT, MOVBE. This feature set was chosen as the version 3
+// of the x86-64 ISA (x86-64-v3) and is supported by GCC and Clang. On systems
+// with the GNU libc, libraries with this feature can be installed on a
+// "glibc-hwcaps/x86-64-v3" subdir. macOS's fat binaries support the "x86_64h"
+// sub-architecture too.
+
+# if defined(__AVX2__)
+// List of features present with -march=x86-64-v3 and not architecturally
+// implied by __AVX2__
+# define ARCH_HASWELL_MACROS \
+ (__AVX2__ + __BMI__ + __BMI2__ + __F16C__ + __FMA__ + __LZCNT__ + __POPCNT__)
+# if ARCH_HASWELL_MACROS != 7
# error "Please enable all x86-64-v3 extensions; you probably want to use -march=haswell or -march=x86-64-v3 instead of -mavx2"
# endif
static_assert(ARCH_HASWELL_MACROS, "Undeclared identifiers indicate which features are missing.");
# define __haswell__ 1
+# undef ARCH_HASWELL_MACROS
# endif
-# undef ARCH_HASWELL_MACROS
// x86-64 sub-architecture version 4
//
-// Similar to the above, x86-64-v4 marches the AVX512 variant of the Intel Core
-// 6th generation (codename "Skylake").
+// Similar to the above, x86-64-v4 matches the AVX512 variant of the Intel Core
+// 6th generation (codename "Skylake"). AMD Zen4 is the their first processor
+// with AVX512 support and it includes all of these too. The GNU libc subdir for
+// this is "glibc-hwcaps/x86-64-v4".
+//
# define ARCH_SKX_MACROS (__AVX512F__ + __AVX512BW__ + __AVX512CD__ + __AVX512DQ__ + __AVX512VL__)
# if ARCH_SKX_MACROS != 0
# if ARCH_SKX_MACROS != 5
@@ -306,12 +325,19 @@ static const uint64_t qCompilerCpuFeatures = 0
#if defined __ARM_NEON__
| CpuFeatureNEON
#endif
+#if !(defined(Q_OS_LINUX) && defined(Q_PROCESSOR_ARM_64))
+ // Yocto Project recipes enable Crypto extension for all ARMv8 configs,
+ // even for targets without the Crypto extension. That's wrong, but as
+ // the compiler never generates the code for them on their own, most
+ // code never notices the problem. But we would. By not setting the
+ // bits here, we force a runtime detection.
#if defined __ARM_FEATURE_CRC32
| CpuFeatureCRC32
#endif
#if defined __ARM_FEATURE_CRYPTO
| CpuFeatureAES
#endif
+#endif // Q_OS_LINUX && Q_PROCESSOR_ARM64
#if defined __mips_dsp
| CpuFeatureDSP
#endif
@@ -346,61 +372,21 @@ Q_CORE_EXPORT uint64_t QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
static inline uint64_t qCpuFeatures()
{
+#ifdef QT_BOOTSTRAPPED
+ return qCompilerCpuFeatures; // no detection
+#else
quint64 features = atomic_load_explicit(QT_MANGLE_NAMESPACE(qt_cpu_features), memory_order_relaxed);
if (!QT_SUPPORTS_INIT_PRIORITY) {
if (Q_UNLIKELY(features == 0))
features = QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
}
return features;
+#endif
}
#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \
|| ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
-/*
- Small wrapper around x86's PAUSE and ARM's YIELD instructions.
-
- This is completely different from QThread::yieldCurrentThread(), which is
- an OS-level operation that takes the whole thread off the CPU.
-
- This is just preventing one SMT thread from filling a core's pipeline with
- speculated further loop iterations (which need to be expensively flushed on
- final success) when it could just give those pipeline slots to a second SMT
- thread that can do something useful with the core, such as unblocking this
- SMT thread :)
-
- So, instead of
-
- while (!condition)
- ;
-
- it's better to use
-
- while (!condition)
- qYieldCpu();
-*/
-static inline void qYieldCpu()
-{
-#if defined(Q_PROCESSOR_X86)
- _mm_pause();
-#elif defined(Q_PROCESSOR_ARM)
-# if __has_builtin(__builtin_arm_yield) /* e.g. Clang */
- __builtin_arm_yield();
-# elif defined(Q_OS_INTEGRITY) || \
- (defined(Q_CC_GNU) && !defined(Q_CC_CLANG))
- /*
- - Integrity is missing the arm_acle.h header
- - GCC doesn't have __yield() in arm_acle.h
- https://stackoverflow.com/a/70076751/134841
- https://gcc.gnu.org/bugzilla/show_bug.cgi?id=105416
- */
- asm volatile("yield"); /* this works everywhere */
-# else
- __yield(); /* this is what should work everywhere */
-# endif
-#endif
-}
-
#ifdef __cplusplus
} // extern "C"