summaryrefslogtreecommitdiffstats
path: root/src/corelib/global/qsimd_p.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/global/qsimd_p.h')
-rw-r--r--src/corelib/global/qsimd_p.h417
1 files changed, 417 insertions, 0 deletions
diff --git a/src/corelib/global/qsimd_p.h b/src/corelib/global/qsimd_p.h
new file mode 100644
index 0000000000..012eb6cf4f
--- /dev/null
+++ b/src/corelib/global/qsimd_p.h
@@ -0,0 +1,417 @@
+// Copyright (C) 2021 The Qt Company Ltd.
+// Copyright (C) 2022 Intel Corporation.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
+
+#ifndef QSIMD_P_H
+#define QSIMD_P_H
+
+//
+// W A R N I N G
+// -------------
+//
+// This file is not part of the Qt API. It exists purely as an
+// implementation detail. This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+//
+
+#include <QtCore/private/qglobal_p.h>
+#include <QtCore/qsimd.h>
+
+QT_WARNING_PUSH
+QT_WARNING_DISABLE_CLANG("-Wundef")
+QT_WARNING_DISABLE_GCC("-Wundef")
+QT_WARNING_DISABLE_INTEL(103)
+
+#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
+ for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
+
+#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
+ for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
+
+#define SIMD_EPILOGUE(i, length, max) \
+ for (int _i = 0; _i < max && i < length; ++i, ++_i)
+
+/*
+ * Code can use the following constructs to determine compiler support & status:
+ * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
+ * If this test passes, then the compiler is already generating code for that
+ * given sub-architecture. The intrinsics for that sub-architecture are
+ * #included and can be used without restriction or runtime check.
+ *
+ * - #if QT_COMPILER_SUPPORTS(XXX)
+ * If this test passes, then the compiler is able to generate code for that
+ * given sub-architecture in another translation unit, given the right set of
+ * flags. Use of the intrinsics is not guaranteed. This is useful with
+ * runtime detection (see below).
+ *
+ * - #if QT_COMPILER_SUPPORTS_HERE(XXX)
+ * If this test passes, then the compiler is able to generate code for that
+ * given sub-architecture in this translation unit, even if it is not doing
+ * that now (it might be). Individual functions may be tagged with
+ * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that
+ * sub-arch. Only inside such functions is the use of the intrisics
+ * guaranteed to work. This is useful with runtime detection (see below).
+ *
+ * The distinction between QT_COMPILER_SUPPORTS and QT_COMPILER_SUPPORTS_HERE is
+ * historical: GCC 4.8 needed the distinction.
+ *
+ * Runtime detection of a CPU sub-architecture can be done with the
+ * qCpuHasFeature(XXX) function. There are two strategies for generating
+ * optimized code like that:
+ *
+ * 1) place the optimized code in a different translation unit (C or assembly
+ * sources) and pass the correct flags to the compiler to enable support. Those
+ * sources must not include qglobal.h, which means they cannot include this
+ * file either. The dispatcher function would look like this:
+ *
+ * void foo()
+ * {
+ * #if QT_COMPILER_SUPPORTS(XXX)
+ * if (qCpuHasFeature(XXX)) {
+ * foo_optimized_xxx();
+ * return;
+ * }
+ * #endif
+ * foo_plain();
+ * }
+ *
+ * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and
+ * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use
+ * other Qt code. The dispatcher function would look like this:
+ *
+ * void foo()
+ * {
+ * #if QT_COMPILER_SUPPORTS_HERE(XXX)
+ * if (qCpuHasFeature(XXX)) {
+ * foo_optimized_xxx();
+ * return;
+ * }
+ * #endif
+ * foo_plain();
+ * }
+ */
+
+#if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC)
+#include <intrin.h>
+#endif
+
+#define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0)
+
+#if defined(Q_PROCESSOR_ARM)
+# define QT_COMPILER_SUPPORTS_HERE(x) ((__ARM_FEATURE_ ## x) || (__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
+# if defined(Q_CC_GNU)
+ /* GCC requires attributes for a function */
+# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
+# else
+# define QT_FUNCTION_TARGET(x)
+# endif
+#elif defined(Q_PROCESSOR_MIPS)
+# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
+# define QT_FUNCTION_TARGET(x)
+# if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32)
+# define __MIPS_DSP__
+# endif
+# if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32)
+# define __MIPS_DSPR2__
+# endif
+#elif defined(Q_PROCESSOR_X86)
+# if defined(Q_CC_CLANG) && defined(Q_CC_MSVC)
+# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
+# else
+# define QT_COMPILER_SUPPORTS_HERE(x) ((__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
+# endif
+# if defined(Q_CC_GNU)
+ /* GCC requires attributes for a function */
+# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
+# else
+# define QT_FUNCTION_TARGET(x)
+# endif
+#else
+# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
+# define QT_FUNCTION_TARGET(x)
+#endif
+
+#if defined(__SSE2__) && !defined(QT_COMPILER_SUPPORTS_SSE2) && !defined(QT_BOOTSTRAPPED)
+// Intrinsic support appears to be missing, so pretend these features don't exist
+# undef __SSE__
+# undef __SSE2__
+# undef __SSE3__
+# undef __SSSE3__
+# undef __SSE4_1__
+# undef __SSE4_2__
+# undef __AES__
+# undef __POPCNT__
+# undef __AVX__
+# undef __F16C__
+# undef __RDRND__
+# undef __AVX2__
+# undef __BMI__
+# undef __BMI2__
+# undef __FMA__
+# undef __MOVBE__
+# undef __RDSEED__
+# undef __AVX512F__
+# undef __AVX512ER__
+# undef __AVX512CD__
+# undef __AVX512PF__
+# undef __AVX512DQ__
+# undef __AVX512BW__
+# undef __AVX512VL__
+# undef __AVX512IFMA__
+# undef __AVX512VBMI__
+# undef __SHA__
+# undef __AVX512VBMI2__
+# undef __AVX512BITALG__
+# undef __AVX512VNNI__
+# undef __AVX512VPOPCNTDQ__
+# undef __GFNI__
+# undef __VAES__
+#endif
+
+#ifdef Q_PROCESSOR_X86
+/* -- x86 intrinsic support -- */
+
+# if defined(QT_COMPILER_SUPPORTS_RDSEED) && defined(Q_OS_QNX)
+// The compiler for QNX is missing the intrinsic
+# undef QT_COMPILER_SUPPORTS_RDSEED
+# endif
+# if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2)
+// MSVC doesn't define __SSE2__, so do it ourselves
+# define __SSE__ 1
+# endif
+
+# if defined(Q_OS_WIN) && defined(Q_CC_GNU) && !defined(Q_CC_CLANG)
+// 64-bit GCC on Windows does not support AVX, so we hack around it by forcing
+// it to emit unaligned loads & stores
+// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49001
+asm(
+ ".macro vmovapd args:vararg\n"
+ " vmovupd \\args\n"
+ ".endm\n"
+ ".macro vmovaps args:vararg\n"
+ " vmovups \\args\n"
+ ".endm\n"
+ ".macro vmovdqa args:vararg\n"
+ " vmovdqu \\args\n"
+ ".endm\n"
+ ".macro vmovdqa32 args:vararg\n"
+ " vmovdqu32 \\args\n"
+ ".endm\n"
+ ".macro vmovdqa64 args:vararg\n"
+ " vmovdqu64 \\args\n"
+ ".endm\n"
+);
+# endif
+
+# if defined(Q_CC_GNU) && !defined(Q_OS_WASM)
+// GCC 4.4 and Clang 2.8 added a few more intrinsics there
+# include <x86intrin.h>
+# endif
+#ifdef Q_OS_WASM
+# include <immintrin.h>
+# endif
+
+# include <QtCore/private/qsimd_x86_p.h>
+
+// x86-64 sub-architecture version 3
+//
+// The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2,
+// BMI1, BMI2, FMA, LZCNT, MOVBE. This feature set was chosen as the version 3
+// of the x86-64 ISA (x86-64-v3) and is supported by GCC and Clang. On systems
+// with the GNU libc, libraries with this feature can be installed on a
+// "glibc-hwcaps/x86-64-v3" subdir. macOS's fat binaries support the "x86_64h"
+// sub-architecture too.
+
+# if defined(__AVX2__)
+// List of features present with -march=x86-64-v3 and not architecturally
+// implied by __AVX2__
+# define ARCH_HASWELL_MACROS \
+ (__AVX2__ + __BMI__ + __BMI2__ + __F16C__ + __FMA__ + __LZCNT__ + __POPCNT__)
+# if ARCH_HASWELL_MACROS != 7
+# error "Please enable all x86-64-v3 extensions; you probably want to use -march=haswell or -march=x86-64-v3 instead of -mavx2"
+# endif
+static_assert(ARCH_HASWELL_MACROS, "Undeclared identifiers indicate which features are missing.");
+# define __haswell__ 1
+# undef ARCH_HASWELL_MACROS
+# endif
+
+// x86-64 sub-architecture version 4
+//
+// Similar to the above, x86-64-v4 matches the AVX512 variant of the Intel Core
+// 6th generation (codename "Skylake"). AMD Zen4 is the their first processor
+// with AVX512 support and it includes all of these too. The GNU libc subdir for
+// this is "glibc-hwcaps/x86-64-v4".
+//
+# define ARCH_SKX_MACROS (__AVX512F__ + __AVX512BW__ + __AVX512CD__ + __AVX512DQ__ + __AVX512VL__)
+# if ARCH_SKX_MACROS != 0
+# if ARCH_SKX_MACROS != 5
+# error "Please enable all x86-64-v4 extensions; you probably want to use -march=skylake-avx512 or -march=x86-64-v4 instead of -mavx512f"
+# endif
+static_assert(ARCH_SKX_MACROS, "Undeclared identifiers indicate which features are missing.");
+# define __skylake_avx512__ 1
+# endif
+# undef ARCH_SKX_MACROS
+#endif /* Q_PROCESSOR_X86 */
+
+// NEON intrinsics
+// note: as of GCC 4.9, does not support function targets for ARM
+#if defined(__ARM_NEON) || defined(__ARM_NEON__)
+#if defined(Q_CC_CLANG)
+#define QT_FUNCTION_TARGET_STRING_NEON "neon"
+#else
+#define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available.
+#endif
+#ifndef __ARM_NEON__
+// __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection.
+#define __ARM_NEON__
+#endif
+
+#ifndef Q_PROCESSOR_ARM_64 // vaddv is only available on Aarch64
+inline uint16_t vaddvq_u16(uint16x8_t v8)
+{
+ const uint64x2_t v2 = vpaddlq_u32(vpaddlq_u16(v8));
+ const uint64x1_t v1 = vadd_u64(vget_low_u64(v2), vget_high_u64(v2));
+ return vget_lane_u16(vreinterpret_u16_u64(v1), 0);
+}
+
+inline uint8_t vaddv_u8(uint8x8_t v8)
+{
+ const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
+ return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
+}
+#endif
+
+#endif
+
+#if defined(Q_PROCESSOR_ARM) && defined(__ARM_FEATURE_CRC32)
+# include <arm_acle.h>
+#endif
+
+#if defined(Q_PROCESSOR_ARM_64)
+#if defined(Q_CC_CLANG)
+#define QT_FUNCTION_TARGET_STRING_AES "crypto"
+#define QT_FUNCTION_TARGET_STRING_CRC32 "crc"
+#elif defined(Q_CC_GNU)
+#define QT_FUNCTION_TARGET_STRING_AES "+crypto"
+#define QT_FUNCTION_TARGET_STRING_CRC32 "+crc"
+#endif
+#elif defined(Q_PROCESSOR_ARM_32)
+#if defined(Q_CC_CLANG)
+#define QT_FUNCTION_TARGET_STRING_AES "armv8-a,crypto"
+#define QT_FUNCTION_TARGET_STRING_CRC32 "armv8-a,crc"
+#elif defined(Q_CC_GNU)
+#define QT_FUNCTION_TARGET_STRING_AES "arch=armv8-a+crypto"
+#define QT_FUNCTION_TARGET_STRING_CRC32 "arch=armv8-a+crc"
+#endif
+#endif
+
+#ifndef Q_PROCESSOR_X86
+enum CPUFeatures {
+#if defined(Q_PROCESSOR_ARM)
+ CpuFeatureNEON = 2,
+ CpuFeatureARM_NEON = CpuFeatureNEON,
+ CpuFeatureCRC32 = 4,
+ CpuFeatureAES = 8,
+ CpuFeatureARM_CRYPTO = CpuFeatureAES,
+#elif defined(Q_PROCESSOR_MIPS)
+ CpuFeatureDSP = 2,
+ CpuFeatureDSPR2 = 4,
+#endif
+};
+
+static const uint64_t qCompilerCpuFeatures = 0
+#if defined __ARM_NEON__
+ | CpuFeatureNEON
+#endif
+#if !(defined(Q_OS_LINUX) && defined(Q_PROCESSOR_ARM_64))
+ // Yocto Project recipes enable Crypto extension for all ARMv8 configs,
+ // even for targets without the Crypto extension. That's wrong, but as
+ // the compiler never generates the code for them on their own, most
+ // code never notices the problem. But we would. By not setting the
+ // bits here, we force a runtime detection.
+#if defined __ARM_FEATURE_CRC32
+ | CpuFeatureCRC32
+#endif
+#if defined __ARM_FEATURE_CRYPTO
+ | CpuFeatureAES
+#endif
+#endif // Q_OS_LINUX && Q_PROCESSOR_ARM64
+#if defined __mips_dsp
+ | CpuFeatureDSP
+#endif
+#if defined __mips_dspr2
+ | CpuFeatureDSPR2
+#endif
+ ;
+#endif
+
+#ifdef __cplusplus
+# include <atomic>
+# define Q_ATOMIC(T) std::atomic<T>
+QT_BEGIN_NAMESPACE
+using std::atomic_load_explicit;
+static constexpr auto memory_order_relaxed = std::memory_order_relaxed;
+extern "C" {
+#else
+# include <stdatomic.h>
+# define Q_ATOMIC(T) _Atomic(T)
+#endif
+
+#ifdef Q_PROCESSOR_X86
+typedef uint64_t QCpuFeatureType;
+static const QCpuFeatureType qCompilerCpuFeatures = _compilerCpuFeatures;
+static const QCpuFeatureType CpuFeatureArchHaswell = cpu_haswell;
+static const QCpuFeatureType CpuFeatureArchSkylakeAvx512 = cpu_skylake_avx512;
+#else
+typedef unsigned QCpuFeatureType;
+#endif
+extern Q_CORE_EXPORT Q_ATOMIC(QCpuFeatureType) QT_MANGLE_NAMESPACE(qt_cpu_features)[1];
+Q_CORE_EXPORT uint64_t QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
+
+static inline uint64_t qCpuFeatures()
+{
+#ifdef QT_BOOTSTRAPPED
+ return qCompilerCpuFeatures; // no detection
+#else
+ quint64 features = atomic_load_explicit(QT_MANGLE_NAMESPACE(qt_cpu_features), memory_order_relaxed);
+ if (!QT_SUPPORTS_INIT_PRIORITY) {
+ if (Q_UNLIKELY(features == 0))
+ features = QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
+ }
+ return features;
+#endif
+}
+
+#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \
+ || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
+
+#ifdef __cplusplus
+} // extern "C"
+
+# if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED)
+Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept;
+
+static inline bool qHasHwrng()
+{
+ return qCpuHasFeature(RDRND);
+}
+# else
+static inline qsizetype qRandomCpu(void *, qsizetype) noexcept
+{
+ return 0;
+}
+static inline bool qHasHwrng()
+{
+ return false;
+}
+# endif
+
+QT_END_NAMESPACE
+
+#endif // __cplusplus
+
+QT_WARNING_POP
+
+#endif // QSIMD_P_H