summaryrefslogtreecommitdiffstats
path: root/src/corelib/global/qsimd_p.h
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/global/qsimd_p.h')
-rw-r--r--src/corelib/global/qsimd_p.h299
1 files changed, 151 insertions, 148 deletions
diff --git a/src/corelib/global/qsimd_p.h b/src/corelib/global/qsimd_p.h
index 57ef30d567..012eb6cf4f 100644
--- a/src/corelib/global/qsimd_p.h
+++ b/src/corelib/global/qsimd_p.h
@@ -1,42 +1,6 @@
-/****************************************************************************
-**
-** Copyright (C) 2016 The Qt Company Ltd.
-** Copyright (C) 2018 Intel Corporation.
-** Contact: https://www.qt.io/licensing/
-**
-** This file is part of the QtCore module of the Qt Toolkit.
-**
-** $QT_BEGIN_LICENSE:LGPL$
-** Commercial License Usage
-** Licensees holding valid commercial Qt licenses may use this file in
-** accordance with the commercial license agreement provided with the
-** Software or, alternatively, in accordance with the terms contained in
-** a written agreement between you and The Qt Company. For licensing terms
-** and conditions see https://www.qt.io/terms-conditions. For further
-** information use the contact form at https://www.qt.io/contact-us.
-**
-** GNU Lesser General Public License Usage
-** Alternatively, this file may be used under the terms of the GNU Lesser
-** General Public License version 3 as published by the Free Software
-** Foundation and appearing in the file LICENSE.LGPL3 included in the
-** packaging of this file. Please review the following information to
-** ensure the GNU Lesser General Public License version 3 requirements
-** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
-**
-** GNU General Public License Usage
-** Alternatively, this file may be used under the terms of the GNU
-** General Public License version 2.0 or (at your option) the GNU General
-** Public license version 3 or any later version approved by the KDE Free
-** Qt Foundation. The licenses are as published by the Free Software
-** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
-** included in the packaging of this file. Please review the following
-** information to ensure the GNU General Public License requirements will
-** be met: https://www.gnu.org/licenses/gpl-2.0.html and
-** https://www.gnu.org/licenses/gpl-3.0.html.
-**
-** $QT_END_LICENSE$
-**
-****************************************************************************/
+// Copyright (C) 2021 The Qt Company Ltd.
+// Copyright (C) 2022 Intel Corporation.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
#ifndef QSIMD_P_H
#define QSIMD_P_H
@@ -55,34 +19,21 @@
#include <QtCore/private/qglobal_p.h>
#include <QtCore/qsimd.h>
+QT_WARNING_PUSH
+QT_WARNING_DISABLE_CLANG("-Wundef")
+QT_WARNING_DISABLE_GCC("-Wundef")
+QT_WARNING_DISABLE_INTEL(103)
+
+#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
+ for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
+
+#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
+ for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
+
+#define SIMD_EPILOGUE(i, length, max) \
+ for (int _i = 0; _i < max && i < length; ++i, ++_i)
+
/*
- * qt_module_config.prf defines the QT_COMPILER_SUPPORTS_XXX macros.
- * They mean the compiler supports the necessary flags and the headers
- * for the x86 and ARM intrinsics:
- * - GCC: the -mXXX or march=YYY flag is necessary before #include
- * up to 4.8; GCC >= 4.9 can include unconditionally
- * - Intel CC: #include can happen unconditionally
- * - MSVC: #include can happen unconditionally
- * - RVCT: ???
- *
- * We will try to include all headers possible under this configuration.
- *
- * MSVC does not define __SSE2__ & family, so we will define them. MSVC 2013 &
- * up do define __AVX__ if the -arch:AVX option is passed on the command-line.
- *
- * Supported XXX are:
- * Flag | Arch | GCC | Intel CC | MSVC |
- * ARM_NEON | ARM | I & C | None | ? |
- * SSE2 | x86 | I & C | I & C | I & C |
- * SSE3 | x86 | I & C | I & C | I only |
- * SSSE3 | x86 | I & C | I & C | I only |
- * SSE4_1 | x86 | I & C | I & C | I only |
- * SSE4_2 | x86 | I & C | I & C | I only |
- * AVX | x86 | I & C | I & C | I & C |
- * AVX2 | x86 | I & C | I & C | I only |
- * AVX512xx | x86 | I & C | I & C | I only |
- * I = intrinsics; C = code generation
- *
* Code can use the following constructs to determine compiler support & status:
* - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
* If this test passes, then the compiler is already generating code for that
@@ -103,6 +54,9 @@
* sub-arch. Only inside such functions is the use of the intrisics
* guaranteed to work. This is useful with runtime detection (see below).
*
+ * The distinction between QT_COMPILER_SUPPORTS and QT_COMPILER_SUPPORTS_HERE is
+ * historical: GCC 4.8 needed the distinction.
+ *
* Runtime detection of a CPU sub-architecture can be done with the
* qCpuHasFeature(XXX) function. There are two strategies for generating
* optimized code like that:
@@ -145,7 +99,7 @@
#define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0)
-#if defined(Q_PROCESSOR_ARM) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)
+#if defined(Q_PROCESSOR_ARM)
# define QT_COMPILER_SUPPORTS_HERE(x) ((__ARM_FEATURE_ ## x) || (__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
# if defined(Q_CC_GNU)
/* GCC requires attributes for a function */
@@ -162,35 +116,73 @@
# if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32)
# define __MIPS_DSPR2__
# endif
-#elif defined(Q_PROCESSOR_X86) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)
+#elif defined(Q_PROCESSOR_X86)
# if defined(Q_CC_CLANG) && defined(Q_CC_MSVC)
# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
# else
# define QT_COMPILER_SUPPORTS_HERE(x) ((__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
# endif
-# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL)
+# if defined(Q_CC_GNU)
/* GCC requires attributes for a function */
# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
# else
# define QT_FUNCTION_TARGET(x)
# endif
-#elif defined(Q_PROCESSOR_ARM)
-# define QT_COMPILER_SUPPORTS_HERE(x) ((__ARM_FEATURE_ ## x) || (__ ## x ## __))
-# define QT_FUNCTION_TARGET(x)
#else
# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
# define QT_FUNCTION_TARGET(x)
#endif
+#if defined(__SSE2__) && !defined(QT_COMPILER_SUPPORTS_SSE2) && !defined(QT_BOOTSTRAPPED)
+// Intrinsic support appears to be missing, so pretend these features don't exist
+# undef __SSE__
+# undef __SSE2__
+# undef __SSE3__
+# undef __SSSE3__
+# undef __SSE4_1__
+# undef __SSE4_2__
+# undef __AES__
+# undef __POPCNT__
+# undef __AVX__
+# undef __F16C__
+# undef __RDRND__
+# undef __AVX2__
+# undef __BMI__
+# undef __BMI2__
+# undef __FMA__
+# undef __MOVBE__
+# undef __RDSEED__
+# undef __AVX512F__
+# undef __AVX512ER__
+# undef __AVX512CD__
+# undef __AVX512PF__
+# undef __AVX512DQ__
+# undef __AVX512BW__
+# undef __AVX512VL__
+# undef __AVX512IFMA__
+# undef __AVX512VBMI__
+# undef __SHA__
+# undef __AVX512VBMI2__
+# undef __AVX512BITALG__
+# undef __AVX512VNNI__
+# undef __AVX512VPOPCNTDQ__
+# undef __GFNI__
+# undef __VAES__
+#endif
+
#ifdef Q_PROCESSOR_X86
/* -- x86 intrinsic support -- */
+# if defined(QT_COMPILER_SUPPORTS_RDSEED) && defined(Q_OS_QNX)
+// The compiler for QNX is missing the intrinsic
+# undef QT_COMPILER_SUPPORTS_RDSEED
+# endif
# if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2)
// MSVC doesn't define __SSE2__, so do it ourselves
# define __SSE__ 1
# endif
-# if defined(Q_OS_WIN) && defined(Q_CC_GNU) && !defined(Q_CC_INTEL) && !defined(Q_CC_CLANG)
+# if defined(Q_OS_WIN) && defined(Q_CC_GNU) && !defined(Q_CC_CLANG)
// 64-bit GCC on Windows does not support AVX, so we hack around it by forcing
// it to emit unaligned loads & stores
// See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=49001
@@ -213,7 +205,7 @@ asm(
);
# endif
-# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) && !defined(Q_OS_WASM)
+# if defined(Q_CC_GNU) && !defined(Q_OS_WASM)
// GCC 4.4 and Clang 2.8 added a few more intrinsics there
# include <x86intrin.h>
# endif
@@ -221,46 +213,46 @@ asm(
# include <immintrin.h>
# endif
-# if defined(Q_CC_GNU) && defined(__AVX2__) && (!defined(__BMI__) || !defined(__FMA__))
-# error "Please enable the BMI and FMA extensions; you probably want to use -march=haswell or -march=x86-64-v3 instead of -mavx2"
-# endif
-
-# include "qsimd_x86_p.h"
+# include <QtCore/private/qsimd_x86_p.h>
-// Haswell sub-architecture
+// x86-64 sub-architecture version 3
//
// The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2,
-// BMI1, BMI2, FMA, LZCNT, MOVBE, which makes it a good divider for a
-// sub-target for us. The first AMD processor with AVX2 support (Zen) has the
-// same features.
-//
-// macOS's fat binaries support the "x86_64h" sub-architecture and the GNU libc
-// ELF loader also supports a "haswell/" subdir (e.g., /usr/lib/haswell).
-# define QT_FUNCTION_TARGET_STRING_ARCH_HASWELL "arch=haswell"
-# if defined(__AVX2__) && defined(__BMI__) && defined(__BMI2__) && defined(__F16C__) && \
- defined(__FMA__) && defined(__LZCNT__) && defined(__RDRND__)
+// BMI1, BMI2, FMA, LZCNT, MOVBE. This feature set was chosen as the version 3
+// of the x86-64 ISA (x86-64-v3) and is supported by GCC and Clang. On systems
+// with the GNU libc, libraries with this feature can be installed on a
+// "glibc-hwcaps/x86-64-v3" subdir. macOS's fat binaries support the "x86_64h"
+// sub-architecture too.
+
+# if defined(__AVX2__)
+// List of features present with -march=x86-64-v3 and not architecturally
+// implied by __AVX2__
+# define ARCH_HASWELL_MACROS \
+ (__AVX2__ + __BMI__ + __BMI2__ + __F16C__ + __FMA__ + __LZCNT__ + __POPCNT__)
+# if ARCH_HASWELL_MACROS != 7
+# error "Please enable all x86-64-v3 extensions; you probably want to use -march=haswell or -march=x86-64-v3 instead of -mavx2"
+# endif
+static_assert(ARCH_HASWELL_MACROS, "Undeclared identifiers indicate which features are missing.");
# define __haswell__ 1
+# undef ARCH_HASWELL_MACROS
# endif
-// This constant does not include all CPU features found in a Haswell, only
-// those that we'd have optimized code for.
-// Note: must use Q_CONSTEXPR here, as this file may be compiled in C mode.
-QT_BEGIN_NAMESPACE
-static const quint64 CpuFeatureArchHaswell = 0
- | CpuFeatureSSE2
- | CpuFeatureSSE3
- | CpuFeatureSSSE3
- | CpuFeatureSSE4_1
- | CpuFeatureSSE4_2
- | CpuFeatureFMA
- | CpuFeaturePOPCNT
- | CpuFeatureAVX
- | CpuFeatureF16C
- | CpuFeatureAVX2
- | CpuFeatureBMI
- | CpuFeatureBMI2;
-QT_END_NAMESPACE
-
+// x86-64 sub-architecture version 4
+//
+// Similar to the above, x86-64-v4 matches the AVX512 variant of the Intel Core
+// 6th generation (codename "Skylake"). AMD Zen4 is the their first processor
+// with AVX512 support and it includes all of these too. The GNU libc subdir for
+// this is "glibc-hwcaps/x86-64-v4".
+//
+# define ARCH_SKX_MACROS (__AVX512F__ + __AVX512BW__ + __AVX512CD__ + __AVX512DQ__ + __AVX512VL__)
+# if ARCH_SKX_MACROS != 0
+# if ARCH_SKX_MACROS != 5
+# error "Please enable all x86-64-v4 extensions; you probably want to use -march=skylake-avx512 or -march=x86-64-v4 instead of -mavx512f"
+# endif
+static_assert(ARCH_SKX_MACROS, "Undeclared identifiers indicate which features are missing.");
+# define __skylake_avx512__ 1
+# endif
+# undef ARCH_SKX_MACROS
#endif /* Q_PROCESSOR_X86 */
// NEON intrinsics
@@ -315,12 +307,6 @@ inline uint8_t vaddv_u8(uint8x8_t v8)
#endif
#endif
-
-#ifdef __cplusplus
-#include <qatomic.h>
-
-QT_BEGIN_NAMESPACE
-
#ifndef Q_PROCESSOR_X86
enum CPUFeatures {
#if defined(Q_PROCESSOR_ARM)
@@ -333,21 +319,25 @@ enum CPUFeatures {
CpuFeatureDSP = 2,
CpuFeatureDSPR2 = 4,
#endif
-
- // used only to indicate that the CPU detection was initialised
- QSimdInitialized = 1
};
-static const quint64 qCompilerCpuFeatures = 0
+static const uint64_t qCompilerCpuFeatures = 0
#if defined __ARM_NEON__
| CpuFeatureNEON
#endif
+#if !(defined(Q_OS_LINUX) && defined(Q_PROCESSOR_ARM_64))
+ // Yocto Project recipes enable Crypto extension for all ARMv8 configs,
+ // even for targets without the Crypto extension. That's wrong, but as
+ // the compiler never generates the code for them on their own, most
+ // code never notices the problem. But we would. By not setting the
+ // bits here, we force a runtime detection.
#if defined __ARM_FEATURE_CRC32
| CpuFeatureCRC32
#endif
#if defined __ARM_FEATURE_CRYPTO
| CpuFeatureAES
#endif
+#endif // Q_OS_LINUX && Q_PROCESSOR_ARM64
#if defined __mips_dsp
| CpuFeatureDSP
#endif
@@ -357,58 +347,71 @@ static const quint64 qCompilerCpuFeatures = 0
;
#endif
-#ifdef Q_ATOMIC_INT64_IS_SUPPORTED
-extern Q_CORE_EXPORT QBasicAtomicInteger<quint64> qt_cpu_features[1];
+#ifdef __cplusplus
+# include <atomic>
+# define Q_ATOMIC(T) std::atomic<T>
+QT_BEGIN_NAMESPACE
+using std::atomic_load_explicit;
+static constexpr auto memory_order_relaxed = std::memory_order_relaxed;
+extern "C" {
#else
-extern Q_CORE_EXPORT QBasicAtomicInteger<unsigned> qt_cpu_features[2];
+# include <stdatomic.h>
+# define Q_ATOMIC(T) _Atomic(T)
#endif
-Q_CORE_EXPORT quint64 qDetectCpuFeatures();
-#if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED)
-Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept;
+#ifdef Q_PROCESSOR_X86
+typedef uint64_t QCpuFeatureType;
+static const QCpuFeatureType qCompilerCpuFeatures = _compilerCpuFeatures;
+static const QCpuFeatureType CpuFeatureArchHaswell = cpu_haswell;
+static const QCpuFeatureType CpuFeatureArchSkylakeAvx512 = cpu_skylake_avx512;
#else
-static inline qsizetype qRandomCpu(void *, qsizetype) noexcept
-{
- return 0;
-}
+typedef unsigned QCpuFeatureType;
#endif
+extern Q_CORE_EXPORT Q_ATOMIC(QCpuFeatureType) QT_MANGLE_NAMESPACE(qt_cpu_features)[1];
+Q_CORE_EXPORT uint64_t QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
-static inline quint64 qCpuFeatures()
+static inline uint64_t qCpuFeatures()
{
- quint64 features = qt_cpu_features[0].loadRelaxed();
-#ifndef Q_ATOMIC_INT64_IS_SUPPORTED
- features |= quint64(qt_cpu_features[1].loadRelaxed()) << 32;
-#endif
- if (Q_UNLIKELY(features == 0)) {
- features = qDetectCpuFeatures();
- Q_ASSUME(features != 0);
+#ifdef QT_BOOTSTRAPPED
+ return qCompilerCpuFeatures; // no detection
+#else
+ quint64 features = atomic_load_explicit(QT_MANGLE_NAMESPACE(qt_cpu_features), memory_order_relaxed);
+ if (!QT_SUPPORTS_INIT_PRIORITY) {
+ if (Q_UNLIKELY(features == 0))
+ features = QT_MANGLE_NAMESPACE(qDetectCpuFeatures)();
}
return features;
+#endif
}
#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \
|| ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
-inline bool qHasHwrng()
+#ifdef __cplusplus
+} // extern "C"
+
+# if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED)
+Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept;
+
+static inline bool qHasHwrng()
{
-#if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND)
return qCpuHasFeature(RDRND);
-#else
+}
+# else
+static inline qsizetype qRandomCpu(void *, qsizetype) noexcept
+{
+ return 0;
+}
+static inline bool qHasHwrng()
+{
return false;
-#endif
}
-
-#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
- for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
-
-#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
- for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
+# endif
QT_END_NAMESPACE
#endif // __cplusplus
-#define SIMD_EPILOGUE(i, length, max) \
- for (int _i = 0; _i < max && i < length; ++i, ++_i)
+QT_WARNING_POP
#endif // QSIMD_P_H