diff options
Diffstat (limited to 'src/corelib/global/qsimd_p.h')
-rw-r--r-- | src/corelib/global/qsimd_p.h | 377 |
1 files changed, 377 insertions, 0 deletions
diff --git a/src/corelib/global/qsimd_p.h b/src/corelib/global/qsimd_p.h new file mode 100644 index 0000000000..2f2d49348f --- /dev/null +++ b/src/corelib/global/qsimd_p.h @@ -0,0 +1,377 @@ +/**************************************************************************** +** +** Copyright (C) 2016 The Qt Company Ltd. +** Copyright (C) 2018 Intel Corporation. +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the QtCore module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#ifndef QSIMD_P_H +#define QSIMD_P_H + +// +// W A R N I N G +// ------------- +// +// This file is not part of the Qt API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. +// + +#include <QtCore/private/qglobal_p.h> +#include <QtCore/qsimd.h> + +/* + * qt_module_config.prf defines the QT_COMPILER_SUPPORTS_XXX macros. + * They mean the compiler supports the necessary flags and the headers + * for the x86 and ARM intrinsics: + * - GCC: the -mXXX or march=YYY flag is necessary before #include + * up to 4.8; GCC >= 4.9 can include unconditionally + * - Intel CC: #include can happen unconditionally + * - MSVC: #include can happen unconditionally + * - RVCT: ??? + * + * We will try to include all headers possible under this configuration. + * + * MSVC does not define __SSE2__ & family, so we will define them. MSVC 2013 & + * up do define __AVX__ if the -arch:AVX option is passed on the command-line. + * + * Supported XXX are: + * Flag | Arch | GCC | Intel CC | MSVC | + * ARM_NEON | ARM | I & C | None | ? | + * SSE2 | x86 | I & C | I & C | I & C | + * SSE3 | x86 | I & C | I & C | I only | + * SSSE3 | x86 | I & C | I & C | I only | + * SSE4_1 | x86 | I & C | I & C | I only | + * SSE4_2 | x86 | I & C | I & C | I only | + * AVX | x86 | I & C | I & C | I & C | + * AVX2 | x86 | I & C | I & C | I only | + * AVX512xx | x86 | I & C | I & C | I only | + * I = intrinsics; C = code generation + * + * Code can use the following constructs to determine compiler support & status: + * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__) + * If this test passes, then the compiler is already generating code for that + * given sub-architecture. The intrinsics for that sub-architecture are + * #included and can be used without restriction or runtime check. + * + * - #if QT_COMPILER_SUPPORTS(XXX) + * If this test passes, then the compiler is able to generate code for that + * given sub-architecture in another translation unit, given the right set of + * flags. Use of the intrinsics is not guaranteed. This is useful with + * runtime detection (see below). + * + * - #if QT_COMPILER_SUPPORTS_HERE(XXX) + * If this test passes, then the compiler is able to generate code for that + * given sub-architecture in this translation unit, even if it is not doing + * that now (it might be). Individual functions may be tagged with + * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that + * sub-arch. Only inside such functions is the use of the intrisics + * guaranteed to work. This is useful with runtime detection (see below). + * + * Runtime detection of a CPU sub-architecture can be done with the + * qCpuHasFeature(XXX) function. There are two strategies for generating + * optimized code like that: + * + * 1) place the optimized code in a different translation unit (C or assembly + * sources) and pass the correct flags to the compiler to enable support. Those + * sources must not include qglobal.h, which means they cannot include this + * file either. The dispatcher function would look like this: + * + * void foo() + * { + * #if QT_COMPILER_SUPPORTS(XXX) + * if (qCpuHasFeature(XXX)) { + * foo_optimized_xxx(); + * return; + * } + * #endif + * foo_plain(); + * } + * + * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and + * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use + * other Qt code. The dispatcher function would look like this: + * + * void foo() + * { + * #if QT_COMPILER_SUPPORTS_HERE(XXX) + * if (qCpuHasFeature(XXX)) { + * foo_optimized_xxx(); + * return; + * } + * #endif + * foo_plain(); + * } + */ + +#if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC) +#include <intrin.h> +#endif + +#define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0) + +#if defined(Q_PROCESSOR_ARM) +# define QT_COMPILER_SUPPORTS_HERE(x) (__ARM_FEATURE_ ## x) +# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) && Q_CC_GNU >= 600 + /* GCC requires attributes for a function */ +# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x))) +# else +# define QT_FUNCTION_TARGET(x) +# endif +# if !defined(__ARM_FEATURE_NEON) && defined(__ARM_NEON__) +# define __ARM_FEATURE_NEON // also support QT_COMPILER_SUPPORTS_HERE(NEON) +# endif +#elif defined(Q_PROCESSOR_MIPS) +# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __) +# define QT_FUNCTION_TARGET(x) +# if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32) +# define __MIPS_DSP__ +# endif +# if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32) +# define __MIPS_DSPR2__ +# endif +#elif defined(Q_PROCESSOR_X86) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) +# define QT_COMPILER_SUPPORTS_HERE(x) ((__ ## x ## __) || QT_COMPILER_SUPPORTS(x)) +# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) + /* GCC requires attributes for a function */ +# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x))) +# else +# define QT_FUNCTION_TARGET(x) +# endif +#else +# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __) +# define QT_FUNCTION_TARGET(x) +#endif + +#ifdef Q_PROCESSOR_X86 +/* -- x86 intrinsic support -- */ + +# if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2) +// MSVC doesn't define __SSE2__, so do it ourselves +# define __SSE__ 1 +# endif + +# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) +// GCC 4.4 and Clang 2.8 added a few more intrinsics there +# include <x86intrin.h> +# endif + +# if defined(__SSE4_2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC)) +// POPCNT instructions: +// All processors that support SSE4.2 support POPCNT +// (but neither MSVC nor the Intel compiler define this macro) +# define __POPCNT__ 1 +# endif + +// AVX intrinsics +# if defined(__AVX__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC)) +// AES, PCLMULQDQ instructions: +// All processors that support AVX support PCLMULQDQ +// (but neither MSVC nor the Intel compiler define this macro) +# define __PCLMUL__ 1 +# endif + +# if defined(__AVX2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC)) +// F16C & RDRAND instructions: +// All processors that support AVX2 support F16C & RDRAND: +// (but neither MSVC nor the Intel compiler define these macros) +# define __F16C__ 1 +# define __RDRND__ 1 +# endif + +# if defined(__BMI__) && !defined(__BMI2__) && defined(Q_CC_INTEL) +// BMI2 instructions: +// All processors that support BMI support BMI2 (and AVX2) +// (but neither MSVC nor the Intel compiler define this macro) +# define __BMI2__ 1 +# endif + +# include "qsimd_x86_p.h" + +// Haswell sub-architecture +// +// The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2, +// BMI1, BMI2, FMA, LZCNT, MOVBE, which makes it a good divider for a +// sub-target for us. The first AMD processor with AVX2 support (Zen) has the +// same features. +// +// macOS's fat binaries support the "x86_64h" sub-architecture and the GNU libc +// ELF loader also supports a "haswell/" subdir (e.g., /usr/lib/haswell). +# define QT_FUNCTION_TARGET_STRING_ARCH_HASWELL "arch=haswell" +# if defined(__AVX2__) && defined(__BMI__) && defined(__BMI2__) && defined(__F16C__) && \ + defined(__FMA__) && defined(__LZCNT__) && defined(__RDRND__) +# define __haswell__ 1 +# endif + +// This constant does not include all CPU features found in a Haswell, only +// those that we'd have optimized code for. +// Note: must use Q_CONSTEXPR here, as this file may be compiled in C mode. +QT_BEGIN_NAMESPACE +static const quint64 CpuFeatureArchHaswell = 0 + | CpuFeatureSSE2 + | CpuFeatureSSE3 + | CpuFeatureSSSE3 + | CpuFeatureSSE4_1 + | CpuFeatureSSE4_2 + | CpuFeatureFMA + | CpuFeaturePOPCNT + | CpuFeatureAVX + | CpuFeatureF16C + | CpuFeatureAVX2 + | CpuFeatureBMI + | CpuFeatureBMI2; +QT_END_NAMESPACE + +#endif /* Q_PROCESSOR_X86 */ + +// Clang compiler fix, see http://lists.llvm.org/pipermail/cfe-commits/Week-of-Mon-20160222/151168.html +// This should be tweaked with an "upper version" of clang once we know which release fixes the +// issue. At that point we can rely on __ARM_FEATURE_CRC32 again. +#if defined(Q_CC_CLANG) && defined(Q_OS_DARWIN) && defined (__ARM_FEATURE_CRC32) +# undef __ARM_FEATURE_CRC32 +#endif + +// NEON intrinsics +// note: as of GCC 4.9, does not support function targets for ARM +#if defined(__ARM_NEON) || defined(__ARM_NEON__) +#define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available. +#ifndef __ARM_NEON__ +// __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection. +#define __ARM_NEON__ +#endif +#endif +// AArch64/ARM64 +#if defined(Q_PROCESSOR_ARM_V8) && defined(__ARM_FEATURE_CRC32) +#if defined(Q_PROCESSOR_ARM_64) +// only available on aarch64 +#define QT_FUNCTION_TARGET_STRING_CRC32 "+crc" +#endif +# include <arm_acle.h> +#endif + +#ifdef __cplusplus +#include <qatomic.h> + +QT_BEGIN_NAMESPACE + +#ifndef Q_PROCESSOR_X86 +enum CPUFeatures { +#if defined(Q_PROCESSOR_ARM) + CpuFeatureNEON = 2, + CpuFeatureARM_NEON = CpuFeatureNEON, + CpuFeatureCRC32 = 4, +#elif defined(Q_PROCESSOR_MIPS) + CpuFeatureDSP = 2, + CpuFeatureDSPR2 = 4, +#endif + + // used only to indicate that the CPU detection was initialised + QSimdInitialized = 1 +}; + +static const quint64 qCompilerCpuFeatures = 0 +#if defined __ARM_NEON__ + | CpuFeatureNEON +#endif +#if defined __ARM_FEATURE_CRC32 + | CpuFeatureCRC32 +#endif +#if defined __mips_dsp + | CpuFeatureDSP +#endif +#if defined __mips_dspr2 + | CpuFeatureDSPR2 +#endif + ; +#endif + +#ifdef Q_ATOMIC_INT64_IS_SUPPORTED +extern Q_CORE_EXPORT QBasicAtomicInteger<quint64> qt_cpu_features[1]; +#else +extern Q_CORE_EXPORT QBasicAtomicInteger<unsigned> qt_cpu_features[2]; +#endif +Q_CORE_EXPORT quint64 qDetectCpuFeatures(); + +#if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) && !defined(QT_BOOTSTRAPPED) +Q_CORE_EXPORT qsizetype qRandomCpu(void *, qsizetype) noexcept; +#else +static inline qsizetype qRandomCpu(void *, qsizetype) noexcept +{ + return 0; +} +#endif + +static inline quint64 qCpuFeatures() +{ + quint64 features = qt_cpu_features[0].loadRelaxed(); +#ifndef Q_ATOMIC_INT64_IS_SUPPORTED + features |= quint64(qt_cpu_features[1].loadRelaxed()) << 32; +#endif + if (Q_UNLIKELY(features == 0)) { + features = qDetectCpuFeatures(); + Q_ASSUME(features != 0); + } + return features; +} + +#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \ + || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature)) + +inline bool qHasHwrng() +{ +#if defined(Q_PROCESSOR_X86) && QT_COMPILER_SUPPORTS_HERE(RDRND) + return qCpuHasFeature(RDRND); +#else + return false; +#endif +} + +#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \ + for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i) + +#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \ + for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i) + +QT_END_NAMESPACE + +#endif // __cplusplus + +#define SIMD_EPILOGUE(i, length, max) \ + for (int _i = 0; _i < max && i < length; ++i, ++_i) + +#endif // QSIMD_P_H |