diff options
Diffstat (limited to 'src/corelib/tools/qsimd_p.h')
-rw-r--r-- | src/corelib/tools/qsimd_p.h | 208 |
1 files changed, 56 insertions, 152 deletions
diff --git a/src/corelib/tools/qsimd_p.h b/src/corelib/tools/qsimd_p.h index 18684caefb..9f1321df94 100644 --- a/src/corelib/tools/qsimd_p.h +++ b/src/corelib/tools/qsimd_p.h @@ -1,7 +1,7 @@ /**************************************************************************** ** ** Copyright (C) 2016 The Qt Company Ltd. -** Copyright (C) 2016 Intel Corporation. +** Copyright (C) 2018 Intel Corporation. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of the QtCore module of the Qt Toolkit. @@ -232,32 +232,48 @@ # define __RDRND__ 1 # endif -#define QT_FUNCTION_TARGET_STRING_SSE2 "sse2" -#define QT_FUNCTION_TARGET_STRING_SSE3 "sse3" -#define QT_FUNCTION_TARGET_STRING_SSSE3 "ssse3" -#define QT_FUNCTION_TARGET_STRING_SSE4_1 "sse4.1" -#define QT_FUNCTION_TARGET_STRING_SSE4_2 "sse4.2" -#define QT_FUNCTION_TARGET_STRING_AVX "avx" -#define QT_FUNCTION_TARGET_STRING_AVX2 "avx2" -#define QT_FUNCTION_TARGET_STRING_AVX512F "avx512f" -#define QT_FUNCTION_TARGET_STRING_AVX512CD "avx512cd" -#define QT_FUNCTION_TARGET_STRING_AVX512ER "avx512er" -#define QT_FUNCTION_TARGET_STRING_AVX512PF "avx512pf" -#define QT_FUNCTION_TARGET_STRING_AVX512BW "avx512bw" -#define QT_FUNCTION_TARGET_STRING_AVX512DQ "avx512dq" -#define QT_FUNCTION_TARGET_STRING_AVX512VL "avx512vl" -#define QT_FUNCTION_TARGET_STRING_AVX512IFMA "avx512ifma" -#define QT_FUNCTION_TARGET_STRING_AVX512VBMI "avx512vbmi" - -#define QT_FUNCTION_TARGET_STRING_AES "aes,sse4.2" -#define QT_FUNCTION_TARGET_STRING_PCLMUL "pclmul,sse4.2" -#define QT_FUNCTION_TARGET_STRING_POPCNT "popcnt" -#define QT_FUNCTION_TARGET_STRING_F16C "f16c,avx" -#define QT_FUNCTION_TARGET_STRING_RDRND "rdrnd" -#define QT_FUNCTION_TARGET_STRING_BMI "bmi" -#define QT_FUNCTION_TARGET_STRING_BMI2 "bmi2" -#define QT_FUNCTION_TARGET_STRING_RDSEED "rdseed" -#define QT_FUNCTION_TARGET_STRING_SHA "sha" +# if defined(__BMI__) && !defined(__BMI2__) && defined(Q_CC_INTEL) +// BMI2 instructions: +// All processors that support BMI support BMI2 (and AVX2) +// (but neither MSVC nor the Intel compiler define this macro) +# define __BMI2__ 1 +# endif + +# include "qsimd_x86_p.h" + +// Haswell sub-architecture +// +// The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2, +// BMI1, BMI2, FMA, LZCNT, MOVBE, which makes it a good divider for a +// sub-target for us. The first AMD processor with AVX2 support (Zen) has the +// same features. +// +// macOS's fat binaries support the "x86_64h" sub-architecture and the GNU libc +// ELF loader also supports a "haswell/" subdir (e.g., /usr/lib/haswell). +# define QT_FUNCTION_TARGET_STRING_ARCH_HASWELL "arch=haswell" +# if defined(__AVX2__) && defined(__BMI__) && defined(__BMI2__) && defined(__F16C__) && \ + defined(__FMA__) && defined(__LZCNT__) && defined(__RDRND__) +# define __haswell__ 1 +# endif + +// This constant does not include all CPU features found in a Haswell, only +// those that we'd have optimized code for. +// Note: must use Q_CONSTEXPR here, as this file may be compiled in C mode. +QT_BEGIN_NAMESPACE +static const quint64 CpuFeatureArchHaswell = 0 + | CpuFeatureSSE2 + | CpuFeatureSSE3 + | CpuFeatureSSSE3 + | CpuFeatureSSE4_1 + | CpuFeatureSSE4_2 + | CpuFeatureFMA + | CpuFeaturePOPCNT + | CpuFeatureAVX + | CpuFeatureF16C + | CpuFeatureAVX2 + | CpuFeatureBMI + | CpuFeatureBMI2; +QT_END_NAMESPACE #endif /* Q_PROCESSOR_X86 */ @@ -292,148 +308,36 @@ QT_BEGIN_NAMESPACE +#ifndef Q_PROCESSOR_X86 enum CPUFeatures { #if defined(Q_PROCESSOR_ARM) - CpuFeatureNEON = 0, + CpuFeatureNEON = 2, CpuFeatureARM_NEON = CpuFeatureNEON, - CpuFeatureCRC32 = 1, + CpuFeatureCRC32 = 4, #elif defined(Q_PROCESSOR_MIPS) - CpuFeatureDSP = 0, - CpuFeatureDSPR2 = 1, -#elif defined(Q_PROCESSOR_X86) - // The order of the flags is jumbled so it matches most closely the bits in CPUID - // Out of order: - CpuFeatureSSE2 = 1, // uses the bit for PCLMULQDQ - // in level 1, ECX - CpuFeatureSSE3 = (0 + 0), - CpuFeatureSSSE3 = (0 + 9), - CpuFeatureSSE4_1 = (0 + 19), - CpuFeatureSSE4_2 = (0 + 20), - CpuFeatureMOVBE = (0 + 22), - CpuFeaturePOPCNT = (0 + 23), - CpuFeatureAES = (0 + 25), - CpuFeatureAVX = (0 + 28), - CpuFeatureF16C = (0 + 29), - CpuFeatureRDRND = (0 + 30), - // 31 is always zero and we've used it for the QSimdInitialized - - // in level 7, leaf 0, EBX - CpuFeatureBMI = (32 + 3), - CpuFeatureHLE = (32 + 4), - CpuFeatureAVX2 = (32 + 5), - CpuFeatureBMI2 = (32 + 8), - CpuFeatureRTM = (32 + 11), - CpuFeatureAVX512F = (32 + 16), - CpuFeatureAVX512DQ = (32 + 17), - CpuFeatureRDSEED = (32 + 18), - CpuFeatureAVX512IFMA = (32 + 21), - CpuFeatureAVX512PF = (32 + 26), - CpuFeatureAVX512ER = (32 + 27), - CpuFeatureAVX512CD = (32 + 28), - CpuFeatureSHA = (32 + 29), - CpuFeatureAVX512BW = (32 + 30), - CpuFeatureAVX512VL = (32 + 31), - - // in level 7, leaf 0, ECX (out of order, for now) - CpuFeatureAVX512VBMI = 2, // uses the bit for DTES64 + CpuFeatureDSP = 2, + CpuFeatureDSPR2 = 4, #endif // used only to indicate that the CPU detection was initialised - QSimdInitialized = 0x80000000 + QSimdInitialized = 1 }; static const quint64 qCompilerCpuFeatures = 0 -#if defined __SHA__ - | (Q_UINT64_C(1) << CpuFeatureSHA) -#endif -#if defined __AES__ - | (Q_UINT64_C(1) << CpuFeatureAES) -#endif -#if defined __RTM__ - | (Q_UINT64_C(1) << CpuFeatureRTM) -#endif -#ifdef __RDRND__ - | (Q_UINT64_C(1) << CpuFeatureRDRND) -#endif -#ifdef __RDSEED__ - | (Q_UINT64_C(1) << CpuFeatureRDSEED) -#endif -#if defined __BMI__ - | (Q_UINT64_C(1) << CpuFeatureBMI) -#endif -#if defined __BMI2__ - | (Q_UINT64_C(1) << CpuFeatureBMI2) -#endif -#if defined __F16C__ - | (Q_UINT64_C(1) << CpuFeatureF16C) -#endif -#if defined __POPCNT__ - | (Q_UINT64_C(1) << CpuFeaturePOPCNT) -#endif -#if defined __MOVBE__ // GCC and Clang don't seem to define this - | (Q_UINT64_C(1) << CpuFeatureMOVBE) -#endif -#if defined __AVX512F__ - | (Q_UINT64_C(1) << CpuFeatureAVX512F) -#endif -#if defined __AVX512CD__ - | (Q_UINT64_C(1) << CpuFeatureAVX512CD) -#endif -#if defined __AVX512ER__ - | (Q_UINT64_C(1) << CpuFeatureAVX512ER) -#endif -#if defined __AVX512PF__ - | (Q_UINT64_C(1) << CpuFeatureAVX512PF) -#endif -#if defined __AVX512BW__ - | (Q_UINT64_C(1) << CpuFeatureAVX512BW) -#endif -#if defined __AVX512DQ__ - | (Q_UINT64_C(1) << CpuFeatureAVX512DQ) -#endif -#if defined __AVX512VL__ - | (Q_UINT64_C(1) << CpuFeatureAVX512VL) -#endif -#if defined __AVX512IFMA__ - | (Q_UINT64_C(1) << CpuFeatureAVX512IFMA) -#endif -#if defined __AVX512VBMI__ - | (Q_UINT64_C(1) << CpuFeatureAVX512VBMI) -#endif -#if defined __AVX2__ - | (Q_UINT64_C(1) << CpuFeatureAVX2) -#endif -#if defined __AVX__ - | (Q_UINT64_C(1) << CpuFeatureAVX) -#endif -#if defined __SSE4_2__ - | (Q_UINT64_C(1) << CpuFeatureSSE4_2) -#endif -#if defined __SSE4_1__ - | (Q_UINT64_C(1) << CpuFeatureSSE4_1) -#endif -#if defined __SSSE3__ - | (Q_UINT64_C(1) << CpuFeatureSSSE3) -#endif -#if defined __SSE3__ - | (Q_UINT64_C(1) << CpuFeatureSSE3) -#endif -#if defined __SSE2__ - | (Q_UINT64_C(1) << CpuFeatureSSE2) -#endif #if defined __ARM_NEON__ - | (Q_UINT64_C(1) << CpuFeatureNEON) + | CpuFeatureNEON #endif #if defined __ARM_FEATURE_CRC32 - | (Q_UINT64_C(1) << CpuFeatureCRC32) + | CpuFeatureCRC32 #endif #if defined __mips_dsp - | (Q_UINT64_C(1) << CpuFeatureDSP) + | CpuFeatureDSP #endif #if defined __mips_dspr2 - | (Q_UINT64_C(1) << CpuFeatureDSPR2) + | CpuFeatureDSPR2 #endif ; +#endif #ifdef Q_ATOMIC_INT64_IS_SUPPORTED extern Q_CORE_EXPORT QBasicAtomicInteger<quint64> qt_cpu_features[1]; @@ -459,8 +363,8 @@ static inline quint64 qCpuFeatures() return features; } -#define qCpuHasFeature(feature) ((qCompilerCpuFeatures & (Q_UINT64_C(1) << CpuFeature ## feature)) \ - || (qCpuFeatures() & (Q_UINT64_C(1) << CpuFeature ## feature))) +#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \ + || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature)) #define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \ for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i) |