/**************************************************************************** ** ** Copyright (C) 2016 The Qt Company Ltd. ** Copyright (C) 2016 Intel Corporation. ** Contact: https://www.qt.io/licensing/ ** ** This file is part of the QtCore module of the Qt Toolkit. ** ** $QT_BEGIN_LICENSE:LGPL$ ** Commercial License Usage ** Licensees holding valid commercial Qt licenses may use this file in ** accordance with the commercial license agreement provided with the ** Software or, alternatively, in accordance with the terms contained in ** a written agreement between you and The Qt Company. For licensing terms ** and conditions see https://www.qt.io/terms-conditions. For further ** information use the contact form at https://www.qt.io/contact-us. ** ** GNU Lesser General Public License Usage ** Alternatively, this file may be used under the terms of the GNU Lesser ** General Public License version 3 as published by the Free Software ** Foundation and appearing in the file LICENSE.LGPL3 included in the ** packaging of this file. Please review the following information to ** ensure the GNU Lesser General Public License version 3 requirements ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. ** ** GNU General Public License Usage ** Alternatively, this file may be used under the terms of the GNU ** General Public License version 2.0 or (at your option) the GNU General ** Public license version 3 or any later version approved by the KDE Free ** Qt Foundation. The licenses are as published by the Free Software ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 ** included in the packaging of this file. Please review the following ** information to ensure the GNU General Public License requirements will ** be met: https://www.gnu.org/licenses/gpl-2.0.html and ** https://www.gnu.org/licenses/gpl-3.0.html. ** ** $QT_END_LICENSE$ ** ****************************************************************************/ #ifndef QSIMD_P_H #define QSIMD_P_H // // W A R N I N G // ------------- // // This file is not part of the Qt API. It exists purely as an // implementation detail. This header file may change from version to // version without notice, or even be removed. // // We mean it. // #include #include /* * qt_module_config.prf defines the QT_COMPILER_SUPPORTS_XXX macros. * They mean the compiler supports the necessary flags and the headers * for the x86 and ARM intrinsics: * - GCC: the -mXXX or march=YYY flag is necessary before #include * up to 4.8; GCC >= 4.9 can include unconditionally * - Intel CC: #include can happen unconditionally * - MSVC: #include can happen unconditionally * - RVCT: ??? * * We will try to include all headers possible under this configuration. * * MSVC does not define __SSE2__ & family, so we will define them. MSVC 2013 & * up do define __AVX__ if the -arch:AVX option is passed on the command-line. * * Supported XXX are: * Flag | Arch | GCC | Intel CC | MSVC | * ARM_NEON | ARM | I & C | None | ? | * SSE2 | x86 | I & C | I & C | I & C | * SSE3 | x86 | I & C | I & C | I only | * SSSE3 | x86 | I & C | I & C | I only | * SSE4_1 | x86 | I & C | I & C | I only | * SSE4_2 | x86 | I & C | I & C | I only | * AVX | x86 | I & C | I & C | I & C | * AVX2 | x86 | I & C | I & C | I only | * AVX512xx | x86 | I & C | I & C | I only | * I = intrinsics; C = code generation * * Code can use the following constructs to determine compiler support & status: * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__) * If this test passes, then the compiler is already generating code for that * given sub-architecture. The intrinsics for that sub-architecture are * #included and can be used without restriction or runtime check. * * - #if QT_COMPILER_SUPPORTS(XXX) * If this test passes, then the compiler is able to generate code for that * given sub-architecture in another translation unit, given the right set of * flags. Use of the intrinsics is not guaranteed. This is useful with * runtime detection (see below). * * - #if QT_COMPILER_SUPPORTS_HERE(XXX) * If this test passes, then the compiler is able to generate code for that * given sub-architecture in this translation unit, even if it is not doing * that now (it might be). Individual functions may be tagged with * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that * sub-arch. Only inside such functions is the use of the intrisics * guaranteed to work. This is useful with runtime detection (see below). * * Runtime detection of a CPU sub-architecture can be done with the * qCpuHasFeature(XXX) function. There are two strategies for generating * optimized code like that: * * 1) place the optimized code in a different translation unit (C or assembly * sources) and pass the correct flags to the compiler to enable support. Those * sources must not include qglobal.h, which means they cannot include this * file either. The dispatcher function would look like this: * * void foo() * { * #if QT_COMPILER_SUPPORTS(XXX) * if (qCpuHasFeature(XXX)) { * foo_optimized_xxx(); * return; * } * #endif * foo_plain(); * } * * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use * other Qt code. The dispatcher function would look like this: * * void foo() * { * #if QT_COMPILER_SUPPORTS_HERE(XXX) * if (qCpuHasFeature(XXX)) { * foo_optimized_xxx(); * return; * } * #endif * foo_plain(); * } */ #if defined(__MINGW64_VERSION_MAJOR) || (defined(Q_CC_MSVC) && !defined(Q_OS_WINCE)) #include #endif #define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0) #if defined(Q_PROCESSOR_ARM) # define QT_COMPILER_SUPPORTS_HERE(x) (__ARM_FEATURE_ ## x) # if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) && Q_CC_GNU >= 600 /* GCC requires attributes for a function */ # define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x))) # else # define QT_FUNCTION_TARGET(x) # endif # if !defined(__ARM_FEATURE_NEON) && defined(__ARM_NEON__) # define __ARM_FEATURE_NEON // also support QT_COMPILER_SUPPORTS_HERE(NEON) # endif #elif (defined(Q_CC_INTEL) || defined(Q_CC_MSVC) \ || (defined(Q_CC_GNU) && !defined(Q_CC_CLANG) && (__GNUC__-0) * 100 + (__GNUC_MINOR__-0) >= 409)) \ && !defined(QT_BOOTSTRAPPED) # define QT_COMPILER_SUPPORTS_SIMD_ALWAYS # define QT_COMPILER_SUPPORTS_HERE(x) QT_COMPILER_SUPPORTS(x) # if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) /* GCC requires attributes for a function */ # define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x))) # else # define QT_FUNCTION_TARGET(x) # endif #else # define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __) # define QT_FUNCTION_TARGET(x) #endif // SSE intrinsics #define QT_FUNCTION_TARGET_STRING_SSE2 "sse2" #if defined(__SSE2__) || (defined(QT_COMPILER_SUPPORTS_SSE2) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) #if defined(QT_LINUXBASE) /// this is an evil hack - the posix_memalign declaration in LSB /// is wrong - see http://bugs.linuxbase.org/show_bug.cgi?id=2431 # define posix_memalign _lsb_hack_posix_memalign # include # undef posix_memalign #else # include #endif #if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2) # define __SSE__ 1 # define __SSE2__ 1 #endif #endif // SSE3 intrinsics #define QT_FUNCTION_TARGET_STRING_SSE3 "sse3" #if defined(__SSE3__) || (defined(QT_COMPILER_SUPPORTS_SSE3) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) #include #endif // SSSE3 intrinsics #define QT_FUNCTION_TARGET_STRING_SSSE3 "ssse3" #if defined(__SSSE3__) || (defined(QT_COMPILER_SUPPORTS_SSSE3) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) #include #endif // SSE4.1 intrinsics #define QT_FUNCTION_TARGET_STRING_SSE4_1 "sse4.1" #if defined(__SSE4_1__) || (defined(QT_COMPILER_SUPPORTS_SSE4_1) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) #include #endif // SSE4.2 intrinsics #define QT_FUNCTION_TARGET_STRING_SSE4_2 "sse4.2" #if defined(__SSE4_2__) || (defined(QT_COMPILER_SUPPORTS_SSE4_2) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) #include #endif // AVX intrinsics #define QT_FUNCTION_TARGET_STRING_AVX "avx" #define QT_FUNCTION_TARGET_STRING_AVX2 "avx2" #if defined(__AVX__) || (defined(QT_COMPILER_SUPPORTS_AVX) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) // immintrin.h is the ultimate header, we don't need anything else after this #include # if defined(Q_CC_MSVC) && (defined(_M_AVX) || defined(__AVX__)) // MS Visual Studio 2010 has no macro pre-defined to identify the use of /arch:AVX // MS Visual Studio 2013 adds it: __AVX__ // See: http://connect.microsoft.com/VisualStudio/feedback/details/605858/arch-avx-should-define-a-predefined-macro-in-x64-and-set-a-unique-value-for-m-ix86-fp-in-win32 # define __SSE3__ 1 # define __SSSE3__ 1 // no Intel CPU supports SSE4a, so don't define it # define __SSE4_1__ 1 # define __SSE4_2__ 1 # ifndef __AVX__ # define __AVX__ 1 # endif # endif #endif #define QT_FUNCTION_TARGET_STRING_AVX512F "avx512f" #define QT_FUNCTION_TARGET_STRING_AVX512CD "avx512cd" #define QT_FUNCTION_TARGET_STRING_AVX512ER "avx512er" #define QT_FUNCTION_TARGET_STRING_AVX512PF "avx512pf" #define QT_FUNCTION_TARGET_STRING_AVX512BW "avx512bw" #define QT_FUNCTION_TARGET_STRING_AVX512DQ "avx512dq" #define QT_FUNCTION_TARGET_STRING_AVX512VL "avx512vl" #define QT_FUNCTION_TARGET_STRING_AVX512IFMA "avx512ifma" #define QT_FUNCTION_TARGET_STRING_AVX512VBMI "avx512vbmi" #define QT_FUNCTION_TARGET_STRING_F16C "f16c" #define QT_FUNCTION_TARGET_STRING_RDRAND "rdrnd" #define QT_FUNCTION_TARGET_STRING_BMI "bmi" #define QT_FUNCTION_TARGET_STRING_BMI2 "bmi2" #define QT_FUNCTION_TARGET_STRING_RDSEED "rdseed" #define QT_FUNCTION_TARGET_STRING_SHA "sha" // other x86 intrinsics #if defined(Q_PROCESSOR_X86) && ((defined(Q_CC_GNU) && (Q_CC_GNU >= 404)) \ || (defined(Q_CC_CLANG) && (Q_CC_CLANG >= 208)) \ || defined(Q_CC_INTEL)) # define QT_COMPILER_SUPPORTS_X86INTRIN # ifdef Q_CC_INTEL // The Intel compiler has no -- all intrinsics are in ; # include # else // GCC 4.4 and Clang 2.8 added a few more intrinsics there # include # endif #endif // Clang compiler fix, see http://lists.llvm.org/pipermail/cfe-commits/Week-of-Mon-20160222/151168.html // This should be tweaked with an "upper version" of clang once we know which release fixes the // issue. At that point we can rely on __ARM_FEATURE_CRC32 again. #if defined(Q_CC_CLANG) && defined(Q_OS_DARWIN) && defined (__ARM_FEATURE_CRC32) # undef __ARM_FEATURE_CRC32 #endif // NEON intrinsics // note: as of GCC 4.9, does not support function targets for ARM #if defined(__ARM_NEON) || defined(__ARM_NEON__) #include #define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available. #ifndef __ARM_NEON__ // __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection. #define __ARM_NEON__ #endif #endif // AArch64/ARM64 #if defined(Q_PROCESSOR_ARM_V8) && defined(__ARM_FEATURE_CRC32) #define QT_FUNCTION_TARGET_STRING_CRC32 "+crc" # include #endif #undef QT_COMPILER_SUPPORTS_SIMD_ALWAYS QT_BEGIN_NAMESPACE enum CPUFeatures { #if defined(Q_PROCESSOR_ARM) CpuFeatureNEON = 0, CpuFeatureARM_NEON = CpuFeatureNEON, CpuFeatureCRC32 = 1, #elif defined(Q_PROCESSOR_MIPS) CpuFeatureDSP = 0, CpuFeatureDSPR2 = 1, #elif defined(Q_PROCESSOR_X86) // The order of the flags is jumbled so it matches most closely the bits in CPUID // Out of order: CpuFeatureSSE2 = 1, // uses the bit for PCLMULQDQ // in level 1, ECX CpuFeatureSSE3 = (0 + 0), CpuFeatureSSSE3 = (0 + 9), CpuFeatureSSE4_1 = (0 + 19), CpuFeatureSSE4_2 = (0 + 20), CpuFeatureMOVBE = (0 + 22), CpuFeaturePOPCNT = (0 + 23), CpuFeatureAES = (0 + 25), CpuFeatureAVX = (0 + 28), CpuFeatureF16C = (0 + 29), CpuFeatureRDRAND = (0 + 30), // 31 is always zero and we've used it for the QSimdInitialized // in level 7, leaf 0, EBX CpuFeatureBMI = (32 + 3), CpuFeatureHLE = (32 + 4), CpuFeatureAVX2 = (32 + 5), CpuFeatureBMI2 = (32 + 8), CpuFeatureRTM = (32 + 11), CpuFeatureAVX512F = (32 + 16), CpuFeatureAVX512DQ = (32 + 17), CpuFeatureRDSEED = (32 + 18), CpuFeatureAVX512IFMA = (32 + 21), CpuFeatureAVX512PF = (32 + 26), CpuFeatureAVX512ER = (32 + 27), CpuFeatureAVX512CD = (32 + 28), CpuFeatureSHA = (32 + 29), CpuFeatureAVX512BW = (32 + 30), CpuFeatureAVX512VL = (32 + 31), // in level 7, leaf 0, ECX (out of order, for now) CpuFeatureAVX512VBMI = 2, // uses the bit for DTES64 #endif // used only to indicate that the CPU detection was initialised QSimdInitialized = 0x80000000 }; static const quint64 qCompilerCpuFeatures = 0 #if defined __SHA__ | (Q_UINT64_C(1) << CpuFeatureSHA) #endif #if defined __AES__ | (Q_UINT64_C(1) << CpuFeatureAES) #endif #if defined __RTM__ | (Q_UINT64_C(1) << CpuFeatureRTM) #endif #ifdef __RDRND__ | (Q_UINT64_C(1) << CpuFeatureRDRAND) #endif #ifdef __RDSEED__ | (Q_UINT64_C(1) << CpuFeatureRDSEED) #endif #if defined __BMI__ | (Q_UINT64_C(1) << CpuFeatureBMI) #endif #if defined __BMI2__ | (Q_UINT64_C(1) << CpuFeatureBMI2) #endif #if defined __F16C__ | (Q_UINT64_C(1) << CpuFeatureF16C) #endif #if defined __POPCNT__ | (Q_UINT64_C(1) << CpuFeaturePOPCNT) #endif #if defined __MOVBE__ // GCC and Clang don't seem to define this | (Q_UINT64_C(1) << CpuFeatureMOVBE) #endif #if defined __AVX512F__ | (Q_UINT64_C(1) << CpuFeatureAVX512F) #endif #if defined __AVX512CD__ | (Q_UINT64_C(1) << CpuFeatureAVX512CD) #endif #if defined __AVX512ER__ | (Q_UINT64_C(1) << CpuFeatureAVX512ER) #endif #if defined __AVX512PF__ | (Q_UINT64_C(1) << CpuFeatureAVX512PF) #endif #if defined __AVX512BW__ | (Q_UINT64_C(1) << CpuFeatureAVX512BW) #endif #if defined __AVX512DQ__ | (Q_UINT64_C(1) << CpuFeatureAVX512DQ) #endif #if defined __AVX512VL__ | (Q_UINT64_C(1) << CpuFeatureAVX512VL) #endif #if defined __AVX512IFMA__ | (Q_UINT64_C(1) << CpuFeatureAVX512IFMA) #endif #if defined __AVX512VBMI__ | (Q_UINT64_C(1) << CpuFeatureAVX512VBMI) #endif #if defined __AVX2__ | (Q_UINT64_C(1) << CpuFeatureAVX2) #endif #if defined __AVX__ | (Q_UINT64_C(1) << CpuFeatureAVX) #endif #if defined __SSE4_2__ | (Q_UINT64_C(1) << CpuFeatureSSE4_2) #endif #if defined __SSE4_1__ | (Q_UINT64_C(1) << CpuFeatureSSE4_1) #endif #if defined __SSSE3__ | (Q_UINT64_C(1) << CpuFeatureSSSE3) #endif #if defined __SSE3__ | (Q_UINT64_C(1) << CpuFeatureSSE3) #endif #if defined __SSE2__ | (Q_UINT64_C(1) << CpuFeatureSSE2) #endif #if defined __ARM_NEON__ | (Q_UINT64_C(1) << CpuFeatureNEON) #endif #if defined __ARM_FEATURE_CRC32 | (Q_UINT64_C(1) << CpuFeatureCRC32) #endif #if defined __mips_dsp | (Q_UINT64_C(1) << CpuFeatureDSP) #endif #if defined __mips_dspr2 | (Q_UINT64_C(1) << CpuFeatureDSPR2) #endif ; #ifdef Q_ATOMIC_INT64_IS_SUPPORTED extern Q_CORE_EXPORT QBasicAtomicInteger qt_cpu_features[1]; #else extern Q_CORE_EXPORT QBasicAtomicInteger qt_cpu_features[2]; #endif Q_CORE_EXPORT void qDetectCpuFeatures(); static inline quint64 qCpuFeatures() { quint64 features = qt_cpu_features[0].load(); #ifndef Q_ATOMIC_INT64_IS_SUPPORTED features |= quint64(qt_cpu_features[1].load()) << 32; #endif if (Q_UNLIKELY(features == 0)) { qDetectCpuFeatures(); features = qt_cpu_features[0].load(); #ifndef Q_ATOMIC_INT64_IS_SUPPORTED features |= quint64(qt_cpu_features[1].load()) << 32; #endif Q_ASSUME(features != 0); } return features; } #define qCpuHasFeature(feature) ((qCompilerCpuFeatures & (Q_UINT64_C(1) << CpuFeature ## feature)) \ || (qCpuFeatures() & (Q_UINT64_C(1) << CpuFeature ## feature))) #if QT_HAS_BUILTIN(__builtin_clz) && QT_HAS_BUILTIN(__builtin_ctz) && defined(Q_CC_CLANG) && !defined(Q_CC_INTEL) static Q_ALWAYS_INLINE unsigned _bit_scan_reverse(unsigned val) { Q_ASSERT(val != 0); // if val==0, the result is undefined. unsigned result = static_cast(__builtin_clz(val)); // Count Leading Zeros // Now Invert the result: clz will count *down* from the msb to the lsb, so the msb index is 31 // and the lsb inde is 0. The result for _bit_scan_reverse is expected to be the index when // counting up: msb index is 0 (because it starts there), and the lsb index is 31. result ^= sizeof(unsigned) * 8 - 1; return result; } static Q_ALWAYS_INLINE unsigned _bit_scan_forward(unsigned val) { Q_ASSERT(val != 0); // if val==0, the result is undefined. return static_cast(__builtin_ctz(val)); // Count Trailing Zeros } #elif defined(Q_PROCESSOR_X86) // Bit scan functions for x86 # if defined(Q_CC_MSVC) # if defined _WIN32_WCE && _WIN32_WCE < 0x800 extern "C" unsigned char _BitScanForward(unsigned long* Index, unsigned long Mask); extern "C" unsigned char _BitScanReverse(unsigned long* Index, unsigned long Mask); # pragma intrinsic(_BitScanForward) # pragma intrinsic(_BitScanReverse) # endif // MSVC calls it _BitScanReverse and returns the carry flag, which we don't need static __forceinline unsigned long _bit_scan_reverse(uint val) { unsigned long result; _BitScanReverse(&result, val); return result; } static __forceinline unsigned long _bit_scan_forward(uint val) { unsigned long result; _BitScanForward(&result, val); return result; } # elif (defined(Q_CC_CLANG) || (defined(Q_CC_GNU) && Q_CC_GNU < 405)) \ && !defined(Q_CC_INTEL) // Clang is missing the intrinsic for _bit_scan_reverse // GCC only added it in version 4.5 static inline __attribute__((always_inline)) unsigned _bit_scan_reverse(unsigned val) { unsigned result; asm("bsr %1, %0" : "=r" (result) : "r" (val)); return result; } static inline __attribute__((always_inline)) unsigned _bit_scan_forward(unsigned val) { unsigned result; asm("bsf %1, %0" : "=r" (result) : "r" (val)); return result; } # endif #endif // Q_PROCESSOR_X86 #define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \ for (; i < static_cast(qMin(static_cast(length), ((4 - ((reinterpret_cast(ptr) >> 2) & 0x3)) & 0x3))); ++i) QT_END_NAMESPACE #endif // QSIMD_P_H