From 689e8055f5c4f3bbdbb9a90ac0405b5295f81374 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Tue, 6 Aug 2013 19:32:37 -0700 Subject: Add support for single-file multi-target intrinsics in Qt GCC 4.9 now allows us to #include any and all intrinsics headers, not just the one for which we're compiling code, a behavior that ICC and MSVC have had for some time. With that, we're able to have the functions for different targets in the same source file. See the GCC manual: http://gcc.gnu.org/onlinedocs/gcc/Function-Multiversioning.html This functionality is notified by the QT_COMPILER_SUPPORTS_HERE(XXX) macro, which indicates that all the intrinsics from QT_COMPILER_SUPPORTS_xxx are available and enabled. To complement, a QT_COMPILER_SUPPORTS(XXX) macro is also added. Unlike ICC and MSVC, GCC requires a special function attribute, which will also cause code optimization. That's the QT_FUNCTION_TARGET macro. Note: because of the absence of the target attribute, ICC and MSVC will not generate instructions with the VEX prefix unless they only exist with the VEX prefix or if -mavx / -arch:AVX are enabled. Change-Id: I0c1880c20324bd8e0fc68a863e36d1fa7755dff0 Reviewed-by: Allan Sandfeld Jensen --- configure | 2 +- src/corelib/tools/qhash.cpp | 5 +- src/corelib/tools/qsimd_p.h | 119 +++++++++++++++++++++++++++++++++------ tools/configure/configureapp.cpp | 16 +++--- 4 files changed, 114 insertions(+), 28 deletions(-) diff --git a/configure b/configure index bb52464e52..9bb09bc36d 100755 --- a/configure +++ b/configure @@ -6035,7 +6035,7 @@ for SUBARCH in SSE2 SSE3 SSSE3 SSE4_1 SSE4_2 AVX AVX2 \ eval "VAL=\$CFG_$SUBARCH" case "$VAL" in yes) - echo "#define QT_COMPILER_SUPPORTS_$SUBARCH" \ + echo "#define QT_COMPILER_SUPPORTS_$SUBARCH 1" \ >>"$outpath/src/corelib/global/qconfig.h.new" ;; esac diff --git a/src/corelib/tools/qhash.cpp b/src/corelib/tools/qhash.cpp index e227a483c4..8e4f0f6407 100644 --- a/src/corelib/tools/qhash.cpp +++ b/src/corelib/tools/qhash.cpp @@ -94,13 +94,14 @@ QT_BEGIN_NAMESPACE (for instance, gcc 4.4 does that even at -O0). */ -#ifdef __SSE4_2__ +#if QT_COMPILER_SUPPORTS_HERE(SSE4_2) static inline bool hasFastCrc32() { - return true; + return qCpuHasFeature(SSE4_2); } template +QT_FUNCTION_TARGET(SSE4_2) static uint crc32(const Char *ptr, size_t len, uint h) { // The CRC32 instructions from Nehalem calculate a 32-bit CRC32 checksum diff --git a/src/corelib/tools/qsimd_p.h b/src/corelib/tools/qsimd_p.h index f4ca971567..9ace4d7409 100644 --- a/src/corelib/tools/qsimd_p.h +++ b/src/corelib/tools/qsimd_p.h @@ -50,6 +50,7 @@ * They mean the compiler supports the necessary flags and the headers * for the x86 and ARM intrinsics: * - GCC: the -mXXX or march=YYY flag is necessary before #include + * up to 4.8; GCC >= 4.9 can include unconditionally * - Intel CC: #include can happen unconditionally * - MSVC: #include can happen unconditionally * - RVCT: ??? @@ -60,25 +61,99 @@ * up do define __AVX__ if the -arch:AVX option is passed on the command-line. * * Supported XXX are: - * Flag | Arch | GCC | Intel CC | MSVC | - * NEON | ARM | I & C | None | ? | - * IWMMXT | ARM | I & C | None | I & C | - * SSE2 | x86 | I & C | I & C | I & C | - * SSE3 | x86 | I & C | I & C | I only | - * SSSE3 | x86 | I & C | I & C | I only | - * SSE4_1 | x86 | I & C | I & C | I only | - * SSE4_2 | x86 | I & C | I & C | I only | - * AVX | x86 | I & C | I & C | I & C | - * AVX2 | x86 | I & C | I & C | I only | + * Flag | Arch | GCC | Intel CC | MSVC | + * ARM_NEON | ARM | I & C | None | ? | + * IWMMXT | ARM | I & C | None | I & C | + * SSE2 | x86 | I & C | I & C | I & C | + * SSE3 | x86 | I & C | I & C | I only | + * SSSE3 | x86 | I & C | I & C | I only | + * SSE4_1 | x86 | I & C | I & C | I only | + * SSE4_2 | x86 | I & C | I & C | I only | + * AVX | x86 | I & C | I & C | I & C | + * AVX2 | x86 | I & C | I & C | I only | * I = intrinsics; C = code generation + * + * Code can use the following constructs to determine compiler support & status: + * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__) + * If this test passes, then the compiler is already generating code for that + * given sub-architecture. The intrinsics for that sub-architecture are + * #included and can be used without restriction or runtime check. + * + * - #if QT_COMPILER_SUPPORTS(XXX) + * If this test passes, then the compiler is able to generate code for that + * given sub-architecture in another translation unit, given the right set of + * flags. Use of the intrinsics is not guaranteed. This is useful with + * runtime detection (see below). + * + * - #if QT_COMPILER_SUPPORTS_HERE(XXX) + * If this test passes, then the compiler is able to generate code for that + * given sub-architecture in this translation unit, even if it is not doing + * that now (it might be). Individual functions may be tagged with + * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that + * sub-arch. Only inside such functions is the use of the intrisics + * guaranteed to work. This is useful with runtime detection (see below). + * + * Runtime detection of a CPU sub-architecture can be done with the + * qCpuHasFeature(XXX) function. There are two strategies for generating + * optimized code like that: + * + * 1) place the optimized code in a different translation unit (C or assembly + * sources) and pass the correct flags to the compiler to enable support. Those + * sources must not include qglobal.h, which means they cannot include this + * file either. The dispatcher function would look like this: + * + * void foo() + * { + * #if QT_COMPILER_SUPPORTS(XXX) + * if (qCpuHasFeature(XXX)) { + * foo_optimized_xxx(); + * return; + * } + * #endif + * foo_plain(); + * } + * + * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and + * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use + * other Qt code. The dispatcher function would look like this: + * + * void foo() + * { + * #if QT_COMPILER_SUPPORTS_HERE(XXX) + * if (qCpuHasFeature(XXX)) { + * foo_optimized_xxx(); + * return; + * } + * #endif + * foo_plain(); + * } */ #if defined(__MINGW64_VERSION_MAJOR) || (defined(Q_CC_MSVC) && !defined(Q_OS_WINCE)) #include #endif +#define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0) + +#if (defined(Q_CC_INTEL) || defined(Q_CC_MSVC) \ + || (defined(Q_CC_GNU) && !defined(Q_CC_CLANG) && (__GNUC__-0) * 100 + (__GNUC_MINOR__-0) >= 409)) \ + && !defined(QT_BOOTSTRAPPED) +# define QT_COMPILER_SUPPORTS_SIMD_ALWAYS +# define QT_COMPILER_SUPPORTS_HERE(x) QT_COMPILER_SUPPORTS(x) +# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) + /* GCC requires attributes for a function */ +# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x))) +# else +# define QT_FUNCTION_TARGET(x) +# endif +#else +# define QT_COMPILER_SUPPORTS_HERE(x) defined(__ ## x ## __) +# define QT_FUNCTION_TARGET(x) +#endif + // SSE intrinsics -#if defined(__SSE2__) || (defined(QT_COMPILER_SUPPORTS_SSE2) && defined(Q_CC_MSVC)) +#define QT_FUNCTION_TARGET_STRING_SSE2 "sse2" +#if defined(__SSE2__) || (defined(QT_COMPILER_SUPPORTS_SSE2) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) #if defined(QT_LINUXBASE) || defined(Q_OS_ANDROID_NO_SDK) /// this is an evil hack - the posix_memalign declaration in LSB /// is wrong - see http://bugs.linuxbase.org/show_bug.cgi?id=2431 @@ -95,27 +170,33 @@ #endif // SSE3 intrinsics -#if defined(__SSE3__) || (defined(QT_COMPILER_SUPPORTS_SSE3) && defined(Q_CC_MSVC)) +#define QT_FUNCTION_TARGET_STRING_SSE3 "sse3" +#if defined(__SSE3__) || (defined(QT_COMPILER_SUPPORTS_SSE3) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) #include #endif // SSSE3 intrinsics -#if defined(__SSSE3__) || (defined(QT_COMPILER_SUPPORTS_SSSE3) && defined(Q_CC_MSVC)) +#define QT_FUNCTION_TARGET_STRING_SSSE3 "ssse3" +#if defined(__SSSE3__) || (defined(QT_COMPILER_SUPPORTS_SSSE3) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) #include #endif // SSE4.1 intrinsics -#if defined(__SSE4_1__) || (defined(QT_COMPILER_SUPPORTS_SSE4_1) && defined(Q_CC_MSVC)) +#define QT_FUNCTION_TARGET_STRING_SSE4_1 "sse4.1" +#if defined(__SSE4_1__) || (defined(QT_COMPILER_SUPPORTS_SSE4_1) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) #include #endif // SSE4.2 intrinsics -#if defined(__SSE4_2__) || (defined(QT_COMPILER_SUPPORTS_SSE4_2) && defined(Q_CC_MSVC)) +#define QT_FUNCTION_TARGET_STRING_SSE4_2 "sse4.2" +#if defined(__SSE4_2__) || (defined(QT_COMPILER_SUPPORTS_SSE4_2) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) #include #endif // AVX intrinsics -#if defined(__AVX__) || (defined(QT_COMPILER_SUPPORTS_AVX) && defined(Q_CC_MSVC)) +#define QT_FUNCTION_TARGET_STRING_AVX "avx" +#define QT_FUNCTION_TARGET_STRING_AVX2 "avx2" +#if defined(__AVX__) || (defined(QT_COMPILER_SUPPORTS_AVX) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)) // immintrin.h is the ultimate header, we don't need anything else after this #include @@ -147,8 +228,10 @@ #endif // NEON intrinsics +// note: as of GCC 4.9, does not support function targets for ARM #if defined __ARM_NEON__ #include +#define QT_FUNCTION_TARGET_STRING_ARM_NEON "neon" #endif @@ -169,12 +252,14 @@ #endif #endif +#undef QT_COMPILER_SUPPORTS_SIMD_ALWAYS + QT_BEGIN_NAMESPACE enum CPUFeatures { IWMMXT = 0x1, - NEON = 0x2, + NEON = 0x2, ARM_NEON = NEON, SSE2 = 0x4, SSE3 = 0x8, SSSE3 = 0x10, diff --git a/tools/configure/configureapp.cpp b/tools/configure/configureapp.cpp index 37bf576d37..06083f2592 100644 --- a/tools/configure/configureapp.cpp +++ b/tools/configure/configureapp.cpp @@ -3476,21 +3476,21 @@ void Configure::generateConfigfiles() tmpStream << endl << "// Compiler sub-arch support" << endl; if (dictionary[ "SSE2" ] == "yes") - tmpStream << "#define QT_COMPILER_SUPPORTS_SSE2" << endl; + tmpStream << "#define QT_COMPILER_SUPPORTS_SSE2 1" << endl; if (dictionary[ "SSE3" ] == "yes") - tmpStream << "#define QT_COMPILER_SUPPORTS_SSE3" << endl; + tmpStream << "#define QT_COMPILER_SUPPORTS_SSE3 1" << endl; if (dictionary[ "SSSE3" ] == "yes") - tmpStream << "#define QT_COMPILER_SUPPORTS_SSSE3" << endl; + tmpStream << "#define QT_COMPILER_SUPPORTS_SSSE3 1" << endl; if (dictionary[ "SSE4_1" ] == "yes") - tmpStream << "#define QT_COMPILER_SUPPORTS_SSE4_1" << endl; + tmpStream << "#define QT_COMPILER_SUPPORTS_SSE4_1 1" << endl; if (dictionary[ "SSE4_2" ] == "yes") - tmpStream << "#define QT_COMPILER_SUPPORTS_SSE4_2" << endl; + tmpStream << "#define QT_COMPILER_SUPPORTS_SSE4_2 1" << endl; if (dictionary[ "AVX" ] == "yes") - tmpStream << "#define QT_COMPILER_SUPPORTS_AVX" << endl; + tmpStream << "#define QT_COMPILER_SUPPORTS_AVX 1" << endl; if (dictionary[ "AVX2" ] == "yes") - tmpStream << "#define QT_COMPILER_SUPPORTS_AVX2" << endl; + tmpStream << "#define QT_COMPILER_SUPPORTS_AVX2 1" << endl; if (dictionary[ "IWMMXT" ] == "yes") - tmpStream << "#define QT_COMPILER_SUPPORTS_IWMMXT" << endl; + tmpStream << "#define QT_COMPILER_SUPPORTS_IWMMXT 1" << endl; if (dictionary["QREAL"] != "double") tmpStream << "#define QT_COORD_TYPE " << dictionary["QREAL"] << endl; -- cgit v1.2.3