diff options
-rw-r--r-- | src/corelib/tools/qsimd.cpp | 182 | ||||
-rw-r--r-- | src/corelib/tools/qsimd_p.h | 60 | ||||
-rw-r--r-- | src/gui/painting/qdrawhelper.cpp | 6 |
3 files changed, 159 insertions, 89 deletions
diff --git a/src/corelib/tools/qsimd.cpp b/src/corelib/tools/qsimd.cpp index 91700c053d..825028037f 100644 --- a/src/corelib/tools/qsimd.cpp +++ b/src/corelib/tools/qsimd.cpp @@ -78,28 +78,28 @@ static inline uint detectProcessorFeatures() return 0; } #elif defined (Q_OS_WINCE) -static inline uint detectProcessorFeatures() +static inline quint64 detectProcessorFeatures() { - uint features = 0; + quint64 features = 0; #if defined (ARM) # ifdef PF_ARM_NEON if (IsProcessorFeaturePresent(PF_ARM_NEON)) - features |= ARM_NEON; + features |= Q_UINT64_C(1) << CpuFeatureNEON; # endif #elif defined(_X86_) if (IsProcessorFeaturePresent(PF_XMMI64_INSTRUCTIONS_AVAILABLE)) - features |= SSE2; + features |= Q_UINT64_C(1) << CpuFeatureSSE2; if (IsProcessorFeaturePresent(PF_SSE3_INSTRUCTIONS_AVAILABLE)) - features |= SSE3; + features |= Q_UINT64_C(1) << CpuFeatureSSE3; #endif return features; } #elif defined(Q_PROCESSOR_ARM) -static inline uint detectProcessorFeatures() +static inline quint64 detectProcessorFeatures() { - uint features = 0; + quint64 features = 0; #if defined(Q_OS_LINUX) int auxv = qt_safe_open("/proc/self/auxv", O_RDONLY); @@ -117,7 +117,7 @@ static inline uint detectProcessorFeatures() for (int i = 0; i < max; i += 2) if (vector[i] == AT_HWCAP) { if (vector[i+1] & HWCAP_NEON) - features |= NEON; + features |= Q_UINT64_C(1) << CpuFeatureNEON; break; } } @@ -129,7 +129,7 @@ static inline uint detectProcessorFeatures() #endif #if defined(__ARM_NEON__) - features = NEON; + features = Q_UINT64_C(1) << CpuFeatureNEON; #endif return features; @@ -257,6 +257,8 @@ static quint64 detectProcessorFeatures() AVXState = XMM0_15 | YMM0_15Hi128, AVX512State = AVXState | OpMask | ZMM0_15Hi256 | ZMM16_31 }; + static const quint64 AllAVX2 = (Q_UINT64_C(1) << CpuFeatureAVX2); + static const quint64 AllAVX = (Q_UINT64_C(1) << CpuFeatureAVX) | AllAVX2; quint64 features = 0; int cpuidLevel = maxBasicCpuidSupported(); @@ -269,28 +271,23 @@ static quint64 detectProcessorFeatures() uint cpuid01ECX = 0, cpuid01EDX = 0; cpuidFeatures01(cpuid01ECX, cpuid01EDX); + + // the low 32-bits of features is cpuid01ECX + // note: we need to check OS support for saving the AVX register state + features = cpuid01ECX; + #if defined(Q_PROCESSOR_X86_32) // x86 might not have SSE2 support if (cpuid01EDX & (1u << 26)) - features |= SSE2; + features |= Q_UINT64_C(1) << CpuFeatureSSE2; + else + features &= ~(Q_UINT64_C(1) << CpuFeatureSSE2); // we should verify that the OS enabled saving of the SSE state... #else // x86-64 or x32 - features = SSE2; + features |= Q_UINT64_C(1) << CpuFeatureSSE2; #endif - // common part between 32- and 64-bit - if (cpuid01ECX & (1u)) - features |= SSE3; - if (cpuid01ECX & (1u << 9)) - features |= SSSE3; - if (cpuid01ECX & (1u << 19)) - features |= SSE4_1; - if (cpuid01ECX & (1u << 20)) - features |= SSE4_2; - if (cpuid01ECX & (1u << 25)) - features |= 0; // AES, enable if needed - uint xgetbvA = 0, xgetbvD = 0; if (cpuid01ECX & (1u << 27)) { // XGETBV enabled @@ -298,22 +295,17 @@ static quint64 detectProcessorFeatures() } uint cpuid0700EBX = 0; - if (cpuidLevel >= 7) + if (cpuidLevel >= 7) { cpuidFeatures07_00(cpuid0700EBX); - if ((xgetbvA & AVXState) == AVXState) { - // support for YMM and XMM registers is enabled - if (cpuid01ECX & (1u << 28)) - features |= AVX; - - if (cpuid0700EBX & (1u << 5)) - features |= AVX2; + // the high 32-bits of features is cpuid0700EBX + features |= quint64(cpuid0700EBX) << 32; } - if (cpuid0700EBX & (1u << 4)) - features |= HLE; // Hardware Lock Ellision - if (cpuid0700EBX & (1u << 11)) - features |= RTM; // Restricted Transactional Memory + if ((xgetbvA & AVXState) != AVXState) { + // support for YMM registers is disabled, disable all AVX + features &= ~AllAVX; + } return features; } @@ -430,24 +422,24 @@ static bool procCpuinfoContains(const char *prefix, const char *string) } #endif -static inline uint detectProcessorFeatures() +static inline quint64 detectProcessorFeatures() { // NOTE: MIPS 74K cores are the only ones supporting DSPr2. - uint flags = 0; + quint64 flags = 0; #if defined __mips_dsp - flags |= DSP; + flags |= Q_UINT64_C(1) << CpuFeatureDSP; # if defined __mips_dsp_rev && __mips_dsp_rev >= 2 - flags |= DSPR2; + flags |= Q_UINT64_C(1) << CpuFeatureDSPR2; # elif defined(Q_OS_LINUX) if (procCpuinfoContains("cpu model", "MIPS 74Kc") || procCpuinfoContains("cpu model", "MIPS 74Kf")) - flags |= DSPR2; + flags |= Q_UINT64_C(1) << CpuFeatureDSPR2; # endif #elif defined(Q_OS_LINUX) if (procCpuinfoContains("ASEs implemented", "dsp")) { - flags |= DSP; + flags |= Q_UINT64_C(1) << CpuFeatureDSP; if (procCpuinfoContains("cpu model", "MIPS 74Kc") || procCpuinfoContains("cpu model", "MIPS 74Kf")) - flags |= DSPR2; + flags |= Q_UINT64_C(1) << CpuFeatureDSPR2; } #endif @@ -462,47 +454,115 @@ static inline uint detectProcessorFeatures() #endif /* - * Use kdesdk/scripts/generate_string_table.pl to update the table below. - * Here's the data (don't forget the ONE leading space): + * Use kdesdk/scripts/generate_string_table.pl to update the table below. Note + * that the x86 version has a lot of blanks that must be kept and that the + * offset table's type is changed to make the table smaller. We also remove the + * terminating -1 that the script adds. + */ +// begin generated +#if defined(Q_PROCESSOR_ARM) +/* Data: neon - sse2 + */ +static const char features_string[] = " neon\0"; +static const int features_indices[] = { 0 }; +#elif defined(Q_PROCESSOR_MIPS) +/* Data: + dsp + dspr2 +*/ +static const char features_string[] = + " dsp\0" + " dspr2\0" + "\0"; + +static const int features_indices[] = { + 0, 5 +}; +#elif defined(Q_PROCESSOR_X86) +/* Data: sse3 + sse2 + + + + + + + ssse3 + + + fma + cmpxchg16b + + + + + sse4.1 sse4.2 + + movbe + popcnt + + aes + + avx - avx2 + f16c + rdrand + + + + + bmi hle - rtm - dsp - dspr2 - */ + avx2 -// begin generated + + bmi2 + + + rtm + */ static const char features_string[] = - "\0" - " neon\0" - " sse2\0" " sse3\0" + " sse2\0" " ssse3\0" + " fma\0" + " cmpxchg16b\0" " sse4.1\0" " sse4.2\0" + " movbe\0" + " popcnt\0" + " aes\0" " avx\0" - " avx2\0" + " f16c\0" + " rdrand\0" + " bmi\0" " hle\0" + " avx2\0" + " bmi2\0" " rtm\0" - " dsp\0" - " dspr2\0" "\0"; -static const int features_indices[] = { - 0, 1, 7, 13, 19, 26, 34, 42, - 47, 53, 58, 63, 68, -1 +static const quint8 features_indices[] = { + 0, 6, 5, 5, 5, 5, 5, 5, + 5, 12, 5, 5, 19, 24, 5, 5, + 5, 5, 5, 36, 44, 5, 52, 59, + 5, 67, 5, 5, 72, 77, 83, 5, + 5, 5, 5, 91, 96, 101, 5, 5, + 107, 5, 5, 113 }; +#else +static const char features_string[] = ""; +static const int features_indices[] = { }; +#endif // end generated -static const int features_count = (sizeof features_indices - 1) / (sizeof features_indices[0]); +static const int features_count = (sizeof features_indices) / (sizeof features_indices[0]); // record what CPU features were enabled by default in this Qt build static const quint64 minFeature = qCompilerCpuFeatures; diff --git a/src/corelib/tools/qsimd_p.h b/src/corelib/tools/qsimd_p.h index 6ca3836ca9..b815e976a7 100644 --- a/src/corelib/tools/qsimd_p.h +++ b/src/corelib/tools/qsimd_p.h @@ -245,18 +245,29 @@ QT_BEGIN_NAMESPACE enum CPUFeatures { - NEON = 0x2, ARM_NEON = NEON, - SSE2 = 0x4, - SSE3 = 0x8, - SSSE3 = 0x10, - SSE4_1 = 0x20, - SSE4_2 = 0x40, - AVX = 0x80, - AVX2 = 0x100, - HLE = 0x200, - RTM = 0x400, - DSP = 0x800, - DSPR2 = 0x1000, +#if defined(Q_PROCESSOR_ARM) + CpuFeatureNEON = 0, + CpuFeatureARM_NEON = CpuFeatureNEON, +#elif defined(Q_PROCESSOR_MIPS) + CpuFeatureDSP = 0, + CpuFeatureDSPR2 = 1, +#elif defined(Q_PROCESSOR_X86) + // The order of the flags is jumbled so it matches most closely the bits in CPUID + // Out of order: + CpuFeatureSSE2 = 1, // uses the bit for PCLMULQDQ + // in level 1, ECX + CpuFeatureSSE3 = (0 + 0), + CpuFeatureSSSE3 = (0 + 9), + CpuFeatureSSE4_1 = (0 + 19), + CpuFeatureSSE4_2 = (0 + 20), + CpuFeatureAES = (0 + 25), + CpuFeatureAVX = (0 + 28), + + // in level 7, leaf 0, EBX + CpuFeatureHLE = (32 + 4), + CpuFeatureAVX2 = (32 + 5), + CpuFeatureRTM = (32 + 11), +#endif // used only to indicate that the CPU detection was initialised QSimdInitialized = 0x80000000 @@ -264,37 +275,37 @@ enum CPUFeatures { static const uint qCompilerCpuFeatures = 0 #if defined __RTM__ - | RTM + | (Q_UINT64_C(1) << CpuFeatureRTM) #endif #if defined __AVX2__ - | AVX2 + | (Q_UINT64_C(1) << CpuFeatureAVX2) #endif #if defined __AVX__ - | AVX + | (Q_UINT64_C(1) << CpuFeatureAVX) #endif #if defined __SSE4_2__ - | SSE4_2 + | (Q_UINT64_C(1) << CpuFeatureSSE4_2) #endif #if defined __SSE4_1__ - | SSE4_1 + | (Q_UINT64_C(1) << CpuFeatureSSE4_1) #endif #if defined __SSSE3__ - | SSSE3 + | (Q_UINT64_C(1) << CpuFeatureSSSE3) #endif #if defined __SSE3__ - | SSE3 + | (Q_UINT64_C(1) << CpuFeatureSSE3) #endif #if defined __SSE2__ - | SSE2 + | (Q_UINT64_C(1) << CpuFeatureSSE2) #endif #if defined __ARM_NEON__ - | NEON + | (Q_UINT64_C(1) << CpuFeatureNEON) #endif #if defined __mips_dsp - | DSP + | (Q_UINT64_C(1) << CpuFeatureDSP) #endif #if defined __mips_dspr2 - | DSPR2 + | (Q_UINT64_C(1) << CpuFeatureDSPR2) #endif ; @@ -322,7 +333,8 @@ static inline quint64 qCpuFeatures() return features; } -#define qCpuHasFeature(feature) ((qCompilerCpuFeatures & (feature)) || (qCpuFeatures() & (feature))) +#define qCpuHasFeature(feature) ((qCompilerCpuFeatures & (Q_UINT64_C(1) << CpuFeature ## feature)) \ + || (qCpuFeatures() & (Q_UINT64_C(1) << CpuFeature ## feature))) #ifdef Q_PROCESSOR_X86 // Bit scan functions for x86 diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index 0cf7e20605..64a363868a 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -6326,8 +6326,6 @@ template<QtPixelOrder> const uint *QT_FASTCALL convertA2RGB30PMFromARGB32PM_sse4 void qInitDrawhelperAsm() { - const uint features = qCpuFeatures(); - Q_UNUSED(features); #ifdef __SSE2__ qDrawHelper[QImage::Format_RGB32].bitmapBlit = qt_bitmapblit32_sse2; qDrawHelper[QImage::Format_ARGB32].bitmapBlit = qt_bitmapblit32_sse2; @@ -6372,7 +6370,7 @@ void qInitDrawhelperAsm() qt_fetch_radial_gradient = qt_fetch_radial_gradient_sse2; #ifdef QT_COMPILER_SUPPORTS_SSSE3 - if (features & SSSE3) { + if (qCpuHasFeature(SSSE3)) { extern void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int w, int h, @@ -6466,7 +6464,7 @@ void qInitDrawhelperAsm() #endif // Q_PROCESSOR_MIPS_32 #if defined(QT_COMPILER_SUPPORTS_MIPS_DSP) || defined(QT_COMPILER_SUPPORTS_MIPS_DSPR2) - if (features & (DSP | DSPR2)) { + if (qCpuHasFeature(DSP) && qCpuHasFeature(DSPR2)) { // Composition functions are all DSP r1 qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_asm_mips_dsp; qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_mips_dsp; |