From cf63b0e1dfc0bf3d11a92c5bf82840ddb6bb22ac Mon Sep 17 00:00:00 2001
From: Thiago Macieira <thiago.macieira@intel.com>
Date: Sun, 17 Sep 2017 12:39:35 -0700
Subject: qsimd: add support for new x86 CPU features
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds detection for: VAES, GFNI, AVX512VBMI2, AVX512VNNI,
AVX512BITALG, AVX512VPOPCNTDQ, AVX512_4NNIW, AVX512_4FMAPS. These
features were found in the "Intel® Architecture Instruction Set
Extensions and Future Features" manual, revision 30. This commit also
adds support for RDPID (already in the main manual) and the Control-flow
Enforcement Technology, which appears in a separate Intel paper.

This new support was done by adding a new generator script so we don't
have to maintain two tables in sync, one in qsimd.cpp with the feature
names, and the other in qsimd_p.h.

Since we now need a lot more bits, it's no longer worth keeping the two
halves of the qt_cpu_features variable mostly similar to the main two
CPUID results. This commit goes back to keeping things in order, like we
used to prior to commit 6a8251a89b6a61258498f4af1ba7b3d5b7f7096c (Qt 5.6)

At the time of this commit, GCC 8 has macros for AVX512VPOPCNTDQ,
AVX512_4NNIW, AVX512_4FMAPS, AVX512VBMI2 and GFNI.

Change-Id: I938b024e38bf4aac9154fffd14f7afae50faaa96
Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
---
 src/corelib/tools/qsimd.cpp | 252 ++++++++++++--------------------------------
 1 file changed, 65 insertions(+), 187 deletions(-)

(limited to 'src/corelib/tools/qsimd.cpp')

diff --git a/src/corelib/tools/qsimd.cpp b/src/corelib/tools/qsimd.cpp
index c4d7117449..fd9c6a7079 100644
--- a/src/corelib/tools/qsimd.cpp
+++ b/src/corelib/tools/qsimd.cpp
@@ -1,7 +1,7 @@
 /****************************************************************************
 **
 ** Copyright (C) 2016 The Qt Company Ltd.
-** Copyright (C) 2016 Intel Corporation.
+** Copyright (C) 2018 Intel Corporation.
 ** Contact: https://www.qt.io/licensing/
 **
 ** This file is part of the QtCore module of the Qt Toolkit.
@@ -80,6 +80,43 @@
 
 QT_BEGIN_NAMESPACE
 
+/*
+ * Use kdesdk/scripts/generate_string_table.pl to update the table below. Note
+ * we remove the terminating -1 that the script adds.
+ */
+
+// begin generated
+#if defined(Q_PROCESSOR_ARM)
+/* Data:
+ neon
+ crc32
+ */
+static const char features_string[] =
+        " neon\0"
+        " crc32\0"
+        "\0";
+static const int features_indices[] = { 0, 6 };
+#elif defined(Q_PROCESSOR_MIPS)
+/* Data:
+ dsp
+ dspr2
+*/
+static const char features_string[] =
+    " dsp\0"
+    " dspr2\0"
+    "\0";
+
+static const int features_indices[] = {
+       0,    5
+};
+#elif defined(Q_PROCESSOR_X86)
+#  include "qsimd_x86.cpp"                  // generated by util/x86simdgen
+#else
+static const char features_string[] = "";
+static const int features_indices[] = { };
+#endif
+// end generated
+
 #if defined (Q_OS_NACL)
 static inline uint detectProcessorFeatures()
 {
@@ -222,29 +259,32 @@ static void cpuidFeatures01(uint &ecx, uint &edx)
 inline void __cpuidex(int info[4], int, __int64) { memset(info, 0, 4*sizeof(int));}
 #endif
 
-static void cpuidFeatures07_00(uint &ebx, uint &ecx)
+static void cpuidFeatures07_00(uint &ebx, uint &ecx, uint &edx)
 {
 #if defined(Q_CC_GNU)
     qregisteruint rbx; // in case it's 64-bit
     qregisteruint rcx = 0;
+    qregisteruint rdx = 0;
     asm ("xchg " PICreg", %0\n"
          "cpuid\n"
          "xchg " PICreg", %0\n"
-        : "=&r" (rbx), "+&c" (rcx)
-        : "a" (7)
-        : "%edx");
+        : "=&r" (rbx), "+&c" (rcx), "+&d" (rdx)
+        : "a" (7));
     ebx = rbx;
     ecx = rcx;
+    edx = rdx;
 #elif defined(Q_OS_WIN)
     int info[4];
     __cpuidex(info, 7, 0);
     ebx = info[1];
     ecx = info[2];
+    edx = info[3];
 #elif defined(Q_CC_GHS)
     unsigned int info[4];
     __CPUIDEX(7, 0, info);
     ebx = info[1];
     ecx = info[2];
+    edx = info[3];
 #endif
 }
 
@@ -285,8 +325,11 @@ static quint64 detectProcessorFeatures()
     static const quint64 AllAVX512 = (Q_UINT64_C(1) << CpuFeatureAVX512F) | (Q_UINT64_C(1) << CpuFeatureAVX512CD) |
             (Q_UINT64_C(1) << CpuFeatureAVX512ER) | (Q_UINT64_C(1) << CpuFeatureAVX512PF) |
             (Q_UINT64_C(1) << CpuFeatureAVX512BW) | (Q_UINT64_C(1) << CpuFeatureAVX512DQ) |
-            (Q_UINT64_C(1) << CpuFeatureAVX512VL) |
-            (Q_UINT64_C(1) << CpuFeatureAVX512IFMA) | (Q_UINT64_C(1) << CpuFeatureAVX512VBMI);
+            (Q_UINT64_C(1) << CpuFeatureAVX512VL) | (Q_UINT64_C(1) << CpuFeatureAVX512IFMA) |
+            (Q_UINT64_C(1) << CpuFeatureAVX512VBMI) | (Q_UINT64_C(1) << CpuFeatureAVX512VBMI2) |
+            (Q_UINT64_C(1) << CpuFeatureAVX512VNNI) | (Q_UINT64_C(1) << CpuFeatureAVX512BITALG) |
+            (Q_UINT64_C(1) << CpuFeatureAVX512VPOPCNTDQ) |
+            (Q_UINT64_C(1) << CpuFeatureAVX5124NNIW) | (Q_UINT64_C(1) << CpuFeatureAVX5124FMAPS);
     static const quint64 AllAVX2 = (Q_UINT64_C(1) << CpuFeatureAVX2) | AllAVX512;
     static const quint64 AllAVX = (Q_UINT64_C(1) << CpuFeatureAVX) | AllAVX2;
 
@@ -299,52 +342,33 @@ static quint64 detectProcessorFeatures()
     Q_ASSERT(cpuidLevel >= 1);
 #endif
 
-    uint cpuid01ECX = 0, cpuid01EDX = 0;
-    cpuidFeatures01(cpuid01ECX, cpuid01EDX);
-
-    // the low 32-bits of features is cpuid01ECX
-    // note: we need to check OS support for saving the AVX register state
-    features = cpuid01ECX;
-
-#if defined(Q_PROCESSOR_X86_32)
-    // x86 might not have SSE2 support
-    if (cpuid01EDX & (1u << 26))
-        features |= Q_UINT64_C(1) << CpuFeatureSSE2;
-    else
-        features &= ~(Q_UINT64_C(1) << CpuFeatureSSE2);
-    // we should verify that the OS enabled saving of the SSE state...
-#else
-    // x86-64 or x32
-    features |= Q_UINT64_C(1) << CpuFeatureSSE2;
-#endif
+    uint results[X86CpuidMaxLeaf] = {};
+    cpuidFeatures01(results[Leaf1ECX], results[Leaf1EDX]);
+    if (cpuidLevel >= 7)
+        cpuidFeatures07_00(results[Leaf7_0EBX], results[Leaf7_0ECX], results[Leaf7_0EDX]);
+
+    // populate our feature list
+    for (uint i = 0; i < sizeof(x86_locators) / sizeof(x86_locators[0]); ++i) {
+        uint word = x86_locators[i] / 32;
+        uint bit = 1U << (x86_locators[i] % 32);
+        quint64 feature = Q_UINT64_C(1) << (i + 1);
+        if (results[word] & bit)
+            features |= feature;
+    }
 
+    // now check the AVX state
     uint xgetbvA = 0, xgetbvD = 0;
-    if (cpuid01ECX & (1u << 27)) {
+    if (results[Leaf1ECX] & (1u << 27)) {
         // XGETBV enabled
         xgetbv(0, xgetbvA, xgetbvD);
     }
 
-    uint cpuid0700EBX = 0;
-    uint cpuid0700ECX = 0;
-    if (cpuidLevel >= 7) {
-        cpuidFeatures07_00(cpuid0700EBX, cpuid0700ECX);
-
-        // the high 32-bits of features is cpuid0700EBX
-        features |= quint64(cpuid0700EBX) << 32;
-    }
-
     if ((xgetbvA & AVXState) != AVXState) {
         // support for YMM registers is disabled, disable all AVX
         features &= ~AllAVX;
     } else if ((xgetbvA & AVX512State) != AVX512State) {
         // support for ZMM registers or mask registers is disabled, disable all AVX512
         features &= ~AllAVX512;
-    } else {
-        // this feature is out of order
-        if (cpuid0700ECX & (1u << 1))
-            features |= Q_UINT64_C(1) << CpuFeatureAVX512VBMI;
-        else
-            features &= ~(Q_UINT64_C(1) << CpuFeatureAVX512VBMI);
     }
 
     return features;
@@ -493,152 +517,6 @@ static inline uint detectProcessorFeatures()
 }
 #endif
 
-/*
- * Use kdesdk/scripts/generate_string_table.pl to update the table below. Note
- * that the x86 version has a lot of blanks that must be kept and that the
- * offset table's type is changed to make the table smaller. We also remove the
- * terminating -1 that the script adds.
- */
-
-// begin generated
-#if defined(Q_PROCESSOR_ARM)
-/* Data:
- neon
- crc32
- */
-static const char features_string[] =
-        " neon\0"
-        " crc32\0"
-        "\0";
-static const int features_indices[] = { 0, 6 };
-#elif defined(Q_PROCESSOR_MIPS)
-/* Data:
- dsp
- dspr2
-*/
-static const char features_string[] =
-    " dsp\0"
-    " dspr2\0"
-    "\0";
-
-static const int features_indices[] = {
-       0,    5
-};
-#elif defined(Q_PROCESSOR_X86)
-/* Data:
- sse3
- sse2
- avx512vbmi
-
-
-
-
-
-
- ssse3
-
-
- fma
- cmpxchg16b
-
-
-
-
-
- sse4.1
- sse4.2
-
- movbe
- popcnt
-
- aes
-
-
- avx
- f16c
- rdrand
-
-
-
-
- bmi
- hle
- avx2
-
-
- bmi2
-
-
- rtm
-
-
-
-
- avx512f
- avx512dq
- rdseed
-
-
- avx512ifma
-
-
-
-
- avx512pf
- avx512er
- avx512cd
- sha
- avx512bw
- avx512vl
- */
-static const char features_string[] =
-    " sse3\0"
-    " sse2\0"
-    " avx512vbmi\0"
-    " ssse3\0"
-    " fma\0"
-    " cmpxchg16b\0"
-    " sse4.1\0"
-    " sse4.2\0"
-    " movbe\0"
-    " popcnt\0"
-    " aes\0"
-    " avx\0"
-    " f16c\0"
-    " rdrand\0"
-    " bmi\0"
-    " hle\0"
-    " avx2\0"
-    " bmi2\0"
-    " rtm\0"
-    " avx512f\0"
-    " avx512dq\0"
-    " rdseed\0"
-    " avx512ifma\0"
-    " avx512pf\0"
-    " avx512er\0"
-    " avx512cd\0"
-    " sha\0"
-    " avx512bw\0"
-    " avx512vl\0"
-    "\0";
-
-static const quint8 features_indices[] = {
-    0,    6,   12,    5,    5,    5,    5,    5,
-    5,   24,    5,    5,   31,   36,    5,    5,
-    5,    5,    5,   48,   56,    5,   64,   71,
-    5,   79,    5,    5,   84,   89,   95,    5,
-    5,    5,    5,  103,  108,  113,    5,    5,
-  119,    5,    5,  125,    5,    5,    5,    5,
-  130,  139,  149,    5,    5,  157,    5,    5,
-    5,    5,  169,  179,  189,  199,  204,  214
-};
-#else
-static const char features_string[] = "";
-static const int features_indices[] = { };
-#endif
-// end generated
-
 static const int features_count = (sizeof features_indices) / (sizeof features_indices[0]);
 
 // record what CPU features were enabled by default in this Qt build
-- 
cgit v1.2.3