convertDoubleTo<FP>: add support for x86 intrinsics

The x86 SSE instructions do what we want and set a flag in the MXCSR to indicate whether we should return false, so we can improve the codegen. GCC generates for QStringView::toFloat(): call _ZNK11QStringView8toDoubleEPb movl $8064, 12(%rsp) movl $1, %eax #APP # 330 "/home/tjmaciei/src/qt/qt6/qtbase/src/corelib/global/qnumeric_p.h" 1 vldmxcsr 12(%rsp) vcvtsd2ss %xmm0, %xmm0, %xmm0 vstmxcsr 12(%rsp) # 0 "" 2 #NO_APP movl 12(%rsp), %edx testb $24, %dl je .L2120 ... handling of the under/overflow ... The MXCSR instructions do need to read and write from memory, but the stack is usually already in L1 and CPUs have special optimizations for it. There are two alternative implementations to the implementation chosen: first, we could confirm there was no underflow or overflow using an expression like: (v == 0) == (*value == 0) && qt_is_finite(v) == qt_is_finite(*value); But that is still very costly, with 4 UCOMISx instructions and several memory loads. Second, we could use the VFPCLASSSD and VFPCLASSSS (yes, 4 "S") instructions to confirm whether a finite input became zero or non- finite, but a) that's only available with AVX512, so of little practical use today and b) it has a 3-cycle latency. Like the comparisons above, we'd need 4 of them. Change-Id: I01ec3c774d9943adb903fffd17b8b9cb2ce805ce Reviewed-by: Edward Welbourne <edward.welbourne@qt.io>
author: Thiago Macieira <thiago.macieira@intel.com> 2024-03-01 11:12:20 -0800
committer: Thiago Macieira <thiago.macieira@intel.com> 2024-03-12 18:23:20 -0700
commit: c86e1758dda16852f81ec542713ae09ec0779fc1 (patch)
tree: 5b236e1efec5e1d346e857b3fd568d820aff57ca
parent: 45fd36f1480a6229879a4e59236ffa1d1d22dfbf (diff)
1 files changed, 48 insertions, 1 deletions
diff --git a/src/corelib/global/qnumeric_p.h b/src/corelib/global/qnumeric_p.h
index bdace21000..64fb69c8f3 100644
--- a/src/corelib/global/qnumeric_p.h
+++ b/src/corelib/global/qnumeric_p.h
@@ -23,6 +23,10 @@
 #include <limits>
 #include <type_traits>
 
+#ifndef __has_extension
+#  define __has_extension(X)    0
+#endif
+
 #if !defined(Q_CC_MSVC) && defined(Q_OS_QNX)
 #  include <math.h>
 #  ifdef isnan
@@ -281,7 +285,7 @@ QT_WARNING_DISABLE_FLOAT_COMPARE
 QT_WARNING_POP
 }
 
-template <typename T> static inline
+template <typename T> static
 std::enable_if_t<std::is_floating_point_v<T> || std::is_same_v<T, qfloat16>, bool>
 convertDoubleTo(double v, T *value, bool allow_precision_upgrade = true)
 {
@@ -294,6 +298,49 @@ convertDoubleTo(double v, T *value, bool allow_precision_upgrade = true)
         *value = T(v);
         return true;
     }
+
+#if defined(__SSE2__) && (defined(Q_CC_GNU) || __has_extension(gnu_asm))
+    // The x86 CVTSD2SH instruction from SSE2 does what we want:
+    // - converts out-of-range doubles to ±infinity and sets #O
+    // - converts underflows to zero and sets #U
+    // We need to clear any previously-stored exceptions from it before the
+    // operation (3-cycle cost) and obtain the new state afterwards (1 cycle).
+
+    unsigned csr = _MM_MASK_MASK;         // clear stored exception indicators
+    auto sse_check_result = [&](auto result) {
+        if ((csr & (_MM_EXCEPT_UNDERFLOW | _MM_EXCEPT_OVERFLOW)) == 0)
+            return true;
+        if (csr & _MM_EXCEPT_OVERFLOW)
+            return false;
+
+        // According to IEEE 754[1], #U is also set when the result is tiny and
+        // inexact, but still non-zero, so detect that (this won't generate
+        // good code for types without hardware support).
+        // [1] https://en.wikipedia.org/wiki/Floating-point_arithmetic#Exception_handling
+        return result != 0;
+    };
+
+    // Written directly in assembly because both Clang and GCC have been
+    // observed to reorder the STMXCSR instruction above the conversion
+    // operation. MSVC generates horrid code when using the intrinsics anyway,
+    // so it's not a loss.
+    // See https://github.com/llvm/llvm-project/issues/83661.
+    if constexpr (std::is_same_v<T, float>) {
+#  ifdef __AVX__
+        asm ("vldmxcsr  %[csr]\n\t"
+             "vcvtsd2ss %[in], %[in], %[out]\n\t"
+             "vstmxcsr  %[csr]"
+            : [csr] "+m" (csr), [out] "=v" (*value) : [in] "v" (v));
+#  else
+        asm ("ldmxcsr  %[csr]\n\t"
+             "cvtsd2ss %[in], %[out]\n\t"
+             "stmxcsr  %[csr]"
+            : [csr] "+m" (csr), [out] "=v" (*value) : [in] "v" (v));
+#  endif
+        return sse_check_result(*value);
+    }
+#endif // __SSE2__ && inline assembly
+
     if (!qt_is_finite(v) && std::numeric_limits<T>::has_infinity) {
         // infinity (or NaN)
         *value = T(v);
author	Thiago Macieira <thiago.macieira@intel.com>	2024-03-01 11:12:20 -0800
committer	Thiago Macieira <thiago.macieira@intel.com>	2024-03-12 18:23:20 -0700
commit	c86e1758dda16852f81ec542713ae09ec0779fc1 (patch)
tree	5b236e1efec5e1d346e857b3fd568d820aff57ca
parent	45fd36f1480a6229879a4e59236ffa1d1d22dfbf (diff)