convertDoubleTo: add an x86-64 intrinsics version

The UB that the C and C++ standards talk about do not apply if we use intrinsics. We can rely on the processors' architectural behavior instead. There are two ways to detect a conversion that cannot be represented in the result. One would be to check if the #IE bit got set in the MXCSR, but in order to do that we'd need two issue an STMXCSR+LDMCXSR pair to clear the bit first and then another STMXCSR at the end to see if it got set. Those instructions are 4 uops long and necessarily target memory, so that's a bit slow. This commit implements the second way, which is to check if the result of the conversion is the "undefined" value. Unfortunately, that value is a valid, precise value that double can hold for all data types except unsigned 64-bit, so we need to recheck if that was the actual value stored in the original double. This implementation targets 64-bit exclusively because that avoids having to deal with the 64-bit intrinsics not even being defined in 32- bit code (converting a double to 64-bit integer in 32-bit is messy). The unsigned implementation is only implemented with AVX512F because of the unsigned conversion instructions that were introduced then. Change-Id: I89446ea06b5742efb194fffd16bb9f04b2014bab Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
author: Thiago Macieira <thiago.macieira@intel.com> 2021-11-27 21:35:31 -0800
committer: Thiago Macieira <thiago.macieira@intel.com> 2022-01-19 20:57:51 -0800
commit: df8456061ef0d57ea6be37746951c50f38a65101 (patch)
tree: 4cdc004c06266e58a88a1419db7bf247662dbcde /src/corelib
parent: 69731bec5796beb53b5ab00388c7c21c6a01d822 (diff)
1 files changed, 79 insertions, 5 deletions
diff --git a/src/corelib/global/qnumeric_p.h b/src/corelib/global/qnumeric_p.h
index 4fa817077e..b7af847673 100644
--- a/src/corelib/global/qnumeric_p.h
+++ b/src/corelib/global/qnumeric_p.h
@@ -1,7 +1,7 @@
 /****************************************************************************
 **
 ** Copyright (C) 2020 The Qt Company Ltd.
-** Copyright (C) 2020 Intel Corporation.
+** Copyright (C) 2021 Intel Corporation.
 ** Contact: https://www.qt.io/licensing/
 **
 ** This file is part of the QtCore module of the Qt Toolkit.
@@ -54,6 +54,7 @@
 
 #include "QtCore/private/qglobal_p.h"
 #include "QtCore/qnumeric.h"
+#include "QtCore/qsimd.h"
 #include <cmath>
 #include <limits>
 #include <type_traits>
@@ -202,6 +203,8 @@ static inline bool convertDoubleTo(double v, T *value, bool allow_precision_upgr
             return false;
     }
 
+    constexpr T Tmin = std::numeric_limits<T>::min();
+    constexpr T Tmax = std::numeric_limits<T>::max();
 
     // The [conv.fpint] (7.10 Floating-integral conversions) section of the C++
     // standard says only exact conversions are guaranteed. Converting
@@ -213,11 +216,82 @@ static inline bool convertDoubleTo(double v, T *value, bool allow_precision_upgr
     // correct, but Clang, ICC and MSVC don't realize that it's a constant and
     // the math call stays in the compiled code.
 
+#ifdef Q_PROCESSOR_X86_64
+    // Of course, UB doesn't apply if we use intrinsics, in which case we are
+    // allowed to dpeend on exactly the processor's behavior. This
+    // implementation uses the truncating conversions from Scalar Double to
+    // integral types (CVTTSD2SI and VCVTTSD2USI), which is documented to
+    // return the "indefinite integer value" if the range of the target type is
+    // exceeded. (only implemented for x86-64 to avoid having to deal with the
+    // non-existence of the 64-bit intrinsics on i386)
+
+    if (std::numeric_limits<T>::is_signed) {
+        __m128d mv = _mm_set_sd(v);
+#  ifdef __AVX512F__
+        // use explicit round control and suppress exceptions
+        if (sizeof(T) > 4)
+            *value = T(_mm_cvtt_roundsd_i64(mv, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+        else
+            *value = _mm_cvtt_roundsd_i32(mv, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+#  else
+        *value = sizeof(T) > 4 ? T(_mm_cvttsd_si64(mv)) : _mm_cvttsd_si32(mv);
+#  endif
+
+        // if *value is the "indefinite integer value", check if the original
+        // variable \a v is the same value (Tmin is an exact representation)
+        if (*value == Tmin && !_mm_ucomieq_sd(mv, _mm_set_sd(Tmin))) {
+            // v != Tmin, so it was out of range
+            if (v > 0)
+                *value = Tmax;
+            return false;
+        }
+
+        // convert the integer back to double and compare for equality with v,
+        // to determine if we've lost any precision
+        __m128d mi = _mm_setzero_pd();
+        mi = sizeof(T) > 4 ? _mm_cvtsi64_sd(mv, *value) : _mm_cvtsi32_sd(mv, *value);
+        return _mm_ucomieq_sd(mv, mi);
+    }
+
+#  ifdef __AVX512F__
+    if (!std::numeric_limits<T>::is_signed) {
+        // Same thing as above, but this function operates on absolute values
+        // and the "indefinite integer value" for the 64-bit unsigned
+        // conversion (Tmax) is not representable in double, so it can never be
+        // the result of an in-range conversion. This is implemented for AVX512
+        // and later because of the unsigned conversion instruction. Converting
+        // to unsigned without losing an extra bit of precision prior to AVX512
+        // is left to the compiler below.
+
+        v = fabs(v);
+        __m128d mv = _mm_set_sd(v);
+
+        // use explicit round control and suppress exceptions
+        if (sizeof(T) > 4)
+            *value = T(_mm_cvtt_roundsd_u64(mv, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC));
+        else
+            *value = _mm_cvtt_roundsd_u32(mv, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
+
+        if (*value == Tmax) {
+            // no double can have an exact value of quint64(-1), but they can
+            // quint32(-1), so we need to compare for that
+            if (TypeIsLarger || _mm_ucomieq_sd(mv, _mm_set_sd(Tmax)))
+                return false;
+        }
+
+        // return true if it was an exact conversion
+        __m128d mi = _mm_setzero_pd();
+        mi = sizeof(T) > 4 ? _mm_cvtu64_sd(mv, *value) : _mm_cvtu32_sd(mv, *value);
+        return _mm_ucomieq_sd(mv, mi);
+    }
+#  endif
+#endif
+
     double supremum;
     if (std::numeric_limits<T>::is_signed) {
-        supremum = -1.0 * std::numeric_limits<T>::min();    // -1 * (-2^63) = 2^63, exact (for T = qint64)
-        *value = std::numeric_limits<T>::min();
-        if (v < std::numeric_limits<T>::min())
+        supremum = -1.0 * Tmin;     // -1 * (-2^63) = 2^63, exact (for T = qint64)
+        *value = Tmin;
+        if (v < Tmin)
             return false;
     } else {
         using ST = typename std::make_signed<T>::type;
@@ -225,7 +299,7 @@ static inline bool convertDoubleTo(double v, T *value, bool allow_precision_upgr
         v = fabs(v);
     }
 
-    *value = std::numeric_limits<T>::max();
+    *value = Tmax;
     if (v >= supremum)
         return false;
author	Thiago Macieira <thiago.macieira@intel.com>	2021-11-27 21:35:31 -0800
committer	Thiago Macieira <thiago.macieira@intel.com>	2022-01-19 20:57:51 -0800
commit	df8456061ef0d57ea6be37746951c50f38a65101 (patch)
tree	4cdc004c06266e58a88a1419db7bf247662dbcde /src/corelib
parent	69731bec5796beb53b5ab00388c7c21c6a01d822 (diff)