From beab4d30e24442fa7c0c3af5056b0e064b3acc95 Mon Sep 17 00:00:00 2001
From: Thiago Macieira <thiago.macieira@intel.com>
Date: Fri, 5 Aug 2022 15:58:46 -0700
Subject: qfloat16: add a couple faster implementations of qSqrt

Instead of going through float.

Change-Id: Ie1b556d9ebca4ccaadd2fffd170895088a5d2dec
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
---
 src/corelib/global/qfloat16.h | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'src')

diff --git a/src/corelib/global/qfloat16.h b/src/corelib/global/qfloat16.h
index 10dcfe0f8d..02ad5f303f 100644
--- a/src/corelib/global/qfloat16.h
+++ b/src/corelib/global/qfloat16.h
@@ -243,13 +243,35 @@ Q_CORE_EXPORT void qFloatFromFloat16(float *, const qfloat16 *, qsizetype length
     // https://wg21.link/p1467 - disabled until tested
     using namespace std;
     return sqrt(f);
-#endif
+#elif QFLOAT16_IS_NATIVE && defined(__HAVE_FLOAT16) && __HAVE_FLOAT16
+    // This C library (glibc) has sqrtf16().
+    return sqrtf16(f);
+#else
+    bool mathUpdatesErrno = true;
+#  if defined(__NO_MATH_ERRNO__) || defined(_M_FP_FAST)
+    mathUpdatesErrno = false;
+#  elif defined(math_errhandling)
+    mathUpdatesErrno = (math_errhandling & MATH_ERRNO);
+#  endif
+
+    // We don't need to set errno to EDOM if (f >= 0 && f != -0 && !isnan(f))
+    // (or if we don't care about errno in the first place). We can merge the
+    // NaN check with by negating and inverting: !(0 > f), and leaving zero to
+    // sqrtf().
+    if (!mathUpdatesErrno || !(0 > f)) {
+#  if defined(__AVX512FP16__)
+        __m128h v = _mm_set_sh(f);
+        v = _mm_sqrt_sh(v, v);
+        return _mm_cvtsh_h(v);
+#  endif
+    }
 
     // WG14's N2601 does not provide a way to tell which types an
     // implementation supports, so we assume it doesn't and fall back to FP32
     float f32 = float(f);
     f32 = sqrtf(f32);
     return qfloat16::NearestFloat(f32);
+#endif
 }
 
 // The remainder of these utility functions complement qglobal.h
-- 
cgit v1.2.3