Add functions for fast bulk conversion of qfloat16

Both ARM and x86 can convert fp16 much faster in bulk than one at a time. This also enables hardware accelerated conversion on x86, when F16C isn't unconditionally available at compile time. This code is implemented in C to ensure that there's no leakage of inline symbols from the .obj file that was compiled by Visual Studio with AVX support. Unfortunately, simd.prf uses $(CXX) instead of $(CC) for all its sources, which means the file gets interpreted as C++ by g++, clang++ and icpc. Those compilers at least don't leak any symbols. Done-with: Thiago Macieira <thiago.macieira@intel.com> Change-Id: I9d26d99e83392861fb09564e0e8e8d76cd8483b3 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
author: Allan Sandfeld Jensen <allan.jensen@qt.io> 2017-10-16 13:54:38 +0200
committer: Thiago Macieira <thiago.macieira@intel.com> 2017-11-21 20:01:23 +0000
commit: b8e352ad378ce4ef7a517971533b02ec9c3768cb (patch)
tree: 1edc11dddc8784285c0ee1914e7f33d2569e6d7e /src/corelib/global/qfloat16.cpp
parent: 0ac2dca977ecc4020f51af57908a2640d00bcd9e (diff)
1 files changed, 85 insertions, 0 deletions
diff --git a/src/corelib/global/qfloat16.cpp b/src/corelib/global/qfloat16.cpp
index 89edfc8787..b2924233b5 100644
--- a/src/corelib/global/qfloat16.cpp
+++ b/src/corelib/global/qfloat16.cpp
@@ -38,6 +38,7 @@
 ****************************************************************************/
 
 #include "qfloat16_p.h"
+#include "private/qsimd_p.h"
 
 QT_BEGIN_NAMESPACE
 
@@ -113,4 +114,88 @@ Q_REQUIRED_RESULT bool qIsFinite(qfloat16 f) Q_DECL_NOTHROW { return qt_is_finit
     exactness is stronger the smaller the numbers are.
  */
 
+#if QT_COMPILER_SUPPORTS(F16C)
+static inline bool hasFastF16()
+{
+    // All processors with F16C also support AVX, but YMM registers
+    // might not be supported by the OS, or they might be disabled.
+    return qCpuHasFeature(F16C) && qCpuHasFeature(AVX);
+}
+
+extern "C" {
+extern void qFloatToFloat16_fast(quint16 *out, const float *in, qssize_t len) Q_DECL_NOTHROW;
+extern void qFloatFromFloat16_fast(float *out, const quint16 *in, qssize_t len) Q_DECL_NOTHROW;
+}
+
+#elif defined(__ARM_FP16_FORMAT_IEEE) && defined(__ARM_NEON__)
+static inline bool hasFastF16()
+{
+    return true;
+}
+
+static void qFloatToFloat16_fast(quint16 *out, const float *in, qssize_t len) Q_DECL_NOTHROW
+{
+    __fp16 *out_f16 = reinterpret_cast<__fp16 *>(out);
+    qssize_t i = 0;
+    for (; i < len - 3; i += 4)
+        vst1_f16(out_f16 + i, vcvt_f16_f32(vld1q_f32(in + i)));
+    SIMD_EPILOGUE(i, len, 3)
+        out_f16[i] = __fp16(in[i]);
+}
+
+static void qFloatFromFloat16_fast(quint16 *out, const quint16 *in, qssize_t len) Q_DECL_NOTHROW
+{
+    const __fp16 *in_f16 = reinterpret_cast<const __fp16 *>(in);
+    qssize_t i = 0;
+    for (; i < len - 3; i += 4)
+        vst1q_f32(out + i, vcvt_f32_f16(vld1_f16(in_f16 + i)));
+    SIMD_EPILOGUE(i, len, 3)
+        out[i] = float(in_f16[i]);
+}
+#else
+static inline bool hasFastF16()
+{
+    return false;
+}
+
+static void qFloatToFloat16_fast(quint16 *, const float *, qssize_t) Q_DECL_NOTHROW
+{
+    Q_UNREACHABLE();
+}
+
+static void qFloatFromFloat16_fast(float *, const quint16 *, qssize_t) Q_DECL_NOTHROW
+{
+    Q_UNREACHABLE();
+}
+#endif
+/*!
+    \since 5.11
+
+    Converts \a len floats from \a in to qfloat16 and stores them in \a out.
+    Both \a in and \a out must have \a len allocated entries.
+*/
+Q_CORE_EXPORT void qFloatToFloat16(qfloat16 *out, const float *in, qssize_t len) Q_DECL_NOTHROW
+{
+    if (hasFastF16())
+        return qFloatToFloat16_fast(reinterpret_cast<quint16 *>(out), in, len);
+
+    for (qssize_t i = 0; i < len; ++i)
+        out[i] = qfloat16(in[i]);
+}
+
+/*!
+    \since 5.11
+
+    Converts \a len qfloat16 from \a in to floats and stores them in \a out.
+    Both \a in and \a out must have \a len allocated entries.
+*/
+Q_CORE_EXPORT void qFloatFromFloat16(float *out, const qfloat16 *in, qssize_t len) Q_DECL_NOTHROW
+{
+    if (hasFastF16())
+        return qFloatFromFloat16_fast(out, reinterpret_cast<const quint16 *>(in), len);
+
+    for (qssize_t i = 0; i < len; ++i)
+        out[i] = float(in[i]);
+}
+
 QT_END_NAMESPACE
author	Allan Sandfeld Jensen <allan.jensen@qt.io>	2017-10-16 13:54:38 +0200
committer	Thiago Macieira <thiago.macieira@intel.com>	2017-11-21 20:01:23 +0000
commit	b8e352ad378ce4ef7a517971533b02ec9c3768cb (patch)
tree	1edc11dddc8784285c0ee1914e7f33d2569e6d7e /src/corelib/global/qfloat16.cpp
parent	0ac2dca977ecc4020f51af57908a2640d00bcd9e (diff)