1 files changed, 213 insertions, 81 deletions
diff --git a/src/corelib/global/qfloat16.cpp b/src/corelib/global/qfloat16.cpp
index c97331748b..f6f782e364 100644
--- a/src/corelib/global/qfloat16.cpp
+++ b/src/corelib/global/qfloat16.cpp
@@ -1,49 +1,20 @@
-/****************************************************************************
-**
-** Copyright (C) 2019 The Qt Company Ltd.
-** Copyright (C) 2016 by Southwest Research Institute (R)
-** Contact: http://www.qt-project.org/legal
-**
-** This file is part of the QtCore module of the Qt Toolkit.
-**
-** $QT_BEGIN_LICENSE:LGPL$
-** Commercial License Usage
-** Licensees holding valid commercial Qt licenses may use this file in
-** accordance with the commercial license agreement provided with the
-** Software or, alternatively, in accordance with the terms contained in
-** a written agreement between you and The Qt Company. For licensing terms
-** and conditions see https://www.qt.io/terms-conditions. For further
-** information use the contact form at https://www.qt.io/contact-us.
-**
-** GNU Lesser General Public License Usage
-** Alternatively, this file may be used under the terms of the GNU Lesser
-** General Public License version 3 as published by the Free Software
-** Foundation and appearing in the file LICENSE.LGPL3 included in the
-** packaging of this file. Please review the following information to
-** ensure the GNU Lesser General Public License version 3 requirements
-** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
-**
-** GNU General Public License Usage
-** Alternatively, this file may be used under the terms of the GNU
-** General Public License version 2.0 or (at your option) the GNU General
-** Public license version 3 or any later version approved by the KDE Free
-** Qt Foundation. The licenses are as published by the Free Software
-** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
-** included in the packaging of this file. Please review the following
-** information to ensure the GNU General Public License requirements will
-** be met: https://www.gnu.org/licenses/gpl-2.0.html and
-** https://www.gnu.org/licenses/gpl-3.0.html.
-**
-** $QT_END_LICENSE$
-**
-****************************************************************************/
+// Copyright (C) 2020 The Qt Company Ltd.
+// Copyright (C) 2016 by Southwest Research Institute (R)
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
 
 #include "qfloat16.h"
 #include "private/qsimd_p.h"
 #include <cmath> // for fpclassify()'s return values
 
+#include <QtCore/qdatastream.h>
+#include <QtCore/qmetatype.h>
+#include <QtCore/qtextstream.h>
+
+QT_DECL_METATYPE_EXTERN(qfloat16, Q_CORE_EXPORT)
 QT_BEGIN_NAMESPACE
 
+QT_IMPL_METATYPE_EXTERN(qfloat16)
+
 /*!
     \class qfloat16
     \keyword 16-bit Floating Point Support
@@ -52,6 +23,15 @@ QT_BEGIN_NAMESPACE
     \inheaderfile QFloat16
     \brief Provides 16-bit floating point support.
 
+    \compares partial
+    \compareswith partial float double {long double} qint8 quint8 qint16 quint16 \
+                  qint32 quint32 long {unsigned long} qint64 quint64
+    \endcompareswith
+    \compareswith partial qint128 quint128
+    Comparison with 128-bit integral types is only supported if Qt provides
+    these types.
+    \endcompareswith
+
     The \c qfloat16 class provides support for half-precision (16-bit) floating
     point data.  It is fully compliant with IEEE 754 as a storage type.  This
     implies that any arithmetic operation on a \c qfloat16 instance results in
@@ -73,43 +53,34 @@ QT_BEGIN_NAMESPACE
 */
 
 /*!
-    \macro QT_NO_FLOAT16_OPERATORS
-    \relates qfloat16
-    \since 5.12.4
-
-    Defining this macro disables the arithmetic operators for qfloat16.
+    \fn qfloat16::qfloat16(Qt::Initialization)
+    \since 6.1
 
-    This is only necessary on Visual Studio 2017 (and earlier) when including
-    \c {<QFloat16>} and \c{<bitset>} in the same translation unit, which would
-    otherwise cause a compilation error due to a toolchain bug (see
-    [QTBUG-72073]).
+    Constructs a qfloat16 without initializing the value.
 */
 
 /*!
     \fn bool qIsInf(qfloat16 f)
     \relates qfloat16
+    \overload qIsInf(float)
 
     Returns true if the \c qfloat16 \a {f} is equivalent to infinity.
-
-    \sa qIsInf
 */
 
 /*!
     \fn bool qIsNaN(qfloat16 f)
     \relates qfloat16
+    \overload qIsNaN(float)
 
     Returns true if the \c qfloat16 \a {f} is not a number (NaN).
-
-    \sa qIsNaN
 */
 
 /*!
     \fn bool qIsFinite(qfloat16 f)
     \relates qfloat16
+    \overload qIsFinite(float)
 
     Returns true if the \c qfloat16 \a {f} is a finite number.
-
-    \sa qIsFinite
 */
 
 /*!
@@ -118,8 +89,6 @@ QT_BEGIN_NAMESPACE
     \fn bool qfloat16::isInf() const noexcept
 
     Tests whether this \c qfloat16 value is an infinity.
-
-    \sa qIsInf()
 */
 
 /*!
@@ -128,8 +97,6 @@ QT_BEGIN_NAMESPACE
     \fn bool qfloat16::isNaN() const noexcept
 
     Tests whether this \c qfloat16 value is "not a number".
-
-    \sa qIsNaN()
 */
 
 /*!
@@ -147,8 +114,6 @@ QT_BEGIN_NAMESPACE
     \fn bool qfloat16::isFinite() const noexcept
 
     Tests whether this \c qfloat16 value is finite.
-
-    \sa qIsFinite()
 */
 
 /*!
@@ -160,11 +125,18 @@ QT_BEGIN_NAMESPACE
 */
 
 /*!
+    \fn int qFpClassify(qfloat16 val)
+    \relates qfloat16
+    \since 5.14
+    \overload qFpClassify(float)
+
+    Returns the floating-point class of \a val.
+*/
+
+/*!
     \internal
     \since 5.14
     Implements qFpClassify() for qfloat16.
-
-    \sa qFpClassify()
 */
 int qfloat16::fpClassify() const noexcept
 {
@@ -174,22 +146,21 @@ int qfloat16::fpClassify() const noexcept
 
 /*! \fn int qRound(qfloat16 value)
     \relates qfloat16
+    \overload qRound(float)
 
     Rounds \a value to the nearest integer.
-
-    \sa qRound
 */
 
 /*! \fn qint64 qRound64(qfloat16 value)
     \relates qfloat16
+    \overload qRound64(float)
 
     Rounds \a value to the nearest 64-bit integer.
-
-    \sa qRound64
 */
 
 /*! \fn bool qFuzzyCompare(qfloat16 p1, qfloat16 p2)
     \relates qfloat16
+    \overload qFuzzyCompare(float, float)
 
     Compares the floating point value \a p1 and \a p2 and
     returns \c true if they are considered equal, otherwise \c false.
@@ -198,25 +169,128 @@ int qfloat16::fpClassify() const noexcept
     exactness is stronger the smaller the numbers are.
  */
 
-#if QT_COMPILER_SUPPORTS(F16C)
+#if QT_COMPILER_SUPPORTS_HERE(F16C)
 static inline bool hasFastF16()
 {
-    // All processors with F16C also support AVX, but YMM registers
-    // might not be supported by the OS, or they might be disabled.
-    return qCpuHasFeature(F16C) && qCpuHasFeature(AVX);
+    // qsimd.cpp:detectProcessorFeatures() turns off this feature if AVX
+    // state-saving is not enabled by the OS
+    return qCpuHasFeature(F16C);
 }
 
-extern "C" {
-#ifdef QFLOAT16_INCLUDE_FAST
-#  define f16cextern    static
-#else
-#  define f16cextern    extern
+#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
+static bool hasFastF16Avx256()
+{
+    // 256-bit AVX512 don't have a performance penalty (see qstring.cpp for more info)
+    return qCpuHasFeature(ArchSkylakeAvx512);
+}
+
+static QT_FUNCTION_TARGET(ARCH_SKYLAKE_AVX512)
+void qFloatToFloat16_tail_avx256(quint16 *out, const float *in, qsizetype len) noexcept
+{
+    __mmask16 mask = _bzhi_u32(-1, len);
+    __m256 f32 = _mm256_maskz_loadu_ps(mask, in );
+    __m128i f16 = _mm256_maskz_cvtps_ph(mask, f32, _MM_FROUND_TO_NEAREST_INT);
+    _mm_mask_storeu_epi16(out, mask, f16);
+};
+
+static QT_FUNCTION_TARGET(ARCH_SKYLAKE_AVX512)
+void qFloatFromFloat16_tail_avx256(float *out, const quint16 *in, qsizetype len) noexcept
+{
+    __mmask16 mask = _bzhi_u32(-1, len);
+    __m128i f16 = _mm_maskz_loadu_epi16(mask, in);
+    __m256 f32 = _mm256_cvtph_ps(f16);
+    _mm256_mask_storeu_ps(out, mask, f32);
+};
 #endif
 
-f16cextern void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) noexcept;
-f16cextern void qFloatFromFloat16_fast(float *out, const quint16 *in, qsizetype len) noexcept;
+QT_FUNCTION_TARGET(F16C)
+static void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) noexcept
+{
+    constexpr qsizetype Step = sizeof(__m256i) / sizeof(float);
+    constexpr qsizetype HalfStep = sizeof(__m128i) / sizeof(float);
+    qsizetype i = 0;
+
+    if (len >= Step) {
+        auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
+            __m256 f32 = _mm256_loadu_ps(in + offset);
+            __m128i f16 = _mm256_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
+            _mm_storeu_si128(reinterpret_cast<__m128i *>(out + offset), f16);
+        };
+
+        // main loop: convert Step (8) floats per iteration
+        for ( ; i + Step < len; i += Step)
+            convertOneChunk(i);
+
+        // epilogue: convert the last chunk, possibly overlapping with the last
+        // iteration of the loop
+        return convertOneChunk(len - Step);
+    }
+
+#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
+    if (hasFastF16Avx256())
+        return qFloatToFloat16_tail_avx256(out, in, len);
+#endif
 
-#undef f16cextern
+    if (len >= HalfStep) {
+        auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
+            __m128 f32 = _mm_loadu_ps(in + offset);
+            __m128i f16 = _mm_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
+            _mm_storel_epi64(reinterpret_cast<__m128i *>(out + offset), f16);
+        };
+
+        // two conversions, possibly overlapping
+        convertOneChunk(0);
+        return convertOneChunk(len - HalfStep);
+    }
+
+    // Inlining "qfloat16::qfloat16(float f)":
+    for ( ; i < len; ++i)
+        out[i] = _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(in[i]), 0), 0);
+}
+
+QT_FUNCTION_TARGET(F16C)
+static void qFloatFromFloat16_fast(float *out, const quint16 *in, qsizetype len) noexcept
+{
+    constexpr qsizetype Step = sizeof(__m256i) / sizeof(float);
+    constexpr qsizetype HalfStep = sizeof(__m128i) / sizeof(float);
+    qsizetype i = 0;
+
+    if (len >= Step) {
+        auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
+            __m128i f16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + offset));
+            __m256 f32 = _mm256_cvtph_ps(f16);
+            _mm256_storeu_ps(out + offset, f32);
+        };
+
+        // main loop: convert Step (8) floats per iteration
+        for ( ; i + Step < len; i += Step)
+            convertOneChunk(i);
+
+        // epilogue: convert the last chunk, possibly overlapping with the last
+        // iteration of the loop
+        return convertOneChunk(len - Step);
+    }
+
+#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
+    if (hasFastF16Avx256())
+        return qFloatFromFloat16_tail_avx256(out, in, len);
+#endif
+
+    if (len >= HalfStep) {
+        auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
+            __m128i f16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(in + offset));
+            __m128 f32 = _mm_cvtph_ps(f16);
+            _mm_storeu_ps(out + offset, f32);
+        };
+
+        // two conversions, possibly overlapping
+        convertOneChunk(0);
+        return convertOneChunk(len - HalfStep);
+    }
+
+    // Inlining "qfloat16::operator float()":
+    for ( ; i < len; ++i)
+        out[i] = _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(in[i])));
 }
 
 #elif defined(__ARM_FP16_FORMAT_IEEE) && defined(__ARM_NEON__) && (__ARM_FP & 2)
@@ -298,9 +372,67 @@ Q_CORE_EXPORT void qFloatFromFloat16(float *out, const qfloat16 *in, qsizetype l
         out[i] = float(in[i]);
 }
 
+/*!
+    \fn size_t qfloat16::qHash(qfloat16 key, size_t seed)
+    \since 6.5.3
+
+    Returns the hash value for the \a key, using \a seed to seed the
+    calculation.
+
+    \note In Qt versions before 6.5, this operation was provided by the
+    qHash(float) overload. In Qt versions 6.5.0 to 6.5.2, this functionality
+    was broken in various ways. In Qt versions 6.5.3 and 6.6 onwards, this
+    overload restores the Qt 6.4 behavior.
+*/
+
+#ifndef QT_NO_DATASTREAM
+/*!
+    \fn qfloat16::operator<<(QDataStream &ds, qfloat16 f)
+    \relates QDataStream
+    \since 5.9
+
+    Writes a floating point number, \a f, to the stream \a ds using
+    the standard IEEE 754 format. Returns a reference to the stream.
+
+    \note In Qt versions prior to 6.3, this was a member function on
+    QDataStream.
+*/
+QDataStream &operator<<(QDataStream &ds, qfloat16 f)
+{
+    return ds << f.b16;
+}
+
+/*!
+    \fn qfloat16::operator>>(QDataStream &ds, qfloat16 &f)
+    \relates QDataStream
+    \since 5.9
+
+    Reads a floating point number from the stream \a ds into \a f,
+    using the standard IEEE 754 format. Returns a reference to the
+    stream.
+
+    \note In Qt versions prior to 6.3, this was a member function on
+    QDataStream.
+*/
+QDataStream &operator>>(QDataStream &ds, qfloat16 &f)
+{
+    return ds >> f.b16;
+}
+#endif
+
+QTextStream &operator>>(QTextStream &ts, qfloat16 &f16)
+{
+    float f;
+    ts >> f;
+    f16 = qfloat16(f);
+    return ts;
+}
+
+QTextStream &operator<<(QTextStream &ts, qfloat16 f)
+{
+    return ts << float(f);
+}
+
 QT_END_NAMESPACE
 
 #include "qfloat16tables.cpp"
-#ifdef QFLOAT16_INCLUDE_FAST
-#  include "qfloat16_f16c.c"
-#endif