summaryrefslogtreecommitdiffstats
path: root/src/corelib/global/qfloat16.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/corelib/global/qfloat16.cpp')
-rw-r--r--src/corelib/global/qfloat16.cpp249
1 files changed, 195 insertions, 54 deletions
diff --git a/src/corelib/global/qfloat16.cpp b/src/corelib/global/qfloat16.cpp
index 9ef197b3a1..f6f782e364 100644
--- a/src/corelib/global/qfloat16.cpp
+++ b/src/corelib/global/qfloat16.cpp
@@ -1,49 +1,20 @@
-/****************************************************************************
-**
-** Copyright (C) 2020 The Qt Company Ltd.
-** Copyright (C) 2016 by Southwest Research Institute (R)
-** Contact: http://www.qt-project.org/legal
-**
-** This file is part of the QtCore module of the Qt Toolkit.
-**
-** $QT_BEGIN_LICENSE:LGPL$
-** Commercial License Usage
-** Licensees holding valid commercial Qt licenses may use this file in
-** accordance with the commercial license agreement provided with the
-** Software or, alternatively, in accordance with the terms contained in
-** a written agreement between you and The Qt Company. For licensing terms
-** and conditions see https://www.qt.io/terms-conditions. For further
-** information use the contact form at https://www.qt.io/contact-us.
-**
-** GNU Lesser General Public License Usage
-** Alternatively, this file may be used under the terms of the GNU Lesser
-** General Public License version 3 as published by the Free Software
-** Foundation and appearing in the file LICENSE.LGPL3 included in the
-** packaging of this file. Please review the following information to
-** ensure the GNU Lesser General Public License version 3 requirements
-** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
-**
-** GNU General Public License Usage
-** Alternatively, this file may be used under the terms of the GNU
-** General Public License version 2.0 or (at your option) the GNU General
-** Public license version 3 or any later version approved by the KDE Free
-** Qt Foundation. The licenses are as published by the Free Software
-** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
-** included in the packaging of this file. Please review the following
-** information to ensure the GNU General Public License requirements will
-** be met: https://www.gnu.org/licenses/gpl-2.0.html and
-** https://www.gnu.org/licenses/gpl-3.0.html.
-**
-** $QT_END_LICENSE$
-**
-****************************************************************************/
+// Copyright (C) 2020 The Qt Company Ltd.
+// Copyright (C) 2016 by Southwest Research Institute (R)
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
#include "qfloat16.h"
#include "private/qsimd_p.h"
#include <cmath> // for fpclassify()'s return values
+#include <QtCore/qdatastream.h>
+#include <QtCore/qmetatype.h>
+#include <QtCore/qtextstream.h>
+
+QT_DECL_METATYPE_EXTERN(qfloat16, Q_CORE_EXPORT)
QT_BEGIN_NAMESPACE
+QT_IMPL_METATYPE_EXTERN(qfloat16)
+
/*!
\class qfloat16
\keyword 16-bit Floating Point Support
@@ -52,6 +23,15 @@ QT_BEGIN_NAMESPACE
\inheaderfile QFloat16
\brief Provides 16-bit floating point support.
+ \compares partial
+ \compareswith partial float double {long double} qint8 quint8 qint16 quint16 \
+ qint32 quint32 long {unsigned long} qint64 quint64
+ \endcompareswith
+ \compareswith partial qint128 quint128
+ Comparison with 128-bit integral types is only supported if Qt provides
+ these types.
+ \endcompareswith
+
The \c qfloat16 class provides support for half-precision (16-bit) floating
point data. It is fully compliant with IEEE 754 as a storage type. This
implies that any arithmetic operation on a \c qfloat16 instance results in
@@ -189,25 +169,128 @@ int qfloat16::fpClassify() const noexcept
exactness is stronger the smaller the numbers are.
*/
-#if QT_COMPILER_SUPPORTS(F16C)
+#if QT_COMPILER_SUPPORTS_HERE(F16C)
static inline bool hasFastF16()
{
- // All processors with F16C also support AVX, but YMM registers
- // might not be supported by the OS, or they might be disabled.
- return qCpuHasFeature(F16C) && qCpuHasFeature(AVX);
+ // qsimd.cpp:detectProcessorFeatures() turns off this feature if AVX
+ // state-saving is not enabled by the OS
+ return qCpuHasFeature(F16C);
}
-extern "C" {
-#ifdef QFLOAT16_INCLUDE_FAST
-# define f16cextern static
-#else
-# define f16cextern extern
+#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
+static bool hasFastF16Avx256()
+{
+ // 256-bit AVX512 don't have a performance penalty (see qstring.cpp for more info)
+ return qCpuHasFeature(ArchSkylakeAvx512);
+}
+
+static QT_FUNCTION_TARGET(ARCH_SKYLAKE_AVX512)
+void qFloatToFloat16_tail_avx256(quint16 *out, const float *in, qsizetype len) noexcept
+{
+ __mmask16 mask = _bzhi_u32(-1, len);
+ __m256 f32 = _mm256_maskz_loadu_ps(mask, in );
+ __m128i f16 = _mm256_maskz_cvtps_ph(mask, f32, _MM_FROUND_TO_NEAREST_INT);
+ _mm_mask_storeu_epi16(out, mask, f16);
+};
+
+static QT_FUNCTION_TARGET(ARCH_SKYLAKE_AVX512)
+void qFloatFromFloat16_tail_avx256(float *out, const quint16 *in, qsizetype len) noexcept
+{
+ __mmask16 mask = _bzhi_u32(-1, len);
+ __m128i f16 = _mm_maskz_loadu_epi16(mask, in);
+ __m256 f32 = _mm256_cvtph_ps(f16);
+ _mm256_mask_storeu_ps(out, mask, f32);
+};
+#endif
+
+QT_FUNCTION_TARGET(F16C)
+static void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) noexcept
+{
+ constexpr qsizetype Step = sizeof(__m256i) / sizeof(float);
+ constexpr qsizetype HalfStep = sizeof(__m128i) / sizeof(float);
+ qsizetype i = 0;
+
+ if (len >= Step) {
+ auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
+ __m256 f32 = _mm256_loadu_ps(in + offset);
+ __m128i f16 = _mm256_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out + offset), f16);
+ };
+
+ // main loop: convert Step (8) floats per iteration
+ for ( ; i + Step < len; i += Step)
+ convertOneChunk(i);
+
+ // epilogue: convert the last chunk, possibly overlapping with the last
+ // iteration of the loop
+ return convertOneChunk(len - Step);
+ }
+
+#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
+ if (hasFastF16Avx256())
+ return qFloatToFloat16_tail_avx256(out, in, len);
#endif
-f16cextern void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) noexcept;
-f16cextern void qFloatFromFloat16_fast(float *out, const quint16 *in, qsizetype len) noexcept;
+ if (len >= HalfStep) {
+ auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
+ __m128 f32 = _mm_loadu_ps(in + offset);
+ __m128i f16 = _mm_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(out + offset), f16);
+ };
+
+ // two conversions, possibly overlapping
+ convertOneChunk(0);
+ return convertOneChunk(len - HalfStep);
+ }
+
+ // Inlining "qfloat16::qfloat16(float f)":
+ for ( ; i < len; ++i)
+ out[i] = _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(in[i]), 0), 0);
+}
+
+QT_FUNCTION_TARGET(F16C)
+static void qFloatFromFloat16_fast(float *out, const quint16 *in, qsizetype len) noexcept
+{
+ constexpr qsizetype Step = sizeof(__m256i) / sizeof(float);
+ constexpr qsizetype HalfStep = sizeof(__m128i) / sizeof(float);
+ qsizetype i = 0;
-#undef f16cextern
+ if (len >= Step) {
+ auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
+ __m128i f16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + offset));
+ __m256 f32 = _mm256_cvtph_ps(f16);
+ _mm256_storeu_ps(out + offset, f32);
+ };
+
+ // main loop: convert Step (8) floats per iteration
+ for ( ; i + Step < len; i += Step)
+ convertOneChunk(i);
+
+ // epilogue: convert the last chunk, possibly overlapping with the last
+ // iteration of the loop
+ return convertOneChunk(len - Step);
+ }
+
+#if QT_COMPILER_SUPPORTS_HERE(AVX512VL) && QT_COMPILER_SUPPORTS_HERE(AVX512BW)
+ if (hasFastF16Avx256())
+ return qFloatFromFloat16_tail_avx256(out, in, len);
+#endif
+
+ if (len >= HalfStep) {
+ auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
+ __m128i f16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(in + offset));
+ __m128 f32 = _mm_cvtph_ps(f16);
+ _mm_storeu_ps(out + offset, f32);
+ };
+
+ // two conversions, possibly overlapping
+ convertOneChunk(0);
+ return convertOneChunk(len - HalfStep);
+ }
+
+ // Inlining "qfloat16::operator float()":
+ for ( ; i < len; ++i)
+ out[i] = _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(in[i])));
}
#elif defined(__ARM_FP16_FORMAT_IEEE) && defined(__ARM_NEON__) && (__ARM_FP & 2)
@@ -289,9 +372,67 @@ Q_CORE_EXPORT void qFloatFromFloat16(float *out, const qfloat16 *in, qsizetype l
out[i] = float(in[i]);
}
+/*!
+ \fn size_t qfloat16::qHash(qfloat16 key, size_t seed)
+ \since 6.5.3
+
+ Returns the hash value for the \a key, using \a seed to seed the
+ calculation.
+
+ \note In Qt versions before 6.5, this operation was provided by the
+ qHash(float) overload. In Qt versions 6.5.0 to 6.5.2, this functionality
+ was broken in various ways. In Qt versions 6.5.3 and 6.6 onwards, this
+ overload restores the Qt 6.4 behavior.
+*/
+
+#ifndef QT_NO_DATASTREAM
+/*!
+ \fn qfloat16::operator<<(QDataStream &ds, qfloat16 f)
+ \relates QDataStream
+ \since 5.9
+
+ Writes a floating point number, \a f, to the stream \a ds using
+ the standard IEEE 754 format. Returns a reference to the stream.
+
+ \note In Qt versions prior to 6.3, this was a member function on
+ QDataStream.
+*/
+QDataStream &operator<<(QDataStream &ds, qfloat16 f)
+{
+ return ds << f.b16;
+}
+
+/*!
+ \fn qfloat16::operator>>(QDataStream &ds, qfloat16 &f)
+ \relates QDataStream
+ \since 5.9
+
+ Reads a floating point number from the stream \a ds into \a f,
+ using the standard IEEE 754 format. Returns a reference to the
+ stream.
+
+ \note In Qt versions prior to 6.3, this was a member function on
+ QDataStream.
+*/
+QDataStream &operator>>(QDataStream &ds, qfloat16 &f)
+{
+ return ds >> f.b16;
+}
+#endif
+
+QTextStream &operator>>(QTextStream &ts, qfloat16 &f16)
+{
+ float f;
+ ts >> f;
+ f16 = qfloat16(f);
+ return ts;
+}
+
+QTextStream &operator<<(QTextStream &ts, qfloat16 f)
+{
+ return ts << float(f);
+}
+
QT_END_NAMESPACE
#include "qfloat16tables.cpp"
-#ifdef QFLOAT16_INCLUDE_FAST
-# include "qfloat16_f16c.c"
-#endif