summaryrefslogtreecommitdiffstats
path: root/src/corelib/global/qfloat16.cpp
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2020-07-29 22:11:46 -0700
committerThiago Macieira <thiago.macieira@intel.com>2022-02-01 08:31:15 -0800
commitac7f73e507be238a9c573c1e082713c4f3449e36 (patch)
treec14f66c22dfc4d1931e1f7339922226515218f9a /src/corelib/global/qfloat16.cpp
parent8247c0dac3a54c838385c5d10eaabc1a084993ea (diff)
qfloat16: add AVX512VL qfloat16 conversion tail
Slightly simpler implementation for the tail because of the OpMask support. Change-Id: I60fdef243d0c4e04890dfffd16266facd53d78aa Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'src/corelib/global/qfloat16.cpp')
-rw-r--r--src/corelib/global/qfloat16.cpp37
1 files changed, 37 insertions, 0 deletions
diff --git a/src/corelib/global/qfloat16.cpp b/src/corelib/global/qfloat16.cpp
index 366edc2ad9..33f5535a79 100644
--- a/src/corelib/global/qfloat16.cpp
+++ b/src/corelib/global/qfloat16.cpp
@@ -192,6 +192,19 @@ int qfloat16::fpClassify() const noexcept
*/
#if QT_COMPILER_SUPPORTS(F16C)
+#define QT_FUNCTION_TARGET_STRING_AVX512VLBW \
+ QT_FUNCTION_TARGET_STRING_AVX512VL "," \
+ QT_FUNCTION_TARGET_STRING_AVX512BW "," \
+ QT_FUNCTION_TARGET_STRING_F16C "," \
+ QT_FUNCTION_TARGET_STRING_BMI2 /* BMI2 for BZHI */
+
+static bool hasFastF16Avx256()
+{
+ // 256-bit AVX512 don't have a performance penalty (see qstring.cpp for more info)
+ constexpr quint64 CpuFeatureAVX512VLBW = CpuFeatureAVX512BW | CpuFeatureAVX512VL;
+ return qCpuHasFeature(AVX512VLBW);
+}
+
static inline bool hasFastF16()
{
// qsimd.cpp:detectProcessorFeatures() turns off this feature if AVX
@@ -199,6 +212,24 @@ static inline bool hasFastF16()
return qCpuHasFeature(F16C);
}
+static QT_FUNCTION_TARGET(AVX512VLBW)
+void qFloatToFloat16_tail_avx256(quint16 *out, const float *in, qsizetype len) noexcept
+{
+ __mmask16 mask = _bzhi_u32(-1, len);
+ __m256 f32 = _mm256_maskz_loadu_ps(mask, in );
+ __m128i f16 = _mm256_maskz_cvtps_ph(mask, f32, _MM_FROUND_TO_NEAREST_INT);
+ _mm_mask_storeu_epi16(out, mask, f16);
+};
+
+static QT_FUNCTION_TARGET(AVX512VLBW)
+void qFloatFromFloat16_tail_avx256(float *out, const quint16 *in, qsizetype len) noexcept
+{
+ __mmask16 mask = _bzhi_u32(-1, len);
+ __m128i f16 = _mm_maskz_loadu_epi16(mask, in);
+ __m256 f32 = _mm256_cvtph_ps(f16);
+ _mm256_mask_storeu_ps(out, mask, f32);
+};
+
QT_FUNCTION_TARGET(F16C)
static void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) noexcept
{
@@ -222,6 +253,9 @@ static void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) n
return convertOneChunk(len - Step);
}
+ if (hasFastF16Avx256())
+ return qFloatToFloat16_tail_avx256(out, in, len);
+
if (len >= HalfStep) {
auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
__m128 f32 = _mm_loadu_ps(in + offset);
@@ -262,6 +296,9 @@ static void qFloatFromFloat16_fast(float *out, const quint16 *in, qsizetype len)
return convertOneChunk(len - Step);
}
+ if (hasFastF16Avx256())
+ return qFloatFromFloat16_tail_avx256(out, in, len);
+
if (len >= HalfStep) {
auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
__m128i f16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(in + offset));