From ac7f73e507be238a9c573c1e082713c4f3449e36 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Wed, 29 Jul 2020 22:11:46 -0700 Subject: qfloat16: add AVX512VL qfloat16 conversion tail Slightly simpler implementation for the tail because of the OpMask support. Change-Id: I60fdef243d0c4e04890dfffd16266facd53d78aa Reviewed-by: Allan Sandfeld Jensen --- src/corelib/global/qfloat16.cpp | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'src/corelib/global/qfloat16.cpp') diff --git a/src/corelib/global/qfloat16.cpp b/src/corelib/global/qfloat16.cpp index 366edc2ad9..33f5535a79 100644 --- a/src/corelib/global/qfloat16.cpp +++ b/src/corelib/global/qfloat16.cpp @@ -192,6 +192,19 @@ int qfloat16::fpClassify() const noexcept */ #if QT_COMPILER_SUPPORTS(F16C) +#define QT_FUNCTION_TARGET_STRING_AVX512VLBW \ + QT_FUNCTION_TARGET_STRING_AVX512VL "," \ + QT_FUNCTION_TARGET_STRING_AVX512BW "," \ + QT_FUNCTION_TARGET_STRING_F16C "," \ + QT_FUNCTION_TARGET_STRING_BMI2 /* BMI2 for BZHI */ + +static bool hasFastF16Avx256() +{ + // 256-bit AVX512 don't have a performance penalty (see qstring.cpp for more info) + constexpr quint64 CpuFeatureAVX512VLBW = CpuFeatureAVX512BW | CpuFeatureAVX512VL; + return qCpuHasFeature(AVX512VLBW); +} + static inline bool hasFastF16() { // qsimd.cpp:detectProcessorFeatures() turns off this feature if AVX @@ -199,6 +212,24 @@ static inline bool hasFastF16() return qCpuHasFeature(F16C); } +static QT_FUNCTION_TARGET(AVX512VLBW) +void qFloatToFloat16_tail_avx256(quint16 *out, const float *in, qsizetype len) noexcept +{ + __mmask16 mask = _bzhi_u32(-1, len); + __m256 f32 = _mm256_maskz_loadu_ps(mask, in ); + __m128i f16 = _mm256_maskz_cvtps_ph(mask, f32, _MM_FROUND_TO_NEAREST_INT); + _mm_mask_storeu_epi16(out, mask, f16); +}; + +static QT_FUNCTION_TARGET(AVX512VLBW) +void qFloatFromFloat16_tail_avx256(float *out, const quint16 *in, qsizetype len) noexcept +{ + __mmask16 mask = _bzhi_u32(-1, len); + __m128i f16 = _mm_maskz_loadu_epi16(mask, in); + __m256 f32 = _mm256_cvtph_ps(f16); + _mm256_mask_storeu_ps(out, mask, f32); +}; + QT_FUNCTION_TARGET(F16C) static void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) noexcept { @@ -222,6 +253,9 @@ static void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) n return convertOneChunk(len - Step); } + if (hasFastF16Avx256()) + return qFloatToFloat16_tail_avx256(out, in, len); + if (len >= HalfStep) { auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) { __m128 f32 = _mm_loadu_ps(in + offset); @@ -262,6 +296,9 @@ static void qFloatFromFloat16_fast(float *out, const quint16 *in, qsizetype len) return convertOneChunk(len - Step); } + if (hasFastF16Avx256()) + return qFloatFromFloat16_tail_avx256(out, in, len); + if (len >= HalfStep) { auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) { __m128i f16 = _mm_loadl_epi64(reinterpret_cast(in + offset)); -- cgit v1.2.3