diff options
author | Thiago Macieira <thiago.macieira@intel.com> | 2020-07-29 22:11:46 -0700 |
---|---|---|
committer | Thiago Macieira <thiago.macieira@intel.com> | 2022-02-01 08:31:15 -0800 |
commit | ac7f73e507be238a9c573c1e082713c4f3449e36 (patch) | |
tree | c14f66c22dfc4d1931e1f7339922226515218f9a /src/corelib/global/qfloat16.cpp | |
parent | 8247c0dac3a54c838385c5d10eaabc1a084993ea (diff) |
qfloat16: add AVX512VL qfloat16 conversion tail
Slightly simpler implementation for the tail because of the OpMask
support.
Change-Id: I60fdef243d0c4e04890dfffd16266facd53d78aa
Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
Diffstat (limited to 'src/corelib/global/qfloat16.cpp')
-rw-r--r-- | src/corelib/global/qfloat16.cpp | 37 |
1 files changed, 37 insertions, 0 deletions
diff --git a/src/corelib/global/qfloat16.cpp b/src/corelib/global/qfloat16.cpp index 366edc2ad9..33f5535a79 100644 --- a/src/corelib/global/qfloat16.cpp +++ b/src/corelib/global/qfloat16.cpp @@ -192,6 +192,19 @@ int qfloat16::fpClassify() const noexcept */ #if QT_COMPILER_SUPPORTS(F16C) +#define QT_FUNCTION_TARGET_STRING_AVX512VLBW \ + QT_FUNCTION_TARGET_STRING_AVX512VL "," \ + QT_FUNCTION_TARGET_STRING_AVX512BW "," \ + QT_FUNCTION_TARGET_STRING_F16C "," \ + QT_FUNCTION_TARGET_STRING_BMI2 /* BMI2 for BZHI */ + +static bool hasFastF16Avx256() +{ + // 256-bit AVX512 don't have a performance penalty (see qstring.cpp for more info) + constexpr quint64 CpuFeatureAVX512VLBW = CpuFeatureAVX512BW | CpuFeatureAVX512VL; + return qCpuHasFeature(AVX512VLBW); +} + static inline bool hasFastF16() { // qsimd.cpp:detectProcessorFeatures() turns off this feature if AVX @@ -199,6 +212,24 @@ static inline bool hasFastF16() return qCpuHasFeature(F16C); } +static QT_FUNCTION_TARGET(AVX512VLBW) +void qFloatToFloat16_tail_avx256(quint16 *out, const float *in, qsizetype len) noexcept +{ + __mmask16 mask = _bzhi_u32(-1, len); + __m256 f32 = _mm256_maskz_loadu_ps(mask, in ); + __m128i f16 = _mm256_maskz_cvtps_ph(mask, f32, _MM_FROUND_TO_NEAREST_INT); + _mm_mask_storeu_epi16(out, mask, f16); +}; + +static QT_FUNCTION_TARGET(AVX512VLBW) +void qFloatFromFloat16_tail_avx256(float *out, const quint16 *in, qsizetype len) noexcept +{ + __mmask16 mask = _bzhi_u32(-1, len); + __m128i f16 = _mm_maskz_loadu_epi16(mask, in); + __m256 f32 = _mm256_cvtph_ps(f16); + _mm256_mask_storeu_ps(out, mask, f32); +}; + QT_FUNCTION_TARGET(F16C) static void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) noexcept { @@ -222,6 +253,9 @@ static void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) n return convertOneChunk(len - Step); } + if (hasFastF16Avx256()) + return qFloatToFloat16_tail_avx256(out, in, len); + if (len >= HalfStep) { auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) { __m128 f32 = _mm_loadu_ps(in + offset); @@ -262,6 +296,9 @@ static void qFloatFromFloat16_fast(float *out, const quint16 *in, qsizetype len) return convertOneChunk(len - Step); } + if (hasFastF16Avx256()) + return qFloatFromFloat16_tail_avx256(out, in, len); + if (len >= HalfStep) { auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) { __m128i f16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(in + offset)); |