From ce411512151e7cab12ba64b53fac72b9c578369d Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Thu, 27 Jan 2022 21:08:04 -0800 Subject: qfloat16/F16C: rewrite the F16C conversion with overlapping We don't have to be completely sequential. It's perfectly fine to overlap loads and stores and this leads to simpler code. Change-Id: I6fcda969a9e9427198bffffd16ce56ebbc948379 Reviewed-by: Allan Sandfeld Jensen --- src/corelib/global/qfloat16.cpp | 76 +++++++++++++++++++++++++++++++++-------- 1 file changed, 62 insertions(+), 14 deletions(-) diff --git a/src/corelib/global/qfloat16.cpp b/src/corelib/global/qfloat16.cpp index 902a1aebe2..366edc2ad9 100644 --- a/src/corelib/global/qfloat16.cpp +++ b/src/corelib/global/qfloat16.cpp @@ -202,32 +202,80 @@ static inline bool hasFastF16() QT_FUNCTION_TARGET(F16C) static void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) noexcept { + constexpr qsizetype Step = sizeof(__m256i) / sizeof(float); + constexpr qsizetype HalfStep = sizeof(__m128i) / sizeof(float); qsizetype i = 0; - int epilog_i; - for (; i < len - 7; i += 8) - _mm_storeu_si128((__m128i *)(out + i), _mm256_cvtps_ph(_mm256_loadu_ps(in + i), 0)); - if (i < len - 3) { - _mm_storel_epi64((__m128i *)(out + i), _mm_cvtps_ph(_mm_loadu_ps(in + i), 0)); - i += 4; + + if (len >= Step) { + auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) { + __m256 f32 = _mm256_loadu_ps(in + offset); + __m128i f16 = _mm256_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT); + _mm_storeu_si128(reinterpret_cast<__m128i *>(out + offset), f16); + }; + + // main loop: convert Step (8) floats per iteration + for ( ; i + Step < len; i += Step) + convertOneChunk(i); + + // epilogue: convert the last chunk, possibly overlapping with the last + // iteration of the loop + return convertOneChunk(len - Step); } + + if (len >= HalfStep) { + auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) { + __m128 f32 = _mm_loadu_ps(in + offset); + __m128i f16 = _mm_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT); + _mm_storel_epi64(reinterpret_cast<__m128i *>(out + offset), f16); + }; + + // two conversions, possibly overlapping + convertOneChunk(0); + return convertOneChunk(len - HalfStep); + } + // Inlining "qfloat16::qfloat16(float f)": - for (epilog_i = 0; i < len && epilog_i < 3; ++i, ++epilog_i) + for ( ; i < len; ++i) out[i] = _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(in[i]), 0), 0); } QT_FUNCTION_TARGET(F16C) static void qFloatFromFloat16_fast(float *out, const quint16 *in, qsizetype len) noexcept { + constexpr qsizetype Step = sizeof(__m256i) / sizeof(float); + constexpr qsizetype HalfStep = sizeof(__m128i) / sizeof(float); qsizetype i = 0; - int epilog_i; - for (; i < len - 7; i += 8) - _mm256_storeu_ps(out + i, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(in + i)))); - if (i < len - 3) { - _mm_storeu_ps(out + i, _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)(in + i)))); - i += 4; + + if (len >= Step) { + auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) { + __m128i f16 = _mm_loadu_si128(reinterpret_cast(in + offset)); + __m256 f32 = _mm256_cvtph_ps(f16); + _mm256_storeu_ps(out + offset, f32); + }; + + // main loop: convert Step (8) floats per iteration + for ( ; i + Step < len; i += Step) + convertOneChunk(i); + + // epilogue: convert the last chunk, possibly overlapping with the last + // iteration of the loop + return convertOneChunk(len - Step); } + + if (len >= HalfStep) { + auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) { + __m128i f16 = _mm_loadl_epi64(reinterpret_cast(in + offset)); + __m128 f32 = _mm_cvtph_ps(f16); + _mm_storeu_ps(out + offset, f32); + }; + + // two conversions, possibly overlapping + convertOneChunk(0); + return convertOneChunk(len - HalfStep); + } + // Inlining "qfloat16::operator float()": - for (epilog_i = 0; i < len && epilog_i < 3; ++i, ++epilog_i) + for ( ; i < len; ++i) out[i] = _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(in[i]))); } -- cgit v1.2.3