summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2022-01-27 21:08:04 -0800
committerThiago Macieira <thiago.macieira@intel.com>2022-01-28 22:40:25 -0800
commitce411512151e7cab12ba64b53fac72b9c578369d (patch)
treea7570e29ad35732bc78abb5a0fede1d2b013b7aa
parent073454901de1b66795b29a708ac939026ea2f41d (diff)
qfloat16/F16C: rewrite the F16C conversion with overlapping
We don't have to be completely sequential. It's perfectly fine to overlap loads and stores and this leads to simpler code. Change-Id: I6fcda969a9e9427198bffffd16ce56ebbc948379 Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
-rw-r--r--src/corelib/global/qfloat16.cpp76
1 files changed, 62 insertions, 14 deletions
diff --git a/src/corelib/global/qfloat16.cpp b/src/corelib/global/qfloat16.cpp
index 902a1aebe2..366edc2ad9 100644
--- a/src/corelib/global/qfloat16.cpp
+++ b/src/corelib/global/qfloat16.cpp
@@ -202,32 +202,80 @@ static inline bool hasFastF16()
QT_FUNCTION_TARGET(F16C)
static void qFloatToFloat16_fast(quint16 *out, const float *in, qsizetype len) noexcept
{
+ constexpr qsizetype Step = sizeof(__m256i) / sizeof(float);
+ constexpr qsizetype HalfStep = sizeof(__m128i) / sizeof(float);
qsizetype i = 0;
- int epilog_i;
- for (; i < len - 7; i += 8)
- _mm_storeu_si128((__m128i *)(out + i), _mm256_cvtps_ph(_mm256_loadu_ps(in + i), 0));
- if (i < len - 3) {
- _mm_storel_epi64((__m128i *)(out + i), _mm_cvtps_ph(_mm_loadu_ps(in + i), 0));
- i += 4;
+
+ if (len >= Step) {
+ auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
+ __m256 f32 = _mm256_loadu_ps(in + offset);
+ __m128i f16 = _mm256_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
+ _mm_storeu_si128(reinterpret_cast<__m128i *>(out + offset), f16);
+ };
+
+ // main loop: convert Step (8) floats per iteration
+ for ( ; i + Step < len; i += Step)
+ convertOneChunk(i);
+
+ // epilogue: convert the last chunk, possibly overlapping with the last
+ // iteration of the loop
+ return convertOneChunk(len - Step);
}
+
+ if (len >= HalfStep) {
+ auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
+ __m128 f32 = _mm_loadu_ps(in + offset);
+ __m128i f16 = _mm_cvtps_ph(f32, _MM_FROUND_TO_NEAREST_INT);
+ _mm_storel_epi64(reinterpret_cast<__m128i *>(out + offset), f16);
+ };
+
+ // two conversions, possibly overlapping
+ convertOneChunk(0);
+ return convertOneChunk(len - HalfStep);
+ }
+
// Inlining "qfloat16::qfloat16(float f)":
- for (epilog_i = 0; i < len && epilog_i < 3; ++i, ++epilog_i)
+ for ( ; i < len; ++i)
out[i] = _mm_extract_epi16(_mm_cvtps_ph(_mm_set_ss(in[i]), 0), 0);
}
QT_FUNCTION_TARGET(F16C)
static void qFloatFromFloat16_fast(float *out, const quint16 *in, qsizetype len) noexcept
{
+ constexpr qsizetype Step = sizeof(__m256i) / sizeof(float);
+ constexpr qsizetype HalfStep = sizeof(__m128i) / sizeof(float);
qsizetype i = 0;
- int epilog_i;
- for (; i < len - 7; i += 8)
- _mm256_storeu_ps(out + i, _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(in + i))));
- if (i < len - 3) {
- _mm_storeu_ps(out + i, _mm_cvtph_ps(_mm_loadl_epi64((const __m128i *)(in + i))));
- i += 4;
+
+ if (len >= Step) {
+ auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
+ __m128i f16 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(in + offset));
+ __m256 f32 = _mm256_cvtph_ps(f16);
+ _mm256_storeu_ps(out + offset, f32);
+ };
+
+ // main loop: convert Step (8) floats per iteration
+ for ( ; i + Step < len; i += Step)
+ convertOneChunk(i);
+
+ // epilogue: convert the last chunk, possibly overlapping with the last
+ // iteration of the loop
+ return convertOneChunk(len - Step);
}
+
+ if (len >= HalfStep) {
+ auto convertOneChunk = [=](qsizetype offset) QT_FUNCTION_TARGET(F16C) {
+ __m128i f16 = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(in + offset));
+ __m128 f32 = _mm_cvtph_ps(f16);
+ _mm_storeu_ps(out + offset, f32);
+ };
+
+ // two conversions, possibly overlapping
+ convertOneChunk(0);
+ return convertOneChunk(len - HalfStep);
+ }
+
// Inlining "qfloat16::operator float()":
- for (epilog_i = 0; i < len && epilog_i < 3; ++i, ++epilog_i)
+ for ( ; i < len; ++i)
out[i] = _mm_cvtss_f32(_mm_cvtph_ps(_mm_cvtsi32_si128(in[i])));
}