From 612e4c5233c489603165f046d2d76935153b4169 Mon Sep 17 00:00:00 2001 From: Thiago Macieira Date: Wed, 9 Jan 2019 20:40:18 -0800 Subject: Use VPMASKMOV in the epilogue ARGB->ARGB{32,64} AVX2 epilogues Instead of stepping down to 4 pixels, then 2 px, then 1, with essentially the same code, let's use maskload and maskstore to only load and store the effective portions (instructions new in AVX2). The secondary loop gets run at most twice, since there can be at most 7 pixels left. This fixes an off-by-4 bug in the previous implementation (lines 1041 and 1186 should have had 7 instead of 3). Change-Id: I4d4dadb709f1482fa8ccfffd157862e77ac508f6 Reviewed-by: Allan Sandfeld Jensen --- src/gui/painting/qdrawhelper_avx2.cpp | 144 +++++++++++----------------------- 1 file changed, 47 insertions(+), 97 deletions(-) (limited to 'src/gui/painting/qdrawhelper_avx2.cpp') diff --git a/src/gui/painting/qdrawhelper_avx2.cpp b/src/gui/painting/qdrawhelper_avx2.cpp index 21e07bb2dc..8c69e21569 100644 --- a/src/gui/painting/qdrawhelper_avx2.cpp +++ b/src/gui/painting/qdrawhelper_avx2.cpp @@ -995,6 +995,18 @@ void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2(uint * } } +static inline __m128i maskFromCount(qsizetype count) +{ + Q_ASSERT(count > 0); + static const qint64 data[] = { -1, -1, 0, 0 }; + auto ptr = reinterpret_cast(data) + sizeof(__m128i); + + if (count > int(sizeof(__m128i))) + return _mm_set1_epi8(-1); + + return _mm_loadu_si128(reinterpret_cast(ptr - count)); +} + template static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype count) { @@ -1008,9 +1020,11 @@ static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype for (; i < count - 7; i += 8) { __m256i srcVector = _mm256_loadu_si256(reinterpret_cast(src + i)); if (!_mm256_testz_si256(srcVector, alphaMask)) { - if (!_mm256_testc_si256(srcVector, alphaMask)) { - if (RGBA) - srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask); + // keep the two _mm_test[zc]_siXXX next to each other + bool cf = _mm256_testc_si256(srcVector, alphaMask); + if (RGBA) + srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask); + if (!cf) { __m256i src1 = _mm256_unpacklo_epi8(srcVector, zero); __m256i src2 = _mm256_unpackhi_epi8(srcVector, zero); __m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask); @@ -1028,8 +1042,6 @@ static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype srcVector = _mm256_packus_epi16(src1, src2); _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), srcVector); } else { - if (RGBA) - srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask); if (buffer != src || RGBA) _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), srcVector); } @@ -1038,12 +1050,18 @@ static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype } } - if (i < count - 3) { - __m128i srcVector = _mm_loadu_si128(reinterpret_cast(src + i)); - if (!_mm_testz_si128(srcVector, _mm256_castsi256_si128(alphaMask))) { - if (!_mm_testc_si128(srcVector, _mm256_castsi256_si128(alphaMask))) { - if (RGBA) - srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); + for ( ; i < count; i += 4) { + __m128i maskedAlphaMask = _mm256_castsi256_si128(alphaMask); + __m128i mask = maskFromCount((count - i) * sizeof(*src)); + maskedAlphaMask = _mm_and_si128(mask, maskedAlphaMask); + __m128i srcVector = _mm_maskload_epi32(reinterpret_cast(src), mask); + + if (!_mm_testz_si128(srcVector, maskedAlphaMask)) { + // keep the two _mm_test[zc]_siXXX next to each other + bool cf = _mm_testc_si128(srcVector, maskedAlphaMask); + if (RGBA) + srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); + if (!cf) { __m128i src1 = _mm_unpacklo_epi8(srcVector, _mm256_castsi256_si128(zero)); __m128i src2 = _mm_unpackhi_epi8(srcVector, _mm256_castsi256_si128(zero)); __m128i alpha1 = _mm_shuffle_epi8(src1, _mm256_castsi256_si128(shuffleMask)); @@ -1058,54 +1076,15 @@ static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype src2 = _mm_srli_epi16(src2, 8); src1 = _mm_blend_epi16(src1, alpha1, 0x88); src2 = _mm_blend_epi16(src2, alpha2, 0x88); - _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + i), srcVector); + srcVector = _mm_packus_epi16(src1, src2); + _mm_maskstore_epi32(reinterpret_cast(buffer + i), mask, srcVector); } else { - if (RGBA) - srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); if (buffer != src || RGBA) - _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + i), srcVector); - } - } else { - _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + i), _mm256_castsi256_si128(zero)); - } - i += 4; - } - - auto convert_half = [=](__m128i &srcVector) { - if (!_mm_testz_si128(srcVector, _mm256_castsi256_si128(alphaMask))) { - if (!_mm_testc_si128(srcVector, _mm256_castsi256_si128(alphaMask))) { - if (RGBA) - srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); - __m128i src1 = _mm_unpacklo_epi8(srcVector, _mm256_castsi256_si128(zero)); - __m128i alpha1 = _mm_shuffle_epi8(src1, _mm256_castsi256_si128(shuffleMask)); - src1 = _mm_mullo_epi16(src1, alpha1); - src1 = _mm_add_epi16(src1, _mm_srli_epi16(src1, 8)); - src1 = _mm_add_epi16(src1, _mm256_castsi256_si128(half)); - src1 = _mm_srli_epi16(src1, 8); - src1 = _mm_blend_epi16(src1, alpha1, 0x88); - srcVector = _mm_packus_epi16(src1, src1); - return true; - } else { - if (RGBA) - srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); - return buffer != src || RGBA; + _mm_maskstore_epi32(reinterpret_cast(buffer + i), mask, srcVector); } } else { - srcVector = _mm256_castsi256_si128(zero); - return true; + _mm_maskstore_epi32(reinterpret_cast(buffer + i), mask, _mm256_castsi256_si128(zero)); } - }; - if (i < count - 1) { - __m128i srcVector = _mm_loadl_epi64(reinterpret_cast(src + i)); - if (convert_half(srcVector)) - _mm_storel_epi64(reinterpret_cast<__m128i *>(buffer + i), srcVector); - i += 2; - } - - if (i != count) { - __m128i srcVector = _mm_cvtsi32_si128(src[i]); - if (convert_half(srcVector)) - buffer[i] = _mm_cvtsi128_si32(srcVector); } } @@ -1183,13 +1162,19 @@ static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizety _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i) + 1, src2); } - if (i < count - 3) { - __m128i srcVector = _mm_loadu_si128(reinterpret_cast(src + i)); + for ( ; i < count; i += 4) { + __m128i maskedAlphaMask = _mm256_castsi256_si128(alphaMask); + __m128i mask = maskFromCount((count - i) * sizeof(*src)); + maskedAlphaMask = _mm_and_si128(mask, maskedAlphaMask); + __m128i srcVector = _mm_maskload_epi32(reinterpret_cast(src), mask); __m256i src; - if (!_mm_testz_si128(srcVector, _mm256_castsi256_si128(alphaMask))) { - if (!_mm_testc_si128(srcVector, _mm256_castsi256_si128(alphaMask))) { - if (!RGBA) - srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); + + if (!_mm_testz_si128(srcVector, maskedAlphaMask)) { + // keep the two _mm_test[zc]_siXXX next to each other + bool cf = _mm_testc_si128(srcVector, maskedAlphaMask); + if (!RGBA) + srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); + if (!cf) { src = _mm256_cvtepu8_epi16(srcVector); __m256i alpha = _mm256_shuffle_epi8(src, shuffleMask); src = _mm256_mullo_epi16(src, alpha); @@ -1200,8 +1185,6 @@ static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizety src = _mm256_add_epi16(src, _mm256_srli_epi16(src, 7)); src = _mm256_blend_epi16(src, alpha, 0x88); } else { - if (!RGBA) - srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); const __m128i src1 = _mm_unpacklo_epi8(srcVector, srcVector); const __m128i src2 = _mm_unpackhi_epi8(srcVector, srcVector); src = _mm256_castsi128_si256(src1); @@ -1210,42 +1193,9 @@ static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizety } else { src = zero; } - _mm256_storeu_si256(reinterpret_cast<__m256i *>(buffer + i), src); - i += 4; - } - - auto convert_half = [=](__m128i &srcVector) { - if (!_mm_testz_si128(srcVector, _mm256_castsi256_si128(alphaMask))) { - if (!_mm_testc_si128(srcVector, _mm256_castsi256_si128(alphaMask))) { - if (!RGBA) - srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); - __m128i src1 = _mm_unpacklo_epi8(srcVector, _mm256_castsi256_si128(zero)); - __m128i alpha1 = _mm_shuffle_epi8(src1, _mm256_castsi256_si128(shuffleMask)); - src1 = _mm_mullo_epi16(src1, alpha1); - alpha1 = _mm_unpacklo_epi8(srcVector, srcVector); - src1 = _mm_add_epi16(src1, _mm_srli_epi16(src1, 7)); - src1 = _mm_blend_epi16(src1, alpha1, 0x88); - return src1; - } else { - if (!RGBA) - srcVector = _mm_shuffle_epi8(srcVector, _mm256_castsi256_si128(rgbaMask)); - const __m128i src1 = _mm_unpacklo_epi8(srcVector, srcVector); - return src1; - } - } else { - return _mm256_castsi256_si128(zero); - } + __m256i xmask = _mm256_cvtepi32_epi64(mask); + _mm256_maskstore_epi64(reinterpret_cast(buffer + i), xmask, src); }; - if (i < count - 1) { - __m128i srcVector = _mm_loadl_epi64(reinterpret_cast(src + i)); - _mm_storeu_si128(reinterpret_cast<__m128i *>(buffer + i), convert_half(srcVector)); - i += 2; - } - - if (i != count) { - __m128i srcVector = _mm_cvtsi32_si128(src[i]); - _mm_storel_epi64(reinterpret_cast<__m128i *>(buffer + i), convert_half(srcVector)); - } } const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, int count, -- cgit v1.2.3