summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorThiago Macieira <thiago.macieira@intel.com>2019-01-09 20:24:32 -0800
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2019-01-15 21:52:46 +0000
commitf7a7a49f9235c9375fc515a3062341f285f3c2c3 (patch)
treedf90884c90d1d129f05d3ec9657d487c9bab1218
parentcacf2ad9229a6842dbc0e002ed8ba4d04db026ae (diff)
Fix the AVX2 ARGB->ARGB64 conversion code
Commit c8c5ff19de1c34a99b8315e59015d115957b3584 introduced the solution as a simple scaling up of the code in qdrawhelper_sse4.cpp, but it's bad due to the way that the 256-bit unpack instructions work: the unpack-low instruction unpacks the lower half of each half of the 256-bit register. So we fix it up by inserting a permute4x64 that swaps the middle two quarters of the 256-bit register (permute8x32 requires a __m256i parameter, instead of an immediate). This introduces an instruction that costs 3 cycles in each loop, but since the AVX2 code has double the throughput compared to SSE4 code, it should still be faster. This problem does not affect the ARGB->ARGB32 code because that repacks at the end. Change-Id: I4d4dadb709f1482fa8ccfffd1578620b45166a4f Reviewed-by: Allan Sandfeld Jensen <allan.jensen@qt.io>
-rw-r--r--src/gui/painting/qdrawhelper_avx2.cpp19
1 files changed, 14 insertions, 5 deletions
diff --git a/src/gui/painting/qdrawhelper_avx2.cpp b/src/gui/painting/qdrawhelper_avx2.cpp
index d8732dc29f..21e07bb2dc 100644
--- a/src/gui/painting/qdrawhelper_avx2.cpp
+++ b/src/gui/painting/qdrawhelper_avx2.cpp
@@ -1146,9 +1146,20 @@ static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizety
__m256i src1, src2;
__m256i srcVector = _mm256_loadu_si256(reinterpret_cast<const __m256i *>(src + i));
if (!_mm256_testz_si256(srcVector, alphaMask)) {
- if (!_mm256_testc_si256(srcVector, alphaMask)) {
- if (!RGBA)
- srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
+ // keep the two _mm_test[zc]_siXXX next to each other
+ bool cf = _mm256_testc_si256(srcVector, alphaMask);
+ if (!RGBA)
+ srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
+
+ // The two unpack instructions unpack the low and upper halves of
+ // each 128-bit half of the 256-bit register. Here's the tracking
+ // of what's where: (p is 32-bit, P is 64-bit)
+ // as loaded: [ p1, p2, p3, p4; p5, p6, p7, p8 ]
+ // after permute4x64 [ p1, p2, p5, p6; p3, p4, p7, p8 ]
+ // after unpacklo/hi [ P1, P2; P3, P4 ] [ P5, P6; P7, P8 ]
+ srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(3, 1, 2, 0));
+
+ if (!cf) {
src1 = _mm256_unpacklo_epi8(srcVector, zero);
src2 = _mm256_unpackhi_epi8(srcVector, zero);
__m256i alpha1 = _mm256_shuffle_epi8(src1, shuffleMask);
@@ -1162,8 +1173,6 @@ static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizety
src1 = _mm256_blend_epi16(src1, alpha1, 0x88);
src2 = _mm256_blend_epi16(src2, alpha2, 0x88);
} else {
- if (!RGBA)
- srcVector = _mm256_shuffle_epi8(srcVector, rgbaMask);
src1 = _mm256_unpacklo_epi8(srcVector, srcVector);
src2 = _mm256_unpackhi_epi8(srcVector, srcVector);
}