From 506aa694a9e466f04c392d814b02c2130611dce6 Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Wed, 16 Nov 2016 16:25:11 +0100 Subject: Cleanup convert_ARGB_to_ARGB_PM_inplace_sse2 Changes it to follow standard SIMD patterns so it can use ALIGNMENT_PROLOGUE_16BYTES and SIMD_EPILOGUE helpers. Should also improve performance by using aligned memory access. Change-Id: I14a48b82e3f3de83bd7572aa82bed07f28ad944c Reviewed-by: Erik Verbruggen --- src/gui/image/qimage_sse2.cpp | 59 +++++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 22 deletions(-) (limited to 'src') diff --git a/src/gui/image/qimage_sse2.cpp b/src/gui/image/qimage_sse2.cpp index 0fb92e9d43..8f7195e0b5 100644 --- a/src/gui/image/qimage_sse2.cpp +++ b/src/gui/image/qimage_sse2.cpp @@ -51,51 +51,66 @@ bool convert_ARGB_to_ARGB_PM_inplace_sse2(QImageData *data, Qt::ImageConversionF { Q_ASSERT(data->format == QImage::Format_ARGB32 || data->format == QImage::Format_RGBA8888); - // extra pixels on each line - const int spare = data->width & 3; - // width in pixels of the pad at the end of each line - const int pad = (data->bytes_per_line >> 2) - data->width; - const int iter = data->width >> 2; - int height = data->height; + const int width = data->width; + const int height = data->height; + const int bpl = data->bytes_per_line; const __m128i alphaMask = _mm_set1_epi32(0xff000000); const __m128i nullVector = _mm_setzero_si128(); const __m128i half = _mm_set1_epi16(0x80); const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); - __m128i *d = reinterpret_cast<__m128i*>(data->data); - while (height--) { - const __m128i *end = d + iter; - - for (; d != end; ++d) { - const __m128i srcVector = _mm_loadu_si128(d); + uchar *d = data->data; + for (int y = 0; y < height; ++y) { + int i = 0; + quint32 *d32 = reinterpret_cast(d); + ALIGNMENT_PROLOGUE_16BYTES(d, i, width) { + const quint32 p = d32[i]; + if (p <= 0x00ffffff) + d32[i] = 0; + else if (p < 0xff000000) + d32[i] = qPremultiply(p); + } + __m128i *d128 = reinterpret_cast<__m128i *>(d32 + i); + for (; i < (width - 3); i += 4) { + const __m128i srcVector = _mm_load_si128(d128); +#ifdef __SSE4_1__ + if (_mm_testc_si128(srcVector, alphaMask)) { + // opaque, data is unchanged + } else if (_mm_testz_si128(srcVector, alphaMask)) { + // fully transparent + _mm_store_si128(d128, nullVector); + } else { + const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); +#else const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { // opaque, data is unchanged } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) == 0xffff) { // fully transparent - _mm_storeu_si128(d, nullVector); + _mm_store_si128(d128, nullVector); } else { +#endif __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); __m128i result; BYTE_MUL_SSE2(result, srcVector, alphaChannel, colorMask, half); result = _mm_or_si128(_mm_andnot_si128(alphaMask, result), srcVectorAlpha); - _mm_storeu_si128(d, result); + _mm_store_si128(d128, result); } + d128++; } - QRgb *p = reinterpret_cast(d); - QRgb *pe = p+spare; - for (; p != pe; ++p) { - if (*p < 0x00ffffff) - *p = 0; - else if (*p < 0xff000000) - *p = qPremultiply(*p); + SIMD_EPILOGUE(i, width, 3) { + const quint32 p = d32[i]; + if (p <= 0x00ffffff) + d32[i] = 0; + else if (p < 0xff000000) + d32[i] = qPremultiply(p); } - d = reinterpret_cast<__m128i*>(p+pad); + d += bpl; } if (data->format == QImage::Format_ARGB32) -- cgit v1.2.3