summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2016-11-16 16:25:11 +0100
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2016-12-06 16:16:53 +0000
commit506aa694a9e466f04c392d814b02c2130611dce6 (patch)
tree7049df4f7e4ba5e9f8521c816075216fda4c42b8
parent0382bb2ab830898fa14b6e42d5ca1b105f6693a7 (diff)
Cleanup convert_ARGB_to_ARGB_PM_inplace_sse2
Changes it to follow standard SIMD patterns so it can use ALIGNMENT_PROLOGUE_16BYTES and SIMD_EPILOGUE helpers. Should also improve performance by using aligned memory access. Change-Id: I14a48b82e3f3de83bd7572aa82bed07f28ad944c Reviewed-by: Erik Verbruggen <erik.verbruggen@qt.io>
-rw-r--r--src/gui/image/qimage_sse2.cpp59
1 files changed, 37 insertions, 22 deletions
diff --git a/src/gui/image/qimage_sse2.cpp b/src/gui/image/qimage_sse2.cpp
index 0fb92e9d43..8f7195e0b5 100644
--- a/src/gui/image/qimage_sse2.cpp
+++ b/src/gui/image/qimage_sse2.cpp
@@ -51,51 +51,66 @@ bool convert_ARGB_to_ARGB_PM_inplace_sse2(QImageData *data, Qt::ImageConversionF
{
Q_ASSERT(data->format == QImage::Format_ARGB32 || data->format == QImage::Format_RGBA8888);
- // extra pixels on each line
- const int spare = data->width & 3;
- // width in pixels of the pad at the end of each line
- const int pad = (data->bytes_per_line >> 2) - data->width;
- const int iter = data->width >> 2;
- int height = data->height;
+ const int width = data->width;
+ const int height = data->height;
+ const int bpl = data->bytes_per_line;
const __m128i alphaMask = _mm_set1_epi32(0xff000000);
const __m128i nullVector = _mm_setzero_si128();
const __m128i half = _mm_set1_epi16(0x80);
const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
- __m128i *d = reinterpret_cast<__m128i*>(data->data);
- while (height--) {
- const __m128i *end = d + iter;
-
- for (; d != end; ++d) {
- const __m128i srcVector = _mm_loadu_si128(d);
+ uchar *d = data->data;
+ for (int y = 0; y < height; ++y) {
+ int i = 0;
+ quint32 *d32 = reinterpret_cast<quint32 *>(d);
+ ALIGNMENT_PROLOGUE_16BYTES(d, i, width) {
+ const quint32 p = d32[i];
+ if (p <= 0x00ffffff)
+ d32[i] = 0;
+ else if (p < 0xff000000)
+ d32[i] = qPremultiply(p);
+ }
+ __m128i *d128 = reinterpret_cast<__m128i *>(d32 + i);
+ for (; i < (width - 3); i += 4) {
+ const __m128i srcVector = _mm_load_si128(d128);
+#ifdef __SSE4_1__
+ if (_mm_testc_si128(srcVector, alphaMask)) {
+ // opaque, data is unchanged
+ } else if (_mm_testz_si128(srcVector, alphaMask)) {
+ // fully transparent
+ _mm_store_si128(d128, nullVector);
+ } else {
+ const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
+#else
const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
// opaque, data is unchanged
} else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) == 0xffff) {
// fully transparent
- _mm_storeu_si128(d, nullVector);
+ _mm_store_si128(d128, nullVector);
} else {
+#endif
__m128i alphaChannel = _mm_srli_epi32(srcVector, 24);
alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16));
__m128i result;
BYTE_MUL_SSE2(result, srcVector, alphaChannel, colorMask, half);
result = _mm_or_si128(_mm_andnot_si128(alphaMask, result), srcVectorAlpha);
- _mm_storeu_si128(d, result);
+ _mm_store_si128(d128, result);
}
+ d128++;
}
- QRgb *p = reinterpret_cast<QRgb*>(d);
- QRgb *pe = p+spare;
- for (; p != pe; ++p) {
- if (*p < 0x00ffffff)
- *p = 0;
- else if (*p < 0xff000000)
- *p = qPremultiply(*p);
+ SIMD_EPILOGUE(i, width, 3) {
+ const quint32 p = d32[i];
+ if (p <= 0x00ffffff)
+ d32[i] = 0;
+ else if (p < 0xff000000)
+ d32[i] = qPremultiply(p);
}
- d = reinterpret_cast<__m128i*>(p+pad);
+ d += bpl;
}
if (data->format == QImage::Format_ARGB32)