summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@theqtcompany.com>2015-04-13 16:06:57 +0200
committerAllan Sandfeld Jensen <allan.jensen@theqtcompany.com>2015-04-16 19:24:06 +0000
commit964ccc58534aac436529007000d1c38d76c88834 (patch)
treed452d03dabe7c109527987b9f9385de5374c1b12 /src
parent365c63e7b177701c0bf80a7cb138b7559b92f350 (diff)
Remove separate SSE4 unpremultiply function
Merges the SSE4 specific unpremultiply with the normal version, and adds a SSE2 fallback. There was no reason to split the two since compile time options will ensure the right version is inlined. Also adds short-cut for 0 and 255 values. Change-Id: Ie5aa262f6964219fd3062d4a498f697cf79a4595 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src')
-rw-r--r--src/gui/image/qimage_conversions.cpp6
-rw-r--r--src/gui/image/qimage_sse4.cpp3
-rw-r--r--src/gui/painting/qdrawhelper.cpp10
-rw-r--r--src/gui/painting/qdrawhelper_sse4.cpp7
-rw-r--r--src/gui/painting/qdrawingprimitive_sse2_p.h21
-rw-r--r--src/gui/painting/qrgb.h37
6 files changed, 45 insertions, 39 deletions
diff --git a/src/gui/image/qimage_conversions.cpp b/src/gui/image/qimage_conversions.cpp
index 28e3a48689..8cb886e09b 100644
--- a/src/gui/image/qimage_conversions.cpp
+++ b/src/gui/image/qimage_conversions.cpp
@@ -117,7 +117,7 @@ static const uint *QT_FASTCALL convertRGB32ToARGB32PM(uint *buffer, const uint *
return buffer;
}
-#ifdef QT_COMPILER_SUPPORTS_SSE4_1
+#if defined(QT_COMPILER_SUPPORTS_SSE4_1) && !defined(__SSE4_1__)
extern const uint *QT_FASTCALL convertRGB32FromARGB32PM_sse4(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
#endif
@@ -144,7 +144,7 @@ void convert_generic(QImageData *dest, const QImageData *src, Qt::ImageConversio
if (src->format == QImage::Format_RGB32)
convertToARGB32PM = convertRGB32ToARGB32PM;
if (dest->format == QImage::Format_RGB32) {
-#ifdef QT_COMPILER_SUPPORTS_SSE4_1
+#if defined(QT_COMPILER_SUPPORTS_SSE4_1) && !defined(__SSE4_1__)
if (qCpuHasFeature(SSE4_1))
convertFromARGB32PM = convertRGB32FromARGB32PM_sse4;
else
@@ -193,7 +193,7 @@ bool convert_generic_inplace(QImageData *data, QImage::Format dst_format, Qt::Im
if (data->format == QImage::Format_RGB32)
convertToARGB32PM = convertRGB32ToARGB32PM;
if (dst_format == QImage::Format_RGB32) {
-#ifdef QT_COMPILER_SUPPORTS_SSE4_1
+#if defined(QT_COMPILER_SUPPORTS_SSE4_1) && !defined(__SSE4_1__)
if (qCpuHasFeature(SSE4_1))
convertFromARGB32PM = convertRGB32FromARGB32PM_sse4;
else
diff --git a/src/gui/image/qimage_sse4.cpp b/src/gui/image/qimage_sse4.cpp
index 5fad4f572a..fb63f5bff9 100644
--- a/src/gui/image/qimage_sse4.cpp
+++ b/src/gui/image/qimage_sse4.cpp
@@ -33,7 +33,6 @@
#include <qimage.h>
#include <private/qdrawhelper_p.h>
-#include <private/qdrawingprimitive_sse2_p.h>
#include <private/qimage_p.h>
#include <private/qsimd_p.h>
@@ -45,7 +44,7 @@ const uint *QT_FASTCALL convertRGB32FromARGB32PM_sse4(uint *buffer, const uint *
const QPixelLayout *, const QRgb *)
{
for (int i = 0; i < count; ++i)
- buffer[i] = 0xff000000 | qUnpremultiply_sse4(src[i]);
+ buffer[i] = 0xff000000 | qUnpremultiply(src[i]);
return buffer;
}
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 37f2de98b6..2817a4a145 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -6704,24 +6704,22 @@ void qInitDrawhelperAsm()
}
#endif // SSSE3
-#if QT_COMPILER_SUPPORTS_SSE4_1
+#if defined(QT_COMPILER_SUPPORTS_SSE4_1) && !defined(__SSE4_1__)
if (qCpuHasFeature(SSE4_1)) {
-#if !defined(__SSE4_1__)
extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_sse4(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_sse4(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
- qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4;
- qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4;
-#endif
extern const uint *QT_FASTCALL convertARGB32FromARGB32PM_sse4(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
extern const uint *QT_FASTCALL convertRGBA8888FromARGB32PM_sse4(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
extern const uint *QT_FASTCALL convertRGBXFromARGB32PM_sse4(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
+ qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4;
+ qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4;
qPixelLayouts[QImage::Format_ARGB32].convertFromARGB32PM = convertARGB32FromARGB32PM_sse4;
qPixelLayouts[QImage::Format_RGBA8888].convertFromARGB32PM = convertRGBA8888FromARGB32PM_sse4;
qPixelLayouts[QImage::Format_RGBX8888].convertFromARGB32PM = convertRGBXFromARGB32PM_sse4;
}
#endif
-#if QT_COMPILER_SUPPORTS_AVX2 && !defined(__AVX2__)
+#if defined(QT_COMPILER_SUPPORTS_AVX2) && !defined(__AVX2__)
if (qCpuHasFeature(AVX2)) {
extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
diff --git a/src/gui/painting/qdrawhelper_sse4.cpp b/src/gui/painting/qdrawhelper_sse4.cpp
index 43a3958997..8e0b2cbb18 100644
--- a/src/gui/painting/qdrawhelper_sse4.cpp
+++ b/src/gui/painting/qdrawhelper_sse4.cpp
@@ -32,7 +32,6 @@
****************************************************************************/
#include <private/qdrawhelper_p.h>
-#include <private/qdrawingprimitive_sse2_p.h>
#if defined(QT_COMPILER_SUPPORTS_SSE4_1)
@@ -54,7 +53,7 @@ const uint *QT_FASTCALL convertARGB32FromARGB32PM_sse4(uint *buffer, const uint
const QPixelLayout *, const QRgb *)
{
for (int i = 0; i < count; ++i)
- buffer[i] = qUnpremultiply_sse4(src[i]);
+ buffer[i] = qUnpremultiply(src[i]);
return buffer;
}
@@ -62,7 +61,7 @@ const uint *QT_FASTCALL convertRGBA8888FromARGB32PM_sse4(uint *buffer, const uin
const QPixelLayout *, const QRgb *)
{
for (int i = 0; i < count; ++i)
- buffer[i] = ARGB2RGBA(qUnpremultiply_sse4(src[i]));
+ buffer[i] = ARGB2RGBA(qUnpremultiply(src[i]));
return buffer;
}
@@ -70,7 +69,7 @@ const uint *QT_FASTCALL convertRGBXFromARGB32PM_sse4(uint *buffer, const uint *s
const QPixelLayout *, const QRgb *)
{
for (int i = 0; i < count; ++i)
- buffer[i] = ARGB2RGBA(0xff000000 | qUnpremultiply_sse4(src[i]));
+ buffer[i] = ARGB2RGBA(0xff000000 | qUnpremultiply(src[i]));
return buffer;
}
diff --git a/src/gui/painting/qdrawingprimitive_sse2_p.h b/src/gui/painting/qdrawingprimitive_sse2_p.h
index 4d0790a502..fbee06e157 100644
--- a/src/gui/painting/qdrawingprimitive_sse2_p.h
+++ b/src/gui/painting/qdrawingprimitive_sse2_p.h
@@ -236,25 +236,4 @@ QT_END_NAMESPACE
#endif // __SSE2__
-QT_BEGIN_NAMESPACE
-#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)
-QT_FUNCTION_TARGET(SSE4_1)
-inline QRgb qUnpremultiply_sse4(QRgb p)
-{
- const uint alpha = qAlpha(p);
- const uint invAlpha = qt_inv_premul_factor[alpha];
- const __m128i via = _mm_set1_epi32(invAlpha);
- const __m128i vr = _mm_set1_epi32(0x8000);
- __m128i vl = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p));
- vl = _mm_mullo_epi32(vl, via);
- vl = _mm_add_epi32(vl, vr);
- vl = _mm_srai_epi32(vl, 16);
- vl = _mm_insert_epi32(vl, alpha, 3);
- vl = _mm_packus_epi32(vl, _mm_setzero_si128());
- vl = _mm_packus_epi16(vl, _mm_setzero_si128());
- return _mm_cvtsi128_si32(vl);
-}
-#endif
-QT_END_NAMESPACE
-
#endif // QDRAWINGPRIMITIVE_SSE2_P_H
diff --git a/src/gui/painting/qrgb.h b/src/gui/painting/qrgb.h
index f7f2185bef..05b3a76bce 100644
--- a/src/gui/painting/qrgb.h
+++ b/src/gui/painting/qrgb.h
@@ -36,6 +36,11 @@
#include <QtCore/qglobal.h>
#include <QtCore/qprocessordetection.h>
+#if defined(__SSE4_1__)
+#include <smmintrin.h>
+#elif defined(__SSE2__)
+#include <emmintrin.h>
+#endif
QT_BEGIN_NAMESPACE
@@ -87,19 +92,45 @@ inline Q_DECL_RELAXED_CONSTEXPR QRgb qPremultiply(QRgb x)
Q_GUI_EXPORT extern const uint qt_inv_premul_factor[];
+#if defined(__SSE2__)
+inline QRgb qUnpremultiply(QRgb p)
+{
+ const uint alpha = qAlpha(p);
+ if (alpha == 255 || alpha == 0)
+ return p;
+ const uint invAlpha = qt_inv_premul_factor[alpha];
+ const __m128i via = _mm_set1_epi32(invAlpha);
+ const __m128i vr = _mm_set1_epi32(0x8000);
+#ifdef __SSE4_1__
+ __m128i vl = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(p));
+ vl = _mm_mullo_epi32(vl, via);
+#else
+ __m128i vl = _mm_unpacklo_epi8(_mm_cvtsi32_si128(p), _mm_setzero_si128());
+ vl = _mm_unpacklo_epi16(vl, vl);
+ __m128i vll = _mm_mullo_epi16(vl, via);
+ __m128i vlh = _mm_mulhi_epu16(vl, via);
+ vl = _mm_add_epi32(vll, _mm_slli_epi32(vlh, 16));
+#endif
+ vl = _mm_add_epi32(vl, vr);
+ vl = _mm_srli_epi32(vl, 16);
+ vl = _mm_packs_epi32(vl, _mm_setzero_si128());
+ vl = _mm_insert_epi16(vl, alpha, 3);
+ vl = _mm_packus_epi16(vl, _mm_setzero_si128());
+ return _mm_cvtsi128_si32(vl);
+}
+#else
inline QRgb qUnpremultiply(QRgb p)
{
const uint alpha = qAlpha(p);
// Alpha 255 and 0 are the two most common values, which makes them beneficial to short-cut.
- if (alpha == 255)
+ if (alpha == 255 || alpha == 0)
return p;
- if (alpha == 0)
- return 0;
// (p*(0x00ff00ff/alpha)) >> 16 == (p*255)/alpha for all p and alpha <= 256.
const uint invAlpha = qt_inv_premul_factor[alpha];
// We add 0x8000 to get even rounding. The rounding also ensures that qPremultiply(qUnpremultiply(p)) == p for all p.
return qRgba((qRed(p)*invAlpha + 0x8000)>>16, (qGreen(p)*invAlpha + 0x8000)>>16, (qBlue(p)*invAlpha + 0x8000)>>16, alpha);
}
+#endif
QT_END_NAMESPACE