summaryrefslogtreecommitdiffstats
path: root/src/gui/image/qimage_conversions.cpp
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@theqtcompany.com>2015-03-09 10:16:18 +0100
committerAllan Sandfeld Jensen <allan.jensen@theqtcompany.com>2015-03-10 12:28:13 +0000
commit3ce88e13b9cc56e00bc3f075e9649c6b291ae01c (patch)
tree2dba67de42eba94d605be744ca63a1903c9048ee /src/gui/image/qimage_conversions.cpp
parent1fc6056ff5526e61419262931de79137bf7c1b4d (diff)
Add AVX2 autovectorized versions of premultiply
Following up on using GCC's autovectorizing for faster SSE4.1 premultiply, this patch adds specialized autovectorized versions of premultiply for AVX2, giving another almost doubling in speed. To make the speed up for AVX2 and also SSE4_1 available to non-GCC compilers, the target-specific methods have been moved to separate files. Change-Id: I97ce05be67f4adeeb9a096eef80fd5fb662099f3 Reviewed-by: Gunnar Sletta <gunnar@sletta.org>
Diffstat (limited to 'src/gui/image/qimage_conversions.cpp')
-rw-r--r--src/gui/image/qimage_conversions.cpp42
1 files changed, 17 insertions, 25 deletions
diff --git a/src/gui/image/qimage_conversions.cpp b/src/gui/image/qimage_conversions.cpp
index 5103d820d6..fe76c6d3ba 100644
--- a/src/gui/image/qimage_conversions.cpp
+++ b/src/gui/image/qimage_conversions.cpp
@@ -32,7 +32,6 @@
****************************************************************************/
#include <private/qdrawhelper_p.h>
-#include <private/qdrawingprimitive_sse2_p.h>
#include <private/qguiapplication_p.h>
#include <private/qsimd_p.h>
#include <private/qimage_p.h>
@@ -110,17 +109,6 @@ static const uint *QT_FASTCALL convertRGB32FromARGB32PM(uint *buffer, const uint
return buffer;
}
-#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)
-QT_FUNCTION_TARGET(SSE4_1)
-static const uint *QT_FASTCALL convertRGB32FromARGB32PM_sse4(uint *buffer, const uint *src, int count,
- const QPixelLayout *, const QRgb *)
-{
- for (int i = 0; i < count; ++i)
- buffer[i] = 0xff000000 | qUnpremultiply_sse4(src[i]);
- return buffer;
-}
-#endif
-
static const uint *QT_FASTCALL convertRGB32ToARGB32PM(uint *buffer, const uint *src, int count,
const QPixelLayout *, const QRgb *)
{
@@ -129,6 +117,10 @@ static const uint *QT_FASTCALL convertRGB32ToARGB32PM(uint *buffer, const uint *
return buffer;
}
+#ifdef QT_COMPILER_SUPPORTS_SSE4_1
+extern const uint *QT_FASTCALL convertRGB32FromARGB32PM_sse4(uint *buffer, const uint *src, int count, const QPixelLayout *, const QRgb *);
+#endif
+
void convert_generic(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags)
{
// Cannot be used with indexed formats.
@@ -152,7 +144,7 @@ void convert_generic(QImageData *dest, const QImageData *src, Qt::ImageConversio
if (src->format == QImage::Format_RGB32)
convertToARGB32PM = convertRGB32ToARGB32PM;
if (dest->format == QImage::Format_RGB32) {
-#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)
+#ifdef QT_COMPILER_SUPPORTS_SSE4_1
if (qCpuHasFeature(SSE4_1))
convertFromARGB32PM = convertRGB32FromARGB32PM_sse4;
else
@@ -201,7 +193,7 @@ bool convert_generic_inplace(QImageData *data, QImage::Format dst_format, Qt::Im
if (data->format == QImage::Format_RGB32)
convertToARGB32PM = convertRGB32ToARGB32PM;
if (dst_format == QImage::Format_RGB32) {
-#if QT_COMPILER_SUPPORTS_HERE(SSE4_1)
+#ifdef QT_COMPILER_SUPPORTS_SSE4_1
if (qCpuHasFeature(SSE4_1))
convertFromARGB32PM = convertRGB32FromARGB32PM_sse4;
else
@@ -257,7 +249,7 @@ static bool convert_passthrough_inplace(QImageData *data, Qt::ImageConversionFla
return true;
}
-static inline void convert_ARGB_to_ARGB_PM(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags)
+static void convert_ARGB_to_ARGB_PM(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags)
{
Q_ASSERT(src->format == QImage::Format_ARGB32 || src->format == QImage::Format_RGBA8888);
Q_ASSERT(dest->format == QImage::Format_ARGB32_Premultiplied || dest->format == QImage::Format_RGBA8888_Premultiplied);
@@ -281,15 +273,6 @@ static inline void convert_ARGB_to_ARGB_PM(QImageData *dest, const QImageData *s
}
}
-#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) && !defined(__SSE4_1__)
-QT_FUNCTION_TARGET(SSE4_1)
-static void convert_ARGB_to_ARGB_PM_sse4(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags flags)
-{
- // Twice as fast autovectorized due to SSE4.1 PMULLD instructions.
- convert_ARGB_to_ARGB_PM(dest, src, flags);
-}
-#endif
-
Q_GUI_EXPORT void QT_FASTCALL qt_convert_rgb888_to_rgb32(quint32 *dest_data, const uchar *src_data, int len)
{
int pixel = 0;
@@ -2804,13 +2787,22 @@ void qInitImageConversions()
}
#endif
-#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) && !defined(__SSE4_1__)
+#if defined(QT_COMPILER_SUPPORTS_SSE4_1) && !defined(__SSE4_1__)
if (qCpuHasFeature(SSE4_1)) {
+ extern void convert_ARGB_to_ARGB_PM_sse4(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags);
qimage_converter_map[QImage::Format_ARGB32][QImage::Format_ARGB32_Premultiplied] = convert_ARGB_to_ARGB_PM_sse4;
qimage_converter_map[QImage::Format_RGBA8888][QImage::Format_RGBA8888_Premultiplied] = convert_ARGB_to_ARGB_PM_sse4;
}
#endif
+#if defined(QT_COMPILER_SUPPORTS_AVX2) && !defined(__AVX2__)
+ if (qCpuHasFeature(AVX2)) {
+ extern void convert_ARGB_to_ARGB_PM_avx2(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags);
+ qimage_converter_map[QImage::Format_ARGB32][QImage::Format_ARGB32_Premultiplied] = convert_ARGB_to_ARGB_PM_avx2;
+ qimage_converter_map[QImage::Format_RGBA8888][QImage::Format_RGBA8888_Premultiplied] = convert_ARGB_to_ARGB_PM_avx2;
+ }
+#endif
+
#if defined(__ARM_NEON__) && !defined(Q_PROCESSOR_ARM_64)
extern void convert_RGB888_to_RGB32_neon(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags);
qimage_converter_map[QImage::Format_RGB888][QImage::Format_RGB32] = convert_RGB888_to_RGB32_neon;