summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@theqtcompany.com>2015-01-28 11:07:14 +0100
committerAllan Sandfeld Jensen <allan.jensen@theqtcompany.com>2015-02-04 19:37:22 +0000
commit89edf43c44294888781c308d9b1f1d9bab63645b (patch)
tree7653d09ab651f237bdef38d24917183352847c78
parent5432f2c7a1ff02e1ee0e07e442ceb6c12ca66098 (diff)
Generate SSE4.1 versions of premultiplying methods where convenient
The autovectorized versions of premultiplying conversions are almost twice as fast with SSE4.1 as with SSE2. Therefore this patch lets compilers that can make those versions convenient without duplicating code do that and lets us use them when available. Change-Id: I699035963abe55a38b9ef8ba7b4a8c961c8dfcdd Reviewed-by: Gunnar Sletta <gunnar@sletta.org>
-rw-r--r--src/gui/image/qimage_conversions.cpp32
-rw-r--r--src/gui/painting/qdrawhelper.cpp36
-rw-r--r--tests/benchmarks/gui/image/qimageconversion/tst_qimageconversion.cpp1
3 files changed, 62 insertions, 7 deletions
diff --git a/src/gui/image/qimage_conversions.cpp b/src/gui/image/qimage_conversions.cpp
index 2e8fc1963d..696f95b565 100644
--- a/src/gui/image/qimage_conversions.cpp
+++ b/src/gui/image/qimage_conversions.cpp
@@ -195,7 +195,7 @@ static bool convert_passthrough_inplace(QImageData *data, Qt::ImageConversionFla
return true;
}
-static void convert_ARGB_to_ARGB_PM(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags)
+static inline void convert_ARGB_to_ARGB_PM(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags)
{
Q_ASSERT(src->format == QImage::Format_ARGB32 || src->format == QImage::Format_RGBA8888);
Q_ASSERT(dest->format == QImage::Format_ARGB32_Premultiplied || dest->format == QImage::Format_RGBA8888_Premultiplied);
@@ -219,6 +219,15 @@ static void convert_ARGB_to_ARGB_PM(QImageData *dest, const QImageData *src, Qt:
}
}
+#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) && !defined(__SSE4_1__)
+QT_FUNCTION_TARGET(SSE4_1)
+static void convert_ARGB_to_ARGB_PM_sse4(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags flags)
+{
+ // Twice as fast autovectorized due to SSE4.1 PMULLD instructions.
+ convert_ARGB_to_ARGB_PM(dest, src, flags);
+}
+#endif
+
extern bool convert_ARGB_to_ARGB_PM_inplace_sse2(QImageData *data, Qt::ImageConversionFlags);
#ifndef __SSE2__
@@ -232,7 +241,7 @@ static bool convert_ARGB_to_ARGB_PM_inplace(QImageData *data, Qt::ImageConversio
for (int i = 0; i < data->height; ++i) {
const QRgb *end = rgb_data + data->width;
while (rgb_data < end) {
- *rgb_data = PREMUL(*rgb_data);
+ *rgb_data = qPremultiply(*rgb_data);
++rgb_data;
}
rgb_data += pad;
@@ -312,7 +321,7 @@ static bool convert_ARGB_to_RGBA_inplace(QImageData *data, Qt::ImageConversionFl
return true;
}
-static void convert_ARGB_to_RGBA_PM(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags)
+static inline void convert_ARGB_to_RGBA_PM(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags)
{
Q_ASSERT(src->format == QImage::Format_ARGB32);
Q_ASSERT(dest->format == QImage::Format_RGBA8888_Premultiplied);
@@ -336,6 +345,15 @@ static void convert_ARGB_to_RGBA_PM(QImageData *dest, const QImageData *src, Qt:
}
}
+#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) && !defined(__SSE4_1__)
+QT_FUNCTION_TARGET(SSE4_1)
+static void convert_ARGB_to_RGBA_PM_sse4(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags flags)
+{
+ // Twice as fast autovectorized due to SSE4.1 PMULLD instructions.
+ convert_ARGB_to_RGBA_PM(dest, src, flags);
+}
+#endif
+
static void convert_RGBA_to_ARGB(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags)
{
Q_ASSERT(src->format == QImage::Format_RGBX8888 || src->format == QImage::Format_RGBA8888 || src->format == QImage::Format_RGBA8888_Premultiplied);
@@ -2945,6 +2963,14 @@ void qInitImageConversions()
}
#endif
+#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) && !defined(__SSE4_1__)
+ if (qCpuHasFeature(SSE4_1)) {
+ qimage_converter_map[QImage::Format_ARGB32][QImage::Format_ARGB32_Premultiplied] = convert_ARGB_to_ARGB_PM_sse4;
+ qimage_converter_map[QImage::Format_RGBA8888][QImage::Format_RGBA8888_Premultiplied] = convert_ARGB_to_ARGB_PM_sse4;
+ qimage_converter_map[QImage::Format_ARGB32][QImage::Format_RGBA8888_Premultiplied] = convert_ARGB_to_RGBA_PM_sse4;
+ }
+#endif
+
#if defined(__ARM_NEON__) && !defined(Q_PROCESSOR_ARM_64)
extern void convert_RGB888_to_RGB32_neon(QImageData *dest, const QImageData *src, Qt::ImageConversionFlags);
qimage_converter_map[QImage::Format_RGB888][QImage::Format_RGB32] = convert_RGB888_to_RGB32_neon;
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index de4be7177b..ac73d78afe 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -503,14 +503,25 @@ static const uint *QT_FASTCALL convertPassThrough(uint *, const uint *src, int,
return src;
}
-static const uint *QT_FASTCALL convertARGB32ToARGB32PM(uint *buffer, const uint *src, int count,
- const QPixelLayout *, const QRgb *)
+static inline const uint *QT_FASTCALL convertARGB32ToARGB32PM(uint *buffer, const uint *src, int count,
+ const QPixelLayout *, const QRgb *)
{
for (int i = 0; i < count; ++i)
buffer[i] = qPremultiply(src[i]);
return buffer;
}
+#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) && !defined(__SSE4_1__)
+QT_FUNCTION_TARGET(SSE4_1)
+static const uint *QT_FASTCALL convertARGB32ToARGB32PM_sse4(uint *buffer, const uint *src, int count,
+ const QPixelLayout *layout, const QRgb *clut)
+{
+ // Twice as fast autovectorized due to SSE4.1 PMULLD instructions.
+ return convertARGB32ToARGB32PM(buffer, src, count, layout, clut);
+}
+#endif
+
+
static const uint *QT_FASTCALL convertRGBA8888PMToARGB32PM(uint *buffer, const uint *src, int count,
const QPixelLayout *, const QRgb *)
{
@@ -519,14 +530,24 @@ static const uint *QT_FASTCALL convertRGBA8888PMToARGB32PM(uint *buffer, const u
return buffer;
}
-static const uint *QT_FASTCALL convertRGBA8888ToARGB32PM(uint *buffer, const uint *src, int count,
- const QPixelLayout *, const QRgb *)
+static inline const uint *QT_FASTCALL convertRGBA8888ToARGB32PM(uint *buffer, const uint *src, int count,
+ const QPixelLayout *, const QRgb *)
{
for (int i = 0; i < count; ++i)
buffer[i] = qPremultiply(RGBA2ARGB(src[i]));
return buffer;
}
+#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) && !defined(__SSE4_1__)
+QT_FUNCTION_TARGET(SSE4_1)
+static const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_sse4(uint *buffer, const uint *src, int count,
+ const QPixelLayout *layout, const QRgb *clut)
+{
+ // Twice as fast autovectorized due to SSE4.1 PMULLD instructions.
+ return convertRGBA8888ToARGB32PM(buffer, src, count, layout, clut);
+}
+#endif
+
static const uint *QT_FASTCALL convertAlpha8ToRGB32(uint *buffer, const uint *src, int count,
const QPixelLayout *, const QRgb *)
{
@@ -6858,6 +6879,13 @@ void qInitDrawhelperAsm()
}
#endif // SSSE3
+#if QT_COMPILER_SUPPORTS_HERE(SSE4_1) && !defined(__SSE4_1__)
+ if (qCpuHasFeature(SSE4_1)) {
+ qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4;
+ qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4;
+ }
+#endif
+
functionForModeAsm = qt_functionForMode_SSE2;
functionForModeSolidAsm = qt_functionForModeSolid_SSE2;
#endif // SSE2
diff --git a/tests/benchmarks/gui/image/qimageconversion/tst_qimageconversion.cpp b/tests/benchmarks/gui/image/qimageconversion/tst_qimageconversion.cpp
index 9d6bdb28fa..44d7854eb8 100644
--- a/tests/benchmarks/gui/image/qimageconversion/tst_qimageconversion.cpp
+++ b/tests/benchmarks/gui/image/qimageconversion/tst_qimageconversion.cpp
@@ -148,6 +148,7 @@ void tst_QImageConversion::convertGeneric_data()
QTest::newRow("argb32 -> argb8565pm") << argb32 << QImage::Format_ARGB8565_Premultiplied;
QTest::newRow("argb32 -> argb4444pm") << argb32 << QImage::Format_ARGB4444_Premultiplied;
QTest::newRow("argb32 -> argb32pm") << argb32 << QImage::Format_ARGB32_Premultiplied;
+ QTest::newRow("argb32 -> rgba8888pm") << argb32 << QImage::Format_RGBA8888_Premultiplied;
}
void tst_QImageConversion::convertGeneric()