summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2018-08-02 18:05:51 +0200
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2018-08-03 09:37:02 +0000
commit1c8f9eb79da837db8e37cf6348de459088c3a20e (patch)
tree44cefd281ff0ebe5835c6292849585b3a56a5cac
parent66be5445e64b54bf60069dfee5dd918459e3deed (diff)
Add missing optimization for loading RGB32 to RGBA64 using NEON
The rest of the RGB64 routines were optimized, but the loading of RGB32 was not as it was originally not used much, but with ARGB32 using the RGB64 backend, it is essential for decent performance. Task-number: QTBUG-69724 Change-Id: I1c02411ed29d3d993427afde44dfa83689d117e0 Reviewed-by: Lars Knoll <lars.knoll@qt.io>
-rw-r--r--src/gui/painting/qdrawhelper.cpp61
1 files changed, 61 insertions, 0 deletions
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 34847daf55..9bb1498ff0 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -618,6 +618,53 @@ static inline void qConvertARGB32PMToARGB64PM_sse2(QRgba64 *buffer, const uint *
*buffer++ = QRgba64::fromArgb32(s);
}
}
+#elif defined(__ARM_NEON__)
+template<bool RGBA, bool maskAlpha>
+static inline void qConvertARGB32PMToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count)
+{
+ if (count <= 0)
+ return;
+
+ const uint32x4_t amask = vdupq_n_u32(0xff000000);
+#if defined(Q_PROCESSOR_ARM_64)
+ const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
+#else
+ const uint8x8_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7 };
+#endif
+ int i = 0;
+ for (; i < count-3; i += 4) {
+ uint32x4_t vs32 = vld1q_u32(src);
+ src += 4;
+ if (maskAlpha)
+ vs32 = vorrq_u32(vs32, amask);
+ uint8x16_t vs8 = vreinterpretq_u8_u32(vs32);
+ if (!RGBA) {
+#if defined(Q_PROCESSOR_ARM_64)
+ vs8 = vqtbl1q_u8(vs8, rgbaMask);
+#else
+ // no vqtbl1q_u8
+ const uint8x8_t vlo = vtbl1_u8(vget_low_u8(vs8), rgbaMask);
+ const uint8x8_t vhi = vtbl1_u8(vget_high_u8(vs8), rgbaMask);
+ vs8 = vcombine_u8(vlo, vhi);
+#endif
+ }
+ uint8x16x2_t v = vzipq_u8(vs8, vs8);
+
+ vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u8(v.val[0]));
+ buffer += 2;
+ vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u8(v.val[1]));
+ buffer += 2;
+ }
+
+ SIMD_EPILOGUE(i, count, 3) {
+ uint s = *src++;
+ if (maskAlpha)
+ s = s | 0xff000000;
+ if (RGBA)
+ s = RGBA2ARGB(s);
+ *buffer++ = QRgba64::fromArgb32(s);
+ }
+}
#endif
static const QRgba64 *QT_FASTCALL convertRGB32ToRGB64(QRgba64 *buffer, const uint *src, int count,
@@ -625,6 +672,8 @@ static const QRgba64 *QT_FASTCALL convertRGB32ToRGB64(QRgba64 *buffer, const uin
{
#ifdef __SSE2__
qConvertARGB32PMToARGB64PM_sse2<false, true>(buffer, src, count);
+#elif defined(__ARM_NEON__)
+ qConvertARGB32PMToRGBA64PM_neon<false, true>(buffer, src, count);
#else
for (int i = 0; i < count; ++i)
buffer[i] = QRgba64::fromArgb32(0xff000000 | src[i]);
@@ -639,6 +688,10 @@ static const QRgba64 *QT_FASTCALL convertARGB32ToARGB64PM(QRgba64 *buffer, const
qConvertARGB32PMToARGB64PM_sse2<false, false>(buffer, src, count);
for (int i = 0; i < count; ++i)
buffer[i] = buffer[i].premultiplied();
+#elif defined(__ARM_NEON__)
+ qConvertARGB32PMToRGBA64PM_neon<false, false>(buffer, src, count);
+ for (int i = 0; i < count; ++i)
+ buffer[i] = buffer[i].premultiplied();
#else
for (int i = 0; i < count; ++i)
buffer[i] = QRgba64::fromArgb32(src[i]).premultiplied();
@@ -651,6 +704,8 @@ static const QRgba64 *QT_FASTCALL convertARGB32PMToARGB64PM(QRgba64 *buffer, con
{
#ifdef __SSE2__
qConvertARGB32PMToARGB64PM_sse2<false, false>(buffer, src, count);
+#elif defined(__ARM_NEON__)
+ qConvertARGB32PMToRGBA64PM_neon<false, false>(buffer, src, count);
#else
for (int i = 0; i < count; ++i)
buffer[i] = QRgba64::fromArgb32(src[i]);
@@ -665,6 +720,10 @@ static const QRgba64 *QT_FASTCALL convertRGBA8888ToARGB64PM(QRgba64 *buffer, con
qConvertARGB32PMToARGB64PM_sse2<true, false>(buffer, src, count);
for (int i = 0; i < count; ++i)
buffer[i] = buffer[i].premultiplied();
+#elif defined(__ARM_NEON__)
+ qConvertARGB32PMToRGBA64PM_neon<true, false>(buffer, src, count);
+ for (int i = 0; i < count; ++i)
+ buffer[i] = buffer[i].premultiplied();
#else
for (int i = 0; i < count; ++i)
buffer[i] = QRgba64::fromArgb32(RGBA2ARGB(src[i])).premultiplied();
@@ -677,6 +736,8 @@ static const QRgba64 *QT_FASTCALL convertRGBA8888PMToARGB64PM(QRgba64 *buffer, c
{
#ifdef __SSE2__
qConvertARGB32PMToARGB64PM_sse2<true, false>(buffer, src, count);
+#elif defined(__ARM_NEON__)
+ qConvertARGB32PMToRGBA64PM_neon<true, false>(buffer, src, count);
#else
for (int i = 0; i < count; ++i)
buffer[i] = QRgba64::fromArgb32(RGBA2ARGB(src[i]));