summaryrefslogtreecommitdiffstats
path: root/src/gui/painting/qdrawhelper_neon.cpp
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2016-12-20 16:17:58 +0100
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2017-01-31 00:14:11 +0000
commit85468f7bccb276c2be5801481a6ce10f07581cdb (patch)
treef96ef309303ed0caf91b0c37cabee4d295cb19d3 /src/gui/painting/qdrawhelper_neon.cpp
parentad4f7b59ead6c4eb17e787bce25a7211b866063f (diff)
Manually vectorize ARGB32toARGB32PM for SSE4.1 and NEON
Manually vectorizing is significantly faster because we can optimize for common cases like long stretches of opaque or transparent pixels. This is both smaller and faster than the auto-vectorized version, it is also much faster than the autovectorized version for AVX2 which then can be removed. Change-Id: I0fa80ce273a8387cc6cd084879822ad9bade385c Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Diffstat (limited to 'src/gui/painting/qdrawhelper_neon.cpp')
-rw-r--r--src/gui/painting/qdrawhelper_neon.cpp61
1 files changed, 61 insertions, 0 deletions
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp
index cdb374f823..643c570f65 100644
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@@ -1069,6 +1069,67 @@ const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Oper
return buffer;
}
+#if defined(Q_PROCESSOR_ARM_64) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+template<bool RGBA>
+static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
+{
+ int i = 0;
+ const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
+ const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
+ const uint32x4_t blendMask = vdupq_n_u32(0xff000000);
+
+ for (; i < count - 3; i += 4) {
+ uint32x4_t srcVector = vld1q_u32(src + i);
+ uint32x4_t alphaVector = vshrq_n_u32(srcVector, 24);
+ uint32_t alphaSum = vaddvq_u32(alphaVector);
+ if (alphaSum) {
+ if (alphaSum != 255 * 4) {
+ if (RGBA)
+ srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask));
+ const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(srcVector));
+ const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(srcVector));
+ const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask);
+ const uint8x8_t alpha2 = vtbl1_u8(s2, shuffleMask);
+ uint16x8_t src1 = vmull_u8(s1, alpha1);
+ uint16x8_t src2 = vmull_u8(s2, alpha2);
+ src1 = vsraq_n_u16(src1, src1, 8);
+ src2 = vsraq_n_u16(src2, src2, 8);
+ const uint8x8_t d1 = vrshrn_n_u16(src1, 8);
+ const uint8x8_t d2 = vrshrn_n_u16(src2, 8);
+ const uint32x4_t d = vbslq_u32(blendMask, srcVector, vreinterpretq_u32_u8(vcombine_u8(d1, d2)));
+ vst1q_u32(buffer + i, d);
+ } else {
+ if (RGBA)
+ vst1q_u32(buffer + i, vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask)));
+ else if (buffer != src)
+ vst1q_u32(buffer + i, srcVector);
+ }
+ } else {
+ vst1q_u32(buffer + i, vdupq_n_u32(0));
+ }
+ }
+
+ SIMD_EPILOGUE(i, count, 3) {
+ uint v = qPremultiply(src[i]);
+ buffer[i] = RGBA ? RGBA2ARGB(v) : v;
+ }
+}
+
+const uint *QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, const uint *src, int count,
+ const QVector<QRgb> *, QDitherInfo *)
+{
+ convertARGBToARGB32PM_neon<false>(buffer, src, count);
+ return buffer;
+}
+
+const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_neon(uint *buffer, const uint *src, int count,
+ const QVector<QRgb> *, QDitherInfo *)
+{
+ convertARGBToARGB32PM_neon<true>(buffer, src, count);
+ return buffer;
+}
+#endif
+
QT_END_NAMESPACE
#endif // __ARM_NEON__