diff options
-rw-r--r-- | src/gui/painting/qcompositionfunctions.cpp | 100 | ||||
-rw-r--r-- | src/gui/painting/qrgba64_p.h | 100 |
2 files changed, 153 insertions, 47 deletions
diff --git a/src/gui/painting/qcompositionfunctions.cpp b/src/gui/painting/qcompositionfunctions.cpp index 9312ee9540..641550766a 100644 --- a/src/gui/painting/qcompositionfunctions.cpp +++ b/src/gui/painting/qcompositionfunctions.cpp @@ -87,6 +87,32 @@ QT_BEGIN_NAMESPACE }\ } +#if defined __SSE2__ +# define LOAD(ptr) _mm_loadl_epi64((__m128i*)(ptr)) +# define CONVERT(value) _mm_shufflelo_epi16(_mm_cvtsi32_si128(value), _MM_SHUFFLE(0, 0, 0, 0)) +# define STORE(ptr, value) _mm_storel_epi64((__m128i*)(ptr), value) +# define ADD(p, q) _mm_add_epi32(p, q) +# define ALPHA(c) _mm_shufflelo_epi16(c, _MM_SHUFFLE(3, 3, 3, 3)) +# define CONST(n) CONVERT(n) +# define INVALPHA(c) _mm_sub_epi32(CONST(65535), ALPHA(c)) +#elif defined __ARM_NEON__ +# define LOAD(ptr) vreinterpret_u16_u64(vld1_u64((quint64*)(ptr))) +# define CONVERT(value) vreinterpret_u16_u64(vmov_n_u64(value)) +# define STORE(ptr, value) vst1_u64((quint64*)(ptr), vreinterpret_u64_u16(value)) +# define ADD(p, q) vadd_u16(p, q) +# define ALPHA(c) vdup_lane_u16(c, 3) +# define CONST(n) vdup_n_u16(n) +# define INVALPHA(c) vmvn_u16(ALPHA(c)) +#else +# define LOAD(ptr) *ptr +# define CONVERT(value) value +# define STORE(ptr, value) *ptr = value +# define ADD(p, q) (p + q) +# define ALPHA(c) (c).alpha() +# define CONST(n) n +# define INVALPHA(c) (65535 - ALPHA(c)) +#endif + void QT_FASTCALL comp_func_solid_Clear(uint *dest, int length, uint, uint const_alpha) { comp_func_Clear_impl(dest, length, const_alpha); @@ -99,7 +125,7 @@ void QT_FASTCALL comp_func_solid_Clear_rgb64(QRgba64 *dest, int length, QRgba64, else { int ialpha = 255 - const_alpha; for (int i = 0; i < length; ++i) { - dest[i] = multiplyAlpha255(dest[i], ialpha); + STORE(&dest[i], multiplyAlpha255(LOAD(&dest[i]), ialpha)); } } } @@ -116,7 +142,7 @@ void QT_FASTCALL comp_func_Clear_rgb64(QRgba64 *dest, const QRgba64 *, int lengt else { int ialpha = 255 - const_alpha; for (int i = 0; i < length; ++i) { - dest[i] = multiplyAlpha255(dest[i], ialpha); + STORE(&dest[i], multiplyAlpha255(LOAD(&dest[i]), ialpha)); } } } @@ -146,9 +172,9 @@ void QT_FASTCALL comp_func_solid_Source_rgb64(QRgba64 *dest, int length, QRgba64 qt_memfill64((quint64*)dest, color, length); else { int ialpha = 255 - const_alpha; - color = multiplyAlpha255(color, const_alpha); + auto c = multiplyAlpha255(CONVERT(color), const_alpha); for (int i = 0; i < length; ++i) { - dest[i] = color + multiplyAlpha255(dest[i], ialpha); + STORE(&dest[i], ADD(c, multiplyAlpha255(LOAD(&dest[i]), ialpha))); } } } @@ -174,7 +200,7 @@ void QT_FASTCALL comp_func_Source_rgb64(QRgba64 *Q_DECL_RESTRICT dest, const QRg else { int ialpha = 255 - const_alpha; for (int i = 0; i < length; ++i) { - dest[i] = interpolate255(src[i], const_alpha, dest[i], ialpha); + STORE(&dest[i], interpolate255(LOAD(&src[i]), const_alpha, LOAD(&dest[i]), ialpha)); } } } @@ -221,10 +247,12 @@ void QT_FASTCALL comp_func_solid_SourceOver_rgb64(QRgba64 *dest, int length, QRg if (const_alpha == 255 && color.isOpaque()) { qt_memfill64((quint64*)dest, color, length); } else { + auto c = CONVERT(color); if (const_alpha != 255) - color = multiplyAlpha255(color, const_alpha); + c = multiplyAlpha255(c, const_alpha); + auto cAlpha = INVALPHA(c); for (int i = 0; i < length; ++i) { - dest[i] = color + multiplyAlpha65535(dest[i], 65535 - color.alpha()); + STORE(&dest[i], ADD(c, multiplyAlpha65535(LOAD(&dest[i]), cAlpha))); } } } @@ -258,12 +286,12 @@ void QT_FASTCALL comp_func_SourceOver_rgb64(QRgba64 *Q_DECL_RESTRICT dest, const if (s.isOpaque()) dest[i] = s; else if (!s.isTransparent()) - dest[i] = s + multiplyAlpha65535(dest[i], 65535 - s.alpha()); + STORE(&dest[i], ADD(CONVERT(s), multiplyAlpha65535(LOAD(&dest[i]), 65535 - s.alpha()))); } } else { for (int i = 0; i < length; ++i) { - QRgba64 s = multiplyAlpha255(src[i], const_alpha); - dest[i] = s + multiplyAlpha65535(dest[i], 65535 - s.alpha()); + auto s = multiplyAlpha255(LOAD(&src[i]), const_alpha); + STORE(&dest[i], ADD(s, multiplyAlpha65535(LOAD(&dest[i]), INVALPHA(s)))); } } } @@ -287,11 +315,12 @@ void QT_FASTCALL comp_func_solid_DestinationOver(uint *dest, int length, uint co void QT_FASTCALL comp_func_solid_DestinationOver_rgb64(QRgba64 *dest, int length, QRgba64 color, uint const_alpha) { + auto c = CONVERT(color); if (const_alpha != 255) - color = multiplyAlpha255(color, const_alpha); + c = multiplyAlpha255(c, const_alpha); for (int i = 0; i < length; ++i) { - QRgba64 d = dest[i]; - dest[i] = d + multiplyAlpha65535(color, 65535 - d.alpha()); + auto d = LOAD(&dest[i]); + STORE(&dest[i], ADD(d, multiplyAlpha65535(c, INVALPHA(d)))); } } @@ -318,14 +347,14 @@ void QT_FASTCALL comp_func_DestinationOver_rgb64(QRgba64 *Q_DECL_RESTRICT dest, { if (const_alpha == 255) { for (int i = 0; i < length; ++i) { - QRgba64 d = dest[i]; - dest[i] = d + multiplyAlpha65535(src[i], 65535 - d.alpha()); + auto d = LOAD(&dest[i]); + STORE(&dest[i], ADD(d, multiplyAlpha65535(LOAD(&src[i]), INVALPHA(d)))); } } else { for (int i = 0; i < length; ++i) { - QRgba64 d = dest[i]; - QRgba64 s = multiplyAlpha255(src[i], const_alpha); - dest[i] = d + multiplyAlpha65535(s, 65535 - d.alpha()); + auto d = LOAD(&dest[i]); + auto s = multiplyAlpha255(LOAD(&src[i]), const_alpha); + STORE(&dest[i], ADD(d, multiplyAlpha65535(s, INVALPHA(d)))); } } } @@ -393,15 +422,15 @@ void QT_FASTCALL comp_func_SourceIn_rgb64(QRgba64 *Q_DECL_RESTRICT dest, const Q { if (const_alpha == 255) { for (int i = 0; i < length; ++i) { - dest[i] = multiplyAlpha65535(src[i], dest[i].alpha()); + STORE(&dest[i], multiplyAlpha65535(LOAD(&src[i]), dest[i].alpha())); } } else { uint ca = const_alpha * 257; - uint cia = 65535 - ca; + auto cia = CONST(65535 - ca); for (int i = 0; i < length; ++i) { - QRgba64 d = dest[i]; - QRgba64 s = multiplyAlpha65535(src[i], ca); - dest[i] = interpolate65535(s, d.alpha(), d, cia); + auto d = LOAD(&dest[i]); + auto s = multiplyAlpha65535(LOAD(&src[i]), ca); + STORE(&dest[i], interpolate65535(s, ALPHA(d), d, cia)); } } } @@ -431,7 +460,7 @@ void QT_FASTCALL comp_func_solid_DestinationIn_rgb64(QRgba64 *dest, int length, if (const_alpha != 255) a = qt_div_65535(a * ca64k) + 65535 - ca64k; for (int i = 0; i < length; ++i) { - dest[i] = multiplyAlpha65535(dest[i], a); + STORE(&dest[i], multiplyAlpha65535(LOAD(&dest[i]), a)); } } @@ -885,14 +914,19 @@ void QT_FASTCALL comp_func_solid_Plus(uint *dest, int length, uint color, uint c void QT_FASTCALL comp_func_solid_Plus_rgb64(QRgba64 *dest, int length, QRgba64 color, uint const_alpha) { + auto b = CONVERT(color); if (const_alpha == 255) { for (int i = 0; i < length; ++i) { - dest[i] = addWithSaturation(dest[i], color); + auto a = LOAD(&dest[i]); + a = addWithSaturation(a, b); + STORE(&dest[i], a); } } else { for (int i = 0; i < length; ++i) { - QRgba64 d = addWithSaturation(dest[i], color); - dest[i] = interpolate255(d, const_alpha, dest[i], 255 - const_alpha); + auto a = LOAD(&dest[i]); + auto d = addWithSaturation(a, b); + a = interpolate255(d, const_alpha, a, 255 - const_alpha); + STORE(&dest[i], a); } } } @@ -924,12 +958,18 @@ void QT_FASTCALL comp_func_Plus_rgb64(QRgba64 *Q_DECL_RESTRICT dest, const QRgba { if (const_alpha == 255) { for (int i = 0; i < length; ++i) { - dest[i] = addWithSaturation(dest[i], src[i]); + auto a = LOAD(&dest[i]); + auto b = LOAD(&src[i]); + a = addWithSaturation(a, b); + STORE(&dest[i], a); } } else { for (int i = 0; i < length; ++i) { - QRgba64 d = addWithSaturation(dest[i], src[i]); - dest[i] = interpolate255(d, const_alpha, dest[i], 255 - const_alpha); + auto a = LOAD(&dest[i]); + auto b = LOAD(&src[i]); + auto d = addWithSaturation(a, b); + a = interpolate255(d, const_alpha, a, 255 - const_alpha); + STORE(&dest[i], a); } } } diff --git a/src/gui/painting/qrgba64_p.h b/src/gui/painting/qrgba64_p.h index cf3dad5f90..b88a5d4c92 100644 --- a/src/gui/painting/qrgba64_p.h +++ b/src/gui/painting/qrgba64_p.h @@ -72,27 +72,49 @@ inline QRgba64 multiplyAlpha256(QRgba64 rgba64, uint alpha256) inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535) { + return QRgba64::fromRgba64(qt_div_65535(rgba64.red() * alpha65535), + qt_div_65535(rgba64.green() * alpha65535), + qt_div_65535(rgba64.blue() * alpha65535), + qt_div_65535(rgba64.alpha() * alpha65535)); +} + #ifdef __SSE2__ - const __m128i va = _mm_shufflelo_epi16(_mm_cvtsi32_si128(alpha65535), _MM_SHUFFLE(0, 0, 0, 0)); - __m128i vs = _mm_loadl_epi64((__m128i*)&rgba64); +Q_ALWAYS_INLINE __m128i multiplyAlpha65535(__m128i rgba64, __m128i va) +{ + __m128i vs = rgba64; vs = _mm_unpacklo_epi16(_mm_mullo_epi16(vs, va), _mm_mulhi_epu16(vs, va)); vs = _mm_add_epi32(vs, _mm_srli_epi32(vs, 16)); vs = _mm_add_epi32(vs, _mm_set1_epi32(0x8000)); vs = _mm_srai_epi32(vs, 16); vs = _mm_packs_epi32(vs, _mm_setzero_si128()); - _mm_storel_epi64((__m128i*)&rgba64, vs); - return rgba64; -#else - return QRgba64::fromRgba64(qt_div_65535(rgba64.red() * alpha65535), - qt_div_65535(rgba64.green() * alpha65535), - qt_div_65535(rgba64.blue() * alpha65535), - qt_div_65535(rgba64.alpha() * alpha65535)); + return vs; +} +Q_ALWAYS_INLINE __m128i multiplyAlpha65535(__m128i rgba64, uint alpha65535) +{ + const __m128i va = _mm_shufflelo_epi16(_mm_cvtsi32_si128(alpha65535), _MM_SHUFFLE(0, 0, 0, 0)); + return multiplyAlpha65535(rgba64, va); +} #endif + +#if defined(__ARM_NEON__) +Q_ALWAYS_INLINE uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint16x4_t alpha65535) +{ + uint32x4_t vs32 = vmull_u16(rgba64, alpha65535); // vs = vs * alpha + vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16) + return vrshrn_n_u32(vs32, 16); // vs = (vs + 0x8000) >> 16 } +Q_ALWAYS_INLINE uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535) +{ + uint32x4_t vs32 = vmull_n_u16(rgba64, alpha65535); // vs = vs * alpha + vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16) + return vrshrn_n_u32(vs32, 16); // vs = (vs + 0x8000) >> 16 +} +#endif -inline QRgba64 multiplyAlpha255(QRgba64 rgba64, uint alpha255) +template<typename T> +inline T multiplyAlpha255(T rgba64, uint alpha255) { -#ifdef __SSE2__ +#if defined(__SSE2__) || defined(__ARM_NEON__) return multiplyAlpha65535(rgba64, alpha255 * 257); #else return QRgba64::fromRgba64(qt_div_255(rgba64.red() * alpha255), @@ -112,25 +134,69 @@ inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2) return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2)); } +#if defined __SSE2__ +Q_ALWAYS_INLINE __m128i interpolate255(__m128i x, uint alpha1, __m128i y, uint alpha2) +{ + return _mm_add_epi32(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2)); +} +#endif + +#if defined __ARM_NEON__ +Q_ALWAYS_INLINE uint16x4_t interpolate255(uint16x4_t x, uint alpha1, uint16x4_t y, uint alpha2) +{ + return vadd_u16(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2)); +} +#endif + inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2) { return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2)); } +#if defined __SSE2__ +Q_ALWAYS_INLINE __m128i interpolate65535(__m128i x, uint alpha1, __m128i y, uint alpha2) +{ + return _mm_add_epi32(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); +} +// alpha2 below is const-ref because otherwise MSVC2013 complains that it can't 16-byte align the argument. +Q_ALWAYS_INLINE __m128i interpolate65535(__m128i x, __m128i alpha1, __m128i y, const __m128i &alpha2) +{ + return _mm_add_epi32(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); +} +#endif + +#if defined __ARM_NEON__ +Q_ALWAYS_INLINE uint16x4_t interpolate65535(uint16x4_t x, uint alpha1, uint16x4_t y, uint alpha2) +{ + return vadd_u16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); +} +Q_ALWAYS_INLINE uint16x4_t interpolate65535(uint16x4_t x, uint16x4_t alpha1, uint16x4_t y, uint16x4_t alpha2) +{ + return vadd_u16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); +} +#endif + inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b) { -#if defined(__SSE2__) && defined(Q_PROCESSOR_X86_64) - __m128i va = _mm_cvtsi64_si128((quint64)a); - __m128i vb = _mm_cvtsi64_si128((quint64)b); - va = _mm_adds_epu16(va, vb); - return QRgba64::fromRgba64(_mm_cvtsi128_si64(va)); -#else return QRgba64::fromRgba64(qMin(a.red() + b.red(), 65535), qMin(a.green() + b.green(), 65535), qMin(a.blue() + b.blue(), 65535), qMin(a.alpha() + b.alpha(), 65535)); +} + +#if defined(__SSE2__) +Q_ALWAYS_INLINE __m128i addWithSaturation(__m128i a, __m128i b) +{ + return _mm_adds_epu16(a, b); +} #endif + +#if defined(__ARM_NEON__) +Q_ALWAYS_INLINE uint16x4_t addWithSaturation(uint16x4_t a, uint16x4_t b) +{ + return vqmovn_u32(vaddl_u16(a, b)); } +#endif QT_END_NAMESPACE |