diff options
Diffstat (limited to 'src/gui/painting/qrgba64_p.h')
-rw-r--r-- | src/gui/painting/qrgba64_p.h | 101 |
1 files changed, 84 insertions, 17 deletions
diff --git a/src/gui/painting/qrgba64_p.h b/src/gui/painting/qrgba64_p.h index cf3dad5f90..0dadc038fa 100644 --- a/src/gui/painting/qrgba64_p.h +++ b/src/gui/painting/qrgba64_p.h @@ -51,6 +51,7 @@ // We mean it. // +#include <QtGui/private/qtguiglobal_p.h> #include <QtGui/qrgba64.h> #include <QtGui/private/qdrawhelper_p.h> #include <private/qsimd_p.h> @@ -72,27 +73,49 @@ inline QRgba64 multiplyAlpha256(QRgba64 rgba64, uint alpha256) inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535) { + return QRgba64::fromRgba64(qt_div_65535(rgba64.red() * alpha65535), + qt_div_65535(rgba64.green() * alpha65535), + qt_div_65535(rgba64.blue() * alpha65535), + qt_div_65535(rgba64.alpha() * alpha65535)); +} + #ifdef __SSE2__ - const __m128i va = _mm_shufflelo_epi16(_mm_cvtsi32_si128(alpha65535), _MM_SHUFFLE(0, 0, 0, 0)); - __m128i vs = _mm_loadl_epi64((__m128i*)&rgba64); +Q_ALWAYS_INLINE __m128i multiplyAlpha65535(__m128i rgba64, __m128i va) +{ + __m128i vs = rgba64; vs = _mm_unpacklo_epi16(_mm_mullo_epi16(vs, va), _mm_mulhi_epu16(vs, va)); vs = _mm_add_epi32(vs, _mm_srli_epi32(vs, 16)); vs = _mm_add_epi32(vs, _mm_set1_epi32(0x8000)); vs = _mm_srai_epi32(vs, 16); vs = _mm_packs_epi32(vs, _mm_setzero_si128()); - _mm_storel_epi64((__m128i*)&rgba64, vs); - return rgba64; -#else - return QRgba64::fromRgba64(qt_div_65535(rgba64.red() * alpha65535), - qt_div_65535(rgba64.green() * alpha65535), - qt_div_65535(rgba64.blue() * alpha65535), - qt_div_65535(rgba64.alpha() * alpha65535)); + return vs; +} +Q_ALWAYS_INLINE __m128i multiplyAlpha65535(__m128i rgba64, uint alpha65535) +{ + const __m128i va = _mm_shufflelo_epi16(_mm_cvtsi32_si128(alpha65535), _MM_SHUFFLE(0, 0, 0, 0)); + return multiplyAlpha65535(rgba64, va); +} #endif + +#if defined(__ARM_NEON__) +Q_ALWAYS_INLINE uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint16x4_t alpha65535) +{ + uint32x4_t vs32 = vmull_u16(rgba64, alpha65535); // vs = vs * alpha + vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16) + return vrshrn_n_u32(vs32, 16); // vs = (vs + 0x8000) >> 16 } +Q_ALWAYS_INLINE uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535) +{ + uint32x4_t vs32 = vmull_n_u16(rgba64, alpha65535); // vs = vs * alpha + vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16) + return vrshrn_n_u32(vs32, 16); // vs = (vs + 0x8000) >> 16 +} +#endif -inline QRgba64 multiplyAlpha255(QRgba64 rgba64, uint alpha255) +template<typename T> +inline T multiplyAlpha255(T rgba64, uint alpha255) { -#ifdef __SSE2__ +#if defined(__SSE2__) || defined(__ARM_NEON__) return multiplyAlpha65535(rgba64, alpha255 * 257); #else return QRgba64::fromRgba64(qt_div_255(rgba64.red() * alpha255), @@ -112,25 +135,69 @@ inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2) return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2)); } +#if defined __SSE2__ +Q_ALWAYS_INLINE __m128i interpolate255(__m128i x, uint alpha1, __m128i y, uint alpha2) +{ + return _mm_add_epi32(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2)); +} +#endif + +#if defined __ARM_NEON__ +Q_ALWAYS_INLINE uint16x4_t interpolate255(uint16x4_t x, uint alpha1, uint16x4_t y, uint alpha2) +{ + return vadd_u16(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2)); +} +#endif + inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2) { return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2)); } +#if defined __SSE2__ +Q_ALWAYS_INLINE __m128i interpolate65535(__m128i x, uint alpha1, __m128i y, uint alpha2) +{ + return _mm_add_epi32(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); +} +// alpha2 below is const-ref because otherwise MSVC2013 complains that it can't 16-byte align the argument. +Q_ALWAYS_INLINE __m128i interpolate65535(__m128i x, __m128i alpha1, __m128i y, const __m128i &alpha2) +{ + return _mm_add_epi32(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); +} +#endif + +#if defined __ARM_NEON__ +Q_ALWAYS_INLINE uint16x4_t interpolate65535(uint16x4_t x, uint alpha1, uint16x4_t y, uint alpha2) +{ + return vadd_u16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); +} +Q_ALWAYS_INLINE uint16x4_t interpolate65535(uint16x4_t x, uint16x4_t alpha1, uint16x4_t y, uint16x4_t alpha2) +{ + return vadd_u16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); +} +#endif + inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b) { -#if defined(__SSE2__) && defined(Q_PROCESSOR_X86_64) - __m128i va = _mm_cvtsi64_si128((quint64)a); - __m128i vb = _mm_cvtsi64_si128((quint64)b); - va = _mm_adds_epu16(va, vb); - return QRgba64::fromRgba64(_mm_cvtsi128_si64(va)); -#else return QRgba64::fromRgba64(qMin(a.red() + b.red(), 65535), qMin(a.green() + b.green(), 65535), qMin(a.blue() + b.blue(), 65535), qMin(a.alpha() + b.alpha(), 65535)); +} + +#if defined(__SSE2__) +Q_ALWAYS_INLINE __m128i addWithSaturation(__m128i a, __m128i b) +{ + return _mm_adds_epu16(a, b); +} #endif + +#if defined(__ARM_NEON__) +Q_ALWAYS_INLINE uint16x4_t addWithSaturation(uint16x4_t a, uint16x4_t b) +{ + return vqmovn_u32(vaddl_u16(a, b)); } +#endif QT_END_NAMESPACE |