diff options
Diffstat (limited to 'src/gui/painting/qrgba64_p.h')
-rw-r--r-- | src/gui/painting/qrgba64_p.h | 219 |
1 files changed, 129 insertions, 90 deletions
diff --git a/src/gui/painting/qrgba64_p.h b/src/gui/painting/qrgba64_p.h index d145dbfbea..ae8b6fd8cb 100644 --- a/src/gui/painting/qrgba64_p.h +++ b/src/gui/painting/qrgba64_p.h @@ -1,41 +1,5 @@ -/**************************************************************************** -** -** Copyright (C) 2016 The Qt Company Ltd. -** Contact: https://www.qt.io/licensing/ -** -** This file is part of the QtGui module of the Qt Toolkit. -** -** $QT_BEGIN_LICENSE:LGPL$ -** Commercial License Usage -** Licensees holding valid commercial Qt licenses may use this file in -** accordance with the commercial license agreement provided with the -** Software or, alternatively, in accordance with the terms contained in -** a written agreement between you and The Qt Company. For licensing terms -** and conditions see https://www.qt.io/terms-conditions. For further -** information use the contact form at https://www.qt.io/contact-us. -** -** GNU Lesser General Public License Usage -** Alternatively, this file may be used under the terms of the GNU Lesser -** General Public License version 3 as published by the Free Software -** Foundation and appearing in the file LICENSE.LGPL3 included in the -** packaging of this file. Please review the following information to -** ensure the GNU Lesser General Public License version 3 requirements -** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. -** -** GNU General Public License Usage -** Alternatively, this file may be used under the terms of the GNU -** General Public License version 2.0 or (at your option) the GNU General -** Public license version 3 or any later version approved by the KDE Free -** Qt Foundation. The licenses are as published by the Free Software -** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 -** included in the packaging of this file. Please review the following -** information to ensure the GNU General Public License requirements will -** be met: https://www.gnu.org/licenses/gpl-2.0.html and -** https://www.gnu.org/licenses/gpl-3.0.html. -** -** $QT_END_LICENSE$ -** -****************************************************************************/ +// Copyright (C) 2020 The Qt Company Ltd. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only #ifndef QRGBA64_P_H #define QRGBA64_P_H @@ -64,40 +28,30 @@ inline QRgba64 combineAlpha256(QRgba64 rgba64, uint alpha256) return QRgba64::fromRgba64(rgba64.red(), rgba64.green(), rgba64.blue(), (rgba64.alpha() * alpha256) >> 8); } -inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535) -{ - return QRgba64::fromRgba64(qt_div_65535(rgba64.red() * alpha65535), - qt_div_65535(rgba64.green() * alpha65535), - qt_div_65535(rgba64.blue() * alpha65535), - qt_div_65535(rgba64.alpha() * alpha65535)); -} - -#ifdef __SSE2__ -Q_ALWAYS_INLINE __m128i multiplyAlpha65535(__m128i rgba64, __m128i va) +#if defined(__SSE2__) +static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, __m128i va) { __m128i vs = rgba64; vs = _mm_unpacklo_epi16(_mm_mullo_epi16(vs, va), _mm_mulhi_epu16(vs, va)); vs = _mm_add_epi32(vs, _mm_srli_epi32(vs, 16)); vs = _mm_add_epi32(vs, _mm_set1_epi32(0x8000)); vs = _mm_srai_epi32(vs, 16); - vs = _mm_packs_epi32(vs, _mm_setzero_si128()); + vs = _mm_packs_epi32(vs, vs); return vs; } -Q_ALWAYS_INLINE __m128i multiplyAlpha65535(__m128i rgba64, uint alpha65535) +static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, uint alpha65535) { const __m128i va = _mm_shufflelo_epi16(_mm_cvtsi32_si128(alpha65535), _MM_SHUFFLE(0, 0, 0, 0)); return multiplyAlpha65535(rgba64, va); } -#endif - -#if defined(__ARM_NEON__) -Q_ALWAYS_INLINE uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint16x4_t alpha65535) +#elif defined(__ARM_NEON__) +static inline uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint16x4_t alpha65535) { uint32x4_t vs32 = vmull_u16(rgba64, alpha65535); // vs = vs * alpha vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16) return vrshrn_n_u32(vs32, 16); // vs = (vs + 0x8000) >> 16 } -Q_ALWAYS_INLINE uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535) +static inline uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535) { uint32x4_t vs32 = vmull_n_u16(rgba64, alpha65535); // vs = vs * alpha vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16) @@ -105,77 +59,144 @@ Q_ALWAYS_INLINE uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535 } #endif -template<typename T> -inline T multiplyAlpha255(T rgba64, uint alpha255) +static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535) { +#if defined(__SSE2__) + const __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&rgba64)); + const __m128i vr = multiplyAlpha65535(v, alpha65535); + QRgba64 r; + _mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr); + return r; +#elif defined(__ARM_NEON__) + const uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64))); + const uint16x4_t vr = multiplyAlpha65535(v, alpha65535); + QRgba64 r; + vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr)); + return r; +#else + return QRgba64::fromRgba64(qt_div_65535(rgba64.red() * alpha65535), + qt_div_65535(rgba64.green() * alpha65535), + qt_div_65535(rgba64.blue() * alpha65535), + qt_div_65535(rgba64.alpha() * alpha65535)); +#endif +} + #if defined(__SSE2__) || defined(__ARM_NEON__) +template<typename T> +static inline T Q_DECL_VECTORCALL multiplyAlpha255(T rgba64, uint alpha255) +{ return multiplyAlpha65535(rgba64, alpha255 * 257); +} #else +template<typename T> +static inline T multiplyAlpha255(T rgba64, uint alpha255) +{ return QRgba64::fromRgba64(qt_div_255(rgba64.red() * alpha255), qt_div_255(rgba64.green() * alpha255), qt_div_255(rgba64.blue() * alpha255), qt_div_255(rgba64.alpha() * alpha255)); -#endif -} - -inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2) -{ - return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2)); } +#endif #if defined __SSE2__ -Q_ALWAYS_INLINE __m128i interpolate255(__m128i x, uint alpha1, __m128i y, uint alpha2) +static inline __m128i Q_DECL_VECTORCALL interpolate255(__m128i x, uint alpha1, __m128i y, uint alpha2) { - return _mm_add_epi32(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2)); + return _mm_add_epi16(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2)); } #endif #if defined __ARM_NEON__ -Q_ALWAYS_INLINE uint16x4_t interpolate255(uint16x4_t x, uint alpha1, uint16x4_t y, uint alpha2) +inline uint16x4_t interpolate255(uint16x4_t x, uint alpha1, uint16x4_t y, uint alpha2) { return vadd_u16(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2)); } #endif -inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2) +static inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2) { - return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2)); +#if defined(__SSE2__) + const __m128i vx = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&x)); + const __m128i vy = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&y)); + const __m128i vr = interpolate255(vx, alpha1, vy, alpha2); + QRgba64 r; + _mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr); + return r; +#elif defined(__ARM_NEON__) + const uint16x4_t vx = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&x))); + const uint16x4_t vy = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&y))); + const uint16x4_t vr = interpolate255(vx, alpha1, vy, alpha2); + QRgba64 r; + vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr)); + return r; +#else + return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2)); +#endif } #if defined __SSE2__ -Q_ALWAYS_INLINE __m128i interpolate65535(__m128i x, uint alpha1, __m128i y, uint alpha2) +static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, uint alpha1, __m128i y, uint alpha2) { - return _mm_add_epi32(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); + return _mm_add_epi16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); } -// alpha2 below is const-ref because otherwise MSVC2015 complains that it can't 16-byte align the argument. -Q_ALWAYS_INLINE __m128i interpolate65535(__m128i x, __m128i alpha1, __m128i y, const __m128i &alpha2) + +static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, __m128i alpha1, __m128i y, __m128i alpha2) { - return _mm_add_epi32(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); + return _mm_add_epi16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); } #endif #if defined __ARM_NEON__ -Q_ALWAYS_INLINE uint16x4_t interpolate65535(uint16x4_t x, uint alpha1, uint16x4_t y, uint alpha2) +inline uint16x4_t interpolate65535(uint16x4_t x, uint alpha1, uint16x4_t y, uint alpha2) { return vadd_u16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); } -Q_ALWAYS_INLINE uint16x4_t interpolate65535(uint16x4_t x, uint16x4_t alpha1, uint16x4_t y, uint16x4_t alpha2) +inline uint16x4_t interpolate65535(uint16x4_t x, uint16x4_t alpha1, uint16x4_t y, uint16x4_t alpha2) { return vadd_u16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2)); } #endif -inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b) +static inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2) { +#if defined(__SSE2__) + const __m128i vx = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&x)); + const __m128i vy = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&y)); + const __m128i vr = interpolate65535(vx, alpha1, vy, alpha2); + QRgba64 r; + _mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr); + return r; +#elif defined(__ARM_NEON__) + const uint16x4_t vx = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&x))); + const uint16x4_t vy = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&y))); + const uint16x4_t vr = interpolate65535(vx, alpha1, vy, alpha2); + QRgba64 r; + vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr)); + return r; +#else + return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2)); +#endif +} + +static inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b) +{ +#if defined(__SSE2__) + const __m128i va = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&a)); + const __m128i vb = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&b)); + const __m128i vr = _mm_adds_epu16(va, vb); + QRgba64 r; + _mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr); + return r; +#else return QRgba64::fromRgba64(qMin(a.red() + b.red(), 65535), qMin(a.green() + b.green(), 65535), qMin(a.blue() + b.blue(), 65535), qMin(a.alpha() + b.alpha(), 65535)); +#endif } #if QT_COMPILER_SUPPORTS_HERE(SSE2) QT_FUNCTION_TARGET(SSE2) -Q_ALWAYS_INLINE uint toArgb32(__m128i v) +static inline uint Q_DECL_VECTORCALL toArgb32(__m128i v) { v = _mm_unpacklo_epi16(v, _mm_setzero_si128()); v = _mm_add_epi32(v, _mm_set1_epi32(128)); @@ -186,7 +207,7 @@ Q_ALWAYS_INLINE uint toArgb32(__m128i v) return _mm_cvtsi128_si32(v); } #elif defined __ARM_NEON__ -Q_ALWAYS_INLINE uint toArgb32(uint16x4_t v) +static inline uint toArgb32(uint16x4_t v) { v = vsub_u16(v, vrshr_n_u16(v, 8)); v = vrshr_n_u16(v, 8); @@ -195,10 +216,10 @@ Q_ALWAYS_INLINE uint toArgb32(uint16x4_t v) } #endif -inline uint toArgb32(QRgba64 rgba64) +static inline uint toArgb32(QRgba64 rgba64) { #if defined __SSE2__ - __m128i v = _mm_loadl_epi64((const __m128i *)&rgba64); + __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&rgba64)); v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 0, 1, 2)); return toArgb32(v); #elif defined __ARM_NEON__ @@ -215,10 +236,10 @@ inline uint toArgb32(QRgba64 rgba64) #endif } -inline uint toRgba8888(QRgba64 rgba64) +static inline uint toRgba8888(QRgba64 rgba64) { #if defined __SSE2__ - __m128i v = _mm_loadl_epi64((const __m128i *)&rgba64); + __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&rgba64)); return toArgb32(v); #elif defined __ARM_NEON__ uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64))); @@ -228,12 +249,12 @@ inline uint toRgba8888(QRgba64 rgba64) #endif } -inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha) +static inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha) { QRgba64 blend; #if defined(__SSE2__) - __m128i vd = _mm_loadl_epi64((const __m128i *)&d); - __m128i vs = _mm_loadl_epi64((const __m128i *)&s); + __m128i vd = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&d)); + __m128i vs = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&s)); __m128i va = _mm_cvtsi32_si128(rgbAlpha); va = _mm_unpacklo_epi8(va, va); va = _mm_shufflelo_epi16(va, _MM_SHUFFLE(3, 0, 1, 2)); @@ -245,9 +266,9 @@ inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha) vd = _mm_add_epi32(vd, _mm_srli_epi32(vd, 16)); vd = _mm_add_epi32(vd, _mm_set1_epi32(0x8000)); vd = _mm_srai_epi32(vd, 16); - vd = _mm_packs_epi32(vd, _mm_setzero_si128()); + vd = _mm_packs_epi32(vd, vd); - _mm_storel_epi64((__m128i *)&blend, vd); + _mm_storel_epi64(reinterpret_cast<__m128i *>(&blend), vd); #elif defined(__ARM_NEON__) uint16x4_t vd = vreinterpret_u16_u64(vmov_n_u64(d)); uint16x4_t vs = vreinterpret_u16_u64(vmov_n_u64(s)); @@ -274,21 +295,39 @@ inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha) return blend; } -static Q_ALWAYS_INLINE void blend_pixel(QRgba64 &dst, QRgba64 src) +static inline void blend_pixel(QRgba64 &dst, QRgba64 src) { if (src.isOpaque()) dst = src; - else if (!src.isTransparent()) + else if (!src.isTransparent()) { +#if defined(__SSE2__) + const __m128i vd = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&dst)); + const __m128i vs = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&src)); + const __m128i via = _mm_xor_si128(_mm_set1_epi16(-1), _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3))); + const __m128i vr = _mm_add_epi16(vs, multiplyAlpha65535(vd, via)); + _mm_storel_epi64(reinterpret_cast<__m128i *>(&dst), vr); +#else dst = src + multiplyAlpha65535(dst, 65535 - src.alpha()); +#endif + } } -static Q_ALWAYS_INLINE void blend_pixel(QRgba64 &dst, QRgba64 src, const int const_alpha) +static inline void blend_pixel(QRgba64 &dst, QRgba64 src, const int const_alpha) { if (const_alpha == 255) return blend_pixel(dst, src); if (!src.isTransparent()) { +#if defined(__SSE2__) + const __m128i vd = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&dst)); + __m128i vs = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&src)); + vs = multiplyAlpha255(vs, const_alpha); + const __m128i via = _mm_xor_si128(_mm_set1_epi16(-1), _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3))); + const __m128i vr = _mm_add_epi16(vs, multiplyAlpha65535(vd, via)); + _mm_storel_epi64(reinterpret_cast<__m128i *>(&dst), vr); +#else src = multiplyAlpha255(src, const_alpha); dst = src + multiplyAlpha65535(dst, 65535 - src.alpha()); +#endif } } |