1 files changed, 129 insertions, 90 deletions
diff --git a/src/gui/painting/qrgba64_p.h b/src/gui/painting/qrgba64_p.h
index d145dbfbea..ae8b6fd8cb 100644
--- a/src/gui/painting/qrgba64_p.h
+++ b/src/gui/painting/qrgba64_p.h
@@ -1,41 +1,5 @@
-/****************************************************************************
-**
-** Copyright (C) 2016 The Qt Company Ltd.
-** Contact: https://www.qt.io/licensing/
-**
-** This file is part of the QtGui module of the Qt Toolkit.
-**
-** $QT_BEGIN_LICENSE:LGPL$
-** Commercial License Usage
-** Licensees holding valid commercial Qt licenses may use this file in
-** accordance with the commercial license agreement provided with the
-** Software or, alternatively, in accordance with the terms contained in
-** a written agreement between you and The Qt Company. For licensing terms
-** and conditions see https://www.qt.io/terms-conditions. For further
-** information use the contact form at https://www.qt.io/contact-us.
-**
-** GNU Lesser General Public License Usage
-** Alternatively, this file may be used under the terms of the GNU Lesser
-** General Public License version 3 as published by the Free Software
-** Foundation and appearing in the file LICENSE.LGPL3 included in the
-** packaging of this file. Please review the following information to
-** ensure the GNU Lesser General Public License version 3 requirements
-** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
-**
-** GNU General Public License Usage
-** Alternatively, this file may be used under the terms of the GNU
-** General Public License version 2.0 or (at your option) the GNU General
-** Public license version 3 or any later version approved by the KDE Free
-** Qt Foundation. The licenses are as published by the Free Software
-** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
-** included in the packaging of this file. Please review the following
-** information to ensure the GNU General Public License requirements will
-** be met: https://www.gnu.org/licenses/gpl-2.0.html and
-** https://www.gnu.org/licenses/gpl-3.0.html.
-**
-** $QT_END_LICENSE$
-**
-****************************************************************************/
+// Copyright (C) 2020 The Qt Company Ltd.
+// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
 
 #ifndef QRGBA64_P_H
 #define QRGBA64_P_H
@@ -64,40 +28,30 @@ inline QRgba64 combineAlpha256(QRgba64 rgba64, uint alpha256)
     return QRgba64::fromRgba64(rgba64.red(), rgba64.green(), rgba64.blue(), (rgba64.alpha() * alpha256) >> 8);
 }
 
-inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
-{
-    return QRgba64::fromRgba64(qt_div_65535(rgba64.red()   * alpha65535),
-                               qt_div_65535(rgba64.green() * alpha65535),
-                               qt_div_65535(rgba64.blue()  * alpha65535),
-                               qt_div_65535(rgba64.alpha() * alpha65535));
-}
-
-#ifdef __SSE2__
-Q_ALWAYS_INLINE __m128i multiplyAlpha65535(__m128i rgba64, __m128i va)
+#if defined(__SSE2__)
+static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, __m128i va)
 {
     __m128i vs = rgba64;
     vs = _mm_unpacklo_epi16(_mm_mullo_epi16(vs, va), _mm_mulhi_epu16(vs, va));
     vs = _mm_add_epi32(vs, _mm_srli_epi32(vs, 16));
     vs = _mm_add_epi32(vs, _mm_set1_epi32(0x8000));
     vs = _mm_srai_epi32(vs, 16);
-    vs = _mm_packs_epi32(vs, _mm_setzero_si128());
+    vs = _mm_packs_epi32(vs, vs);
     return vs;
 }
-Q_ALWAYS_INLINE __m128i multiplyAlpha65535(__m128i rgba64, uint alpha65535)
+static inline __m128i Q_DECL_VECTORCALL multiplyAlpha65535(__m128i rgba64, uint alpha65535)
 {
     const __m128i va = _mm_shufflelo_epi16(_mm_cvtsi32_si128(alpha65535), _MM_SHUFFLE(0, 0, 0, 0));
     return multiplyAlpha65535(rgba64, va);
 }
-#endif
-
-#if defined(__ARM_NEON__)
-Q_ALWAYS_INLINE uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint16x4_t alpha65535)
+#elif defined(__ARM_NEON__)
+static inline uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint16x4_t alpha65535)
 {
     uint32x4_t vs32 = vmull_u16(rgba64, alpha65535); // vs = vs * alpha
     vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16)
     return vrshrn_n_u32(vs32, 16); // vs = (vs + 0x8000) >> 16
 }
-Q_ALWAYS_INLINE uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535)
+static inline uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535)
 {
     uint32x4_t vs32 = vmull_n_u16(rgba64, alpha65535); // vs = vs * alpha
     vs32 = vsraq_n_u32(vs32, vs32, 16); // vs = vs + (vs >> 16)
@@ -105,77 +59,144 @@ Q_ALWAYS_INLINE uint16x4_t multiplyAlpha65535(uint16x4_t rgba64, uint alpha65535
 }
 #endif
 
-template<typename T>
-inline T multiplyAlpha255(T rgba64, uint alpha255)
+static inline QRgba64 multiplyAlpha65535(QRgba64 rgba64, uint alpha65535)
 {
+#if defined(__SSE2__)
+    const __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&rgba64));
+    const __m128i vr = multiplyAlpha65535(v, alpha65535);
+    QRgba64 r;
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr);
+    return r;
+#elif defined(__ARM_NEON__)
+    const uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
+    const uint16x4_t vr = multiplyAlpha65535(v, alpha65535);
+    QRgba64 r;
+    vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
+    return r;
+#else
+    return QRgba64::fromRgba64(qt_div_65535(rgba64.red()   * alpha65535),
+                               qt_div_65535(rgba64.green() * alpha65535),
+                               qt_div_65535(rgba64.blue()  * alpha65535),
+                               qt_div_65535(rgba64.alpha() * alpha65535));
+#endif
+}
+
 #if defined(__SSE2__) || defined(__ARM_NEON__)
+template<typename T>
+static inline T Q_DECL_VECTORCALL multiplyAlpha255(T rgba64, uint alpha255)
+{
     return multiplyAlpha65535(rgba64, alpha255 * 257);
+}
 #else
+template<typename T>
+static inline T multiplyAlpha255(T rgba64, uint alpha255)
+{
     return QRgba64::fromRgba64(qt_div_255(rgba64.red()   * alpha255),
                                qt_div_255(rgba64.green() * alpha255),
                                qt_div_255(rgba64.blue()  * alpha255),
                                qt_div_255(rgba64.alpha() * alpha255));
-#endif
-}
-
-inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
-{
-    return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2));
 }
+#endif
 
 #if defined __SSE2__
-Q_ALWAYS_INLINE __m128i interpolate255(__m128i x, uint alpha1, __m128i y, uint alpha2)
+static inline __m128i Q_DECL_VECTORCALL interpolate255(__m128i x, uint alpha1, __m128i y, uint alpha2)
 {
-    return _mm_add_epi32(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2));
+    return _mm_add_epi16(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2));
 }
 #endif
 
 #if defined __ARM_NEON__
-Q_ALWAYS_INLINE uint16x4_t interpolate255(uint16x4_t x, uint alpha1, uint16x4_t y, uint alpha2)
+inline uint16x4_t interpolate255(uint16x4_t x, uint alpha1, uint16x4_t y, uint alpha2)
 {
     return vadd_u16(multiplyAlpha255(x, alpha1), multiplyAlpha255(y, alpha2));
 }
 #endif
 
-inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
+static inline QRgba64 interpolate255(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
 {
-    return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2));
+#if defined(__SSE2__)
+    const __m128i vx = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&x));
+    const __m128i vy = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&y));
+    const __m128i vr = interpolate255(vx, alpha1, vy, alpha2);
+    QRgba64 r;
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr);
+    return r;
+#elif defined(__ARM_NEON__)
+    const uint16x4_t vx = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&x)));
+    const uint16x4_t vy = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&y)));
+    const uint16x4_t vr = interpolate255(vx, alpha1, vy, alpha2);
+    QRgba64 r;
+    vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
+    return r;
+#else
+    return QRgba64::fromRgba64(multiplyAlpha255(x, alpha1) + multiplyAlpha255(y, alpha2));
+#endif
 }
 
 #if defined __SSE2__
-Q_ALWAYS_INLINE __m128i interpolate65535(__m128i x, uint alpha1, __m128i y, uint alpha2)
+static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, uint alpha1, __m128i y, uint alpha2)
 {
-    return _mm_add_epi32(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
+    return _mm_add_epi16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
 }
-// alpha2 below is const-ref because otherwise MSVC2015 complains that it can't 16-byte align the argument.
-Q_ALWAYS_INLINE __m128i interpolate65535(__m128i x, __m128i alpha1, __m128i y, const __m128i &alpha2)
+
+static inline __m128i Q_DECL_VECTORCALL interpolate65535(__m128i x, __m128i alpha1, __m128i y, __m128i alpha2)
 {
-    return _mm_add_epi32(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
+    return _mm_add_epi16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
 }
 #endif
 
 #if defined __ARM_NEON__
-Q_ALWAYS_INLINE uint16x4_t interpolate65535(uint16x4_t x, uint alpha1, uint16x4_t y, uint alpha2)
+inline uint16x4_t interpolate65535(uint16x4_t x, uint alpha1, uint16x4_t y, uint alpha2)
 {
     return vadd_u16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
 }
-Q_ALWAYS_INLINE uint16x4_t interpolate65535(uint16x4_t x, uint16x4_t alpha1, uint16x4_t y, uint16x4_t alpha2)
+inline uint16x4_t interpolate65535(uint16x4_t x, uint16x4_t alpha1, uint16x4_t y, uint16x4_t alpha2)
 {
     return vadd_u16(multiplyAlpha65535(x, alpha1), multiplyAlpha65535(y, alpha2));
 }
 #endif
 
-inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b)
+static inline QRgba64 interpolate65535(QRgba64 x, uint alpha1, QRgba64 y, uint alpha2)
 {
+#if defined(__SSE2__)
+    const __m128i vx = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&x));
+    const __m128i vy = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&y));
+    const __m128i vr = interpolate65535(vx, alpha1, vy, alpha2);
+    QRgba64 r;
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr);
+    return r;
+#elif defined(__ARM_NEON__)
+    const uint16x4_t vx = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&x)));
+    const uint16x4_t vy = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&y)));
+    const uint16x4_t vr = interpolate65535(vx, alpha1, vy, alpha2);
+    QRgba64 r;
+    vst1_u64(reinterpret_cast<uint64_t *>(&r), vreinterpret_u64_u16(vr));
+    return r;
+#else
+    return QRgba64::fromRgba64(multiplyAlpha65535(x, alpha1) + multiplyAlpha65535(y, alpha2));
+#endif
+}
+
+static inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b)
+{
+#if defined(__SSE2__)
+    const __m128i va = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&a));
+    const __m128i vb = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&b));
+    const __m128i vr = _mm_adds_epu16(va, vb);
+    QRgba64 r;
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(&r), vr);
+    return r;
+#else
     return QRgba64::fromRgba64(qMin(a.red() + b.red(), 65535),
                                qMin(a.green() + b.green(), 65535),
                                qMin(a.blue() + b.blue(), 65535),
                                qMin(a.alpha() + b.alpha(), 65535));
+#endif
 }
 
 #if QT_COMPILER_SUPPORTS_HERE(SSE2)
 QT_FUNCTION_TARGET(SSE2)
-Q_ALWAYS_INLINE uint toArgb32(__m128i v)
+static inline uint Q_DECL_VECTORCALL toArgb32(__m128i v)
 {
     v = _mm_unpacklo_epi16(v, _mm_setzero_si128());
     v = _mm_add_epi32(v, _mm_set1_epi32(128));
@@ -186,7 +207,7 @@ Q_ALWAYS_INLINE uint toArgb32(__m128i v)
     return _mm_cvtsi128_si32(v);
 }
 #elif defined __ARM_NEON__
-Q_ALWAYS_INLINE uint toArgb32(uint16x4_t v)
+static inline uint toArgb32(uint16x4_t v)
 {
     v = vsub_u16(v, vrshr_n_u16(v, 8));
     v = vrshr_n_u16(v, 8);
@@ -195,10 +216,10 @@ Q_ALWAYS_INLINE uint toArgb32(uint16x4_t v)
 }
 #endif
 
-inline uint toArgb32(QRgba64 rgba64)
+static inline uint toArgb32(QRgba64 rgba64)
 {
 #if defined __SSE2__
-    __m128i v = _mm_loadl_epi64((const __m128i *)&rgba64);
+    __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&rgba64));
     v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 0, 1, 2));
     return toArgb32(v);
 #elif defined __ARM_NEON__
@@ -215,10 +236,10 @@ inline uint toArgb32(QRgba64 rgba64)
 #endif
 }
 
-inline uint toRgba8888(QRgba64 rgba64)
+static inline uint toRgba8888(QRgba64 rgba64)
 {
 #if defined __SSE2__
-    __m128i v = _mm_loadl_epi64((const __m128i *)&rgba64);
+    __m128i v = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&rgba64));
     return toArgb32(v);
 #elif defined __ARM_NEON__
     uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
@@ -228,12 +249,12 @@ inline uint toRgba8888(QRgba64 rgba64)
 #endif
 }
 
-inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha)
+static inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha)
 {
     QRgba64 blend;
 #if defined(__SSE2__)
-    __m128i vd = _mm_loadl_epi64((const __m128i *)&d);
-    __m128i vs = _mm_loadl_epi64((const __m128i *)&s);
+    __m128i vd = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&d));
+    __m128i vs = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&s));
     __m128i va =  _mm_cvtsi32_si128(rgbAlpha);
     va = _mm_unpacklo_epi8(va, va);
     va = _mm_shufflelo_epi16(va, _MM_SHUFFLE(3, 0, 1, 2));
@@ -245,9 +266,9 @@ inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha)
     vd = _mm_add_epi32(vd, _mm_srli_epi32(vd, 16));
     vd = _mm_add_epi32(vd, _mm_set1_epi32(0x8000));
     vd = _mm_srai_epi32(vd, 16);
-    vd = _mm_packs_epi32(vd, _mm_setzero_si128());
+    vd = _mm_packs_epi32(vd, vd);
 
-    _mm_storel_epi64((__m128i *)&blend, vd);
+    _mm_storel_epi64(reinterpret_cast<__m128i *>(&blend), vd);
 #elif defined(__ARM_NEON__)
     uint16x4_t vd = vreinterpret_u16_u64(vmov_n_u64(d));
     uint16x4_t vs = vreinterpret_u16_u64(vmov_n_u64(s));
@@ -274,21 +295,39 @@ inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha)
     return blend;
 }
 
-static Q_ALWAYS_INLINE void blend_pixel(QRgba64 &dst, QRgba64 src)
+static inline void blend_pixel(QRgba64 &dst, QRgba64 src)
 {
     if (src.isOpaque())
         dst = src;
-    else if (!src.isTransparent())
+    else if (!src.isTransparent()) {
+#if defined(__SSE2__)
+        const __m128i vd = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&dst));
+        const __m128i vs = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&src));
+        const __m128i via = _mm_xor_si128(_mm_set1_epi16(-1), _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3)));
+        const __m128i vr = _mm_add_epi16(vs, multiplyAlpha65535(vd, via));
+        _mm_storel_epi64(reinterpret_cast<__m128i *>(&dst), vr);
+#else
         dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());
+#endif
+    }
 }
 
-static Q_ALWAYS_INLINE void blend_pixel(QRgba64 &dst, QRgba64 src, const int const_alpha)
+static inline void blend_pixel(QRgba64 &dst, QRgba64 src, const int const_alpha)
 {
     if (const_alpha == 255)
         return blend_pixel(dst, src);
     if (!src.isTransparent()) {
+#if defined(__SSE2__)
+        const __m128i vd = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&dst));
+        __m128i vs = _mm_loadl_epi64(reinterpret_cast<const __m128i *>(&src));
+        vs = multiplyAlpha255(vs, const_alpha);
+        const __m128i via = _mm_xor_si128(_mm_set1_epi16(-1), _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3)));
+        const __m128i vr = _mm_add_epi16(vs, multiplyAlpha65535(vd, via));
+        _mm_storel_epi64(reinterpret_cast<__m128i *>(&dst), vr);
+#else
         src = multiplyAlpha255(src, const_alpha);
         dst = src + multiplyAlpha65535(dst, 65535 - src.alpha());
+#endif
     }
 }