diff options
Diffstat (limited to 'src/gui/painting/qdrawhelper_avx2.cpp')
-rw-r--r-- | src/gui/painting/qdrawhelper_avx2.cpp | 77 |
1 files changed, 22 insertions, 55 deletions
diff --git a/src/gui/painting/qdrawhelper_avx2.cpp b/src/gui/painting/qdrawhelper_avx2.cpp index c58d0804c7..34de69ecf4 100644 --- a/src/gui/painting/qdrawhelper_avx2.cpp +++ b/src/gui/painting/qdrawhelper_avx2.cpp @@ -1,42 +1,6 @@ -/**************************************************************************** -** -** Copyright (C) 2018 The Qt Company Ltd. -** Copyright (C) 2018 Intel Corporation. -** Contact: https://www.qt.io/licensing/ -** -** This file is part of the QtGui module of the Qt Toolkit. -** -** $QT_BEGIN_LICENSE:LGPL$ -** Commercial License Usage -** Licensees holding valid commercial Qt licenses may use this file in -** accordance with the commercial license agreement provided with the -** Software or, alternatively, in accordance with the terms contained in -** a written agreement between you and The Qt Company. For licensing terms -** and conditions see https://www.qt.io/terms-conditions. For further -** information use the contact form at https://www.qt.io/contact-us. -** -** GNU Lesser General Public License Usage -** Alternatively, this file may be used under the terms of the GNU Lesser -** General Public License version 3 as published by the Free Software -** Foundation and appearing in the file LICENSE.LGPL3 included in the -** packaging of this file. Please review the following information to -** ensure the GNU Lesser General Public License version 3 requirements -** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. -** -** GNU General Public License Usage -** Alternatively, this file may be used under the terms of the GNU -** General Public License version 2.0 or (at your option) the GNU General -** Public license version 3 or any later version approved by the KDE Free -** Qt Foundation. The licenses are as published by the Free Software -** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 -** included in the packaging of this file. Please review the following -** information to ensure the GNU General Public License requirements will -** be met: https://www.gnu.org/licenses/gpl-2.0.html and -** https://www.gnu.org/licenses/gpl-3.0.html. -** -** $QT_END_LICENSE$ -** -****************************************************************************/ +// Copyright (C) 2018 The Qt Company Ltd. +// Copyright (C) 2018 Intel Corporation. +// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only #include "qdrawhelper_p.h" #include "qdrawhelper_x86_p.h" @@ -359,7 +323,7 @@ void Q_DECL_VECTORCALL qt_memfillXX_avx2(uchar *dest, __m256i value256, qsizetyp void qt_memfill64_avx2(quint64 *dest, quint64 value, qsizetype count) { -#if defined(Q_CC_GNU) && !defined(Q_CC_CLANG) && !defined(Q_CC_INTEL) +#if defined(Q_CC_GNU) && !defined(Q_CC_CLANG) // work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80820 __m128i value64 = _mm_set_epi64x(0, value); // _mm_cvtsi64_si128(value); # ifdef Q_PROCESSOR_X86_64 @@ -478,14 +442,14 @@ void QT_FASTCALL comp_func_SourceOver_rgbafp_avx2(QRgbaFloat32 *dst, const QRgba _mm256_storeu_ps((float *)(dst + x), dstVector); } if (x < length) { - __m128 srcVector = _mm_load_ps((float *)(src + x)); - __m128 dstVector = _mm_load_ps((const float *)(dst + x)); + __m128 srcVector = _mm_loadu_ps((const float *)&src[x]); + __m128 dstVector = _mm_loadu_ps((const float *)&dst[x]); srcVector = _mm_mul_ps(srcVector, constAlphaVector); __m128 alphaChannel = _mm_permute_ps(srcVector, _MM_SHUFFLE(3, 3, 3, 3)); alphaChannel = _mm_sub_ps(one, alphaChannel); dstVector = _mm_mul_ps(dstVector, alphaChannel); dstVector = _mm_add_ps(dstVector, srcVector); - _mm_store_ps((float *)(dst + x), dstVector); + _mm_storeu_ps((float *)(dst + x), dstVector); } } #endif @@ -580,12 +544,12 @@ void QT_FASTCALL comp_func_Source_rgbafp_avx2(QRgbaFloat32 *dst, const QRgbaFloa _mm256_storeu_ps((float *)&dst[x], dstVector); } if (x < length) { - __m128 srcVector = _mm_load_ps((const float *)&src[x]); - __m128 dstVector = _mm_load_ps((const float *)&dst[x]); + __m128 srcVector = _mm_loadu_ps((const float *)&src[x]); + __m128 dstVector = _mm_loadu_ps((const float *)&dst[x]); srcVector = _mm_mul_ps(srcVector, constAlphaVector); dstVector = _mm_mul_ps(dstVector, oneMinusConstAlpha); dstVector = _mm_add_ps(dstVector, srcVector); - _mm_store_ps((float *)&dst[x], dstVector); + _mm_storeu_ps((float *)&dst[x], dstVector); } } } @@ -666,7 +630,7 @@ void QT_FASTCALL comp_func_solid_Source_rgbafp_avx2(QRgbaFloat32 *dst, int lengt const float a = const_alpha / 255.0f; const __m128 alphaVector = _mm_set1_ps(a); const __m128 minusAlphaVector = _mm_set1_ps(1.0f - a); - __m128 colorVector = _mm_load_ps((const float *)&color); + __m128 colorVector = _mm_loadu_ps((const float *)&color); colorVector = _mm_mul_ps(colorVector, alphaVector); const __m256 colorVector256 = _mm256_insertf128_ps(_mm256_castps128_ps256(colorVector), colorVector, 1); const __m256 minusAlphaVector256 = _mm256_set1_ps(1.0f - a); @@ -678,10 +642,10 @@ void QT_FASTCALL comp_func_solid_Source_rgbafp_avx2(QRgbaFloat32 *dst, int lengt _mm256_storeu_ps((float *)&dst[x], dstVector); } if (x < length) { - __m128 dstVector = _mm_load_ps((const float *)&dst[x]); + __m128 dstVector = _mm_loadu_ps((const float *)&dst[x]); dstVector = _mm_mul_ps(dstVector, minusAlphaVector); dstVector = _mm_add_ps(dstVector, colorVector); - _mm_store_ps((float *)&dst[x], dstVector); + _mm_storeu_ps((float *)&dst[x], dstVector); } } } @@ -693,7 +657,7 @@ void QT_FASTCALL comp_func_solid_SourceOver_rgbafp_avx2(QRgbaFloat32 *dst, int l for (int i = 0; i < length; ++i) dst[i] = color; } else { - __m128 colorVector = _mm_load_ps((const float *)&color); + __m128 colorVector = _mm_loadu_ps((const float *)&color); if (const_alpha != 255) colorVector = _mm_mul_ps(colorVector, _mm_set1_ps(const_alpha / 255.f)); __m128 minusAlphaOfColorVector = @@ -709,10 +673,10 @@ void QT_FASTCALL comp_func_solid_SourceOver_rgbafp_avx2(QRgbaFloat32 *dst, int l _mm256_storeu_ps((float *)&dst[x], dstVector); } if (x < length) { - __m128 dstVector = _mm_load_ps((const float *)&dst[x]); + __m128 dstVector = _mm_loadu_ps((const float *)&dst[x]); dstVector = _mm_mul_ps(dstVector, minusAlphaOfColorVector); dstVector = _mm_add_ps(dstVector, colorVector); - _mm_store_ps((float *)&dst[x], dstVector); + _mm_storeu_ps((float *)&dst[x], dstVector); } } } @@ -1381,13 +1345,16 @@ const QRgba64 *QT_FASTCALL fetchRGBA64ToRGBA64PM_avx2(QRgba64 *buffer, const uch vslo = _mm256_srli_epi32(vslo, 16); vshi = _mm256_srli_epi32(vshi, 16); vs256 = _mm256_packus_epi32(vslo, vshi); + vs256 = _mm256_blend_epi16(vs256, va256, 0x88); _mm256_storeu_si256((__m256i *)(buffer + i), vs256); } for (; i < count; ++i) { + const auto a = s[i].alpha(); __m128i vs = _mm_loadl_epi64((const __m128i *)(s + i)); __m128i va = _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3)); vs = multiplyAlpha65535(vs, va); _mm_storel_epi64((__m128i *)(buffer + i), vs); + buffer[i].setAlpha(a); } return buffer; } @@ -1590,7 +1557,7 @@ const QRgbaFloat32 *QT_FASTCALL fetchRGBA16FToRGBA32F_avx2(QRgbaFloat32 *buffer, __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3)); vsf = _mm_mul_ps(vsf, vsa); vsf = _mm_insert_ps(vsf, vsa, 0x30); - _mm_store_ps((float *)(buffer + i), vsf); + _mm_storeu_ps((float *)(buffer + i), vsf); } return buffer; } @@ -1602,7 +1569,7 @@ void QT_FASTCALL storeRGBX16FFromRGBA32F_avx2(uchar *dest, const QRgbaFloat32 *s const __m128 *s = reinterpret_cast<const __m128 *>(src); const __m128 zero = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); for (int i = 0; i < count; ++i) { - __m128 vsf = _mm_load_ps(reinterpret_cast<const float *>(s + i)); + __m128 vsf = _mm_loadu_ps(reinterpret_cast<const float *>(s + i)); const __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3)); const float a = _mm_cvtss_f32(vsa); if (a == 1.0f) @@ -1626,7 +1593,7 @@ void QT_FASTCALL storeRGBA16FFromRGBA32F_avx2(uchar *dest, const QRgbaFloat32 *s const __m128 *s = reinterpret_cast<const __m128 *>(src); const __m128 zero = _mm_set1_ps(0.0f); for (int i = 0; i < count; ++i) { - __m128 vsf = _mm_load_ps(reinterpret_cast<const float *>(s + i)); + __m128 vsf = _mm_loadu_ps(reinterpret_cast<const float *>(s + i)); const __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3)); const float a = _mm_cvtss_f32(vsa); if (a == 1.0f) |