From b46cc98eca89a5adef4b03fb990c782c61ec6bfb Mon Sep 17 00:00:00 2001 From: Paul Lemire Date: Mon, 13 Mar 2017 16:14:36 +0100 Subject: Matrix4x4: simd (AVX2 and SSE) matrix classes Change-Id: I8a4d8542a12a9ec6631f76515c0da41cd1c679fe Reviewed-by: Sean Harmer --- src/core/transforms/matrix4x4_avx2.cpp | 64 ++++ src/core/transforms/matrix4x4_avx2_p.h | 574 +++++++++++++++++++++++++++++++++ src/core/transforms/matrix4x4_p.h | 82 +++++ src/core/transforms/matrix4x4_sse.cpp | 64 ++++ src/core/transforms/matrix4x4_sse_p.h | 504 +++++++++++++++++++++++++++++ src/core/transforms/transforms.pri | 23 +- src/core/transforms/vector3d_sse.cpp | 75 +++++ src/core/transforms/vector3d_sse_p.h | 16 +- src/core/transforms/vector4d_sse_p.h | 8 +- 9 files changed, 1400 insertions(+), 10 deletions(-) create mode 100644 src/core/transforms/matrix4x4_avx2.cpp create mode 100644 src/core/transforms/matrix4x4_avx2_p.h create mode 100644 src/core/transforms/matrix4x4_p.h create mode 100644 src/core/transforms/matrix4x4_sse.cpp create mode 100644 src/core/transforms/matrix4x4_sse_p.h (limited to 'src/core/transforms') diff --git a/src/core/transforms/matrix4x4_avx2.cpp b/src/core/transforms/matrix4x4_avx2.cpp new file mode 100644 index 000000000..556e778d0 --- /dev/null +++ b/src/core/transforms/matrix4x4_avx2.cpp @@ -0,0 +1,64 @@ +/**************************************************************************** +** +** Copyright (C) 2016 Paul Lemire +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the Qt3D module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "matrix4x4_avx2_p.h" + +#ifdef QT_COMPILER_SUPPORTS_AVX2 + +QT_BEGIN_NAMESPACE + +namespace Qt3DCore { + +QDebug operator<<(QDebug dbg, const Matrix4x4_AVX2 &m) +{ + dbg.nospace() << "Matrix4x4_AVX2(" << endl + << qSetFieldWidth(10) + << m.m11() << m.m12() << m.m13() << m.m14() << endl + << m.m21() << m.m22() << m.m23() << m.m24() << endl + << m.m31() << m.m32() << m.m33() << m.m34() << endl + << m.m41() << m.m42() << m.m43() << m.m44() << endl + << qSetFieldWidth(0) << ')'; + return dbg; +} + +} // Qt3DCore + +QT_END_NAMESPACE + +#endif diff --git a/src/core/transforms/matrix4x4_avx2_p.h b/src/core/transforms/matrix4x4_avx2_p.h new file mode 100644 index 000000000..d5f77041b --- /dev/null +++ b/src/core/transforms/matrix4x4_avx2_p.h @@ -0,0 +1,574 @@ +/**************************************************************************** +** +** Copyright (C) 2016 Paul Lemire +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the Qt3D module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#ifndef QT3DCORE_MATRIX4X4_AVX2_P_H +#define QT3DCORE_MATRIX4X4_AVX2_P_H + +// +// W A R N I N G +// ------------- +// +// This file is not part of the Qt3D API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. +// + +#include +#include +#include +#include + +#ifdef QT_COMPILER_SUPPORTS_AVX2 + +// Some GCC versions don't have _mm256_set_m128 available +// Work around that +#define _mm256_set_m128(va, vb) \ + _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1) + +QT_BEGIN_NAMESPACE + +namespace Qt3DCore { + +class Matrix4x4_AVX2 +{ +public: + + Q_ALWAYS_INLINE Matrix4x4_AVX2() { setToIdentity(); } + explicit Q_ALWAYS_INLINE Matrix4x4_AVX2(Qt::Initialization) {} + + // Assumes data is 32 bytes aligned (and in column major order) + explicit Q_ALWAYS_INLINE Matrix4x4_AVX2(float *data) + { + m_col12 = _mm256_load_ps(data); + m_col34 = _mm256_load_ps(data + 8); + } + + // QMatrix4x4::constData returns in column major order + explicit Q_ALWAYS_INLINE Matrix4x4_AVX2(const QMatrix4x4 &mat) + { + // data may not be properly aligned, using unaligned loads + const float *data = mat.constData(); + m_col12 = _mm256_loadu_ps(data); + m_col34 = _mm256_loadu_ps(data + 8); + } + + // In (row major) but we store in column major order + explicit Q_ALWAYS_INLINE Matrix4x4_AVX2(float m11, float m12, float m13, float m14, + float m21, float m22, float m23, float m24, + float m31, float m32, float m33, float m34, + float m41, float m42, float m43, float m44) + { + m_col12 = _mm256_set_ps(m42, m32, m22, m12, m41, m31, m21, m11); + m_col34 = _mm256_set_ps(m44, m34, m24, m14, m43, m33, m23, m13); + } + + Q_ALWAYS_INLINE void setToIdentity() + { + // 23 instructions + m_col12 = _mm256_set_ps(0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f); + m_col34 = _mm256_set_ps(1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f); + + // 23 instructions + // 1, 0, 0, 0 + // __m128 vec = _mm_set_ss(1.0f); + // // 0, 1, 0, 0 + // // 0b01010001 == 0x51 + // __m128 tmp = _mm_permute_ps(vec, 0x51); + + // // 1, 0, 0, 0, 0, 1, 0, 0 + // m_col12 = _mm256_set_m128(tmp, vec); + + // // 0, 0, 1, 0 + // // 0b01000101 == 0x45 + // tmp = _mm_permute_ps(vec, 0x45); + + // // 0, 0, 0, 1 + // // 0b00010101 == 0x15 + // vec = _mm_permute_ps(vec, 0x15); + + // // 0, 0, 1, 0, 0, 0, 0, 1 + // m_col34 = _mm256_set_m128(vec, tmp); + + // Using a static identity matrix and assigning it is 27 instructions + } + + Q_ALWAYS_INLINE Matrix4x4_AVX2 operator*(const Matrix4x4_AVX2 &other) const + { + // Shuffling: (Latency 1) + // (8 bits -> first two pairs used to select from the first vector, second pairs from second vector) + + // 00 01 10 11, 00 01 10 11 + // v1 = m11 m12 m13 m14 m21 m22 m23 m24 + // v2 = m11 m12 m13 m14 m21 m22 m23 m24 + + // shuffled with 00 00 00 00 + // v1[0] v1[0] v2[0] v2[0] v1[0] v1[0] v2[0] v2[0] + // -> m11 m11 m11 m11 m21 m21 m21 m21 + + // Broadcasting: (Latency 1) + // -> n11 n12 n13 n14 broadcasted + // n11 n12 n13 n14 n11 n12 n13 n14 + + // Multiplying (Latency 5): + // m11 m11 m11 m11 m21 m21 m21 m21 * n11 n12 n13 n14 n11 n12 n13 n14 + + // -> m11 n11, m11 n12, m11 n13, m11 n14 + // m21 n11, m21 n12, m21 n13, m21 n14 + + // 00 01 10 11, 00 01 10 11 + // v1 = m11 m12 m13 m14 m21 m22 m23 m24 + // v2 = m11 m12 m13 m14 m21 m22 m23 m24 + const __m256 otherCol12 = other.m_col12; + const __m256 otherCol34 = other.m_col34; + const __m128 col1 = _mm256_extractf128_ps(m_col12, 0); + const __m128 col2 = _mm256_extractf128_ps(m_col12, 1); + const __m128 col3 = _mm256_extractf128_ps(m_col34, 0); + const __m128 col4 = _mm256_extractf128_ps(m_col34, 1); + + // const __m256 col12 = _mm256_load_ps(m); + // const __m256 col34 = _mm256_load_ps(m + 8); + // const __m128 otherCol1 = _mm_load_ps(other.m); + // const __m128 otherCol2 = _mm_load_ps(other.m + 4); + // const __m128 otherCol3 = _mm_load_ps(other.m + 8); + // const __m128 otherCol4 = _mm_load_ps(other.m + 12); + + __m256 tmp = _mm256_mul_ps(_mm256_shuffle_ps(otherCol12, otherCol12, 0x00), _mm256_broadcast_ps(&col1)); + + // shuffled with 01 01 01 01 + // v1[1] v1[1] v2[1] v2[1] v1[1] v1[1] v2[1] v2[1] + // -> m12 m12 m12 m12 m22 m22 m22 m22 + + + // 00 01 10 11, 00 01 10 11 + // m11 m12 m13 m14, m21 m22 m23 m24 shuffled with 01 01 01 01 + // -> m12 m12 m12 m12 m22 m22 m22 m22 x n21 n22 n23 n24 n21 n22 n23 n24 + + // -> m12 n21, m12 n22, m12 n23, m12 n24, + // -> m22 n21, m22 n22, m22 n23, m22 n24 + tmp = _mm256_add_ps(_mm256_mul_ps(_mm256_shuffle_ps(otherCol12, otherCol12, 0x55), _mm256_broadcast_ps(&col2)), tmp); + + // m11 m12 m13 m14 m11 m12 m13 m14 shuffled with 10 10 10 10 + // m13 m13 m13 m13, m23 m23 m23 m23 + + // Multiplying with other.col3 + // -> m13 n31, m13 n32, m13 n33, m13 n34 + // -> m23 n31, m23 n32, m23 n33, m23 n34 + tmp = _mm256_add_ps(_mm256_mul_ps(_mm256_shuffle_ps(otherCol12, otherCol12, 0xaa), _mm256_broadcast_ps(&col3)), tmp); + + // m11 m12 m13 m14 m11 m12 m13 m14 shuffled with 11 11 11 11 + // m14 m14 m14 m14 m24 m24 m24 m24 + + // -> m14 n41, m14 n42, m14 n43, m14 n44 + // -> m24 n41, m24 n42, m24 n43, m24 n44 + tmp = _mm256_add_ps(_mm256_mul_ps(_mm256_shuffle_ps(otherCol12, otherCol12, 0xff), _mm256_broadcast_ps(&col4)), tmp); + + // Which finally gives + // c11 -> m11 n11 + m12 n21 + m13 n31 + m14 n41, + // c12 -> m11 n12 + m12 n22 + m13 n32 + m14 n42 + // c13 -> m11 n13 + m12 n23 + m13 n33 + m14 n43 + // c14 -> m11 n14 + m12 n24 + m13 n34 + m14 n44 + + // c21 -> m21 n11 + m22 n21 + m23 n31 + m24 n41, + // c12 -> m21 n12 + m22 n22 + m23 n32 + m24 n42 + // c13 -> m21 n13 + m22 n23 + m23 n33 + m24 n43 + // c14 -> m21 n14 + m22 n24 + m23 n34 + m24 n44 + + __m256 tmp2 = _mm256_mul_ps(_mm256_shuffle_ps(otherCol34, otherCol34, 0x00), _mm256_broadcast_ps(&col1)); + tmp2 = _mm256_add_ps(_mm256_mul_ps(_mm256_shuffle_ps(otherCol34, otherCol34, 0x55), _mm256_broadcast_ps(&col2)), tmp2); + tmp2 = _mm256_add_ps(_mm256_mul_ps(_mm256_shuffle_ps(otherCol34, otherCol34, 0xaa), _mm256_broadcast_ps(&col3)), tmp2); + tmp2 = _mm256_add_ps(_mm256_mul_ps(_mm256_shuffle_ps(otherCol34, otherCol34, 0xff), _mm256_broadcast_ps(&col4)), tmp2); + + Matrix4x4_AVX2 c(Qt::Uninitialized); + c.m_col12 = tmp; + c.m_col34 = tmp2; + return c; + } + + Q_ALWAYS_INLINE Matrix4x4_AVX2 operator-(const Matrix4x4_AVX2 &other) const + { + Matrix4x4_AVX2 c(Qt::Uninitialized); + + c.m_col12 = _mm256_sub_ps(m_col12, other.m_col12); + c.m_col34 = _mm256_sub_ps(m_col34, other.m_col34); + return c; + } + + Q_ALWAYS_INLINE Matrix4x4_AVX2 operator+(const Matrix4x4_AVX2 &other) const + { + Matrix4x4_AVX2 c(Qt::Uninitialized); + + c.m_col12 = _mm256_add_ps(m_col12, other.m_col12); + c.m_col34 = _mm256_add_ps(m_col34, other.m_col34); + return c; + } + + Q_ALWAYS_INLINE Matrix4x4_AVX2 &operator*=(const Matrix4x4_AVX2 &other) + { + *this = *this * other; + return *this; + } + + Q_ALWAYS_INLINE Matrix4x4_AVX2 &operator-=(const Matrix4x4_AVX2 &other) + { + *this = *this - other; + return *this; + } + + Q_ALWAYS_INLINE Matrix4x4_AVX2 &operator+=(const Matrix4x4_AVX2 &other) + { + *this = *this + other; + return *this; + } + + Q_ALWAYS_INLINE Matrix4x4_AVX2 transposed() const + { + Matrix4x4_AVX2 c(Qt::Uninitialized); + const __m128 col1 = _mm256_extractf128_ps(m_col12, 0); + const __m128 col2 = _mm256_extractf128_ps(m_col12, 1); + const __m128 col3 = _mm256_extractf128_ps(m_col34, 0); + const __m128 col4 = _mm256_extractf128_ps(m_col34, 1); + + // ~117 instructions + // Matrix4x4_AVX2 c = *this; + // _MM_TRANSPOSE4_PS(c.m_col1, c.m_col2, c.m_col3, c.m_col4); + + // ~131 instructions - AVX2 + // const __m256i indexes = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0); + // c.m_col12 = _mm256_permutevar8x32_ps(_mm256_unpacklo_ps(m_col12, m_col34), indexes); + // c.m_col34 = _mm256_permutevar8x32_ps(_mm256_unpackhi_ps(m_col12, m_col34), indexes); + + // ~193 instructions + // c.m_col12 = _mm256_setr_ps(m_m11, m_m21, m_m31, m_m41, m_m12, m_m22, m_m32, m_m42); + // c.m_col34 = _mm256_setr_ps(m_m13, m_m23, m_m33, m_m43, m_m14, m_m24, m_m34, m_m44); + + // ~113 instructions + // union { + // struct + // { + // __m256 twin; + // }; + // struct + // { + // __m128 col1; + // __m128 col2; + // }; + // } u; + + // u.twin = _mm256_shuffle_ps(m_col12, m_col34, 0b01000100); + // c.m_col1 = _mm_permute_ps(_mm_shuffle_ps(u.col1, u.col2, 0b10001000), 0b11011000); + // c.m_col2 = _mm_permute_ps(_mm_shuffle_ps(u.col1, u.col2, 0b11011101), 0b11011000); + + // u.twin = _mm256_shuffle_ps(m_col12, m_col34, 0b11101110); + // c.m_col3 = _mm_permute_ps(_mm_shuffle_ps(u.col1, u.col2, 0b10001000), 0b11011000); + // c.m_col4 = _mm_permute_ps(_mm_shuffle_ps(u.col1, u.col2, 0b11011101), 0b11011000); + + // ~113 instructions + // 0b11011101 == 0xdd + // 0b10001000 == 0x88 + const __m128 tmp1 = _mm_shuffle_ps(col1, col2, 0xdd); + const __m128 tmp2 = _mm_shuffle_ps(col1, col2, 0x88); + const __m128 tmp3 = _mm_shuffle_ps(col3, col4, 0xdd); + const __m128 tmp4 = _mm_shuffle_ps(col3, col4, 0x88); + c.m_col12 = _mm256_set_m128(_mm_shuffle_ps(tmp1, tmp3, 0x88), _mm_shuffle_ps(tmp2, tmp4, 0x88)); + c.m_col34 = _mm256_set_m128(_mm_shuffle_ps(tmp1, tmp3, 0xdd), _mm_shuffle_ps(tmp2, tmp4, 0xdd)); + + return c; + } + + Q_ALWAYS_INLINE Matrix4x4_AVX2 inverted() const + { + // TO DO: Optimize + const QMatrix4x4 mat = toQMatrix4x4(); + return Matrix4x4_AVX2(mat.inverted()); + } + + Q_ALWAYS_INLINE bool operator==(const Matrix4x4_AVX2 &other) const + { + // cmp returns (-1, -1, -1, -1, -1, -1, -1, -1) if the two m256 are equals + // movemask takes the most significant bits (8x 1 in this case) which equals 0xff + return (_mm256_movemask_ps(_mm256_cmp_ps(m_col12, other.m_col12, _CMP_EQ_OQ)) == 0xff && + _mm256_movemask_ps(_mm256_cmp_ps(m_col34, other.m_col34, _CMP_EQ_OQ)) == 0xff); + + } + + Q_ALWAYS_INLINE bool operator!=(const Matrix4x4_AVX2 &other) const + { + return !(*this == other); + } + + // For some reason _mm256_cvtss_f32 doesn't seem to be defined + Q_ALWAYS_INLINE float m11() const { return _mm_cvtss_f32(_mm256_extractf128_ps(m_col12, 0)); } + Q_ALWAYS_INLINE float m12() const { return _mm_cvtss_f32(_mm256_extractf128_ps(m_col12, 1)); } + Q_ALWAYS_INLINE float m13() const { return _mm_cvtss_f32(_mm256_extractf128_ps(m_col34, 0)); } + Q_ALWAYS_INLINE float m14() const { return _mm_cvtss_f32(_mm256_extractf128_ps(m_col34, 1)); } + + Q_ALWAYS_INLINE float m21() const + { + // 0b01010101 = 0x55 + const __m128 v = _mm256_extractf128_ps(m_col12, 0); + return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0x55)); + } + Q_ALWAYS_INLINE float m22() const + { + // 0b01010101 = 0x55 + const __m128 v = _mm256_extractf128_ps(m_col12, 1); + return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0x55)); + } + Q_ALWAYS_INLINE float m23() const + { + // 0b01010101 = 0x55 + const __m128 v = _mm256_extractf128_ps(m_col34, 0); + return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0x55)); + } + Q_ALWAYS_INLINE float m24() const + { + // 0b01010101 = 0x55 + const __m128 v = _mm256_extractf128_ps(m_col34, 1); + return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0x55)); + } + + Q_ALWAYS_INLINE float m31() const + { + // 0b10101010 = 0xaa + const __m128 v = _mm256_extractf128_ps(m_col12, 0); + return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xaa)); + } + Q_ALWAYS_INLINE float m32() const + { + // 0b10101010 = 0xaa + const __m128 v = _mm256_extractf128_ps(m_col12, 1); + return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xaa)); + } + Q_ALWAYS_INLINE float m33() const + { + // 0b10101010 = 0xaa + const __m128 v = _mm256_extractf128_ps(m_col34, 0); + return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xaa)); + } + Q_ALWAYS_INLINE float m34() const + { + // 0b10101010 = 0xaa + const __m128 v = _mm256_extractf128_ps(m_col34, 1); + return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xaa)); + } + + Q_ALWAYS_INLINE float m41() const + { + // 0b11111111 = 0xff + const __m128 v = _mm256_extractf128_ps(m_col12, 0); + return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xff)); + } + Q_ALWAYS_INLINE float m42() const + { + // 0b11111111 = 0xff + const __m128 v = _mm256_extractf128_ps(m_col12, 1); + return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xff)); + } + Q_ALWAYS_INLINE float m43() const + { + // 0b11111111 = 0xff + const __m128 v = _mm256_extractf128_ps(m_col34, 0); + return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xff)); + } + Q_ALWAYS_INLINE float m44() const + { + // 0b11111111 = 0xff + const __m128 v = _mm256_extractf128_ps(m_col34, 1); + return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xff)); + } + Q_ALWAYS_INLINE QMatrix4x4 toQMatrix4x4() const { return QMatrix4x4(m11(), m12(), m13(), m14(), + m21(), m22(), m23(), m24(), + m31(), m32(), m33(), m34(), + m41(), m42(), m43(), m44()); } + + Q_ALWAYS_INLINE Vector4D row(int index) const + { + switch (index) { + case 0: + return Vector4D(m11(), m12(), m13(), m14()); + case 1: + return Vector4D(m21(), m22(), m23(), m24()); + case 2: + return Vector4D(m31(), m32(), m33(), m34()); + case 3: + return Vector4D(m41(), m42(), m43(), m44()); + default: + Q_UNREACHABLE(); + return Vector4D(); + } + } + + Q_ALWAYS_INLINE Vector4D column(int index) const + { + Vector4D c(Qt::Uninitialized); + switch (index) { + case 0: + c.m_xyzw = _mm256_extractf128_ps(m_col12, 0); + break; + case 1: + c.m_xyzw = _mm256_extractf128_ps(m_col12, 1); + break; + case 2: + c.m_xyzw = _mm256_extractf128_ps(m_col34, 0); + break; + case 3: + c.m_xyzw = _mm256_extractf128_ps(m_col34, 1); + break; + default: + Q_UNREACHABLE(); + return Vector4D(); + } + return c; + } + + Q_ALWAYS_INLINE Vector3D_SSE map(const Vector3D_SSE &point) const + { + return *this * point; + } + + Q_ALWAYS_INLINE Vector4D_SSE map(const Vector4D_SSE &point) const + { + return *this * point; + } + + Vector3D_SSE mapVector(const Vector3D_SSE &vector) const + { + const __m128 row1 = _mm_set_ps(0.0f, m13(), m12(), m11()); + const __m128 row2 = _mm_set_ps(0.0f, m23(), m22(), m21()); + const __m128 row3 = _mm_set_ps(0.0f, m33(), m32(), m31()); + + const __m128 tmp = _mm_add_ps(_mm_mul_ps(vector.m_xyzw, row1), _mm_mul_ps(vector.m_xyzw, row2)); + + Vector3D_SSE v(Qt::Uninitialized); + v.m_xyzw = _mm_add_ps(tmp, _mm_mul_ps(vector.m_xyzw, row3)); + return v; + } + + friend Vector4D operator*(const Vector4D &vector, const Matrix4x4_AVX2 &matrix); + friend Vector4D operator*(const Matrix4x4_AVX2 &matrix, const Vector4D &vector); + + friend Vector3D operator*(const Vector3D &vector, const Matrix4x4_AVX2 &matrix); + friend Vector3D operator*(const Matrix4x4_AVX2 &matrix, const Vector3D &vector); + + friend QT3DCORE_PRIVATE_EXPORT QDebug operator<<(QDebug dbg, const Matrix4x4_AVX2 &m); +private: + // column major order + // aligned on 32 bytes boundaries for AVX, compatible with 16 bytes boundary for SSE + // union Q_DECL_ALIGN(32) + // { + // float m[16]; + // struct + // { + // float m_m11, m_m21, m_m31, m_m41; + // float m_m12, m_m22, m_m32, m_m42; + // float m_m13, m_m23, m_m33, m_m43; + // float m_m14, m_m24, m_m34, m_m44; + // }; + // }; + __m256 m_col12; + __m256 m_col34; +}; + +Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_AVX2 &matrix) +{ + const __m256 vecMultiplier = _mm256_broadcast_ps(&vector.m_xyzw); + // a1 a2 a3 a4 b1 b2 b3 b4, c1 c2 c3 c4 d1 d2 d3 d4 + // a1 + a2, a3 + a4, c1 + c2, c3 + c4 + // b1 + b2, b3 + b3, d1 + d2, d3 + d4 + + const __m256 partialSum = _mm256_hadd_ps(_mm256_mul_ps(matrix.m_col12, vecMultiplier), + _mm256_mul_ps(matrix.m_col34, vecMultiplier)); + + Vector4D v(Qt::Uninitialized); + // a12 + a34, b12 + b34, c12 + c34, d12 + d34 + // _mm256_permute4x64_pd is AVX2 + // 0b11011000 == 0xd8 + const __m256 shuffledSum = _mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(partialSum), 0xd8)); + v.m_xyzw = _mm_hadd_ps(_mm256_extractf128_ps(shuffledSum, 0), _mm256_extractf128_ps(shuffledSum, 1)); + return v; +} + +Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_AVX2 &matrix, const Vector4D &vector) +{ + const Matrix4x4_AVX2 transposed = matrix.transposed(); + return vector * transposed; +} + +Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_AVX2 &matrix) +{ + const __m128 vec4 = _mm_set_ps(1.0f, vector.z(), vector.y(), vector.x()); + const __m256 vecMultiplier = _mm256_broadcast_ps(&vec4); + // a1 a2 a3 a4 b1 b2 b3 b4, c1 c2 c3 c4 d1 d2 d3 d4 + // a1 + a2, a3 + a4, c1 + c2, c3 + c4 + // b1 + b2, b3 + b3, d1 + d2, d3 + d4 + const __m256 partialSum = _mm256_hadd_ps(_mm256_mul_ps(matrix.m_col12, vecMultiplier), + _mm256_mul_ps(matrix.m_col34, vecMultiplier)); + + // _mm256_permute4x64_pd is AVX2 + // 0b11011000 == 0xd8 + const __m256 shuffledSum = _mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(partialSum), 0xd8)); + // a12 + a34, b12 + b34, c12 + c34, d12 + d34 + const __m128 result = _mm_hadd_ps(_mm256_extractf128_ps(shuffledSum, 0), _mm256_extractf128_ps(shuffledSum, 1)); + // 0b11111111 = 0xff + const __m128 divisor = _mm_shuffle_ps(result, result, 0xff); + + Vector3D v(Qt::Uninitialized); + v.m_xyzw = _mm_div_ps(result, divisor);; + return v; +} + +Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_AVX2 &matrix, const Vector3D &vector) +{ + const Matrix4x4_AVX2 transposed = matrix.transposed(); + return vector * transposed; +} + +} // Qt3DCore + +Q_DECLARE_TYPEINFO(Qt3DCore::Matrix4x4_AVX2, Q_PRIMITIVE_TYPE); + +QT_END_NAMESPACE + +Q_DECLARE_METATYPE(Qt3DCore::Matrix4x4_AVX2) + +#endif // QT_COMPILER_SUPPORTS_AVX + +#endif // QT3DCORE_MATRIX4X4_AVX2_P_H diff --git a/src/core/transforms/matrix4x4_p.h b/src/core/transforms/matrix4x4_p.h new file mode 100644 index 000000000..1aded0f9c --- /dev/null +++ b/src/core/transforms/matrix4x4_p.h @@ -0,0 +1,82 @@ +/**************************************************************************** +** +** Copyright (C) 2016 Paul Lemire +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the Qt3D module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#ifndef QT3DCORE_MATRIX4X4_P_H +#define QT3DCORE_MATRIX4X4_P_H + +// +// W A R N I N G +// ------------- +// +// This file is not part of the Qt3D API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. +// + +#include + +#if defined(__AVX2__) && defined(QT_COMPILER_SUPPORTS_AVX2) + +#include + +QT_BEGIN_NAMESPACE +using Matrix4x4 = Qt3DCore::Matrix4x4_AVX2; +QT_END_NAMESPACE + +#elif defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2) + +#include + +QT_BEGIN_NAMESPACE +using Matrix4x4 = Qt3DCore::Matrix4x4_SSE; +QT_END_NAMESPACE + +#else + +#include + +QT_BEGIN_NAMESPACE +using Matrix4x4 = QMatrix4x4; +QT_END_NAMESPACE + +#endif + +#endif // QT3DCORE_MATRIX4X4_P_H diff --git a/src/core/transforms/matrix4x4_sse.cpp b/src/core/transforms/matrix4x4_sse.cpp new file mode 100644 index 000000000..d35cc2e35 --- /dev/null +++ b/src/core/transforms/matrix4x4_sse.cpp @@ -0,0 +1,64 @@ +/**************************************************************************** +** +** Copyright (C) 2017 Klaralvdalens Datakonsult AB (KDAB). +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the Qt3D module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "matrix4x4_sse_p.h" + +#ifdef QT_COMPILER_SUPPORTS_SSE2 + +QT_BEGIN_NAMESPACE + +namespace Qt3DCore { + +QDebug operator<<(QDebug dbg, const Matrix4x4_SSE &m) +{ + dbg.nospace() << "Matrix4x4_SSE(" << endl + << qSetFieldWidth(10) + << m.m11() << m.m12() << m.m13() << m.m14() << endl + << m.m21() << m.m22() << m.m23() << m.m24() << endl + << m.m31() << m.m32() << m.m33() << m.m34() << endl + << m.m41() << m.m42() << m.m43() << m.m44() << endl + << qSetFieldWidth(0) << ')'; + return dbg; +} + +} // Qt3DCore + +QT_END_NAMESPACE + +#endif // QT_COMPILER_SUPPORTS_SSE2 diff --git a/src/core/transforms/matrix4x4_sse_p.h b/src/core/transforms/matrix4x4_sse_p.h new file mode 100644 index 000000000..be314ca4d --- /dev/null +++ b/src/core/transforms/matrix4x4_sse_p.h @@ -0,0 +1,504 @@ +/**************************************************************************** +** +** Copyright (C) 2016 Paul Lemire +** Contact: https://www.qt.io/licensing/ +** +** This file is part of the Qt3D module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see https://www.qt.io/terms-conditions. For further +** information use the contact form at https://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 3 as published by the Free Software +** Foundation and appearing in the file LICENSE.LGPL3 included in the +** packaging of this file. Please review the following information to +** ensure the GNU Lesser General Public License version 3 requirements +** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. +** +** GNU General Public License Usage +** Alternatively, this file may be used under the terms of the GNU +** General Public License version 2.0 or (at your option) the GNU General +** Public license version 3 or any later version approved by the KDE Free +** Qt Foundation. The licenses are as published by the Free Software +** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 +** included in the packaging of this file. Please review the following +** information to ensure the GNU General Public License requirements will +** be met: https://www.gnu.org/licenses/gpl-2.0.html and +** https://www.gnu.org/licenses/gpl-3.0.html. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#ifndef QT3DCORE_MATRIX4X4_SSE_P_H +#define QT3DCORE_MATRIX4X4_SSE_P_H + +// +// W A R N I N G +// ------------- +// +// This file is not part of the Qt3D API. It exists purely as an +// implementation detail. This header file may change from version to +// version without notice, or even be removed. +// +// We mean it. +// + +#include +#include +#include +#include + +#ifdef QT_COMPILER_SUPPORTS_SSE2 + +QT_BEGIN_NAMESPACE + +namespace Qt3DCore { + +class Matrix4x4_SSE +{ +public: + + Q_ALWAYS_INLINE Matrix4x4_SSE() { setToIdentity(); } + explicit Q_ALWAYS_INLINE Matrix4x4_SSE(Qt::Initialization) {} + + // QMatrix4x4::constData returns in column major order + explicit Q_ALWAYS_INLINE Matrix4x4_SSE(const QMatrix4x4 &mat) + { + // data may not be properly aligned, using unaligned loads + const float *data = mat.constData(); + m_col1 = _mm_loadu_ps(data); + m_col2 = _mm_loadu_ps(data + 4); + m_col3 = _mm_loadu_ps(data + 8); + m_col4 = _mm_loadu_ps(data + 12); + } + + // Assumes data is 16 bytes aligned (and in column major order) + explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float *data) + { + m_col1 = _mm_load_ps(data); + m_col2 = _mm_load_ps(data + 4); + m_col3 = _mm_load_ps(data + 8); + m_col4 = _mm_load_ps(data + 12); + } + + // In (row major) but we store in column major order + explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float m11, float m12, float m13, float m14, + float m21, float m22, float m23, float m24, + float m31, float m32, float m33, float m34, + float m41, float m42, float m43, float m44) + { + m_col1 = _mm_set_ps(m41, m31, m21, m11); + m_col2 = _mm_set_ps(m42, m32, m22, m12); + m_col3 = _mm_set_ps(m43, m33, m23, m13); + m_col4 = _mm_set_ps(m44, m34, m24, m14); + } + + Q_ALWAYS_INLINE void setToIdentity() + { + m_col1 = _mm_set_ss(1.0f); + m_col2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f); + m_col3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); + m_col4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); + } + + Q_ALWAYS_INLINE Matrix4x4_SSE operator*(const Matrix4x4_SSE &other) const + { + Matrix4x4_SSE c(Qt::Uninitialized); + + const __m128 c1 = m_col1; + const __m128 c2 = m_col2; + const __m128 c3 = m_col3; + const __m128 c4 = m_col4; + + // c11, c21, c31, c41 + // 1) (m11 x n11), (m11 x n21), (m11 x n31), (m11 x n41) + // 2) (m11 x n11) + (m21 x n12), (m11 x n21) + (m21 x n22), (m11 x n31) + (m21 x n32), (m11 x n41) + (m21 x n42) + // 3) (m11 x n11) + (m21 x n21) + (m31 x n13), (m11 x n21) + (m21 x n22) + (m31 x n 23), (m11 x n31) + (m21 x n32) + (m31 x n33), (m11 x n41) + (m21 x n42) (m31 x n43) + // 4) (m11 x n11) + (m21 x n21) + (m31 x n13) + (m41 x n14), (m11 x n21) + (m21 x n22) + (m31 x n 23) + (m41 x n24), (m11 x n31) + (m21 x n32) + (m31 x n33) + (m41 x n34), (m11 x n41) + (m21 x n42) (m31 x n43) + (m41 x n44) + __m128 tmp = _mm_mul_ps(_mm_set1_ps(other.m11()), c1); + tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m21()), c2), tmp); + tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m31()), c3), tmp); + c.m_col1 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m41()), c4), tmp); + + // c21, c22, c23, c24 + tmp = _mm_mul_ps(_mm_set1_ps(other.m12()), c1); + tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m22()), c2), tmp); + tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m32()), c3), tmp); + c.m_col2 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m42()), c4), tmp); + + // c31, c32, c33, c34 + tmp = _mm_mul_ps(_mm_set1_ps(other.m13()), c1); + tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m23()), c2), tmp); + tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m33()), c3), tmp); + c.m_col3 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m43()), c4), tmp); + + // c41, c42, c43, c44 + tmp = _mm_mul_ps(_mm_set1_ps(other.m14()), c1); + tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m24()), c2), tmp); + tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m34()), c3), tmp); + c.m_col4 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m44()), c4), tmp); + + return c; + } + + Q_ALWAYS_INLINE Matrix4x4_SSE operator-(const Matrix4x4_SSE &other) const + { + Matrix4x4_SSE c(Qt::Uninitialized); + + c.m_col1 = _mm_sub_ps(m_col1, other.m_col1); + c.m_col2 = _mm_sub_ps(m_col2, other.m_col2); + c.m_col3 = _mm_sub_ps(m_col3, other.m_col3); + c.m_col4 = _mm_sub_ps(m_col4, other.m_col4); + + return c; + } + + Q_ALWAYS_INLINE Matrix4x4_SSE operator+(const Matrix4x4_SSE &other) const + { + Matrix4x4_SSE c(Qt::Uninitialized); + + c.m_col1 = _mm_add_ps(m_col1, other.m_col1); + c.m_col2 = _mm_add_ps(m_col2, other.m_col2); + c.m_col3 = _mm_add_ps(m_col3, other.m_col3); + c.m_col4 = _mm_add_ps(m_col4, other.m_col4); + + return c; + } + + Q_ALWAYS_INLINE Matrix4x4_SSE &operator*=(const Matrix4x4_SSE &other) + { + *this = *this * other; + return *this; + } + + Q_ALWAYS_INLINE Matrix4x4_SSE &operator-=(const Matrix4x4_SSE &other) + { + *this = *this - other; + return *this; + } + + Q_ALWAYS_INLINE Matrix4x4_SSE &operator+=(const Matrix4x4_SSE &other) + { + *this = *this + other; + return *this; + } + + Q_ALWAYS_INLINE Matrix4x4_SSE transposed() const + { + Matrix4x4_SSE c(Qt::Uninitialized); + + // ~113 instructions + // 0b11011101 == 0xdd + // 0b10001000 == 0x88 + const __m128 tmp1 = _mm_shuffle_ps(m_col1, m_col2, 0xdd); + const __m128 tmp2 = _mm_shuffle_ps(m_col1, m_col2, 0x88); + const __m128 tmp3 = _mm_shuffle_ps(m_col3, m_col4, 0xdd); + const __m128 tmp4 = _mm_shuffle_ps(m_col3, m_col4, 0x88); + c.m_col1 = _mm_shuffle_ps(tmp2, tmp4, 0x88); + c.m_col2 = _mm_shuffle_ps(tmp1, tmp3, 0x88); + c.m_col3 = _mm_shuffle_ps(tmp2, tmp4, 0xdd); + c.m_col4 = _mm_shuffle_ps(tmp1, tmp3, 0xdd); + + return c; + } + + Q_ALWAYS_INLINE Matrix4x4_SSE inverted() const + { + // TO DO: Optimize + const QMatrix4x4 mat = toQMatrix4x4(); + return Matrix4x4_SSE(mat.inverted()); + } + + Q_ALWAYS_INLINE bool operator==(const Matrix4x4_SSE &other) const + { + // 0b1111 == 0xf + return (_mm_movemask_ps(_mm_cmpeq_ps(m_col1, other.m_col1)) == 0xf && + _mm_movemask_ps(_mm_cmpeq_ps(m_col2, other.m_col2)) == 0xf && + _mm_movemask_ps(_mm_cmpeq_ps(m_col3, other.m_col3)) == 0xf && + _mm_movemask_ps(_mm_cmpeq_ps(m_col4, other.m_col4)) == 0xf); + } + + Q_ALWAYS_INLINE bool operator!=(const Matrix4x4_SSE &other) const + { + return !(*this == other); + } + + Q_ALWAYS_INLINE float m11() const { return _mm_cvtss_f32(m_col1); } + Q_ALWAYS_INLINE float m12() const { return _mm_cvtss_f32(m_col2); } + Q_ALWAYS_INLINE float m13() const { return _mm_cvtss_f32(m_col3); } + Q_ALWAYS_INLINE float m14() const { return _mm_cvtss_f32(m_col4); } + + Q_ALWAYS_INLINE float m21() const + { + // 0b01010101 = 0x55 + return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0x55)); + } + Q_ALWAYS_INLINE float m22() const + { + // 0b01010101 = 0x55 + return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0x55)); + } + Q_ALWAYS_INLINE float m23() const + { + // 0b01010101 = 0x55 + return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0x55)); + } + Q_ALWAYS_INLINE float m24() const + { + // 0b01010101 = 0x55 + return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0x55)); + } + + Q_ALWAYS_INLINE float m31() const + { + // 0b10101010 = 0xaa + return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xaa)); + } + Q_ALWAYS_INLINE float m32() const + { + // 0b10101010 = 0xaa + return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xaa)); + } + Q_ALWAYS_INLINE float m33() const + { + // 0b10101010 = 0xaa + return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xaa)); + } + Q_ALWAYS_INLINE float m34() const + { + // 0b10101010 = 0xaa + return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xaa)); + } + + Q_ALWAYS_INLINE float m41() const + { + // 0b11111111 = 0xff + return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xff)); + } + Q_ALWAYS_INLINE float m42() const + { + // 0b11111111 = 0xff + return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xff)); + } + Q_ALWAYS_INLINE float m43() const + { + // 0b11111111 = 0xff + return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xff)); + } + Q_ALWAYS_INLINE float m44() const + { + // 0b11111111 = 0xff + return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xff)); + } + + Q_ALWAYS_INLINE Vector4D row(int index) const + { + switch (index) { + case 0: + return Vector4D(m11(), m12(), m13(), m14()); + case 1: + return Vector4D(m21(), m22(), m23(), m24()); + case 2: + return Vector4D(m31(), m32(), m33(), m34()); + case 3: + return Vector4D(m41(), m42(), m43(), m44()); + default: + Q_UNREACHABLE(); + return Vector4D(); + } + } + + Q_ALWAYS_INLINE Vector4D column(int index) const + { + Vector4D c(Qt::Uninitialized); + switch (index) { + case 0: + c.m_xyzw = m_col1; + break; + case 1: + c.m_xyzw = m_col2; + break; + case 2: + c.m_xyzw = m_col3; + break; + case 3: + c.m_xyzw = m_col4; + break; + default: + Q_UNREACHABLE(); + return Vector4D(); + } + return c; + } + + Q_ALWAYS_INLINE QMatrix4x4 toQMatrix4x4() const { return QMatrix4x4(m11(), m12(), m13(), m14(), + m21(), m22(), m23(), m24(), + m31(), m32(), m33(), m34(), + m41(), m42(), m43(), m44()); } + + Q_ALWAYS_INLINE Vector3D_SSE map(const Vector3D_SSE &point) const + { + return *this * point; + } + + Q_ALWAYS_INLINE Vector4D_SSE map(const Vector4D_SSE &point) const + { + return *this * point; + } + + Q_ALWAYS_INLINE Vector3D_SSE mapVector(const Vector3D_SSE &vector) const + { + const __m128 row1 = _mm_set_ps(0.0f, m13(), m12(), m11()); + const __m128 row2 = _mm_set_ps(0.0f, m23(), m22(), m21()); + const __m128 row3 = _mm_set_ps(0.0f, m33(), m32(), m31()); + + const __m128 tmp = _mm_add_ps(_mm_mul_ps(vector.m_xyzw, row1), _mm_mul_ps(vector.m_xyzw, row2)); + + Vector3D_SSE v(Qt::Uninitialized); + v.m_xyzw = _mm_add_ps(tmp, _mm_mul_ps(vector.m_xyzw, row3)); + return v; + } + + friend Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix); + friend Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector); + + friend Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix); + friend Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector); + + friend QT3DCORE_PRIVATE_EXPORT QDebug operator<<(QDebug dbg, const Matrix4x4_SSE &m); +private: + // Internally we will store the matrix as indicated below + // Q_DECL_ALIGN(16) // aligned on 16 bytes boundary for SSE (column major) + // struct + // { + // float m_m11, m_m21, m_m31, m_m41; + // float m_m12, m_m22, m_m32, m_m42; + // float m_m13, m_m23, m_m33, m_m43; + // float m_m14, m_m24, m_m34, m_m44; + // }; + // struct + // { + // float m[16]; + // }; + __m128 m_col1; + __m128 m_col2; + __m128 m_col3; + __m128 m_col4; +}; + +Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix) +{ + const __m128 vCol1 = _mm_mul_ps(matrix.m_col1, vector.m_xyzw); + const __m128 vCol2 = _mm_mul_ps(matrix.m_col2, vector.m_xyzw); + const __m128 vCol3 = _mm_mul_ps(matrix.m_col3, vector.m_xyzw); + const __m128 vCol4 = _mm_mul_ps(matrix.m_col4, vector.m_xyzw); + + + // 0b01000100 == 0x44 + // 0b11101110 == 0xee + + // vCol1.x, vCol1.y, vCol2.x, vCol2.y + __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44); + // vCol1.z, vCol1.w, vCol2.z, vCol2.w + __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee); + + // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w, + const __m128 tmpSum01 = _mm_add_ps(tmp1, tmp2); + + // vCol3.x, vCol3.y, vCol4.x, vCol4.y + tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44); + // vCol3.z, vCol3.w, vCol4.z, vCol4.w + tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee); + + // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w, + const __m128 tmpSum02 = _mm_add_ps(tmp1, tmp2); + + // 0b10001000 == 0x88 + // 0b11011101 == 0xdd + + // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z, + tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88); + // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w, + tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd); + + Vector4D v(Qt::Uninitialized); + v.m_xyzw = _mm_add_ps(tmp1, tmp2); + return v; +} + +Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector) +{ + const Matrix4x4_SSE transposed = matrix.transposed(); + return vector * transposed; +} + +Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix) +{ + const __m128 vec4 = _mm_set_ps(1.0f, vector.z(), vector.y(), vector.x()); + + const __m128 vCol1 = _mm_mul_ps(matrix.m_col1, vec4); + const __m128 vCol2 = _mm_mul_ps(matrix.m_col2, vec4); + const __m128 vCol3 = _mm_mul_ps(matrix.m_col3, vec4); + const __m128 vCol4 = _mm_mul_ps(matrix.m_col4, vec4); + + // 0b01000100 == 0x44 + // 0b11101110 == 0xee + + // vCol1.x, vCol1.y, vCol2.x, vCol2.y + __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44); + // vCol1.z, vCol1.w, vCol2.z, vCol2.w + __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee); + + // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w, + const __m128 tmpSum01 = _mm_add_ps(tmp1, tmp2); + + // vCol3.x, vCol3.y, vCol4.x, vCol4.y + tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44); + // vCol3.z, vCol3.w, vCol4.z, vCol4.w + tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee); + + // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w, + const __m128 tmpSum02 = _mm_add_ps(tmp1, tmp2); + + // 0b10001000 == 0x88 + // 0b11011101 == 0xdd + + // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z, + tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88); + // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w, + tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd); + + const __m128 result = _mm_add_ps(tmp1, tmp2); + // 0b11111111 = 0xff + const __m128 divisor = _mm_shuffle_ps(result, result, 0xff); + Vector3D v(Qt::Uninitialized); + v.m_xyzw = _mm_div_ps(result, divisor); + return v; +} + +QT3DCORE_PRIVATE_EXPORT Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector) +{ + const Matrix4x4_SSE transposed = matrix.transposed(); + return vector * transposed; +} + +} // Qt3DCore + + +Q_DECLARE_TYPEINFO(Qt3DCore::Matrix4x4_SSE, Q_PRIMITIVE_TYPE); + +QT_END_NAMESPACE + +Q_DECLARE_METATYPE(Qt3DCore::Matrix4x4_SSE) + +#endif // QT_COMPILER_SUPPORTS_SSE2 + +#endif // QT3DCORE_MATRIX4X4_SSE_P_H diff --git a/src/core/transforms/transforms.pri b/src/core/transforms/transforms.pri index d12fb1b92..aee2dad3f 100644 --- a/src/core/transforms/transforms.pri +++ b/src/core/transforms/transforms.pri @@ -21,7 +21,8 @@ HEADERS += \ $$PWD/qarmature.h \ $$PWD/qarmature_p.h \ $$PWD/vector4d_p.h \ - $$PWD/vector3d_p.h + $$PWD/vector3d_p.h \ + $$PWD/matrix4x4_p.h INCLUDEPATH += $$PWD @@ -30,9 +31,27 @@ qtConfig(qt3d-simd-sse2) { SSE2_HEADERS += \ $$PWD/vector4d_sse_p.h \ - $$PWD/vector3d_sse_p.h + $$PWD/vector3d_sse_p.h \ + $$PWD/matrix4x4_sse_p.h SSE2_SOURCES += \ + $$PWD/matrix4x4_sse.cpp + + # These files contain AVX2 code, only add them to SSE2 if AVX2 not available + !qtConfig(qt3d-simd-avx2) { + SSE2_SOURCES += \ + $$PWD/vector4d_sse.cpp \ + $$PWD/vector3d_sse.cpp + } +} + +qtConfig(qt3d-simd-avx2) { + CONFIG += simd + + AVX2_HEADERS += \ + $$PWD/matrix4x4_avx2_p.h + AVX2_SOURCES += \ + $$PWD/matrix4x4_avx2.cpp \ $$PWD/vector4d_sse.cpp \ $$PWD/vector3d_sse.cpp } diff --git a/src/core/transforms/vector3d_sse.cpp b/src/core/transforms/vector3d_sse.cpp index 7e20a2f77..151cbb959 100644 --- a/src/core/transforms/vector3d_sse.cpp +++ b/src/core/transforms/vector3d_sse.cpp @@ -38,6 +38,13 @@ ****************************************************************************/ #include + +#ifdef __AVX2__ +#include "matrix4x4_avx2_p.h" +#else +#include "matrix4x4_sse_p.h" +#endif + #include "vector3d_sse_p.h" #include "vector4d_sse_p.h" #include @@ -59,6 +66,74 @@ Vector3D_SSE::Vector3D_SSE(const Vector4D_SSE &v) m_xyzw = _mm_mul_ps(v.m_xyzw, _mm_set_ps(0.0f, 1.0f, 1.0f, 1.0f)); } +#ifdef __AVX2__ + +Vector3D_SSE Vector3D_SSE::unproject(const Matrix4x4_AVX2 &modelView, const Matrix4x4_AVX2 &projection, const QRect &viewport) const +{ + const Matrix4x4_AVX2 inverse = (projection * modelView).inverted(); + + Vector4D_SSE tmp(*this, 1.0f); + tmp.setX((tmp.x() - float(viewport.x())) / float(viewport.width())); + tmp.setY((tmp.y() - float(viewport.y())) / float(viewport.height())); + tmp = tmp * 2.0f - Vector4D_SSE(1.0f, 1.0f, 1.0f, 1.0f); + + Vector4D_SSE obj = inverse * tmp; + if (qFuzzyIsNull(obj.w())) + obj.setW(1.0f); + obj /= obj.w(); + return Vector3D_SSE(obj); +} + +Vector3D_SSE Vector3D_SSE::project(const Matrix4x4_AVX2 &modelView, const Matrix4x4_AVX2 &projection, const QRect &viewport) const +{ + Vector4D_SSE tmp(*this, 1.0f); + tmp = projection * modelView * tmp; + if (qFuzzyIsNull(tmp.w())) + tmp.setW(1.0f); + tmp /= tmp.w(); + + tmp = tmp * 0.5f + Vector4D_SSE(0.5f, 0.5f, 0.5f, 0.5f); + tmp.setX(tmp.x() * viewport.width() + viewport.x()); + tmp.setY(tmp.y() * viewport.height() + viewport.y()); + + return Vector3D_SSE(tmp); +} + +#else + +Vector3D_SSE Vector3D_SSE::unproject(const Matrix4x4_SSE &modelView, const Matrix4x4_SSE &projection, const QRect &viewport) const +{ + const Matrix4x4_SSE inverse = (projection * modelView).inverted(); + + Vector4D_SSE tmp(*this, 1.0f); + tmp.setX((tmp.x() - float(viewport.x())) / float(viewport.width())); + tmp.setY((tmp.y() - float(viewport.y())) / float(viewport.height())); + tmp = tmp * 2.0f - Vector4D_SSE(1.0f, 1.0f, 1.0f, 1.0f); + + Vector4D_SSE obj = inverse * tmp; + if (qFuzzyIsNull(obj.w())) + obj.setW(1.0f); + obj /= obj.w(); + return Vector3D_SSE(obj); +} + +Vector3D_SSE Vector3D_SSE::project(const Matrix4x4_SSE &modelView, const Matrix4x4_SSE &projection, const QRect &viewport) const +{ + Vector4D_SSE tmp(*this, 1.0f); + tmp = projection * modelView * tmp; + if (qFuzzyIsNull(tmp.w())) + tmp.setW(1.0f); + tmp /= tmp.w(); + + tmp = tmp * 0.5f + Vector4D_SSE(0.5f, 0.5f, 0.5f, 0.5f); + tmp.setX(tmp.x() * viewport.width() + viewport.x()); + tmp.setY(tmp.y() * viewport.height() + viewport.y()); + + return Vector3D_SSE(tmp); +} + +#endif + } // Qt3DCore QT_END_NAMESPACE diff --git a/src/core/transforms/vector3d_sse_p.h b/src/core/transforms/vector3d_sse_p.h index bf0f2c1ac..a299d1db9 100644 --- a/src/core/transforms/vector3d_sse_p.h +++ b/src/core/transforms/vector3d_sse_p.h @@ -178,6 +178,14 @@ public: return ((_mm_movemask_ps(_mm_cmpeq_ps(m_xyzw, _mm_set_ps1(0.0f))) & 0x7) == 0x7); } +#ifdef __AVX2__ + QT3DCORE_PRIVATE_EXPORT Vector3D_SSE unproject(const Matrix4x4_AVX2 &modelView, const Matrix4x4_AVX2 &projection, const QRect &viewport) const; + QT3DCORE_PRIVATE_EXPORT Vector3D_SSE project(const Matrix4x4_AVX2 &modelView, const Matrix4x4_AVX2 &projection, const QRect &viewport) const; +#else + QT3DCORE_PRIVATE_EXPORT Vector3D_SSE unproject(const Matrix4x4_SSE &modelView, const Matrix4x4_SSE &projection, const QRect &viewport) const; + QT3DCORE_PRIVATE_EXPORT Vector3D_SSE project(const Matrix4x4_SSE &modelView, const Matrix4x4_SSE &projection, const QRect &viewport) const; +#endif + Q_ALWAYS_INLINE float x() const { return _mm_cvtss_f32(m_xyzw); } Q_ALWAYS_INLINE float y() const @@ -342,11 +350,11 @@ public: friend class Matrix4x4_SSE; friend class Vector4D_SSE; - friend QT3DCORE_PRIVATE_EXPORT Vector3D_SSE operator*(Vector3D_SSE vector, Matrix4x4_SSE matrix); - friend QT3DCORE_PRIVATE_EXPORT Vector3D_SSE operator*(Matrix4x4_SSE matrix, Vector3D_SSE vector); + friend QT3DCORE_PRIVATE_EXPORT Vector3D_SSE operator*(const Vector3D_SSE &vector, const Matrix4x4_SSE &matrix); + friend QT3DCORE_PRIVATE_EXPORT Vector3D_SSE operator*(const Matrix4x4_SSE &matrix, const Vector3D_SSE &vector); - friend QT3DCORE_PRIVATE_EXPORT Vector3D_SSE operator*(Vector3D_SSE vector, Matrix4x4_AVX2 matrix); - friend QT3DCORE_PRIVATE_EXPORT Vector3D_SSE operator*(Matrix4x4_AVX2 matrix, Vector3D_SSE vector); + friend QT3DCORE_PRIVATE_EXPORT Vector3D_SSE operator*(const Vector3D_SSE &vector, const Matrix4x4_AVX2 &matrix); + friend QT3DCORE_PRIVATE_EXPORT Vector3D_SSE operator*(const Matrix4x4_AVX2 &matrix, const Vector3D_SSE &vector); friend Q_ALWAYS_INLINE const Vector3D_SSE operator+(Vector3D_SSE v1, Vector3D_SSE v2) { return v1 += v2; } friend Q_ALWAYS_INLINE const Vector3D_SSE operator-(Vector3D_SSE v1, Vector3D_SSE v2) { return v1 -= v2; } diff --git a/src/core/transforms/vector4d_sse_p.h b/src/core/transforms/vector4d_sse_p.h index b1c58c826..ffeca0946 100644 --- a/src/core/transforms/vector4d_sse_p.h +++ b/src/core/transforms/vector4d_sse_p.h @@ -346,11 +346,11 @@ public: friend class Matrix4x4_SSE; friend class Vector3D_SSE; - friend Vector4D_SSE operator*(Vector4D_SSE vector, Matrix4x4_SSE matrix); - friend Vector4D_SSE operator*(Matrix4x4_SSE matrix, Vector4D_SSE vector); + friend Vector4D_SSE operator*(const Vector4D_SSE &vector, const Matrix4x4_SSE &matrix); + friend Vector4D_SSE operator*(const Matrix4x4_SSE &matrix, const Vector4D_SSE &vector); - friend Vector4D_SSE operator*(Vector4D_SSE vector, Matrix4x4_AVX2 matrix); - friend Vector4D_SSE operator*(Matrix4x4_AVX2 matrix, Vector4D_SSE vector); + friend Vector4D_SSE operator*(const Vector4D_SSE &vector, const Matrix4x4_AVX2 &matrix); + friend Vector4D_SSE operator*(const Matrix4x4_AVX2 &matrix, const Vector4D_SSE &vector); friend Q_ALWAYS_INLINE const Vector4D_SSE operator+(Vector4D_SSE v1, Vector4D_SSE v2) { return v1 += v2; } friend Q_ALWAYS_INLINE const Vector4D_SSE operator-(Vector4D_SSE v1, Vector4D_SSE v2) { return v1 -= v2; } -- cgit v1.2.3