1 files changed, 574 insertions, 0 deletions
diff --git a/src/core/transforms/matrix4x4_avx2_p.h b/src/core/transforms/matrix4x4_avx2_p.h
new file mode 100644
index 000000000..d5f77041b
--- /dev/null
+++ b/src/core/transforms/matrix4x4_avx2_p.h
@@ -0,0 +1,574 @@
+/****************************************************************************
+**
+** Copyright (C) 2016 Paul Lemire <paul.lemire350@gmail.com>
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of the Qt3D module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#ifndef QT3DCORE_MATRIX4X4_AVX2_P_H
+#define QT3DCORE_MATRIX4X4_AVX2_P_H
+
+//
+//  W A R N I N G
+//  -------------
+//
+// This file is not part of the Qt3D API.  It exists purely as an
+// implementation detail.  This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+//
+
+#include <Qt3DCore/private/vector4d_p.h>
+#include <Qt3DCore/private/vector3d_p.h>
+#include <private/qsimd_p.h>
+#include <QMatrix4x4>
+
+#ifdef QT_COMPILER_SUPPORTS_AVX2
+
+// Some GCC versions don't have _mm256_set_m128 available
+// Work around that
+#define _mm256_set_m128(va, vb) \
+        _mm256_insertf128_ps(_mm256_castps128_ps256(vb), va, 1)
+
+QT_BEGIN_NAMESPACE
+
+namespace Qt3DCore {
+
+class Matrix4x4_AVX2
+{
+public:
+
+    Q_ALWAYS_INLINE Matrix4x4_AVX2() { setToIdentity(); }
+    explicit Q_ALWAYS_INLINE Matrix4x4_AVX2(Qt::Initialization) {}
+
+    // Assumes data is 32 bytes aligned (and in column major order)
+    explicit Q_ALWAYS_INLINE Matrix4x4_AVX2(float *data)
+    {
+        m_col12 = _mm256_load_ps(data);
+        m_col34 = _mm256_load_ps(data + 8);
+    }
+
+    // QMatrix4x4::constData returns in column major order
+    explicit Q_ALWAYS_INLINE Matrix4x4_AVX2(const QMatrix4x4 &mat)
+    {
+        // data may not be properly aligned, using unaligned loads
+        const float *data = mat.constData();
+        m_col12 = _mm256_loadu_ps(data);
+        m_col34 = _mm256_loadu_ps(data + 8);
+    }
+
+    // In (row major) but we store in column major order
+    explicit Q_ALWAYS_INLINE Matrix4x4_AVX2(float m11, float m12, float m13, float m14,
+                                            float m21, float m22, float m23, float m24,
+                                            float m31, float m32, float m33, float m34,
+                                            float m41, float m42, float m43, float m44)
+    {
+        m_col12 = _mm256_set_ps(m42, m32, m22, m12, m41, m31, m21, m11);
+        m_col34 = _mm256_set_ps(m44, m34, m24, m14, m43, m33, m23, m13);
+    }
+
+    Q_ALWAYS_INLINE void setToIdentity()
+    {
+        // 23 instructions
+        m_col12 = _mm256_set_ps(0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f);
+        m_col34 = _mm256_set_ps(1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f);
+
+        // 23 instructions
+        // 1, 0, 0, 0
+        //        __m128 vec = _mm_set_ss(1.0f);
+        //        // 0, 1, 0, 0
+        //        // 0b01010001 == 0x51
+        //        __m128 tmp = _mm_permute_ps(vec, 0x51);
+
+        //        // 1, 0, 0, 0, 0, 1, 0, 0
+        //        m_col12 = _mm256_set_m128(tmp, vec);
+
+        //        // 0, 0, 1, 0
+        //        // 0b01000101 == 0x45
+        //        tmp = _mm_permute_ps(vec, 0x45);
+
+        //        // 0, 0, 0, 1
+        //        // 0b00010101 == 0x15
+        //        vec = _mm_permute_ps(vec, 0x15);
+
+        //        // 0, 0, 1, 0, 0, 0, 0, 1
+        //        m_col34 = _mm256_set_m128(vec, tmp);
+
+        // Using a static identity matrix and assigning it is 27 instructions
+    }
+
+    Q_ALWAYS_INLINE Matrix4x4_AVX2 operator*(const Matrix4x4_AVX2 &other) const
+    {
+        // Shuffling: (Latency 1)
+        // (8 bits -> first two pairs used to select from the first vector, second pairs from second vector)
+
+        //       00  01  10  11, 00  01  10  11
+        // v1 = m11 m12 m13 m14 m21 m22 m23 m24
+        // v2 = m11 m12 m13 m14 m21 m22 m23 m24
+
+        // shuffled with 00 00 00 00
+        // v1[0] v1[0] v2[0] v2[0] v1[0] v1[0] v2[0] v2[0]
+        // -> m11 m11 m11 m11 m21 m21 m21 m21
+
+        // Broadcasting: (Latency 1)
+        // -> n11 n12 n13 n14 broadcasted
+        // n11 n12 n13 n14 n11 n12 n13 n14
+
+        // Multiplying (Latency 5):
+        // m11 m11 m11 m11 m21 m21 m21 m21 *  n11 n12 n13 n14 n11 n12 n13 n14
+
+        // -> m11 n11, m11 n12, m11 n13, m11 n14
+        //    m21 n11, m21 n12, m21 n13, m21 n14
+
+        //       00  01  10  11, 00  01  10  11
+        // v1 = m11 m12 m13 m14 m21 m22 m23 m24
+        // v2 = m11 m12 m13 m14 m21 m22 m23 m24
+        const __m256 otherCol12 = other.m_col12;
+        const __m256 otherCol34 = other.m_col34;
+        const __m128 col1 = _mm256_extractf128_ps(m_col12, 0);
+        const __m128 col2 = _mm256_extractf128_ps(m_col12, 1);
+        const __m128 col3 = _mm256_extractf128_ps(m_col34, 0);
+        const __m128 col4 = _mm256_extractf128_ps(m_col34, 1);
+
+        //    const __m256 col12 = _mm256_load_ps(m);
+        //    const __m256 col34 = _mm256_load_ps(m + 8);
+        //    const __m128 otherCol1 = _mm_load_ps(other.m);
+        //    const __m128 otherCol2 = _mm_load_ps(other.m + 4);
+        //    const __m128 otherCol3 = _mm_load_ps(other.m + 8);
+        //    const __m128 otherCol4 = _mm_load_ps(other.m + 12);
+
+        __m256 tmp = _mm256_mul_ps(_mm256_shuffle_ps(otherCol12, otherCol12, 0x00), _mm256_broadcast_ps(&col1));
+
+        // shuffled with 01 01 01 01
+        // v1[1] v1[1] v2[1] v2[1] v1[1] v1[1] v2[1] v2[1]
+        // -> m12 m12 m12 m12 m22 m22 m22 m22
+
+
+        //  00  01  10  11,  00  01  10  11
+        // m11 m12 m13 m14, m21 m22 m23 m24 shuffled with 01 01 01 01
+        // -> m12 m12 m12 m12 m22 m22 m22 m22 x n21 n22 n23 n24 n21 n22 n23 n24
+
+        // -> m12 n21, m12 n22, m12 n23, m12 n24,
+        // -> m22 n21, m22 n22, m22 n23, m22 n24
+        tmp = _mm256_add_ps(_mm256_mul_ps(_mm256_shuffle_ps(otherCol12, otherCol12, 0x55), _mm256_broadcast_ps(&col2)), tmp);
+
+        // m11 m12 m13 m14 m11 m12 m13 m14 shuffled with 10 10 10 10
+        // m13 m13 m13 m13, m23 m23 m23 m23
+
+        // Multiplying with other.col3
+        // -> m13 n31, m13 n32, m13 n33, m13 n34
+        // -> m23 n31, m23 n32, m23 n33, m23 n34
+        tmp = _mm256_add_ps(_mm256_mul_ps(_mm256_shuffle_ps(otherCol12, otherCol12, 0xaa), _mm256_broadcast_ps(&col3)), tmp);
+
+        // m11 m12 m13 m14 m11 m12 m13 m14 shuffled with 11 11 11 11
+        // m14 m14 m14 m14 m24 m24 m24 m24
+
+        // -> m14 n41, m14 n42, m14 n43, m14 n44
+        // -> m24 n41, m24 n42, m24 n43, m24 n44
+        tmp = _mm256_add_ps(_mm256_mul_ps(_mm256_shuffle_ps(otherCol12, otherCol12, 0xff), _mm256_broadcast_ps(&col4)), tmp);
+
+        // Which finally gives
+        // c11 -> m11 n11 + m12 n21 + m13 n31 + m14 n41,
+        // c12 -> m11 n12 + m12 n22 + m13 n32 + m14 n42
+        // c13 -> m11 n13 + m12 n23 + m13 n33 + m14 n43
+        // c14 -> m11 n14 + m12 n24 + m13 n34 + m14 n44
+
+        // c21 -> m21 n11 + m22 n21 + m23 n31 + m24 n41,
+        // c12 -> m21 n12 + m22 n22 + m23 n32 + m24 n42
+        // c13 -> m21 n13 + m22 n23 + m23 n33 + m24 n43
+        // c14 -> m21 n14 + m22 n24 + m23 n34 + m24 n44
+
+        __m256 tmp2 = _mm256_mul_ps(_mm256_shuffle_ps(otherCol34, otherCol34, 0x00), _mm256_broadcast_ps(&col1));
+        tmp2 = _mm256_add_ps(_mm256_mul_ps(_mm256_shuffle_ps(otherCol34, otherCol34, 0x55), _mm256_broadcast_ps(&col2)), tmp2);
+        tmp2 = _mm256_add_ps(_mm256_mul_ps(_mm256_shuffle_ps(otherCol34, otherCol34, 0xaa), _mm256_broadcast_ps(&col3)), tmp2);
+        tmp2 = _mm256_add_ps(_mm256_mul_ps(_mm256_shuffle_ps(otherCol34, otherCol34, 0xff), _mm256_broadcast_ps(&col4)), tmp2);
+
+        Matrix4x4_AVX2 c(Qt::Uninitialized);
+        c.m_col12 = tmp;
+        c.m_col34 = tmp2;
+        return c;
+    }
+
+    Q_ALWAYS_INLINE Matrix4x4_AVX2 operator-(const Matrix4x4_AVX2 &other) const
+    {
+        Matrix4x4_AVX2 c(Qt::Uninitialized);
+
+        c.m_col12 = _mm256_sub_ps(m_col12, other.m_col12);
+        c.m_col34 = _mm256_sub_ps(m_col34, other.m_col34);
+        return c;
+    }
+
+    Q_ALWAYS_INLINE Matrix4x4_AVX2 operator+(const Matrix4x4_AVX2 &other) const
+    {
+        Matrix4x4_AVX2 c(Qt::Uninitialized);
+
+        c.m_col12 = _mm256_add_ps(m_col12, other.m_col12);
+        c.m_col34 = _mm256_add_ps(m_col34, other.m_col34);
+        return c;
+    }
+
+    Q_ALWAYS_INLINE Matrix4x4_AVX2 &operator*=(const Matrix4x4_AVX2 &other)
+    {
+        *this = *this * other;
+        return *this;
+    }
+
+    Q_ALWAYS_INLINE Matrix4x4_AVX2 &operator-=(const Matrix4x4_AVX2 &other)
+    {
+        *this = *this - other;
+        return *this;
+    }
+
+    Q_ALWAYS_INLINE Matrix4x4_AVX2 &operator+=(const Matrix4x4_AVX2 &other)
+    {
+        *this = *this + other;
+        return *this;
+    }
+
+    Q_ALWAYS_INLINE Matrix4x4_AVX2 transposed() const
+    {
+        Matrix4x4_AVX2 c(Qt::Uninitialized);
+        const __m128 col1 = _mm256_extractf128_ps(m_col12, 0);
+        const __m128 col2 = _mm256_extractf128_ps(m_col12, 1);
+        const __m128 col3 = _mm256_extractf128_ps(m_col34, 0);
+        const __m128 col4 = _mm256_extractf128_ps(m_col34, 1);
+
+        // ~117 instructions
+        // Matrix4x4_AVX2 c = *this;
+        // _MM_TRANSPOSE4_PS(c.m_col1, c.m_col2, c.m_col3, c.m_col4);
+
+        // ~131 instructions - AVX2
+        // const __m256i indexes = _mm256_set_epi32(7, 3, 6, 2, 5, 1, 4, 0);
+        // c.m_col12 = _mm256_permutevar8x32_ps(_mm256_unpacklo_ps(m_col12, m_col34), indexes);
+        // c.m_col34 = _mm256_permutevar8x32_ps(_mm256_unpackhi_ps(m_col12, m_col34), indexes);
+
+        // ~193 instructions
+        // c.m_col12 = _mm256_setr_ps(m_m11, m_m21, m_m31, m_m41, m_m12, m_m22, m_m32, m_m42);
+        // c.m_col34 = _mm256_setr_ps(m_m13, m_m23, m_m33, m_m43, m_m14, m_m24, m_m34, m_m44);
+
+        // ~113 instructions
+        //    union {
+        //        struct
+        //        {
+        //            __m256 twin;
+        //        };
+        //        struct
+        //        {
+        //            __m128 col1;
+        //            __m128 col2;
+        //        };
+        //    } u;
+
+        //    u.twin = _mm256_shuffle_ps(m_col12, m_col34, 0b01000100);
+        //    c.m_col1 = _mm_permute_ps(_mm_shuffle_ps(u.col1, u.col2, 0b10001000), 0b11011000);
+        //    c.m_col2 = _mm_permute_ps(_mm_shuffle_ps(u.col1, u.col2, 0b11011101), 0b11011000);
+
+        //    u.twin = _mm256_shuffle_ps(m_col12, m_col34, 0b11101110);
+        //    c.m_col3 = _mm_permute_ps(_mm_shuffle_ps(u.col1, u.col2, 0b10001000), 0b11011000);
+        //    c.m_col4 = _mm_permute_ps(_mm_shuffle_ps(u.col1, u.col2, 0b11011101), 0b11011000);
+
+        // ~113 instructions
+        // 0b11011101 == 0xdd
+        // 0b10001000 == 0x88
+        const __m128 tmp1 = _mm_shuffle_ps(col1, col2, 0xdd);
+        const __m128 tmp2 = _mm_shuffle_ps(col1, col2, 0x88);
+        const __m128 tmp3 = _mm_shuffle_ps(col3, col4, 0xdd);
+        const __m128 tmp4 = _mm_shuffle_ps(col3, col4, 0x88);
+        c.m_col12 = _mm256_set_m128(_mm_shuffle_ps(tmp1, tmp3, 0x88), _mm_shuffle_ps(tmp2, tmp4, 0x88));
+        c.m_col34 = _mm256_set_m128(_mm_shuffle_ps(tmp1, tmp3, 0xdd), _mm_shuffle_ps(tmp2, tmp4, 0xdd));
+
+        return c;
+    }
+
+    Q_ALWAYS_INLINE Matrix4x4_AVX2 inverted() const
+    {
+        // TO DO: Optimize
+        const QMatrix4x4 mat = toQMatrix4x4();
+        return Matrix4x4_AVX2(mat.inverted());
+    }
+
+    Q_ALWAYS_INLINE bool operator==(const Matrix4x4_AVX2 &other) const
+    {
+        // cmp returns (-1, -1, -1, -1, -1, -1, -1, -1) if the two m256 are equals
+        // movemask takes the most significant bits (8x 1 in this case) which equals 0xff
+        return (_mm256_movemask_ps(_mm256_cmp_ps(m_col12, other.m_col12, _CMP_EQ_OQ)) == 0xff &&
+                _mm256_movemask_ps(_mm256_cmp_ps(m_col34, other.m_col34, _CMP_EQ_OQ)) == 0xff);
+
+    }
+
+    Q_ALWAYS_INLINE bool operator!=(const Matrix4x4_AVX2 &other) const
+    {
+        return !(*this == other);
+    }
+
+    // For some reason _mm256_cvtss_f32 doesn't seem to be defined
+    Q_ALWAYS_INLINE float m11() const { return _mm_cvtss_f32(_mm256_extractf128_ps(m_col12, 0)); }
+    Q_ALWAYS_INLINE float m12() const { return _mm_cvtss_f32(_mm256_extractf128_ps(m_col12, 1)); }
+    Q_ALWAYS_INLINE float m13() const { return _mm_cvtss_f32(_mm256_extractf128_ps(m_col34, 0)); }
+    Q_ALWAYS_INLINE float m14() const { return _mm_cvtss_f32(_mm256_extractf128_ps(m_col34, 1)); }
+
+    Q_ALWAYS_INLINE float m21() const
+    {
+        // 0b01010101 = 0x55
+        const __m128 v = _mm256_extractf128_ps(m_col12, 0);
+        return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0x55));
+    }
+    Q_ALWAYS_INLINE float m22() const
+    {
+        // 0b01010101 = 0x55
+        const __m128 v = _mm256_extractf128_ps(m_col12, 1);
+        return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0x55));
+    }
+    Q_ALWAYS_INLINE float m23() const
+    {
+        // 0b01010101 = 0x55
+        const __m128 v = _mm256_extractf128_ps(m_col34, 0);
+        return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0x55));
+    }
+    Q_ALWAYS_INLINE float m24() const
+    {
+        // 0b01010101 = 0x55
+        const __m128 v = _mm256_extractf128_ps(m_col34, 1);
+        return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0x55));
+    }
+
+    Q_ALWAYS_INLINE float m31() const
+    {
+        // 0b10101010 = 0xaa
+        const __m128 v = _mm256_extractf128_ps(m_col12, 0);
+        return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xaa));
+    }
+    Q_ALWAYS_INLINE float m32() const
+    {
+        // 0b10101010 = 0xaa
+        const __m128 v = _mm256_extractf128_ps(m_col12, 1);
+        return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xaa));
+    }
+    Q_ALWAYS_INLINE float m33() const
+    {
+        // 0b10101010 = 0xaa
+        const __m128 v = _mm256_extractf128_ps(m_col34, 0);
+        return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xaa));
+    }
+    Q_ALWAYS_INLINE float m34() const
+    {
+        // 0b10101010 = 0xaa
+        const __m128 v = _mm256_extractf128_ps(m_col34, 1);
+        return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xaa));
+    }
+
+    Q_ALWAYS_INLINE float m41() const
+    {
+        // 0b11111111 = 0xff
+        const __m128 v = _mm256_extractf128_ps(m_col12, 0);
+        return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xff));
+    }
+    Q_ALWAYS_INLINE float m42() const
+    {
+        // 0b11111111 = 0xff
+        const __m128 v = _mm256_extractf128_ps(m_col12, 1);
+        return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xff));
+    }
+    Q_ALWAYS_INLINE float m43() const
+    {
+        // 0b11111111 = 0xff
+        const __m128 v = _mm256_extractf128_ps(m_col34, 0);
+        return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xff));
+    }
+    Q_ALWAYS_INLINE float m44() const
+    {
+        // 0b11111111 = 0xff
+        const __m128 v = _mm256_extractf128_ps(m_col34, 1);
+        return _mm_cvtss_f32(_mm_shuffle_ps(v, v, 0xff));
+    }
+    Q_ALWAYS_INLINE QMatrix4x4 toQMatrix4x4() const { return QMatrix4x4(m11(), m12(), m13(), m14(),
+                                                                        m21(), m22(), m23(), m24(),
+                                                                        m31(), m32(), m33(), m34(),
+                                                                        m41(), m42(), m43(), m44()); }
+
+    Q_ALWAYS_INLINE Vector4D row(int index) const
+    {
+        switch (index) {
+        case 0:
+            return Vector4D(m11(), m12(), m13(), m14());
+        case 1:
+            return Vector4D(m21(), m22(), m23(), m24());
+        case 2:
+            return Vector4D(m31(), m32(), m33(), m34());
+        case 3:
+            return Vector4D(m41(), m42(), m43(), m44());
+        default:
+            Q_UNREACHABLE();
+            return Vector4D();
+        }
+    }
+
+    Q_ALWAYS_INLINE Vector4D column(int index) const
+    {
+        Vector4D c(Qt::Uninitialized);
+        switch (index) {
+        case 0:
+            c.m_xyzw = _mm256_extractf128_ps(m_col12, 0);
+            break;
+        case 1:
+            c.m_xyzw = _mm256_extractf128_ps(m_col12, 1);
+            break;
+        case 2:
+            c.m_xyzw = _mm256_extractf128_ps(m_col34, 0);
+            break;
+        case 3:
+            c.m_xyzw = _mm256_extractf128_ps(m_col34, 1);
+            break;
+        default:
+            Q_UNREACHABLE();
+            return Vector4D();
+        }
+        return c;
+    }
+
+    Q_ALWAYS_INLINE Vector3D_SSE map(const Vector3D_SSE &point) const
+    {
+        return *this * point;
+    }
+
+    Q_ALWAYS_INLINE Vector4D_SSE map(const Vector4D_SSE &point) const
+    {
+        return *this * point;
+    }
+
+    Vector3D_SSE mapVector(const Vector3D_SSE &vector) const
+    {
+        const __m128 row1 = _mm_set_ps(0.0f, m13(), m12(), m11());
+        const __m128 row2 = _mm_set_ps(0.0f, m23(), m22(), m21());
+        const __m128 row3 = _mm_set_ps(0.0f, m33(), m32(), m31());
+
+        const __m128 tmp = _mm_add_ps(_mm_mul_ps(vector.m_xyzw, row1), _mm_mul_ps(vector.m_xyzw, row2));
+
+        Vector3D_SSE v(Qt::Uninitialized);
+        v.m_xyzw = _mm_add_ps(tmp, _mm_mul_ps(vector.m_xyzw, row3));
+        return v;
+    }
+
+    friend Vector4D operator*(const Vector4D &vector, const Matrix4x4_AVX2 &matrix);
+    friend Vector4D operator*(const Matrix4x4_AVX2 &matrix, const Vector4D &vector);
+
+    friend Vector3D operator*(const Vector3D &vector, const Matrix4x4_AVX2 &matrix);
+    friend Vector3D operator*(const Matrix4x4_AVX2 &matrix, const Vector3D &vector);
+
+    friend QT3DCORE_PRIVATE_EXPORT QDebug operator<<(QDebug dbg, const Matrix4x4_AVX2 &m);
+private:
+    // column major order
+    // aligned on 32 bytes boundaries for AVX, compatible with 16 bytes boundary for SSE
+    //    union Q_DECL_ALIGN(32)
+    //    {
+    //        float m[16];
+    //        struct
+    //        {
+    //            float m_m11, m_m21, m_m31, m_m41;
+    //            float m_m12, m_m22, m_m32, m_m42;
+    //            float m_m13, m_m23, m_m33, m_m43;
+    //            float m_m14, m_m24, m_m34, m_m44;
+    //        };
+    //    };
+    __m256 m_col12;
+    __m256 m_col34;
+};
+
+Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_AVX2 &matrix)
+{
+    const __m256 vecMultiplier = _mm256_broadcast_ps(&vector.m_xyzw);
+    // a1 a2 a3 a4 b1 b2 b3 b4, c1 c2 c3 c4 d1 d2 d3 d4
+    // a1 + a2, a3 + a4, c1 + c2, c3 + c4
+    // b1 + b2, b3 + b3, d1 + d2, d3 + d4
+
+    const __m256 partialSum = _mm256_hadd_ps(_mm256_mul_ps(matrix.m_col12, vecMultiplier),
+                                             _mm256_mul_ps(matrix.m_col34, vecMultiplier));
+
+    Vector4D v(Qt::Uninitialized);
+    // a12 + a34, b12 + b34, c12 + c34, d12 + d34
+    // _mm256_permute4x64_pd is AVX2
+    // 0b11011000 == 0xd8
+    const __m256 shuffledSum = _mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(partialSum), 0xd8));
+    v.m_xyzw = _mm_hadd_ps(_mm256_extractf128_ps(shuffledSum, 0), _mm256_extractf128_ps(shuffledSum, 1));
+    return v;
+}
+
+Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_AVX2 &matrix, const Vector4D &vector)
+{
+    const Matrix4x4_AVX2 transposed = matrix.transposed();
+    return vector * transposed;
+}
+
+Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_AVX2 &matrix)
+{
+    const __m128 vec4 = _mm_set_ps(1.0f, vector.z(), vector.y(), vector.x());
+    const __m256 vecMultiplier = _mm256_broadcast_ps(&vec4);
+    // a1 a2 a3 a4 b1 b2 b3 b4, c1 c2 c3 c4 d1 d2 d3 d4
+    // a1 + a2, a3 + a4, c1 + c2, c3 + c4
+    // b1 + b2, b3 + b3, d1 + d2, d3 + d4
+    const __m256 partialSum = _mm256_hadd_ps(_mm256_mul_ps(matrix.m_col12, vecMultiplier),
+                                             _mm256_mul_ps(matrix.m_col34, vecMultiplier));
+
+    // _mm256_permute4x64_pd is AVX2
+    // 0b11011000 == 0xd8
+    const __m256 shuffledSum = _mm256_castpd_ps(_mm256_permute4x64_pd(_mm256_castps_pd(partialSum), 0xd8));
+    // a12 + a34, b12 + b34, c12 + c34, d12 + d34
+    const __m128 result = _mm_hadd_ps(_mm256_extractf128_ps(shuffledSum, 0), _mm256_extractf128_ps(shuffledSum, 1));
+    // 0b11111111 = 0xff
+    const __m128 divisor = _mm_shuffle_ps(result, result, 0xff);
+
+    Vector3D v(Qt::Uninitialized);
+    v.m_xyzw = _mm_div_ps(result, divisor);;
+    return v;
+}
+
+Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_AVX2 &matrix, const Vector3D &vector)
+{
+    const Matrix4x4_AVX2 transposed = matrix.transposed();
+    return vector * transposed;
+}
+
+} // Qt3DCore
+
+Q_DECLARE_TYPEINFO(Qt3DCore::Matrix4x4_AVX2, Q_PRIMITIVE_TYPE);
+
+QT_END_NAMESPACE
+
+Q_DECLARE_METATYPE(Qt3DCore::Matrix4x4_AVX2)
+
+#endif // QT_COMPILER_SUPPORTS_AVX
+
+#endif // QT3DCORE_MATRIX4X4_AVX2_P_H