1 files changed, 679 insertions, 0 deletions
diff --git a/src/gui/painting/qcolortransform.cpp b/src/gui/painting/qcolortransform.cpp
new file mode 100644
index 0000000000..b677c4b36b
--- /dev/null
+++ b/src/gui/painting/qcolortransform.cpp
@@ -0,0 +1,679 @@
+/****************************************************************************
+**
+** Copyright (C) 2018 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of the QtGui module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+
+#include "qcolortransform.h"
+#include "qcolortransform_p.h"
+
+#include "qcolormatrix_p.h"
+#include "qcolorspace_p.h"
+#include "qcolortrc_p.h"
+#include "qcolortrclut_p.h"
+
+#include <QtCore/qatomic.h>
+#include <QtCore/qmath.h>
+#include <QtGui/qcolor.h>
+#include <QtGui/qtransform.h>
+#include <QtCore/private/qsimd_p.h>
+
+#include <qdebug.h>
+
+QT_BEGIN_NAMESPACE
+
+QColorTrcLut *lutFromTrc(const QColorTrc &trc)
+{
+    if (trc.m_type == QColorTrc::Type::Table)
+        return QColorTrcLut::fromTransferTable(trc.m_table);
+    if (trc.m_type == QColorTrc::Type::Function)
+        return QColorTrcLut::fromTransferFunction(trc.m_fun);
+    qWarning() << "TRC uninitialized";
+    return nullptr;
+}
+
+void QColorTransformPrivate::updateLutsIn() const
+{
+    if (colorSpaceIn->lutsGenerated.loadAcquire())
+        return;
+    for (int i = 0; i < 3; ++i) {
+        if (!colorSpaceIn->trc[i].isValid())
+            return;
+    }
+
+    if (colorSpaceIn->trc[0] == colorSpaceIn->trc[1] && colorSpaceIn->trc[0] == colorSpaceIn->trc[2]) {
+        colorSpaceIn->lut[0].reset(lutFromTrc(colorSpaceIn->trc[0]));
+        colorSpaceIn->lut[1] = colorSpaceIn->lut[0];
+        colorSpaceIn->lut[2] = colorSpaceIn->lut[0];
+    } else {
+        for (int i = 0; i < 3; ++i)
+            colorSpaceIn->lut[i].reset(lutFromTrc(colorSpaceIn->trc[i]));
+    }
+
+    colorSpaceIn->lutsGenerated.storeRelease(1);
+}
+
+void QColorTransformPrivate::updateLutsOut() const
+{
+    if (colorSpaceOut->lutsGenerated.loadAcquire())
+        return;
+    for (int i = 0; i < 3; ++i) {
+        if (!colorSpaceOut->trc[i].isValid())
+            return;
+    }
+
+    if (colorSpaceOut->trc[0] == colorSpaceOut->trc[1] && colorSpaceOut->trc[0] == colorSpaceOut->trc[2]) {
+        colorSpaceOut->lut[0].reset(lutFromTrc(colorSpaceOut->trc[0]));
+        colorSpaceOut->lut[1] = colorSpaceOut->lut[0];
+        colorSpaceOut->lut[2] = colorSpaceOut->lut[0];
+    } else {
+        for (int i = 0; i < 3; ++i)
+            colorSpaceOut->lut[i].reset(lutFromTrc(colorSpaceOut->trc[i]));
+    }
+
+    colorSpaceOut->lutsGenerated.storeRelease(1);
+}
+
+/*!
+    \class QColorTransform
+    \brief The QColorTransform class is a transformation between color spaces.
+    \since 5.14
+
+    \ingroup painting
+    \ingroup appearance
+    \inmodule QtGui
+
+    QColorTransform is an instantiation of a transformation between color spaces.
+    It can be applied on color and pixels to convert them from one color space to
+    another.
+
+    Setting up a QColorTransform takes some preprocessing, so keeping around
+    QColorTransforms that you need often is recommended, instead of generating
+    them on the fly.
+*/
+
+
+QColorTransform::~QColorTransform() noexcept
+{
+}
+
+/*!
+    Applies the color transformation on the QRgb value \a argb.
+
+    The input should be opaque or unpremultiplied.
+*/
+QRgb QColorTransform::map(const QRgb &argb) const
+{
+    if (!d_ptr)
+        return argb;
+    Q_D(const QColorTransform);
+    constexpr float f = 1.0f / 255.0f;
+    QColorVector c = { qRed(argb) * f, qGreen(argb) * f, qBlue(argb) * f };
+    c.x = d->colorSpaceIn->trc[0].apply(c.x);
+    c.y = d->colorSpaceIn->trc[1].apply(c.y);
+    c.z = d->colorSpaceIn->trc[2].apply(c.z);
+    c = d->colorMatrix.map(c);
+    c.x = std::max(0.0f, std::min(1.0f, c.x));
+    c.y = std::max(0.0f, std::min(1.0f, c.y));
+    c.z = std::max(0.0f, std::min(1.0f, c.z));
+    if (d->colorSpaceOut->lutsGenerated.loadAcquire()) {
+        c.x = d->colorSpaceOut->lut[0]->fromLinear(c.x);
+        c.y = d->colorSpaceOut->lut[1]->fromLinear(c.y);
+        c.z = d->colorSpaceOut->lut[2]->fromLinear(c.z);
+    } else {
+        c.x = d->colorSpaceOut->trc[0].applyInverse(c.x);
+        c.y = d->colorSpaceOut->trc[1].applyInverse(c.y);
+        c.z = d->colorSpaceOut->trc[2].applyInverse(c.z);
+    }
+
+    return qRgba(c.x * 255 + 0.5f, c.y * 255 + 0.5f, c.z * 255 + 0.5f, qAlpha(argb));
+}
+
+/*!
+    Applies the color transformation on the QRgba64 value \a rgba64.
+
+    The input should be opaque or unpremultiplied.
+*/
+QRgba64 QColorTransform::map(const QRgba64 &rgba64) const
+{
+    if (!d_ptr)
+        return rgba64;
+    Q_D(const QColorTransform);
+    constexpr float f = 1.0f / 65535.0f;
+    QColorVector c = { rgba64.red() * f, rgba64.green() * f, rgba64.blue() * f };
+    c.x = d->colorSpaceIn->trc[0].apply(c.x);
+    c.y = d->colorSpaceIn->trc[1].apply(c.y);
+    c.z = d->colorSpaceIn->trc[2].apply(c.z);
+    c = d->colorMatrix.map(c);
+    c.x = std::max(0.0f, std::min(1.0f, c.x));
+    c.y = std::max(0.0f, std::min(1.0f, c.y));
+    c.z = std::max(0.0f, std::min(1.0f, c.z));
+    if (d->colorSpaceOut->lutsGenerated.loadAcquire()) {
+        c.x = d->colorSpaceOut->lut[0]->fromLinear(c.x);
+        c.y = d->colorSpaceOut->lut[1]->fromLinear(c.y);
+        c.z = d->colorSpaceOut->lut[2]->fromLinear(c.z);
+    } else {
+        c.x = d->colorSpaceOut->trc[0].applyInverse(c.x);
+        c.y = d->colorSpaceOut->trc[1].applyInverse(c.y);
+        c.z = d->colorSpaceOut->trc[2].applyInverse(c.z);
+    }
+
+    return QRgba64::fromRgba64(c.x * 65535, c.y * 65535, c.z * 65535, rgba64.alpha());
+}
+
+/*!
+    Applies the color transformation on the QColor value \a color.
+
+*/
+QColor QColorTransform::map(const QColor &color) const
+{
+    if (!d_ptr)
+        return color;
+    Q_D(const QColorTransform);
+    QColorVector c = { (float)color.redF(), (float)color.greenF(), (float)color.blueF() };
+    c.x = d->colorSpaceIn->trc[0].apply(c.x);
+    c.y = d->colorSpaceIn->trc[1].apply(c.y);
+    c.z = d->colorSpaceIn->trc[2].apply(c.z);
+    c = d->colorMatrix.map(c);
+    if (d_ptr->colorSpaceOut->lutsGenerated.loadAcquire()) {
+        c.x = d->colorSpaceOut->lut[0]->fromLinear(c.x);
+        c.y = d->colorSpaceOut->lut[1]->fromLinear(c.y);
+        c.z = d->colorSpaceOut->lut[2]->fromLinear(c.z);
+    } else {
+        c.x = d->colorSpaceOut->trc[0].applyInverse(c.x);
+        c.y = d->colorSpaceOut->trc[1].applyInverse(c.y);
+        c.z = d->colorSpaceOut->trc[2].applyInverse(c.z);
+    }
+    QColor out;
+    out.setRgbF(c.x, c.y, c.z, color.alphaF());
+    return out;
+}
+
+// Optimized sub-routines for fast block based conversion:
+
+static void applyMatrix(QColorVector *buffer, const qsizetype len, const QColorMatrix &colorMatrix)
+{
+#if defined(__SSE2__)
+    const __m128 minV = _mm_set1_ps(0.0f);
+    const __m128 maxV = _mm_set1_ps(1.0f);
+    const __m128 xMat = _mm_loadu_ps(&colorMatrix.r.x);
+    const __m128 yMat = _mm_loadu_ps(&colorMatrix.g.x);
+    const __m128 zMat = _mm_loadu_ps(&colorMatrix.b.x);
+    for (qsizetype j = 0; j < len; ++j) {
+        __m128 c = _mm_loadu_ps(&buffer[j].x);
+        __m128 cx = _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0));
+        __m128 cy = _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1));
+        __m128 cz = _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2));
+        cx = _mm_mul_ps(cx, xMat);
+        cy = _mm_mul_ps(cy, yMat);
+        cz = _mm_mul_ps(cz, zMat);
+        cx = _mm_add_ps(cx, cy);
+        cx = _mm_add_ps(cx, cz);
+        // Clamp:
+        cx = _mm_min_ps(cx, maxV);
+        cx = _mm_max_ps(cx, minV);
+        _mm_storeu_ps(&buffer[j].x, cx);
+    }
+#else
+    for (int j = 0; j < len; ++j) {
+        const QColorVector cv = colorMatrix.map(buffer[j]);
+        buffer[j].x = std::max(0.0f, std::min(1.0f, cv.x));
+        buffer[j].y = std::max(0.0f, std::min(1.0f, cv.y));
+        buffer[j].z = std::max(0.0f, std::min(1.0f, cv.z));
+    }
+#endif
+}
+
+template<typename T>
+static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr);
+template<typename T>
+static void loadUnpremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr);
+
+#if defined(__SSE2__)
+// Load to [0-alpha] in 4x32 SIMD
+template<typename T>
+static inline void loadP(const T &p, __m128i &v);
+
+template<>
+inline void loadP<QRgb>(const QRgb &p, __m128i &v)
+{
+    v = _mm_cvtsi32_si128(p);
+#if defined(__SSE4_1__)
+    v = _mm_cvtepu8_epi32(v);
+#else
+    v = _mm_unpacklo_epi8(v, _mm_setzero_si128());
+    v = _mm_unpacklo_epi16(v, _mm_setzero_si128());
+#endif
+}
+
+template<>
+inline void loadP<QRgba64>(const QRgba64 &p, __m128i &v)
+{
+    v = _mm_loadl_epi64((const __m128i *)&p);
+#if defined(__SSE4_1__)
+    v = _mm_cvtepu16_epi32(v);
+#else
+    v = _mm_unpacklo_epi16(v, _mm_setzero_si128());
+#endif
+    // Shuffle to ARGB as the template below expects it
+    v = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 0, 1, 2));
+}
+
+template<typename T>
+static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
+{
+    const __m128 v4080 = _mm_set1_ps(4080.f);
+    const __m128 iFF00 = _mm_set1_ps(1.0f / (255 * 256));
+    for (qsizetype i = 0; i < len; ++i) {
+        __m128i v;
+        loadP<T>(src[i], v);
+        __m128 vf = _mm_cvtepi32_ps(v);
+        // Approximate 1/a:
+        __m128 va = _mm_shuffle_ps(vf, vf, _MM_SHUFFLE(3, 3, 3, 3));
+        __m128 via = _mm_rcp_ps(va);
+        via = _mm_sub_ps(_mm_add_ps(via, via), _mm_mul_ps(via, _mm_mul_ps(via, va)));
+        // v * (1/a)
+        vf = _mm_mul_ps(vf, via);
+
+        // Handle zero alpha
+        __m128 vAlphaMask = _mm_cmpeq_ps(va, _mm_set1_ps(0.0f));
+        vf = _mm_andnot_ps(vAlphaMask, vf);
+
+        // LUT
+        v = _mm_cvtps_epi32(_mm_mul_ps(vf, v4080));
+        const int ridx = _mm_extract_epi16(v, 4);
+        const int gidx = _mm_extract_epi16(v, 2);
+        const int bidx = _mm_extract_epi16(v, 0);
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[0]->m_toLinear[ridx], 0);
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[1]->m_toLinear[gidx], 2);
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[2]->m_toLinear[bidx], 4);
+        vf = _mm_mul_ps(_mm_cvtepi32_ps(v), iFF00);
+
+        _mm_storeu_ps(&buffer[i].x, vf);
+    }
+}
+
+// Load to [0-4080] in 4x32 SIMD
+template<typename T>
+static inline void loadPU(const T &p, __m128i &v);
+
+template<>
+inline void loadPU<QRgb>(const QRgb &p, __m128i &v)
+{
+    v = _mm_cvtsi32_si128(p);
+#if defined(__SSE4_1__)
+    v = _mm_cvtepu8_epi32(v);
+#else
+    v = _mm_unpacklo_epi8(v, _mm_setzero_si128());
+    v = _mm_unpacklo_epi16(v, _mm_setzero_si128());
+#endif
+    v = _mm_slli_epi32(v, 4);
+}
+
+template<>
+inline void loadPU<QRgba64>(const QRgba64 &p, __m128i &v)
+{
+    v = _mm_loadl_epi64((const __m128i *)&p);
+    v = _mm_sub_epi16(v, _mm_srli_epi16(v, 8));
+#if defined(__SSE4_1__)
+    v = _mm_cvtepu16_epi32(v);
+#else
+    v = _mm_unpacklo_epi16(v, _mm_setzero_si128());
+#endif
+    v = _mm_srli_epi32(v, 4);
+    // Shuffle to ARGB as the template below expects it
+    v = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 0, 1, 2));
+}
+
+template<typename T>
+void loadUnpremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
+{
+    const __m128 iFF00 = _mm_set1_ps(1.0f / (255 * 256));
+    for (qsizetype i = 0; i < len; ++i) {
+        __m128i v;
+        loadPU<T>(src[i], v);
+        const int ridx = _mm_extract_epi16(v, 4);
+        const int gidx = _mm_extract_epi16(v, 2);
+        const int bidx = _mm_extract_epi16(v, 0);
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[0]->m_toLinear[ridx], 0);
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[1]->m_toLinear[gidx], 2);
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[2]->m_toLinear[bidx], 4);
+        __m128 vf = _mm_mul_ps(_mm_cvtepi32_ps(v), iFF00);
+        _mm_storeu_ps(&buffer[i].x, vf);
+    }
+}
+
+#else
+template<>
+void loadPremultiplied<QRgb>(QColorVector *buffer, const QRgb *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
+{
+    for (qsizetype i = 0; i < len; ++i) {
+        const uint p = src[i];
+        const int a = qAlpha(p);
+        if (a) {
+            const float ia = 4080.0f / a;
+            const int ridx = int(qRed(p)   * ia + 0.5f);
+            const int gidx = int(qGreen(p) * ia + 0.5f);
+            const int bidx = int(qBlue(p)  * ia + 0.5f);
+            buffer[i].x = d_ptr->colorSpaceIn->lut[0]->m_toLinear[ridx] * (1.0f / (255 * 256));
+            buffer[i].y = d_ptr->colorSpaceIn->lut[1]->m_toLinear[gidx] * (1.0f / (255 * 256));
+            buffer[i].z = d_ptr->colorSpaceIn->lut[2]->m_toLinear[bidx] * (1.0f / (255 * 256));
+        } else {
+            buffer[i].x = buffer[i].y = buffer[i].z = 0.0f;
+        }
+    }
+}
+
+template<>
+void loadPremultiplied<QRgba64>(QColorVector *buffer, const QRgba64 *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
+{
+    for (qsizetype i = 0; i < len; ++i) {
+        const QRgba64 &p = src[i];
+        const int a = p.alpha();
+        if (a) {
+            const float ia = 4080.0f / a;
+            const int ridx = int(p.red()   * ia + 0.5f);
+            const int gidx = int(p.green() * ia + 0.5f);
+            const int bidx = int(p.blue()  * ia + 0.5f);
+            buffer[i].x = d_ptr->colorSpaceIn->lut[0]->m_toLinear[ridx] * (1.0f / (255 * 256));
+            buffer[i].y = d_ptr->colorSpaceIn->lut[1]->m_toLinear[gidx] * (1.0f / (255 * 256));
+            buffer[i].z = d_ptr->colorSpaceIn->lut[2]->m_toLinear[bidx] * (1.0f / (255 * 256));
+        } else {
+            buffer[i].x = buffer[i].y = buffer[i].z = 0.0f;
+        }
+    }
+}
+
+template<>
+void loadUnpremultiplied<QRgb>(QColorVector *buffer, const QRgb *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
+{
+    for (qsizetype i = 0; i < len; ++i) {
+        const uint p = src[i];
+        buffer[i].x = d_ptr->colorSpaceIn->lut[0]->u8ToLinearF32(qRed(p));
+        buffer[i].y = d_ptr->colorSpaceIn->lut[1]->u8ToLinearF32(qGreen(p));
+        buffer[i].z = d_ptr->colorSpaceIn->lut[2]->u8ToLinearF32(qBlue(p));
+    }
+}
+
+template<>
+void loadUnpremultiplied<QRgba64>(QColorVector *buffer, const QRgba64 *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
+{
+    for (qsizetype i = 0; i < len; ++i) {
+        const QRgba64 &p = src[i];
+        buffer[i].x = d_ptr->colorSpaceIn->lut[0]->u16ToLinearF32(p.red());
+        buffer[i].y = d_ptr->colorSpaceIn->lut[1]->u16ToLinearF32(p.green());
+        buffer[i].z = d_ptr->colorSpaceIn->lut[2]->u16ToLinearF32(p.blue());
+    }
+}
+#endif
+
+static void storePremultiplied(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len,
+                               const QColorTransformPrivate *d_ptr)
+{
+#if defined(__SSE2__)
+    const __m128 v4080 = _mm_set1_ps(4080.f);
+    const __m128 iFF00 = _mm_set1_ps(1.0f / (255 * 256));
+    for (qsizetype i = 0; i < len; ++i) {
+        const int a = qAlpha(src[i]);
+        __m128 vf = _mm_loadu_ps(&buffer[i].x);
+        __m128i v = _mm_cvtps_epi32(_mm_mul_ps(vf, v4080));
+        __m128 va = _mm_set1_ps(a);
+        va = _mm_mul_ps(va, iFF00);
+        const int ridx = _mm_extract_epi16(v, 0);
+        const int gidx = _mm_extract_epi16(v, 2);
+        const int bidx = _mm_extract_epi16(v, 4);
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], 4);
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 2);
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], 0);
+        vf = _mm_cvtepi32_ps(v);
+        vf = _mm_mul_ps(vf, va);
+        v = _mm_cvtps_epi32(vf);
+        v = _mm_packs_epi32(v, v);
+        v = _mm_insert_epi16(v, a, 3);
+        v = _mm_packus_epi16(v, v);
+        dst[i] = _mm_cvtsi128_si32(v);
+    }
+#else
+    for (qsizetype i = 0; i < len; ++i) {
+        const int a = qAlpha(src[i]);
+        const float fa = a / (255.0f * 256.0f);
+        const float r = d_ptr->colorSpaceOut->lut[0]->m_fromLinear[int(buffer[i].x * 4080.0f + 0.5f)];
+        const float g = d_ptr->colorSpaceOut->lut[1]->m_fromLinear[int(buffer[i].y * 4080.0f + 0.5f)];
+        const float b = d_ptr->colorSpaceOut->lut[2]->m_fromLinear[int(buffer[i].z * 4080.0f + 0.5f)];
+        dst[i] = qRgba(r * fa + 0.5f, g * fa + 0.5f, b * fa + 0.5f, a);
+    }
+#endif
+}
+
+static void storeUnpremultiplied(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len,
+                                 const QColorTransformPrivate *d_ptr)
+{
+#if defined(__SSE2__)
+    const __m128 v4080 = _mm_set1_ps(4080.f);
+    for (qsizetype i = 0; i < len; ++i) {
+        const int a = qAlpha(src[i]);
+        __m128 vf = _mm_loadu_ps(&buffer[i].x);
+        __m128i v = _mm_cvtps_epi32(_mm_mul_ps(vf, v4080));
+        const int ridx = _mm_extract_epi16(v, 0);
+        const int gidx = _mm_extract_epi16(v, 2);
+        const int bidx = _mm_extract_epi16(v, 4);
+        v = _mm_setzero_si128();
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], 2);
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 1);
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], 0);
+        v = _mm_add_epi16(v, _mm_set1_epi16(0x80));
+        v = _mm_srli_epi16(v, 8);
+        v = _mm_insert_epi16(v, a, 3);
+        v = _mm_packus_epi16(v, v);
+        dst[i] = _mm_cvtsi128_si32(v);
+    }
+#else
+    for (qsizetype i = 0; i < len; ++i) {
+        const int r = d_ptr->colorSpaceOut->lut[0]->u8FromLinearF32(buffer[i].x);
+        const int g = d_ptr->colorSpaceOut->lut[1]->u8FromLinearF32(buffer[i].y);
+        const int b = d_ptr->colorSpaceOut->lut[2]->u8FromLinearF32(buffer[i].z);
+        dst[i] = (src[i] & 0xff000000) | (r << 16) | (g << 8) | (b << 0);
+    }
+#endif
+}
+
+static void storeOpaque(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len,
+                        const QColorTransformPrivate *d_ptr)
+{
+    Q_UNUSED(src);
+#if defined(__SSE2__)
+    const __m128 v4080 = _mm_set1_ps(4080.f);
+    for (qsizetype i = 0; i < len; ++i) {
+        __m128 vf = _mm_loadu_ps(&buffer[i].x);
+        __m128i v = _mm_cvtps_epi32(_mm_mul_ps(vf, v4080));
+        const int ridx = _mm_extract_epi16(v, 0);
+        const int gidx = _mm_extract_epi16(v, 2);
+        const int bidx = _mm_extract_epi16(v, 4);
+        v = _mm_setzero_si128();
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], 2);
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 1);
+        v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], 0);
+        v = _mm_add_epi16(v, _mm_set1_epi16(0x80));
+        v = _mm_srli_epi16(v, 8);
+        v = _mm_insert_epi16(v, 255, 3);
+        v = _mm_packus_epi16(v, v);
+        dst[i] = _mm_cvtsi128_si32(v);
+    }
+#else
+    for (qsizetype i = 0; i < len; ++i) {
+        const int r = d_ptr->colorSpaceOut->lut[0]->u8FromLinearF32(buffer[i].x);
+        const int g = d_ptr->colorSpaceOut->lut[1]->u8FromLinearF32(buffer[i].y);
+        const int b = d_ptr->colorSpaceOut->lut[2]->u8FromLinearF32(buffer[i].z);
+        dst[i] = 0xff000000 | (r << 16) | (g << 8) | (b << 0);
+    }
+#endif
+}
+
+static void storePremultiplied(QRgba64 *dst, const QRgba64 *src, const QColorVector *buffer, const qsizetype len,
+                               const QColorTransformPrivate *d_ptr)
+{
+    for (qsizetype i = 0; i < len; ++i) {
+        const int a = src[i].alpha();
+        const float fa = a / (255.0f * 256.0f);
+        const float r = d_ptr->colorSpaceOut->lut[0]->m_fromLinear[int(buffer[i].x * 4080.0f + 0.5f)];
+        const float g = d_ptr->colorSpaceOut->lut[1]->m_fromLinear[int(buffer[i].y * 4080.0f + 0.5f)];
+        const float b = d_ptr->colorSpaceOut->lut[2]->m_fromLinear[int(buffer[i].z * 4080.0f + 0.5f)];
+        dst[i] = qRgba64(r * fa + 0.5f, g * fa + 0.5f, b * fa + 0.5f, a);
+    }
+}
+
+static void storeUnpremultiplied(QRgba64 *dst, const QRgba64 *src, const QColorVector *buffer, const qsizetype len,
+                                 const QColorTransformPrivate *d_ptr)
+{
+    for (qsizetype i = 0; i < len; ++i) {
+         const int r = d_ptr->colorSpaceOut->lut[0]->u16FromLinearF32(buffer[i].x);
+         const int g = d_ptr->colorSpaceOut->lut[1]->u16FromLinearF32(buffer[i].y);
+         const int b = d_ptr->colorSpaceOut->lut[2]->u16FromLinearF32(buffer[i].z);
+         dst[i] = qRgba64(r, g, b, src[i].alpha());
+    }
+}
+
+static void storeOpaque(QRgba64 *dst, const QRgba64 *src, const QColorVector *buffer, const qsizetype len,
+                        const QColorTransformPrivate *d_ptr)
+{
+    Q_UNUSED(src);
+    for (qsizetype i = 0; i < len; ++i) {
+        const int r = d_ptr->colorSpaceOut->lut[0]->u16FromLinearF32(buffer[i].x);
+        const int g = d_ptr->colorSpaceOut->lut[1]->u16FromLinearF32(buffer[i].y);
+        const int b = d_ptr->colorSpaceOut->lut[2]->u16FromLinearF32(buffer[i].z);
+        dst[i] = qRgba64(r, g, b, 0xFFFF);
+    }
+}
+
+static constexpr qsizetype WorkBlockSize = 256;
+
+template<typename T>
+void QColorTransformPrivate::apply(T *dst, const T *src, qsizetype count, TransformFlags flags) const
+{
+    if (!colorMatrix.isValid())
+        return;
+
+    updateLutsIn();
+    updateLutsOut();
+
+    bool doApplyMatrix = (colorMatrix != QColorMatrix::identity());
+
+    QColorVector buffer[WorkBlockSize];
+    qsizetype i = 0;
+    while (i < count) {
+        const qsizetype len = qMin(count - i, WorkBlockSize);
+        if (flags & InputPremultiplied)
+            loadPremultiplied(buffer, src + i, len, this);
+        else
+            loadUnpremultiplied(buffer, src + i, len, this);
+
+        if (doApplyMatrix)
+            applyMatrix(buffer, len, colorMatrix);
+
+        if (flags & InputOpaque)
+            storeOpaque(dst + i, src + i, buffer, len, this);
+        else if (flags & OutputPremultiplied)
+            storePremultiplied(dst + i, src + i, buffer, len, this);
+        else
+            storeUnpremultiplied(dst + i, src + i, buffer, len, this);
+
+        i += len;
+    }
+}
+
+/*!
+    \internal
+    \enum QColorTransformPrivate::TransformFlag
+
+    Defines how the transform is to be applied.
+
+    \value Unpremultiplied The input and output should both be unpremultiplied.
+    \value InputOpaque The input is guaranteed to be opaque.
+    \value InputPremultiplied The input is premultiplied.
+    \value OutputPremultiplied The output should be premultiplied.
+    \value Premultiplied Both input and output should both be premultiplied.
+*/
+
+/*!
+    \internal
+    Prepares a color transformation for fast application. You do not need to
+    call this explicitly as it will be called implicitly on the first transforms, but
+    if you want predictable performance on the first transforms, you can perform it
+    in advance.
+
+    \sa QColorTransform::map(), apply()
+*/
+void QColorTransformPrivate::prepare()
+{
+    updateLutsIn();
+    updateLutsOut();
+}
+
+/*!
+    \internal
+    Applies the color transformation on \a count QRgb pixels starting from
+    \a src and stores the result in \a dst.
+
+    Thread-safe if prepare() has been called first.
+
+    Assumes unpremultiplied data by default. Set \a flags to change defaults.
+
+    \sa prepare()
+*/
+void QColorTransformPrivate::apply(QRgb *dst, const QRgb *src, qsizetype count, TransformFlags flags) const
+{
+    apply<QRgb>(dst, src, count, flags);
+}
+
+/*!
+    \internal
+    Applies the color transformation on \a count QRgba64 pixels starting from
+    \a src and stores the result in \a dst.
+
+    Thread-safe if prepare() has been called first.
+
+    Assumes unpremultiplied data by default. Set \a flags to change defaults.
+
+    \sa prepare()
+*/
+void QColorTransformPrivate::apply(QRgba64 *dst, const QRgba64 *src, qsizetype count, TransformFlags flags) const
+{
+    apply<QRgba64>(dst, src, count, flags);
+}
+
+
+QT_END_NAMESPACE