23 files changed, 2103 insertions, 1087 deletions
diff --git a/src/gui/painting/painting.pri b/src/gui/painting/painting.pri
index 86e35c39f8..63e345545c 100644
--- a/src/gui/painting/painting.pri
+++ b/src/gui/painting/painting.pri
@@ -8,6 +8,7 @@ HEADERS += \
         painting/qbrush.h \
         painting/qcolor.h \
         painting/qcolor_p.h \
+        painting/qcolorprofile_p.h \
         painting/qcosmeticstroker_p.h \
         painting/qdatabuffer_p.h \
         painting/qdrawhelper_p.h \
@@ -63,11 +64,11 @@ SOURCES += \
         painting/qblittable.cpp \
         painting/qbrush.cpp \
         painting/qcolor.cpp \
+        painting/qcolorprofile.cpp \
         painting/qcompositionfunctions.cpp \
         painting/qcosmeticstroker.cpp \
         painting/qdrawhelper.cpp \
         painting/qemulationpaintengine.cpp \
-        painting/qgammatables.cpp \
         painting/qgrayraster.c \
         painting/qimagescale.cpp \
         painting/qmatrix.cpp \
diff --git a/src/gui/painting/qbrush.cpp b/src/gui/painting/qbrush.cpp
index ebb035a2c1..cc3ee76f0d 100644
--- a/src/gui/painting/qbrush.cpp
+++ b/src/gui/painting/qbrush.cpp
@@ -1006,7 +1006,7 @@ bool QBrush::operator==(const QBrush &b) const
 */
 QDebug operator<<(QDebug dbg, const QBrush &b)
 {
-    static const char *const BRUSH_STYLES[] = {
+    static const char BRUSH_STYLES[][24] = {
      "NoBrush",
      "SolidPattern",
      "Dense1Pattern",
@@ -1025,7 +1025,7 @@ QDebug operator<<(QDebug dbg, const QBrush &b)
      "LinearGradientPattern",
      "RadialGradientPattern",
      "ConicalGradientPattern",
-     0, 0, 0, 0, 0, 0,
+     "", "", "", "", "", "",
      "TexturePattern" // 24
     };
 
@@ -1419,6 +1419,25 @@ void QGradient::setColorAt(qreal pos, const QColor &color)
         m_stops.insert(index, QGradientStop(pos, color));
 }
 
+static inline bool ok(QGradientStop stop)
+{
+    return stop.first >= 0 && stop.first <= 1; // rejects NaNs
+}
+
+static inline bool ok(const QGradientStops &stops)
+{
+    qreal lastPos = -1;
+    for (const QGradientStop &stop : stops) {
+        if (Q_UNLIKELY(!ok(stop)))
+            return false;
+        const bool sorted = stop.first > lastPos; // rejects duplicates
+        if (Q_UNLIKELY(!sorted))
+            return false;
+        lastPos = stop.first;
+    }
+    return true;
+}
+
 /*!
     \fn void QGradient::setStops(const QGradientStops &stopPoints)
 
@@ -1430,6 +1449,14 @@ void QGradient::setColorAt(qreal pos, const QColor &color)
 */
 void QGradient::setStops(const QGradientStops &stops)
 {
+    // ## Qt 6: consider taking \a stops by value, so we can move into m_stops
+    if (Q_LIKELY(ok(stops))) {
+        // fast path for the common case: if everything is ok with the stops, just copy them
+        m_stops = stops;
+        return;
+    }
+    // otherwise, to keep the pre-5.9 behavior, add them one after another,
+    // so each stop is checked, invalid ones are skipped, they are added in-order (which may be O(N^2)).
     m_stops.clear();
     for (int i=0; i<stops.size(); ++i)
         setColorAt(stops.at(i).first, stops.at(i).second);
diff --git a/src/gui/painting/qgammatables.cpp b/src/gui/painting/qcolorprofile.cpp
index 1d76f7ee3c..3b7b0a248b 100644
--- a/src/gui/painting/qgammatables.cpp
+++ b/src/gui/painting/qcolorprofile.cpp
@@ -37,28 +37,51 @@
 **
 ****************************************************************************/
 
-#include <private/qdrawhelper_p.h>
+#include "qcolorprofile_p.h"
+#include <qmath.h>
 
 QT_BEGIN_NAMESPACE
 
+QColorProfile *QColorProfile::fromGamma(qreal gamma)
+{
+    QColorProfile *cp = new QColorProfile;
+
+    for (int i = 0; i <= (255 * 16); ++i) {
+        cp->m_toLinear[i] = ushort(qRound(qPow(i / qreal(255 * 16), gamma) * (255 * 256)));
+        cp->m_fromLinear[i] = ushort(qRound(qPow(i / qreal(255 * 16), qreal(1) / gamma) * (255 * 256)));
+    }
+
+    return cp;
+}
 
-QDrawHelperGammaTables::QDrawHelperGammaTables(qreal smoothing)
+static qreal srgbToLinear(qreal v)
 {
-    const qreal gray_gamma = 2.31;
-    for (int i=0; i<256; ++i)
-        qt_pow_gamma[i] = uint(qRound(qPow(i / qreal(255.), gray_gamma) * 2047));
-    for (int i=0; i<2048; ++i)
-        qt_pow_invgamma[i] = uchar(qRound(qPow(i / qreal(2047.0), 1 / gray_gamma) * 255));
+    const qreal a = 0.055;
+    if (v <= qreal(0.04045))
+        return v / qreal(12.92);
+    else
+        return qPow((v + a) / (qreal(1) + a), qreal(2.4));
+}
 
-    refresh(smoothing);
+static qreal linearToSrgb(qreal v)
+{
+    const qreal a = 0.055;
+    if (v <= qreal(0.0031308))
+        return v * qreal(12.92);
+    else
+        return (qreal(1) + a) * qPow(v, qreal(1.0 / 2.4)) - a;
 }
 
-void QDrawHelperGammaTables::refresh(qreal smoothing)
+QColorProfile *QColorProfile::fromSRgb()
 {
-    for (int i=0; i<256; ++i) {
-        qt_pow_rgb_gamma[i] = uchar(qRound(qPow(i / qreal(255.0), smoothing) * 255));
-        qt_pow_rgb_invgamma[i] = uchar(qRound(qPow(i / qreal(255.), 1 / smoothing) * 255));
+    QColorProfile *cp = new QColorProfile;
+
+    for (int i = 0; i <= (255 * 16); ++i) {
+        cp->m_toLinear[i] = ushort(qRound(srgbToLinear(i / qreal(255 * 16)) * (255 * 256)));
+        cp->m_fromLinear[i] = ushort(qRound(linearToSrgb(i / qreal(255 * 16)) * (255 * 256)));
     }
+
+    return cp;
 }
 
 QT_END_NAMESPACE
diff --git a/src/gui/painting/qcolorprofile_p.h b/src/gui/painting/qcolorprofile_p.h
new file mode 100644
index 0000000000..ca1786ee6d
--- /dev/null
+++ b/src/gui/painting/qcolorprofile_p.h
@@ -0,0 +1,157 @@
+/****************************************************************************
+**
+** Copyright (C) 2016 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of the QtGui module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#ifndef QCOLORPROFILE_P_H
+#define QCOLORPROFILE_P_H
+
+//
+//  W A R N I N G
+//  -------------
+//
+// This file is not part of the Qt API.  It exists purely as an
+// implementation detail.  This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+//
+
+#include <QtGui/private/qtguiglobal_p.h>
+#include <QtGui/qrgb.h>
+#include <QtGui/qrgba64.h>
+
+QT_BEGIN_NAMESPACE
+
+class Q_GUI_EXPORT QColorProfile
+{
+public:
+    static QColorProfile *fromGamma(qreal gamma);
+    static QColorProfile *fromSRgb();
+
+    // The following methods all convert opaque or unpremultiplied colors:
+
+    QRgba64 toLinear64(QRgb rgb32) const
+    {
+        ushort r = m_toLinear[qRed(rgb32) << 4];
+        ushort g = m_toLinear[qGreen(rgb32) << 4];
+        ushort b = m_toLinear[qBlue(rgb32) << 4];
+        r = r + (r >> 8);
+        g = g + (g >> 8);
+        b = b + (b >> 8);
+        return QRgba64::fromRgba64(r, g, b, qAlpha(rgb32) * 257);
+    }
+
+    QRgb toLinear(QRgb rgb32) const
+    {
+        uchar r = (m_toLinear[qRed(rgb32) << 4] + 0x80) >> 8;
+        uchar g = (m_toLinear[qGreen(rgb32) << 4] + 0x80) >> 8;
+        uchar b = (m_toLinear[qBlue(rgb32) << 4] + 0x80) >> 8;
+        return qRgba(r, g, b, qAlpha(rgb32));
+    }
+
+    QRgba64 toLinear(QRgba64 rgb64) const
+    {
+        ushort r = rgb64.red();
+        ushort g = rgb64.green();
+        ushort b = rgb64.blue();
+        r = r - (r >> 8);
+        g = g - (g >> 8);
+        b = b - (b >> 8);
+        r = m_toLinear[r >> 4];
+        g = m_toLinear[g >> 4];
+        b = m_toLinear[b >> 4];
+        r = r + (r >> 8);
+        g = g + (g >> 8);
+        b = b + (b >> 8);
+        return QRgba64::fromRgba64(r, g, b, rgb64.alpha());
+    }
+
+    QRgb fromLinear64(QRgba64 rgb64) const
+    {
+        ushort r = rgb64.red();
+        ushort g = rgb64.green();
+        ushort b = rgb64.blue();
+        r = r - (r >> 8);
+        g = g - (g >> 8);
+        b = b - (b >> 8);
+        r = (m_fromLinear[r >> 4] + 0x80) >> 8;
+        g = (m_fromLinear[g >> 4] + 0x80) >> 8;
+        b = (m_fromLinear[b >> 4] + 0x80) >> 8;
+        return qRgba(r, g, b, rgb64.alpha8());
+    }
+
+    QRgb fromLinear(QRgb rgb32) const
+    {
+        uchar r = (m_fromLinear[qRed(rgb32) << 4] + 0x80) >> 8;
+        uchar g = (m_fromLinear[qGreen(rgb32) << 4] + 0x80) >> 8;
+        uchar b = (m_fromLinear[qBlue(rgb32) << 4] + 0x80) >> 8;
+        return qRgba(r, g, b, qAlpha(rgb32));
+    }
+
+    QRgba64 fromLinear(QRgba64 rgb64) const
+    {
+        ushort r = rgb64.red();
+        ushort g = rgb64.green();
+        ushort b = rgb64.blue();
+        r = r - (r >> 8);
+        g = g - (g >> 8);
+        b = b - (b >> 8);
+        r = m_fromLinear[r >> 4];
+        g = m_fromLinear[g >> 4];
+        b = m_fromLinear[b >> 4];
+        r = r + (r >> 8);
+        g = g + (g >> 8);
+        b = b + (b >> 8);
+        return QRgba64::fromRgba64(r, g, b, rgb64.alpha());
+    }
+
+private:
+    QColorProfile() { }
+
+    // We translate to 0-65280 (255*256) instead to 0-65535 to make simple
+    // shifting an accurate conversion.
+    // We translate from 0-4080 (255*16) for the same speed up, and to keep
+    // the tables small enough to fit in most inner caches.
+    ushort m_toLinear[(255 * 16) + 1]; // [0-4080] -> [0-65280]
+    ushort m_fromLinear[(255 * 16) + 1]; // [0-4080] -> [0-65280]
+
+};
+
+QT_END_NAMESPACE
+
+#endif // QCOLORPROFILE_P_H
diff --git a/src/gui/painting/qcoregraphics.mm b/src/gui/painting/qcoregraphics.mm
index 3753fa4e88..98fdd7f35e 100644
--- a/src/gui/painting/qcoregraphics.mm
+++ b/src/gui/painting/qcoregraphics.mm
@@ -1,31 +1,37 @@
 /****************************************************************************
 **
-** Copyright (C) 2016 The Qt Company Ltd.
-** Contact: http://www.qt.io/licensing/
+** Copyright (C) 2017 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
 **
 ** This file is part of the QtGui module of the Qt Toolkit.
 **
-** $QT_BEGIN_LICENSE:LGPL21$
+** $QT_BEGIN_LICENSE:LGPL$
 ** Commercial License Usage
 ** Licensees holding valid commercial Qt licenses may use this file in
 ** accordance with the commercial license agreement provided with the
 ** Software or, alternatively, in accordance with the terms contained in
 ** a written agreement between you and The Qt Company. For licensing terms
-** and conditions see http://www.qt.io/terms-conditions. For further
-** information use the contact form at http://www.qt.io/contact-us.
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
 **
 ** GNU Lesser General Public License Usage
 ** Alternatively, this file may be used under the terms of the GNU Lesser
-** General Public License version 2.1 or version 3 as published by the Free
-** Software Foundation and appearing in the file LICENSE.LGPLv21 and
-** LICENSE.LGPLv3 included in the packaging of this file. Please review the
-** following information to ensure the GNU Lesser General Public License
-** requirements will be met: https://www.gnu.org/licenses/lgpl.html and
-** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
 **
-** As a special exception, The Qt Company gives you certain additional
-** rights. These rights are described in The Qt Company LGPL Exception
-** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
 **
 ** $QT_END_LICENSE$
 **
@@ -39,6 +45,7 @@
 #include <QtGui/private/qpaintengine_p.h>
 #include <QtCore/qdebug.h>
 #include <QtCore/qcoreapplication.h>
+#include <QtCore/qoperatingsystemversion.h>
 
 QT_BEGIN_NAMESPACE
 
@@ -106,29 +113,6 @@ QImage qt_mac_toQImage(CGImageRef image)
 
 #ifdef Q_OS_MACOS
 
-QT_END_NAMESPACE
-
-@interface NSGraphicsContext (QtAdditions)
-
-+ (NSGraphicsContext *)qt_graphicsContextWithCGContext:(CGContextRef)graphicsPort flipped:(BOOL)initialFlippedState;
-
-@end
-
-@implementation NSGraphicsContext (QtAdditions)
-
-+ (NSGraphicsContext *)qt_graphicsContextWithCGContext:(CGContextRef)graphicsPort flipped:(BOOL)initialFlippedState
-{
-#if QT_MAC_PLATFORM_SDK_EQUAL_OR_ABOVE(__MAC_10_10, __IPHONE_NA)
-    if (QT_PREPEND_NAMESPACE(QSysInfo::MacintoshVersion) >= QT_PREPEND_NAMESPACE(QSysInfo::MV_10_10))
-        return [self graphicsContextWithCGContext:graphicsPort flipped:initialFlippedState];
-#endif
-    return [self graphicsContextWithGraphicsPort:graphicsPort flipped:initialFlippedState];
-}
-
-@end
-
-QT_BEGIN_NAMESPACE
-
 static NSImage *qt_mac_cgimage_to_nsimage(CGImageRef image)
 {
     NSImage *newImage = [[NSImage alloc] initWithCGImage:image size:NSZeroSize];
@@ -155,7 +139,7 @@ NSImage *qt_mac_create_nsimage(const QIcon &icon, int defaultSize)
     QList<QSize> availableSizes = icon.availableSizes();
     if (availableSizes.isEmpty() && defaultSize > 0)
         availableSizes << QSize(defaultSize, defaultSize);
-    foreach (QSize size, availableSizes) {
+    for (QSize size : qAsConst(availableSizes)) {
         QPixmap pm = icon.pixmap(size);
         if (pm.isNull())
             continue;
@@ -179,7 +163,7 @@ QPixmap qt_mac_toQPixmap(const NSImage *image, const QSizeF &size)
     QMacCGContext ctx(&pixmap);
     if (!ctx)
         return QPixmap();
-    NSGraphicsContext *gc = [NSGraphicsContext qt_graphicsContextWithCGContext:ctx flipped:YES];
+    NSGraphicsContext *gc = [NSGraphicsContext graphicsContextWithCGContext:ctx flipped:YES];
     if (!gc)
         return QPixmap();
     [NSGraphicsContext saveGraphicsState];
diff --git a/src/gui/painting/qcoregraphics_p.h b/src/gui/painting/qcoregraphics_p.h
index 065910222d..54de3f332e 100644
--- a/src/gui/painting/qcoregraphics_p.h
+++ b/src/gui/painting/qcoregraphics_p.h
@@ -1,31 +1,37 @@
 /****************************************************************************
 **
-** Copyright (C) 2016 The Qt Company Ltd.
-** Contact: http://www.qt.io/licensing/
+** Copyright (C) 2017 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
 **
 ** This file is part of the QtGui module of the Qt Toolkit.
 **
-** $QT_BEGIN_LICENSE:LGPL21$
+** $QT_BEGIN_LICENSE:LGPL$
 ** Commercial License Usage
 ** Licensees holding valid commercial Qt licenses may use this file in
 ** accordance with the commercial license agreement provided with the
 ** Software or, alternatively, in accordance with the terms contained in
 ** a written agreement between you and The Qt Company. For licensing terms
-** and conditions see http://www.qt.io/terms-conditions. For further
-** information use the contact form at http://www.qt.io/contact-us.
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
 **
 ** GNU Lesser General Public License Usage
 ** Alternatively, this file may be used under the terms of the GNU Lesser
-** General Public License version 2.1 or version 3 as published by the Free
-** Software Foundation and appearing in the file LICENSE.LGPLv21 and
-** LICENSE.LGPLv3 included in the packaging of this file. Please review the
-** following information to ensure the GNU Lesser General Public License
-** requirements will be met: https://www.gnu.org/licenses/lgpl.html and
-** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
 **
-** As a special exception, The Qt Company gives you certain additional
-** rights. These rights are described in The Qt Company LGPL Exception
-** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
 **
 ** $QT_END_LICENSE$
 **
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 7b3e1b991d..5c38648fe7 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -43,6 +43,7 @@
 #include <qstylehints.h>
 #include <qguiapplication.h>
 #include <qatomic.h>
+#include <private/qcolorprofile_p.h>
 #include <private/qdrawhelper_p.h>
 #include <private/qpaintengine_raster_p.h>
 #include <private/qpainter_p.h>
@@ -1143,6 +1144,11 @@ static QRgba64 *QT_FASTCALL destFetch64uint32(QRgba64 *buffer, QRasterBuffer *ra
     return const_cast<QRgba64 *>(layout->convertToARGB64PM(buffer, src, length, 0, 0));
 }
 
+static QRgba64 * QT_FASTCALL destFetch64Undefined(QRgba64 *buffer, QRasterBuffer *, int, int, int)
+{
+    return buffer;
+}
+
 static DestFetchProc destFetchProc[QImage::NImageFormats] =
 {
     0,                  // Format_Invalid
@@ -1175,8 +1181,8 @@ static DestFetchProc destFetchProc[QImage::NImageFormats] =
 static DestFetchProc64 destFetchProc64[QImage::NImageFormats] =
 {
     0,                  // Format_Invalid
-    destFetch64,        // Format_Mono,
-    destFetch64,        // Format_MonoLSB
+    0,                  // Format_Mono,
+    0,                  // Format_MonoLSB
     0,                  // Format_Indexed8
     destFetch64uint32,  // Format_RGB32
     destFetch64uint32,  // Format_ARGB32,
@@ -1320,7 +1326,7 @@ static void QT_FASTCALL destStore(QRasterBuffer *rasterBuffer, int x, int y, con
 static void QT_FASTCALL convertFromRgb64(uint *dest, const QRgba64 *src, int length)
 {
     for (int i = 0; i < length; ++i) {
-        dest[i] = src[i].toArgb32();
+        dest[i] = toArgb32(src[i]);
     }
 }
 
@@ -1411,7 +1417,7 @@ static void QT_FASTCALL destStore64ARGB32(QRasterBuffer *rasterBuffer, int x, in
 {
     uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
     for (int i = 0; i < length; ++i) {
-        dest[i] = buffer[i].unpremultiplied().toArgb32();
+        dest[i] = toArgb32(buffer[i].unpremultiplied());
     }
 }
 
@@ -1419,7 +1425,7 @@ static void QT_FASTCALL destStore64RGBA8888(QRasterBuffer *rasterBuffer, int x,
 {
     uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
     for (int i = 0; i < length; ++i) {
-        dest[i] = ARGB2RGBA(buffer[i].unpremultiplied().toArgb32());
+        dest[i] = toRgba8888(buffer[i].unpremultiplied());
     }
 }
 
@@ -1914,562 +1920,695 @@ inline void fetchTransformedBilinear_pixelBounds<BlendTransformedBilinear>(int,
     Q_ASSERT(v2 >= l1 && v2 <= l2);
 }
 
-template<TextureBlendType blendType> /* blendType = BlendTransformedBilinear or BlendTransformedBilinearTiled */
-static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, const Operator *,
-                                                                 const QSpanData *data, int y, int x,
-                                                                 int length)
-{
-    int image_width = data->texture.width;
-    int image_height = data->texture.height;
-
-    int image_x1 = data->texture.x1;
-    int image_y1 = data->texture.y1;
-    int image_x2 = data->texture.x2 - 1;
-    int image_y2 = data->texture.y2 - 1;
-
-    const qreal cx = x + qreal(0.5);
-    const qreal cy = y + qreal(0.5);
-
-    uint *end = buffer + length;
-    uint *b = buffer;
-    if (data->fast_matrix) {
-        // The increment pr x in the scanline
-        int fdx = (int)(data->m11 * fixed_scale);
-        int fdy = (int)(data->m12 * fixed_scale);
-
-        int fx = int((data->m21 * cy
-                      + data->m11 * cx + data->dx) * fixed_scale);
-        int fy = int((data->m22 * cy
-                      + data->m12 * cx + data->dy) * fixed_scale);
-
-        fx -= half_point;
-        fy -= half_point;
-
-        if (fdy == 0) { //simple scale, no rotation
-            int y1 = (fy >> 16);
-            int y2;
-            fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
-            const uint *s1 = (const uint *)data->texture.scanLine(y1);
-            const uint *s2 = (const uint *)data->texture.scanLine(y2);
-
-            if (fdx <= fixed_scale && fdx > 0) { // scale up on X
-                int disty = (fy & 0x0000ffff) >> 8;
-                int idisty = 256 - disty;
-                int x = fx >> 16;
+enum FastTransformTypes {
+    SimpleUpscaleTransform,
+    UpscaleTransform,
+    DownscaleTransform,
+    RotateTransform,
+    FastRotateTransform,
+    NFastTransformTypes
+};
 
-                // The idea is first to do the interpolation between the row s1 and the row s2
-                // into an intermediate buffer, then we interpolate between two pixel of this buffer.
+typedef void (QT_FASTCALL *BilinearFastTransformHelper)(uint *b, uint *end, const QTextureData &image, int &fx, int &fy, int fdx, int fdy);
 
-                // intermediate_buffer[0] is a buffer of red-blue component of the pixel, in the form 0x00RR00BB
-                // intermediate_buffer[1] is the alpha-green component of the pixel, in the form 0x00AA00GG
-                // +1 for the last pixel to interpolate with, and +1 for rounding errors.
-                quint32 intermediate_buffer[2][buffer_size + 2];
-                // count is the size used in the intermediate_buffer.
-                int count = (qint64(length) * fdx + fixed_scale - 1) / fixed_scale + 2;
-                Q_ASSERT(count <= buffer_size + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case
-                int f = 0;
-                int lim = count;
-                if (blendType == BlendTransformedBilinearTiled) {
-                    x %= image_width;
-                    if (x < 0) x += image_width;
-                } else {
-                    lim = qMin(count, image_x2-x+1);
-                    if (x < image_x1) {
-                        Q_ASSERT(x <= image_x2);
-                        uint t = s1[image_x1];
-                        uint b = s2[image_x1];
-                        quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                        quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                        do {
-                            intermediate_buffer[0][f] = rb;
-                            intermediate_buffer[1][f] = ag;
-                            f++;
-                            x++;
-                        } while (x < image_x1 && f < lim);
-                    }
-                }
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_upscale_helper(uint *b, uint *end, const QTextureData &image,
+                                                                               int &fx, int &fy, int fdx, int /*fdy*/)
+{
+    int y1 = (fy >> 16);
+    int y2;
+    fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+    const uint *s1 = (const uint *)image.scanLine(y1);
+    const uint *s2 = (const uint *)image.scanLine(y2);
+
+    int disty = (fy & 0x0000ffff) >> 8;
+    int idisty = 256 - disty;
+    int x = fx >> 16;
+    int length = end - b;
+
+    // The idea is first to do the interpolation between the row s1 and the row s2
+    // into an intermediate buffer, then we interpolate between two pixel of this buffer.
+
+    // intermediate_buffer[0] is a buffer of red-blue component of the pixel, in the form 0x00RR00BB
+    // intermediate_buffer[1] is the alpha-green component of the pixel, in the form 0x00AA00GG
+    // +1 for the last pixel to interpolate with, and +1 for rounding errors.
+    quint32 intermediate_buffer[2][buffer_size + 2];
+    // count is the size used in the intermediate_buffer.
+    int count = (qint64(length) * fdx + fixed_scale - 1) / fixed_scale + 2;
+    Q_ASSERT(count <= buffer_size + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case
+    int f = 0;
+    int lim = count;
+    if (blendType == BlendTransformedBilinearTiled) {
+        x %= image.width;
+        if (x < 0) x += image.width;
+    } else {
+        lim = qMin(count, image.x2 - x);
+        if (x < image.x1) {
+            Q_ASSERT(x < image.x2);
+            uint t = s1[image.x1];
+            uint b = s2[image.x1];
+            quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+            quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+            do {
+                intermediate_buffer[0][f] = rb;
+                intermediate_buffer[1][f] = ag;
+                f++;
+                x++;
+            } while (x < image.x1 && f < lim);
+        }
+    }
 
-                if (blendType != BlendTransformedBilinearTiled) {
+    if (blendType != BlendTransformedBilinearTiled) {
 #if defined(__SSE2__)
-                    const __m128i disty_ = _mm_set1_epi16(disty);
-                    const __m128i idisty_ = _mm_set1_epi16(idisty);
-                    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
-
-                    lim -= 3;
-                    for (; f < lim; x += 4, f += 4) {
-                        // Load 4 pixels from s1, and split the alpha-green and red-blue component
-                        __m128i top = _mm_loadu_si128((const __m128i*)((const uint *)(s1)+x));
-                        __m128i topAG = _mm_srli_epi16(top, 8);
-                        __m128i topRB = _mm_and_si128(top, colorMask);
-                        // Multiplies each colour component by idisty
-                        topAG = _mm_mullo_epi16 (topAG, idisty_);
-                        topRB = _mm_mullo_epi16 (topRB, idisty_);
-
-                        // Same for the s2 vector
-                        __m128i bottom = _mm_loadu_si128((const __m128i*)((const uint *)(s2)+x));
-                        __m128i bottomAG = _mm_srli_epi16(bottom, 8);
-                        __m128i bottomRB = _mm_and_si128(bottom, colorMask);
-                        bottomAG = _mm_mullo_epi16 (bottomAG, disty_);
-                        bottomRB = _mm_mullo_epi16 (bottomRB, disty_);
-
-                        // Add the values, and shift to only keep 8 significant bits per colors
-                        __m128i rAG =_mm_add_epi16(topAG, bottomAG);
-                        rAG = _mm_srli_epi16(rAG, 8);
-                        _mm_storeu_si128((__m128i*)(&intermediate_buffer[1][f]), rAG);
-                        __m128i rRB =_mm_add_epi16(topRB, bottomRB);
-                        rRB = _mm_srli_epi16(rRB, 8);
-                        _mm_storeu_si128((__m128i*)(&intermediate_buffer[0][f]), rRB);
-                    }
+        const __m128i disty_ = _mm_set1_epi16(disty);
+        const __m128i idisty_ = _mm_set1_epi16(idisty);
+        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+
+        lim -= 3;
+        for (; f < lim; x += 4, f += 4) {
+            // Load 4 pixels from s1, and split the alpha-green and red-blue component
+            __m128i top = _mm_loadu_si128((const __m128i*)((const uint *)(s1)+x));
+            __m128i topAG = _mm_srli_epi16(top, 8);
+            __m128i topRB = _mm_and_si128(top, colorMask);
+            // Multiplies each color component by idisty
+            topAG = _mm_mullo_epi16 (topAG, idisty_);
+            topRB = _mm_mullo_epi16 (topRB, idisty_);
+
+            // Same for the s2 vector
+            __m128i bottom = _mm_loadu_si128((const __m128i*)((const uint *)(s2)+x));
+            __m128i bottomAG = _mm_srli_epi16(bottom, 8);
+            __m128i bottomRB = _mm_and_si128(bottom, colorMask);
+            bottomAG = _mm_mullo_epi16 (bottomAG, disty_);
+            bottomRB = _mm_mullo_epi16 (bottomRB, disty_);
+
+            // Add the values, and shift to only keep 8 significant bits per colors
+            __m128i rAG =_mm_add_epi16(topAG, bottomAG);
+            rAG = _mm_srli_epi16(rAG, 8);
+            _mm_storeu_si128((__m128i*)(&intermediate_buffer[1][f]), rAG);
+            __m128i rRB =_mm_add_epi16(topRB, bottomRB);
+            rRB = _mm_srli_epi16(rRB, 8);
+            _mm_storeu_si128((__m128i*)(&intermediate_buffer[0][f]), rRB);
+        }
 #elif defined(__ARM_NEON__)
-                    const int16x8_t disty_ = vdupq_n_s16(disty);
-                    const int16x8_t idisty_ = vdupq_n_s16(idisty);
-                    const int16x8_t colorMask = vdupq_n_s16(0x00ff);
-
-                    lim -= 3;
-                    for (; f < lim; x += 4, f += 4) {
-                        // Load 4 pixels from s1, and split the alpha-green and red-blue component
-                        int16x8_t top = vld1q_s16((int16_t*)((const uint *)(s1)+x));
-                        int16x8_t topAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(top), 8));
-                        int16x8_t topRB = vandq_s16(top, colorMask);
-                        // Multiplies each colour component by idisty
-                        topAG = vmulq_s16(topAG, idisty_);
-                        topRB = vmulq_s16(topRB, idisty_);
-
-                        // Same for the s2 vector
-                        int16x8_t bottom = vld1q_s16((int16_t*)((const uint *)(s2)+x));
-                        int16x8_t bottomAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(bottom), 8));
-                        int16x8_t bottomRB = vandq_s16(bottom, colorMask);
-                        bottomAG = vmulq_s16(bottomAG, disty_);
-                        bottomRB = vmulq_s16(bottomRB, disty_);
-
-                        // Add the values, and shift to only keep 8 significant bits per colors
-                        int16x8_t rAG = vaddq_s16(topAG, bottomAG);
-                        rAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rAG), 8));
-                        vst1q_s16((int16_t*)(&intermediate_buffer[1][f]), rAG);
-                        int16x8_t rRB = vaddq_s16(topRB, bottomRB);
-                        rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8));
-                        vst1q_s16((int16_t*)(&intermediate_buffer[0][f]), rRB);
-                    }
+        const int16x8_t disty_ = vdupq_n_s16(disty);
+        const int16x8_t idisty_ = vdupq_n_s16(idisty);
+        const int16x8_t colorMask = vdupq_n_s16(0x00ff);
+
+        lim -= 3;
+        for (; f < lim; x += 4, f += 4) {
+            // Load 4 pixels from s1, and split the alpha-green and red-blue component
+            int16x8_t top = vld1q_s16((int16_t*)((const uint *)(s1)+x));
+            int16x8_t topAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(top), 8));
+            int16x8_t topRB = vandq_s16(top, colorMask);
+            // Multiplies each color component by idisty
+            topAG = vmulq_s16(topAG, idisty_);
+            topRB = vmulq_s16(topRB, idisty_);
+
+            // Same for the s2 vector
+            int16x8_t bottom = vld1q_s16((int16_t*)((const uint *)(s2)+x));
+            int16x8_t bottomAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(bottom), 8));
+            int16x8_t bottomRB = vandq_s16(bottom, colorMask);
+            bottomAG = vmulq_s16(bottomAG, disty_);
+            bottomRB = vmulq_s16(bottomRB, disty_);
+
+            // Add the values, and shift to only keep 8 significant bits per colors
+            int16x8_t rAG = vaddq_s16(topAG, bottomAG);
+            rAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rAG), 8));
+            vst1q_s16((int16_t*)(&intermediate_buffer[1][f]), rAG);
+            int16x8_t rRB = vaddq_s16(topRB, bottomRB);
+            rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8));
+            vst1q_s16((int16_t*)(&intermediate_buffer[0][f]), rRB);
+        }
 #endif
-                }
-                for (; f < count; f++) { // Same as above but without sse2
-                    if (blendType == BlendTransformedBilinearTiled) {
-                        if (x >= image_width) x -= image_width;
-                    } else {
-                        x = qMin(x, image_x2);
-                    }
+    }
+    for (; f < count; f++) { // Same as above but without simd
+        if (blendType == BlendTransformedBilinearTiled) {
+            if (x >= image.width) x -= image.width;
+        } else {
+            x = qMin(x, image.x2 - 1);
+        }
 
-                    uint t = s1[x];
-                    uint b = s2[x];
+        uint t = s1[x];
+        uint b = s2[x];
 
-                    intermediate_buffer[0][f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                    intermediate_buffer[1][f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                    x++;
-                }
-                // Now interpolate the values from the intermediate_buffer to get the final result.
-                fx &= fixed_scale - 1;
-                Q_ASSERT((fx >> 16) == 0);
-                while (b < end) {
-                    int x1 = (fx >> 16);
-                    int x2 = x1 + 1;
-                    Q_ASSERT(x1 >= 0);
-                    Q_ASSERT(x2 < count);
+        intermediate_buffer[0][f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+        intermediate_buffer[1][f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+        x++;
+    }
+    // Now interpolate the values from the intermediate_buffer to get the final result.
+    fx &= fixed_scale - 1;
+    Q_ASSERT((fx >> 16) == 0);
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2 = x1 + 1;
+        Q_ASSERT(x1 >= 0);
+        Q_ASSERT(x2 < count);
+
+        int distx = (fx & 0x0000ffff) >> 8;
+        int idistx = 256 - distx;
+        int rb = ((intermediate_buffer[0][x1] * idistx + intermediate_buffer[0][x2] * distx) >> 8) & 0xff00ff;
+        int ag = (intermediate_buffer[1][x1] * idistx + intermediate_buffer[1][x2] * distx) & 0xff00ff00;
+        *b = rb | ag;
+        b++;
+        fx += fdx;
+    }
+}
 
-                    int distx = (fx & 0x0000ffff) >> 8;
-                    int idistx = 256 - distx;
-                    int rb = ((intermediate_buffer[0][x1] * idistx + intermediate_buffer[0][x2] * distx) >> 8) & 0xff00ff;
-                    int ag = (intermediate_buffer[1][x1] * idistx + intermediate_buffer[1][x2] * distx) & 0xff00ff00;
-                    *b = rb | ag;
-                    b++;
-                    fx += fdx;
-                }
-            } else if ((fdx < 0 && fdx > -(fixed_scale / 8)) || std::abs(data->m22) < (1./8.)) { // scale up more than 8x
-                int y1 = (fy >> 16);
-                int y2;
-                fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
-                const uint *s1 = (const uint *)data->texture.scanLine(y1);
-                const uint *s2 = (const uint *)data->texture.scanLine(y2);
-                int disty = (fy & 0x0000ffff) >> 8;
-                while (b < end) {
-                    int x1 = (fx >> 16);
-                    int x2;
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                    uint tl = s1[x1];
-                    uint tr = s1[x2];
-                    uint bl = s2[x1];
-                    uint br = s2[x2];
-                    int distx = (fx & 0x0000ffff) >> 8;
-                    *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_upscale_helper(uint *b, uint *end, const QTextureData &image,
+                                                                        int &fx, int &fy, int fdx, int /*fdy*/)
+{
+    int y1 = (fy >> 16);
+    int y2;
+    fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+    const uint *s1 = (const uint *)image.scanLine(y1);
+    const uint *s2 = (const uint *)image.scanLine(y2);
+    const int disty = (fy & 0x0000ffff) >> 8;
+
+    if (blendType != BlendTransformedBilinearTiled) {
+        const qint64 min_fx = qint64(image.x1) * fixed_scale;
+        const qint64 max_fx = qint64(image.x2 - 1) * fixed_scale;
+        while (b < end) {
+            int x1 = (fx >> 16);
+            int x2;
+            fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+            if (x1 != x2)
+                break;
+            uint top = s1[x1];
+            uint bot = s2[x1];
+            *b = INTERPOLATE_PIXEL_256(top, 256 - disty, bot, disty);
+            fx += fdx;
+            ++b;
+        }
+        uint *boundedEnd = end;
+        if (fdx > 0)
+            boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
+        else if (fdx < 0)
+            boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
+
+        // A fast middle part without boundary checks
+        while (b < boundedEnd) {
+            int x = (fx >> 16);
+            int distx = (fx & 0x0000ffff) >> 8;
+            *b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty);
+            fx += fdx;
+            ++b;
+        }
+    }
 
-                    fx += fdx;
-                    ++b;
-                }
-            } else { //scale down
-                int y1 = (fy >> 16);
-                int y2;
-                fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
-                const uint *s1 = (const uint *)data->texture.scanLine(y1);
-                const uint *s2 = (const uint *)data->texture.scanLine(y2);
-                const int disty8 = (fy & 0x0000ffff) >> 8;
-                const int disty4 = (disty8 + 0x08) >> 4;
-
-                if (blendType != BlendTransformedBilinearTiled) {
-#define BILINEAR_DOWNSCALE_BOUNDS_PROLOG \
-                    const qint64 min_fx = qint64(image_x1) * fixed_scale; \
-                    const qint64 max_fx = qint64(image_x2) * fixed_scale; \
-                    while (b < end) { \
-                        int x1 = (fx >> 16); \
-                        int x2; \
-                        fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); \
-                        if (x1 != x2) \
-                            break; \
-                        uint top = s1[x1]; \
-                        uint bot = s2[x1]; \
-                        *b = INTERPOLATE_PIXEL_256(top, 256 - disty8, bot, disty8); \
-                        fx += fdx; \
-                        ++b; \
-                    } \
-                    uint *boundedEnd = end; \
-                    if (fdx > 0) \
-                        boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); \
-                    else if (fdx < 0) \
-                        boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); \
-                    boundedEnd -= 3;
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2;
+        fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1 , x1, x2);
+        uint tl = s1[x1];
+        uint tr = s1[x2];
+        uint bl = s2[x1];
+        uint br = s2[x2];
+        int distx = (fx & 0x0000ffff) >> 8;
+        *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+
+        fx += fdx;
+        ++b;
+    }
+}
 
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper(uint *b, uint *end, const QTextureData &image,
+                                                                          int &fx, int &fy, int fdx, int /*fdy*/)
+{
+    int y1 = (fy >> 16);
+    int y2;
+    fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+    const uint *s1 = (const uint *)image.scanLine(y1);
+    const uint *s2 = (const uint *)image.scanLine(y2);
+    const int disty8 = (fy & 0x0000ffff) >> 8;
+    const int disty4 = (disty8 + 0x08) >> 4;
+
+    if (blendType != BlendTransformedBilinearTiled) {
+        const qint64 min_fx = qint64(image.x1) * fixed_scale;
+        const qint64 max_fx = qint64(image.x2 - 1) * fixed_scale;
+        while (b < end) {
+            int x1 = (fx >> 16);
+            int x2;
+            fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+            if (x1 != x2)
+                break;
+            uint top = s1[x1];
+            uint bot = s2[x1];
+            *b = INTERPOLATE_PIXEL_256(top, 256 - disty8, bot, disty8);
+            fx += fdx;
+            ++b;
+        }
+        uint *boundedEnd = end;
+        if (fdx > 0)
+            boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
+        else if (fdx < 0)
+            boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
+        // A fast middle part without boundary checks
 #if defined(__SSE2__)
-                    BILINEAR_DOWNSCALE_BOUNDS_PROLOG
-
-                    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
-                    const __m128i v_256 = _mm_set1_epi16(256);
-                    const __m128i v_disty = _mm_set1_epi16(disty4);
-                    const __m128i v_fdx = _mm_set1_epi32(fdx*4);
-                    const __m128i v_fx_r = _mm_set1_epi32(0x8);
-                    __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
-
-                    while (b < boundedEnd) {
-                        __m128i offset = _mm_srli_epi32(v_fx, 16);
-                        const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
-                        const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
-                        const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
-                        const int offset3 = _mm_cvtsi128_si32(offset);
-                        const __m128i tl = _mm_setr_epi32(s1[offset0], s1[offset1], s1[offset2], s1[offset3]);
-                        const __m128i tr = _mm_setr_epi32(s1[offset0 + 1], s1[offset1 + 1], s1[offset2 + 1], s1[offset3 + 1]);
-                        const __m128i bl = _mm_setr_epi32(s2[offset0], s2[offset1], s2[offset2], s2[offset3]);
-                        const __m128i br = _mm_setr_epi32(s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]);
-
-                        __m128i v_distx = _mm_srli_epi16(v_fx, 8);
-                        v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fx_r), 4);
-                        v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
-                        v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
-
-                        interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b);
-                        b += 4;
-                        v_fx = _mm_add_epi32(v_fx, v_fdx);
-                    }
-                    fx = _mm_cvtsi128_si32(v_fx);
+        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+        const __m128i v_256 = _mm_set1_epi16(256);
+        const __m128i v_disty = _mm_set1_epi16(disty4);
+        const __m128i v_fdx = _mm_set1_epi32(fdx*4);
+        const __m128i v_fx_r = _mm_set1_epi32(0x8);
+        __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
+
+        while (b < boundedEnd - 3) {
+            __m128i offset = _mm_srli_epi32(v_fx, 16);
+            const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+            const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+            const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+            const int offset3 = _mm_cvtsi128_si32(offset);
+            const __m128i tl = _mm_setr_epi32(s1[offset0], s1[offset1], s1[offset2], s1[offset3]);
+            const __m128i tr = _mm_setr_epi32(s1[offset0 + 1], s1[offset1 + 1], s1[offset2 + 1], s1[offset3 + 1]);
+            const __m128i bl = _mm_setr_epi32(s2[offset0], s2[offset1], s2[offset2], s2[offset3]);
+            const __m128i br = _mm_setr_epi32(s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]);
+
+            __m128i v_distx = _mm_srli_epi16(v_fx, 8);
+            v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fx_r), 4);
+            v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+            v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+
+            interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b);
+            b += 4;
+            v_fx = _mm_add_epi32(v_fx, v_fdx);
+        }
+        fx = _mm_cvtsi128_si32(v_fx);
 #elif defined(__ARM_NEON__)
-                    BILINEAR_DOWNSCALE_BOUNDS_PROLOG
-
-                    const int16x8_t colorMask = vdupq_n_s16(0x00ff);
-                    const int16x8_t invColorMask = vmvnq_s16(colorMask);
-                    const int16x8_t v_256 = vdupq_n_s16(256);
-                    const int16x8_t v_disty = vdupq_n_s16(disty4);
-                    const int16x8_t v_disty_ = vshlq_n_s16(v_disty, 4);
-                    int32x4_t v_fdx = vdupq_n_s32(fdx*4);
+        const int16x8_t colorMask = vdupq_n_s16(0x00ff);
+        const int16x8_t invColorMask = vmvnq_s16(colorMask);
+        const int16x8_t v_256 = vdupq_n_s16(256);
+        const int16x8_t v_disty = vdupq_n_s16(disty4);
+        const int16x8_t v_disty_ = vshlq_n_s16(v_disty, 4);
+        int32x4_t v_fdx = vdupq_n_s32(fdx*4);
 
-                    int32x4_t v_fx = vmovq_n_s32(fx);
-                    v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1);
-                    v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2);
-                    v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3);
+        int32x4_t v_fx = vmovq_n_s32(fx);
+        v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1);
+        v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2);
+        v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3);
 
-                    const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
-                    const int32x4_t v_fx_r = vdupq_n_s32(0x0800);
+        const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
+        const int32x4_t v_fx_r = vdupq_n_s32(0x0800);
 
-                    while (b < boundedEnd) {
-                        uint32x4x2_t v_top, v_bot;
+        while (b < boundedEnd - 3) {
+            uint32x4x2_t v_top, v_bot;
 
-                        int x1 = (fx >> 16);
-                        fx += fdx;
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 0);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0);
-                        x1 = (fx >> 16);
-                        fx += fdx;
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 1);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1);
-                        x1 = (fx >> 16);
-                        fx += fdx;
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 2);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2);
-                        x1 = (fx >> 16);
-                        fx += fdx;
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 3);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3);
-
-                        int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_fx_r), 12);
-                        v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16));
-
-                        interpolate_4_pixels_16_neon(
-                                    vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]),
-                                    vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]),
-                                    vreinterpretq_s16_s32(v_distx), v_disty, v_disty_,
-                                    colorMask, invColorMask, v_256, b);
-                        b+=4;
-                        v_fx = vaddq_s32(v_fx, v_fdx);
-                    }
+            int x1 = (fx >> 16);
+            fx += fdx;
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 0);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0);
+            x1 = (fx >> 16);
+            fx += fdx;
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 1);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1);
+            x1 = (fx >> 16);
+            fx += fdx;
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 2);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2);
+            x1 = (fx >> 16);
+            fx += fdx;
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 3);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3);
+
+            int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_fx_r), 12);
+            v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16));
+
+            interpolate_4_pixels_16_neon(
+                        vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]),
+                    vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]),
+                    vreinterpretq_s16_s32(v_distx), v_disty, v_disty_,
+                    colorMask, invColorMask, v_256, b);
+            b+=4;
+            v_fx = vaddq_s32(v_fx, v_fdx);
+        }
 #endif
-                }
+        while (b < boundedEnd) {
+            int x = (fx >> 16);
+#if defined(__SSE2__) || defined(__ARM_NEON__)
+            int distx8 = (fx & 0x0000ffff) >> 8;
+            *b = interpolate_4_pixels(s1 + x, s2 + x, distx8, disty8);
+#else
+            uint tl = s1[x];
+            uint tr = s1[x + 1];
+            uint bl = s2[x];
+            uint br = s2[x + 1];
+            int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12;
+            *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4);
+#endif
+            fx += fdx;
+            ++b;
+        }
+    }
 
-                while (b < end) {
-                    int x1 = (fx >> 16);
-                    int x2;
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                    uint tl = s1[x1];
-                    uint tr = s1[x2];
-                    uint bl = s2[x1];
-                    uint br = s2[x2];
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2;
+        fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+        uint tl = s1[x1];
+        uint tr = s1[x2];
+        uint bl = s2[x1];
+        uint br = s2[x2];
 #if defined(__SSE2__) || defined(__ARM_NEON__)
-                    // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
-                    int distx8 = (fx & 0x0000ffff) >> 8;
-                    *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8);
+        // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
+        int distx8 = (fx & 0x0000ffff) >> 8;
+        *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8);
 #else
-                    int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12;
-                    *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4);
+        int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12;
+        *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4);
 #endif
-                    fx += fdx;
-                    ++b;
-                }
-            }
-        } else { //rotation
-            if (std::abs(data->m11) < (1./8.) || std::abs(data->m22) < (1./8.)) {
-                //if we are zooming more than 8 times, we use 8bit precision for the position.
-                while (b < end) {
-                    int x1 = (fx >> 16);
-                    int x2;
-                    int y1 = (fy >> 16);
-                    int y2;
+        fx += fdx;
+        ++b;
+    }
+}
 
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_rotate_helper(uint *b, uint *end, const QTextureData &image,
+                                                                       int &fx, int &fy, int fdx, int fdy)
+{
+    // if we are zooming more than 8 times, we use 8bit precision for the position.
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2;
+        int y1 = (fy >> 16);
+        int y2;
 
-                    const uint *s1 = (const uint *)data->texture.scanLine(y1);
-                    const uint *s2 = (const uint *)data->texture.scanLine(y2);
+        fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+        fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
 
-                    uint tl = s1[x1];
-                    uint tr = s1[x2];
-                    uint bl = s2[x1];
-                    uint br = s2[x2];
+        const uint *s1 = (const uint *)image.scanLine(y1);
+        const uint *s2 = (const uint *)image.scanLine(y2);
 
-                    int distx = (fx & 0x0000ffff) >> 8;
-                    int disty = (fy & 0x0000ffff) >> 8;
+        uint tl = s1[x1];
+        uint tr = s1[x2];
+        uint bl = s2[x1];
+        uint br = s2[x2];
 
-                    *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+        int distx = (fx & 0x0000ffff) >> 8;
+        int disty = (fy & 0x0000ffff) >> 8;
 
-                    fx += fdx;
-                    fy += fdy;
-                    ++b;
-                }
-            } else {
-                //we are zooming less than 8x, use 4bit precision
-
-                if (blendType != BlendTransformedBilinearTiled) {
-#define BILINEAR_ROTATE_BOUNDS_PROLOG \
-                    const qint64 min_fx = qint64(image_x1) * fixed_scale; \
-                    const qint64 max_fx = qint64(image_x2) * fixed_scale; \
-                    const qint64 min_fy = qint64(image_y1) * fixed_scale; \
-                    const qint64 max_fy = qint64(image_y2) * fixed_scale; \
-                    while (b < end) { \
-                        int x1 = (fx >> 16); \
-                        int x2; \
-                        int y1 = (fy >> 16); \
-                        int y2; \
-                        fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); \
-                        fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2); \
-                        if (x1 != x2 && y1 != y2) \
-                            break; \
-                        const uint *s1 = (const uint *)data->texture.scanLine(y1); \
-                        const uint *s2 = (const uint *)data->texture.scanLine(y2); \
-                        uint tl = s1[x1]; \
-                        uint tr = s1[x2]; \
-                        uint bl = s2[x1]; \
-                        uint br = s2[x2]; \
-                        int distx = (fx & 0x0000ffff) >> 8; \
-                        int disty = (fy & 0x0000ffff) >> 8; \
-                        *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); \
-                        fx += fdx; \
-                        fy += fdy; \
-                        ++b; \
-                    } \
-                    uint *boundedEnd = end; \
-                    if (fdx > 0) \
-                        boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); \
-                    else if (fdx < 0) \
-                        boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); \
-                    if (fdy > 0) \
-                        boundedEnd = qMin(boundedEnd, b + (max_fy - fy) / fdy); \
-                    else if (fdy < 0) \
-                        boundedEnd = qMin(boundedEnd, b + (min_fy - fy) / fdy); \
-                    boundedEnd -= 3;
+        *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
 
-#if defined(__SSE2__)
-                    BILINEAR_ROTATE_BOUNDS_PROLOG
+        fx += fdx;
+        fy += fdy;
+        ++b;
+    }
+}
 
-                    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
-                    const __m128i v_256 = _mm_set1_epi16(256);
-                    const __m128i v_fdx = _mm_set1_epi32(fdx*4);
-                    const __m128i v_fdy = _mm_set1_epi32(fdy*4);
-                    const __m128i v_fxy_r = _mm_set1_epi32(0x8);
-                    __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
-                    __m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy);
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper(uint *b, uint *end, const QTextureData &image,
+                                                                            int &fx, int &fy, int fdx, int fdy)
+{
+    //we are zooming less than 8x, use 4bit precision
+    if (blendType != BlendTransformedBilinearTiled) {
+        const qint64 min_fx = qint64(image.x1) * fixed_scale;
+        const qint64 max_fx = qint64(image.x2 - 1) * fixed_scale;
+        const qint64 min_fy = qint64(image.y1) * fixed_scale;
+        const qint64 max_fy = qint64(image.y2 - 1) * fixed_scale;
+        // first handle the possibly bounded part in the beginning
+        while (b < end) {
+            int x1 = (fx >> 16);
+            int x2;
+            int y1 = (fy >> 16);
+            int y2;
+            fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+            fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+            if (x1 != x2 && y1 != y2)
+                break;
+            const uint *s1 = (const uint *)image.scanLine(y1);
+            const uint *s2 = (const uint *)image.scanLine(y2);
+            uint tl = s1[x1];
+            uint tr = s1[x2];
+            uint bl = s2[x1];
+            uint br = s2[x2];
+#if defined(__SSE2__) || defined(__ARM_NEON__)
+            int distx = (fx & 0x0000ffff) >> 8;
+            int disty = (fy & 0x0000ffff) >> 8;
+            *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+#else
+            int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
+            int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
+            *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
+#endif
+            fx += fdx;
+            fy += fdy;
+            ++b;
+        }
+        uint *boundedEnd = end; \
+        if (fdx > 0) \
+            boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); \
+        else if (fdx < 0) \
+            boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); \
+        if (fdy > 0) \
+            boundedEnd = qMin(boundedEnd, b + (max_fy - fy) / fdy); \
+        else if (fdy < 0) \
+            boundedEnd = qMin(boundedEnd, b + (min_fy - fy) / fdy); \
+
+        // until boundedEnd we can now have a fast middle part without boundary checks
+#if defined(__SSE2__)
+        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+        const __m128i v_256 = _mm_set1_epi16(256);
+        const __m128i v_fdx = _mm_set1_epi32(fdx*4);
+        const __m128i v_fdy = _mm_set1_epi32(fdy*4);
+        const __m128i v_fxy_r = _mm_set1_epi32(0x8);
+        __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
+        __m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy);
+
+        const uchar *textureData = image.imageData;
+        const int bytesPerLine = image.bytesPerLine;
+        const __m128i vbpl = _mm_shufflelo_epi16(_mm_cvtsi32_si128(bytesPerLine/4), _MM_SHUFFLE(0, 0, 0, 0));
+
+        while (b < boundedEnd - 3) {
+            const __m128i vy = _mm_packs_epi32(_mm_srli_epi32(v_fy, 16), _mm_setzero_si128());
+            // 4x16bit * 4x16bit -> 4x32bit
+            __m128i offset = _mm_unpacklo_epi16(_mm_mullo_epi16(vy, vbpl), _mm_mulhi_epi16(vy, vbpl));
+            offset = _mm_add_epi32(offset, _mm_srli_epi32(v_fx, 16));
+            const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+            const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+            const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+            const int offset3 = _mm_cvtsi128_si32(offset);
+            const uint *topData = (const uint *)(textureData);
+            const __m128i tl = _mm_setr_epi32(topData[offset0], topData[offset1], topData[offset2], topData[offset3]);
+            const __m128i tr = _mm_setr_epi32(topData[offset0 + 1], topData[offset1 + 1], topData[offset2 + 1], topData[offset3 + 1]);
+            const uint *bottomData = (const uint *)(textureData + bytesPerLine);
+            const __m128i bl = _mm_setr_epi32(bottomData[offset0], bottomData[offset1], bottomData[offset2], bottomData[offset3]);
+            const __m128i br = _mm_setr_epi32(bottomData[offset0 + 1], bottomData[offset1 + 1], bottomData[offset2 + 1], bottomData[offset3 + 1]);
+
+            __m128i v_distx = _mm_srli_epi16(v_fx, 8);
+            __m128i v_disty = _mm_srli_epi16(v_fy, 8);
+            v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fxy_r), 4);
+            v_disty = _mm_srli_epi16(_mm_add_epi32(v_disty, v_fxy_r), 4);
+            v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+            v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+            v_disty = _mm_shufflehi_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
+            v_disty = _mm_shufflelo_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
+
+            interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b);
+            b += 4;
+            v_fx = _mm_add_epi32(v_fx, v_fdx);
+            v_fy = _mm_add_epi32(v_fy, v_fdy);
+        }
+        fx = _mm_cvtsi128_si32(v_fx);
+        fy = _mm_cvtsi128_si32(v_fy);
+#elif defined(__ARM_NEON__)
+        const int16x8_t colorMask = vdupq_n_s16(0x00ff);
+        const int16x8_t invColorMask = vmvnq_s16(colorMask);
+        const int16x8_t v_256 = vdupq_n_s16(256);
+        int32x4_t v_fdx = vdupq_n_s32(fdx * 4);
+        int32x4_t v_fdy = vdupq_n_s32(fdy * 4);
+
+        const uchar *textureData = image.imageData;
+        const int bytesPerLine = image.bytesPerLine;
+
+        int32x4_t v_fx = vmovq_n_s32(fx);
+        int32x4_t v_fy = vmovq_n_s32(fy);
+        v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1);
+        v_fy = vsetq_lane_s32(fy + fdy, v_fy, 1);
+        v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2);
+        v_fy = vsetq_lane_s32(fy + fdy * 2, v_fy, 2);
+        v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3);
+        v_fy = vsetq_lane_s32(fy + fdy * 3, v_fy, 3);
+
+        const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
+        const int32x4_t v_round = vdupq_n_s32(0x0800);
+
+        while (b < boundedEnd - 3) {
+            uint32x4x2_t v_top, v_bot;
+
+            int x1 = (fx >> 16);
+            int y1 = (fy >> 16);
+            fx += fdx; fy += fdy;
+            const uchar *sl = textureData + bytesPerLine * y1;
+            const uint *s1 = reinterpret_cast<const uint *>(sl);
+            const uint *s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 0);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0);
+            x1 = (fx >> 16);
+            y1 = (fy >> 16);
+            fx += fdx; fy += fdy;
+            sl = textureData + bytesPerLine * y1;
+            s1 = reinterpret_cast<const uint *>(sl);
+            s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 1);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1);
+            x1 = (fx >> 16);
+            y1 = (fy >> 16);
+            fx += fdx; fy += fdy;
+            sl = textureData + bytesPerLine * y1;
+            s1 = reinterpret_cast<const uint *>(sl);
+            s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 2);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2);
+            x1 = (fx >> 16);
+            y1 = (fy >> 16);
+            fx += fdx; fy += fdy;
+            sl = textureData + bytesPerLine * y1;
+            s1 = reinterpret_cast<const uint *>(sl);
+            s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 3);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3);
+
+            int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_round), 12);
+            int32x4_t v_disty = vshrq_n_s32(vaddq_s32(vandq_s32(v_fy, v_ffff_mask), v_round), 12);
+            v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16));
+            v_disty = vorrq_s32(v_disty, vshlq_n_s32(v_disty, 16));
+            int16x8_t v_disty_ = vshlq_n_s16(vreinterpretq_s16_s32(v_disty), 4);
+
+            interpolate_4_pixels_16_neon(
+                        vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]),
+                        vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]),
+                        vreinterpretq_s16_s32(v_distx), vreinterpretq_s16_s32(v_disty),
+                        v_disty_, colorMask, invColorMask, v_256, b);
+            b += 4;
+            v_fx = vaddq_s32(v_fx, v_fdx);
+            v_fy = vaddq_s32(v_fy, v_fdy);
+        }
+#endif
+        while (b < boundedEnd) {
+            int x = (fx >> 16);
+            int y = (fy >> 16);
 
-                    const uchar *textureData = data->texture.imageData;
-                    const int bytesPerLine = data->texture.bytesPerLine;
-                    const __m128i vbpl = _mm_shufflelo_epi16(_mm_cvtsi32_si128(bytesPerLine/4), _MM_SHUFFLE(0, 0, 0, 0));
+            const uint *s1 = (const uint *)image.scanLine(y);
+            const uint *s2 = (const uint *)image.scanLine(y + 1);
 
-                    while (b < boundedEnd) {
-                        const __m128i vy = _mm_packs_epi32(_mm_srli_epi32(v_fy, 16), _mm_setzero_si128());
-                        // 4x16bit * 4x16bit -> 4x32bit
-                        __m128i offset = _mm_unpacklo_epi16(_mm_mullo_epi16(vy, vbpl), _mm_mulhi_epi16(vy, vbpl));
-                        offset = _mm_add_epi32(offset, _mm_srli_epi32(v_fx, 16));
-                        const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
-                        const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
-                        const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
-                        const int offset3 = _mm_cvtsi128_si32(offset);
-                        const uint *topData = (const uint *)(textureData);
-                        const __m128i tl = _mm_setr_epi32(topData[offset0], topData[offset1], topData[offset2], topData[offset3]);
-                        const __m128i tr = _mm_setr_epi32(topData[offset0 + 1], topData[offset1 + 1], topData[offset2 + 1], topData[offset3 + 1]);
-                        const uint *bottomData = (const uint *)(textureData + bytesPerLine);
-                        const __m128i bl = _mm_setr_epi32(bottomData[offset0], bottomData[offset1], bottomData[offset2], bottomData[offset3]);
-                        const __m128i br = _mm_setr_epi32(bottomData[offset0 + 1], bottomData[offset1 + 1], bottomData[offset2 + 1], bottomData[offset3 + 1]);
-
-                        __m128i v_distx = _mm_srli_epi16(v_fx, 8);
-                        __m128i v_disty = _mm_srli_epi16(v_fy, 8);
-                        v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fxy_r), 4);
-                        v_disty = _mm_srli_epi16(_mm_add_epi32(v_disty, v_fxy_r), 4);
-                        v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
-                        v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
-                        v_disty = _mm_shufflehi_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
-                        v_disty = _mm_shufflelo_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
-
-                        interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b);
-                        b += 4;
-                        v_fx = _mm_add_epi32(v_fx, v_fdx);
-                        v_fy = _mm_add_epi32(v_fy, v_fdy);
-                    }
-                    fx = _mm_cvtsi128_si32(v_fx);
-                    fy = _mm_cvtsi128_si32(v_fy);
-#elif defined(__ARM_NEON__)
-                    BILINEAR_ROTATE_BOUNDS_PROLOG
+#if defined(__SSE2__) || defined(__ARM_NEON__)
+            int distx = (fx & 0x0000ffff) >> 8;
+            int disty = (fy & 0x0000ffff) >> 8;
+            *b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty);
+#else
+            uint tl = s1[x];
+            uint tr = s1[x + 1];
+            uint bl = s2[x];
+            uint br = s2[x + 1];
+            int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
+            int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
+            *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
+#endif
 
-                    const int16x8_t colorMask = vdupq_n_s16(0x00ff);
-                    const int16x8_t invColorMask = vmvnq_s16(colorMask);
-                    const int16x8_t v_256 = vdupq_n_s16(256);
-                    int32x4_t v_fdx = vdupq_n_s32(fdx * 4);
-                    int32x4_t v_fdy = vdupq_n_s32(fdy * 4);
+            fx += fdx;
+            fy += fdy;
+            ++b;
+        }
+    }
 
-                    const uchar *textureData = data->texture.imageData;
-                    const int bytesPerLine = data->texture.bytesPerLine;
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2;
+        int y1 = (fy >> 16);
+        int y2;
 
-                    int32x4_t v_fx = vmovq_n_s32(fx);
-                    int32x4_t v_fy = vmovq_n_s32(fy);
-                    v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1);
-                    v_fy = vsetq_lane_s32(fy + fdy, v_fy, 1);
-                    v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2);
-                    v_fy = vsetq_lane_s32(fy + fdy * 2, v_fy, 2);
-                    v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3);
-                    v_fy = vsetq_lane_s32(fy + fdy * 3, v_fy, 3);
+        fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+        fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
 
-                    const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
-                    const int32x4_t v_round = vdupq_n_s32(0x0800);
+        const uint *s1 = (const uint *)image.scanLine(y1);
+        const uint *s2 = (const uint *)image.scanLine(y2);
 
-                    while (b < boundedEnd) {
-                        uint32x4x2_t v_top, v_bot;
+        uint tl = s1[x1];
+        uint tr = s1[x2];
+        uint bl = s2[x1];
+        uint br = s2[x2];
 
-                        int x1 = (fx >> 16);
-                        int y1 = (fy >> 16);
-                        fx += fdx; fy += fdy;
-                        const uchar *sl = textureData + bytesPerLine * y1;
-                        const uint *s1 = reinterpret_cast<const uint *>(sl);
-                        const uint *s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 0);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0);
-                        x1 = (fx >> 16);
-                        y1 = (fy >> 16);
-                        fx += fdx; fy += fdy;
-                        sl = textureData + bytesPerLine * y1;
-                        s1 = reinterpret_cast<const uint *>(sl);
-                        s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 1);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1);
-                        x1 = (fx >> 16);
-                        y1 = (fy >> 16);
-                        fx += fdx; fy += fdy;
-                        sl = textureData + bytesPerLine * y1;
-                        s1 = reinterpret_cast<const uint *>(sl);
-                        s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 2);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2);
-                        x1 = (fx >> 16);
-                        y1 = (fy >> 16);
-                        fx += fdx; fy += fdy;
-                        sl = textureData + bytesPerLine * y1;
-                        s1 = reinterpret_cast<const uint *>(sl);
-                        s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 3);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3);
-
-                        int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_round), 12);
-                        int32x4_t v_disty = vshrq_n_s32(vaddq_s32(vandq_s32(v_fy, v_ffff_mask), v_round), 12);
-                        v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16));
-                        v_disty = vorrq_s32(v_disty, vshlq_n_s32(v_disty, 16));
-                        int16x8_t v_disty_ = vshlq_n_s16(vreinterpretq_s16_s32(v_disty), 4);
-
-                        interpolate_4_pixels_16_neon(
-                                    vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]),
-                                    vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]),
-                                    vreinterpretq_s16_s32(v_distx), vreinterpretq_s16_s32(v_disty),
-                                    v_disty_, colorMask, invColorMask, v_256, b);
-                        b += 4;
-                        v_fx = vaddq_s32(v_fx, v_fdx);
-                        v_fy = vaddq_s32(v_fy, v_fdy);
-                    }
+#if defined(__SSE2__) || defined(__ARM_NEON__)
+        // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
+        int distx = (fx & 0x0000ffff) >> 8;
+        int disty = (fy & 0x0000ffff) >> 8;
+        *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+#else
+        int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
+        int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
+        *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
 #endif
-                }
 
-                while (b < end) {
-                    int x1 = (fx >> 16);
-                    int x2;
-                    int y1 = (fy >> 16);
-                    int y2;
+        fx += fdx;
+        fy += fdy;
+        ++b;
+    }
+}
 
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
 
-                    const uint *s1 = (const uint *)data->texture.scanLine(y1);
-                    const uint *s2 = (const uint *)data->texture.scanLine(y2);
+static BilinearFastTransformHelper bilinearFastTransformHelperARGB32PM[2][NFastTransformTypes] = {
+    {
+        fetchTransformedBilinearARGB32PM_simple_upscale_helper<BlendTransformedBilinear>,
+        fetchTransformedBilinearARGB32PM_upscale_helper<BlendTransformedBilinear>,
+        fetchTransformedBilinearARGB32PM_downscale_helper<BlendTransformedBilinear>,
+        fetchTransformedBilinearARGB32PM_rotate_helper<BlendTransformedBilinear>,
+        fetchTransformedBilinearARGB32PM_fast_rotate_helper<BlendTransformedBilinear>
+    },
+    {
+        fetchTransformedBilinearARGB32PM_simple_upscale_helper<BlendTransformedBilinearTiled>,
+        fetchTransformedBilinearARGB32PM_upscale_helper<BlendTransformedBilinearTiled>,
+        fetchTransformedBilinearARGB32PM_downscale_helper<BlendTransformedBilinearTiled>,
+        fetchTransformedBilinearARGB32PM_rotate_helper<BlendTransformedBilinearTiled>,
+        fetchTransformedBilinearARGB32PM_fast_rotate_helper<BlendTransformedBilinearTiled>
+    }
+};
 
-                    uint tl = s1[x1];
-                    uint tr = s1[x2];
-                    uint bl = s2[x1];
-                    uint br = s2[x2];
+template<TextureBlendType blendType> /* blendType = BlendTransformedBilinear or BlendTransformedBilinearTiled */
+static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, const Operator *,
+                                                                 const QSpanData *data, int y, int x,
+                                                                 int length)
+{
+    const qreal cx = x + qreal(0.5);
+    const qreal cy = y + qreal(0.5);
+    Q_CONSTEXPR int tiled = (blendType == BlendTransformedBilinearTiled) ? 1 : 0;
 
-#if defined(__SSE2__) || defined(__ARM_NEON__)
-                    // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
-                    int distx = (fx & 0x0000ffff) >> 8;
-                    int disty = (fy & 0x0000ffff) >> 8;
-                    *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
-#else
-                    int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
-                    int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
-                    *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
-#endif
+    uint *end = buffer + length;
+    uint *b = buffer;
+    if (data->fast_matrix) {
+        // The increment pr x in the scanline
+        int fdx = (int)(data->m11 * fixed_scale);
+        int fdy = (int)(data->m12 * fixed_scale);
 
-                    fx += fdx;
-                    fy += fdy;
-                    ++b;
-                }
+        int fx = int((data->m21 * cy
+                      + data->m11 * cx + data->dx) * fixed_scale);
+        int fy = int((data->m22 * cy
+                      + data->m12 * cx + data->dy) * fixed_scale);
+
+        fx -= half_point;
+        fy -= half_point;
+
+        if (fdy == 0) { // simple scale, no rotation or shear
+            if (fdx <= fixed_scale && fdx > 0) {
+                // simple scale up on X without mirroring
+                bilinearFastTransformHelperARGB32PM[tiled][SimpleUpscaleTransform](b, end, data->texture, fx, fy, fdx, fdy);
+            } else if ((fdx < 0 && fdx > -(fixed_scale / 8)) || qAbs(data->m22) < qreal(1./8.)) {
+                // scale up more than 8x (on either Y or on X mirrored)
+                bilinearFastTransformHelperARGB32PM[tiled][UpscaleTransform](b, end, data->texture, fx, fy, fdx, fdy);
+            } else {
+                // scale down on X (or up on X mirrored less than 8x)
+                bilinearFastTransformHelperARGB32PM[tiled][DownscaleTransform](b, end, data->texture, fx, fy, fdx, fdy);
+            }
+        } else { // rotation or shear
+            if (qAbs(data->m11) < qreal(1./8.) || qAbs(data->m22) < qreal(1./8.) ) {
+                // if we are zooming more than 8 times, we use 8bit precision for the position.
+                bilinearFastTransformHelperARGB32PM[tiled][RotateTransform](b, end, data->texture, fx, fy, fdx, fdy);
+            } else {
+                // we are zooming less than 8x, use 4bit precision
+                bilinearFastTransformHelperARGB32PM[tiled][FastRotateTransform](b, end, data->texture, fx, fy, fdx, fdy);
             }
         }
     } else {
+        const QTextureData &image = data->texture;
+
         const qreal fdx = data->m11;
         const qreal fdy = data->m12;
         const qreal fdw = data->m13;
@@ -2491,8 +2630,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
             int distx = int((px - x1) * 256);
             int disty = int((py - y1) * 256);
 
-            fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-            fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
+            fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+            fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
 
             const uint *s1 = (const uint *)data->texture.scanLine(y1);
             const uint *s2 = (const uint *)data->texture.scanLine(y2);
@@ -2674,7 +2813,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
                     layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0);
                     layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0);
 
-                    if ((fdx < 0 && fdx > -(fixed_scale / 8)) || std::abs(data->m22) < (1./8.)) { // scale up more than 8x
+                    if ((fdx < 0 && fdx > -(fixed_scale / 8)) || qAbs(data->m22) < qreal(1./8.)) { // scale up more than 8x
                         int disty = (fy & 0x0000ffff) >> 8;
                         for (int i = 0; i < len; ++i) {
                             int distx = (fracX & 0x0000ffff) >> 8;
@@ -2726,7 +2865,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
                 layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0);
                 layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0);
 
-                if (std::abs(data->m11) < (1./8.) || std::abs(data->m22) < (1./8.)) {
+                if (qAbs(data->m11) < qreal(1./8.) || qAbs(data->m22) < qreal(1./8.) ) {
                     //if we are zooming more than 8 times, we use 8bit precision for the position.
                     for (int i = 0; i < len; ++i) {
                         int distx = (fracX & 0x0000ffff) >> 8;
@@ -3603,27 +3742,23 @@ static inline Operator getOperator(const QSpanData *data, const QSpan *spans, in
 
     op.destFetch = destFetchProc[data->rasterBuffer->format];
     op.destFetch64 = destFetchProc64[data->rasterBuffer->format];
-    if (op.mode == QPainter::CompositionMode_Source) {
-        switch (data->rasterBuffer->format) {
-        case QImage::Format_RGB32:
-        case QImage::Format_ARGB32_Premultiplied:
-            // don't clear destFetch as it sets up the pointer correctly to save one copy
-            break;
-        default: {
-            if (data->type == QSpanData::Texture && data->texture.const_alpha != 256)
+    if (op.mode == QPainter::CompositionMode_Source &&
+            (data->type != QSpanData::Texture || data->texture.const_alpha == 256)) {
+        const QSpan *lastSpan = spans + spanCount;
+        bool alphaSpans = false;
+        while (spans < lastSpan) {
+            if (spans->coverage != 255) {
+                alphaSpans = true;
                 break;
-            const QSpan *lastSpan = spans + spanCount;
-            bool alphaSpans = false;
-            while (spans < lastSpan) {
-                if (spans->coverage != 255) {
-                    alphaSpans = true;
-                    break;
-                }
-                ++spans;
             }
-            if (!alphaSpans)
-                op.destFetch = 0;
+            ++spans;
         }
+        if (!alphaSpans) {
+            // If all spans are opaque we do not need to fetch dest.
+            // But don't clear passthrough destFetch as they are just as fast and save destStore.
+            if (op.destFetch != destFetchARGB32P)
+                op.destFetch = 0;
+            op.destFetch64 = destFetch64Undefined;
         }
     }
 
@@ -5191,6 +5326,8 @@ void qBlendTexture(int count, const QSpan *spans, void *userData)
     case QImage::Format_RGB16:
         proc = processTextureSpansRGB16[blendType];
         break;
+    case QImage::Format_ARGB32:
+    case QImage::Format_RGBA8888:
     case QImage::Format_BGR30:
     case QImage::Format_A2BGR30_Premultiplied:
     case QImage::Format_RGB30:
@@ -5403,134 +5540,200 @@ inline static void qt_bitmapblit_quint16(QRasterBuffer *rasterBuffer,
                                     map, mapWidth, mapHeight, mapStride);
 }
 
-static void qt_alphamapblit_quint16(QRasterBuffer *rasterBuffer,
+static inline void alphamapblend_generic(int coverage, QRgba64 *dest, int x, const QRgba64 &srcLinear, const QRgba64 &src, const QColorProfile *colorProfile)
+{
+    if (coverage == 0) {
+        // nothing
+    } else if (coverage == 255) {
+        dest[x] = src;
+    } else {
+        QRgba64 dstColor = dest[x];
+        if (colorProfile) {
+            if (dstColor.isOpaque())
+                dstColor = colorProfile->toLinear(dstColor);
+            else if (!dstColor.isTransparent())
+                dstColor = colorProfile->toLinear(dstColor.unpremultiplied()).premultiplied();
+        }
+
+        dstColor = interpolate255(srcLinear, coverage, dstColor, 255 - coverage);
+        if (colorProfile) {
+            if (dstColor.isOpaque())
+                dstColor = colorProfile->fromLinear(dstColor);
+            else if (!dstColor.isTransparent())
+                dstColor = colorProfile->fromLinear(dstColor.unpremultiplied()).premultiplied();
+        }
+        dest[x] = dstColor;
+    }
+}
+
+static void qt_alphamapblit_generic(QRasterBuffer *rasterBuffer,
                                     int x, int y, const QRgba64 &color,
                                     const uchar *map,
                                     int mapWidth, int mapHeight, int mapStride,
-                                    const QClipData *)
+                                    const QClipData *clip, bool useGammaCorrection)
 {
-    const quint16 c = color.toRgb16();
-    quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
-    const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
+    if (color.isTransparent())
+        return;
 
-    while (mapHeight--) {
-        for (int i = 0; i < mapWidth; ++i) {
-            const int coverage = map[i];
+    const QColorProfile *colorProfile = nullptr;
 
-            if (coverage == 0) {
-                // nothing
-            } else if (coverage == 255) {
-                dest[i] = c;
-            } else {
-                int ialpha = 255 - coverage;
-                dest[i] = BYTE_MUL_RGB16(c, coverage)
-                          + BYTE_MUL_RGB16(dest[i], ialpha);
+    if (useGammaCorrection)
+        colorProfile = QGuiApplicationPrivate::instance()->colorProfileForA8Text();
+
+    QRgba64 srcColor = color;
+    if (colorProfile) {
+        if (color.isOpaque())
+            srcColor = colorProfile->toLinear(srcColor);
+        else
+            srcColor = colorProfile->toLinear(srcColor.unpremultiplied()).premultiplied();
+    }
+
+    quint64 buffer[buffer_size];
+    const DestFetchProc64 destFetch64 = destFetchProc64[rasterBuffer->format];
+    const DestStoreProc64 destStore64 = destStoreProc64[rasterBuffer->format];
+
+    if (!clip) {
+        for (int ly = 0; ly < mapHeight; ++ly) {
+            int i = x;
+            int length = mapWidth;
+            while (length > 0) {
+                int l = qMin(buffer_size, length);
+                QRgba64 *dest = destFetch64((QRgba64*)buffer, rasterBuffer, i, y + ly, l);
+                for (int j=0; j < l; ++j) {
+                    const int coverage = map[j + (i - x)];
+                    alphamapblend_generic(coverage, dest, j, srcColor, color, colorProfile);
+                }
+                destStore64(rasterBuffer, i, y + ly, dest, l);
+                length -= l;
+                i += l;
             }
+            map += mapStride;
         }
-        dest += destStride;
-        map += mapStride;
-    }
-}
+    } else {
+        int bottom = qMin(y + mapHeight, rasterBuffer->height());
 
-static inline void rgbBlendPixel(quint32 *dst, int coverage, int sr, int sg, int sb, const uchar *gamma, const uchar *invgamma)
-{
-    // Do a gray alphablend...
-    int da = qAlpha(*dst);
-    int dr = qRed(*dst);
-    int dg = qGreen(*dst);
-    int db = qBlue(*dst);
+        int top = qMax(y, 0);
+        map += (top - y) * mapStride;
+
+        const_cast<QClipData *>(clip)->initialize();
+        for (int yp = top; yp<bottom; ++yp) {
+            const QClipData::ClipLine &line = clip->m_clipLines[yp];
 
-    if (da != 255
-        ) {
+            for (int i=0; i<line.count; ++i) {
+                const QSpan &clip = line.spans[i];
 
-        int a = qGray(coverage);
-        sr = qt_div_255(invgamma[sr] * a);
-        sg = qt_div_255(invgamma[sg] * a);
-        sb = qt_div_255(invgamma[sb] * a);
+                int start = qMax<int>(x, clip.x);
+                int end = qMin<int>(x + mapWidth, clip.x + clip.len);
+                Q_ASSERT(clip.len <= buffer_size);
+                QRgba64 *dest = destFetch64((QRgba64*)buffer, rasterBuffer, start, clip.y, clip.len);
 
-        int ia = 255 - a;
-        dr = qt_div_255(dr * ia);
-        dg = qt_div_255(dg * ia);
-        db = qt_div_255(db * ia);
+                for (int xp=start; xp<end; ++xp) {
+                    const int coverage = map[xp - x];
+                    alphamapblend_generic(coverage, dest, xp - start, srcColor, color, colorProfile);
+                }
+                destStore64(rasterBuffer, start, clip.y, dest, clip.len);
+            } // for (i -> line.count)
+            map += mapStride;
+        } // for (yp -> bottom)
+    }
+}
 
-        *dst = ((a + qt_div_255((255 - a) * da)) << 24)
-            |  ((sr + dr) << 16)
-            |  ((sg + dg) << 8)
-            |  ((sb + db));
+static inline void alphamapblend_quint16(int coverage, quint16 *dest, int x, const quint16 srcColor)
+{
+    if (coverage == 0) {
+        // nothing
+    } else if (coverage == 255) {
+        dest[x] = srcColor;
+    } else {
+        dest[x] = BYTE_MUL_RGB16(srcColor, coverage)
+                + BYTE_MUL_RGB16(dest[x], 255 - coverage);
+    }
+}
+
+void qt_alphamapblit_quint16(QRasterBuffer *rasterBuffer,
+                             int x, int y, const QRgba64 &color,
+                             const uchar *map,
+                             int mapWidth, int mapHeight, int mapStride,
+                             const QClipData *clip, bool useGammaCorrection)
+{
+    if (useGammaCorrection) {
+        qt_alphamapblit_generic(rasterBuffer, x, y, color, map, mapWidth, mapHeight, mapStride, clip, useGammaCorrection);
         return;
     }
 
-    int mr = qRed(coverage);
-    int mg = qGreen(coverage);
-    int mb = qBlue(coverage);
+    const quint16 c = color.toRgb16();
 
-    dr = gamma[dr];
-    dg = gamma[dg];
-    db = gamma[db];
+    if (!clip) {
+        quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
+        const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
+        while (mapHeight--) {
+            for (int i = 0; i < mapWidth; ++i)
+                alphamapblend_quint16(map[i], dest, i, c);
+            dest += destStride;
+            map += mapStride;
+        }
+    } else {
+        int top = qMax(y, 0);
+        int bottom = qMin(y + mapHeight, rasterBuffer->height());
+        map += (top - y) * mapStride;
 
-    int nr = qt_div_255(sr * mr + dr * (255 - mr));
-    int ng = qt_div_255(sg * mg + dg * (255 - mg));
-    int nb = qt_div_255(sb * mb + db * (255 - mb));
+        const_cast<QClipData *>(clip)->initialize();
+        for (int yp = top; yp<bottom; ++yp) {
+            const QClipData::ClipLine &line = clip->m_clipLines[yp];
 
-    nr = invgamma[nr];
-    ng = invgamma[ng];
-    nb = invgamma[nb];
+            quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(yp));
 
-    *dst = qRgb(nr, ng, nb);
-}
+            for (int i=0; i<line.count; ++i) {
+                const QSpan &clip = line.spans[i];
+
+                int start = qMax<int>(x, clip.x);
+                int end = qMin<int>(x + mapWidth, clip.x + clip.len);
 
-#if defined(Q_OS_WIN)
-Q_GUI_EXPORT bool qt_needs_a8_gamma_correction = false;
+                for (int xp=start; xp<end; ++xp)
+                    alphamapblend_quint16(map[xp - x], dest, xp, c);
+            } // for (i -> line.count)
+            map += mapStride;
+        } // for (yp -> bottom)
+    }
+}
 
-static inline void grayBlendPixel(quint32 *dst, int coverage, int sr, int sg, int sb, const uint *gamma, const uchar *invgamma)
+static inline void rgbBlendPixel(quint32 *dst, int coverage, QRgba64 slinear, const QColorProfile *colorProfile, bool useGammaCorrection)
 {
-    // Do a gammacorrected gray alphablend...
-    int dr = qRed(*dst);
-    int dg = qGreen(*dst);
-    int db = qBlue(*dst);
+    // Do a gammacorrected RGB alphablend...
+    const QRgba64 dlinear = useGammaCorrection ? colorProfile->toLinear64(*dst) : QRgba64::fromArgb32(*dst);
 
-    dr = gamma[dr];
-    dg = gamma[dg];
-    db = gamma[db];
+    QRgba64 blend = rgbBlend(dlinear, slinear, coverage);
 
-    int alpha = coverage;
-    int ialpha = 255 - alpha;
-    int nr = qt_div_255(sr * alpha + dr * ialpha);
-    int ng = qt_div_255(sg * alpha + dg * ialpha);
-    int nb = qt_div_255(sb * alpha + db * ialpha);
+    *dst = useGammaCorrection ? colorProfile->fromLinear64(blend) : toArgb32(blend);
+}
+
+static inline void grayBlendPixel(quint32 *dst, int coverage, QRgba64 slinear, const QColorProfile *colorProfile)
+{
+    // Do a gammacorrected gray alphablend...
+    const QRgba64 dlinear = colorProfile->toLinear64(*dst);
 
-    nr = invgamma[nr];
-    ng = invgamma[ng];
-    nb = invgamma[nb];
+    QRgba64 blend = interpolate255(slinear, coverage, dlinear, 255 - coverage);
 
-    *dst = qRgb(nr, ng, nb);
+    *dst = colorProfile->fromLinear64(blend);
 }
-#endif
 
 static void qt_alphamapblit_uint32(QRasterBuffer *rasterBuffer,
                                    int x, int y, quint32 color,
                                    const uchar *map,
                                    int mapWidth, int mapHeight, int mapStride,
-                                   const QClipData *clip)
+                                   const QClipData *clip, bool useGammaCorrection)
 {
     const quint32 c = color;
     const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint32);
 
-#if defined(Q_OS_WIN)
-    const QDrawHelperGammaTables *tables = QGuiApplicationPrivate::instance()->gammaTables();
-    if (!tables)
+    const QColorProfile *colorProfile = QGuiApplicationPrivate::instance()->colorProfileForA8Text();
+    if (!colorProfile)
         return;
 
-    const uint *gamma = tables->qt_pow_gamma;
-    const uchar *invgamma = tables->qt_pow_invgamma;
-
-    int sr = gamma[qRed(color)];
-    int sg = gamma[qGreen(color)];
-    int sb = gamma[qBlue(color)];
+    const QRgba64 slinear = colorProfile->toLinear64(c);
 
     bool opaque_src = (qAlpha(color) == 255);
-    bool doGrayBlendPixel = opaque_src && qt_needs_a8_gamma_correction;
-#endif
+    bool doGrayBlendPixel = opaque_src && useGammaCorrection;
 
     if (!clip) {
         quint32 *dest = reinterpret_cast<quint32*>(rasterBuffer->scanLine(y)) + x;
@@ -5543,13 +5746,9 @@ static void qt_alphamapblit_uint32(QRasterBuffer *rasterBuffer,
                 } else if (coverage == 255) {
                     dest[i] = c;
                 } else {
-#if defined(Q_OS_WIN)
-                    if (QSysInfo::WindowsVersion >= QSysInfo::WV_XP && doGrayBlendPixel
-                        && qAlpha(dest[i]) == 255) {
-                        grayBlendPixel(dest+i, coverage, sr, sg, sb, gamma, invgamma);
-                    } else
-#endif
-                    {
+                    if (doGrayBlendPixel && qAlpha(dest[i]) == 255) {
+                        grayBlendPixel(dest+i, coverage, slinear, colorProfile);
+                    } else {
                         int ialpha = 255 - coverage;
                         dest[i] = INTERPOLATE_PIXEL_255(c, coverage, dest[i], ialpha);
                     }
@@ -5584,13 +5783,9 @@ static void qt_alphamapblit_uint32(QRasterBuffer *rasterBuffer,
                     } else if (coverage == 255) {
                         dest[xp] = c;
                     } else {
-#if defined(Q_OS_WIN)
-                        if (QSysInfo::WindowsVersion >= QSysInfo::WV_XP && doGrayBlendPixel
-                            && qAlpha(dest[xp]) == 255) {
-                            grayBlendPixel(dest+xp, coverage, sr, sg, sb, gamma, invgamma);
-                        } else
-#endif
-                        {
+                        if (doGrayBlendPixel && qAlpha(dest[xp]) == 255) {
+                            grayBlendPixel(dest+xp, coverage, slinear, colorProfile);
+                        } else {
                             int ialpha = 255 - coverage;
                             dest[xp] = INTERPOLATE_PIXEL_255(c, coverage, dest[xp], ialpha);
                         }
@@ -5608,9 +5803,9 @@ static void qt_alphamapblit_argb32(QRasterBuffer *rasterBuffer,
                                    int x, int y, const QRgba64 &color,
                                    const uchar *map,
                                    int mapWidth, int mapHeight, int mapStride,
-                                   const QClipData *clip)
+                                   const QClipData *clip, bool useGammaCorrection)
 {
-    qt_alphamapblit_uint32(rasterBuffer, x, y, color.toArgb32(), map, mapWidth, mapHeight, mapStride, clip);
+    qt_alphamapblit_uint32(rasterBuffer, x, y, color.toArgb32(), map, mapWidth, mapHeight, mapStride, clip, useGammaCorrection);
 }
 
 #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
@@ -5618,38 +5813,132 @@ static void qt_alphamapblit_rgba8888(QRasterBuffer *rasterBuffer,
                                      int x, int y, const QRgba64 &color,
                                      const uchar *map,
                                      int mapWidth, int mapHeight, int mapStride,
-                                     const QClipData *clip)
+                                     const QClipData *clip, bool useGammaCorrection)
 {
-    qt_alphamapblit_uint32(rasterBuffer, x, y, ARGB2RGBA(color.toArgb32()), map, mapWidth, mapHeight, mapStride, clip);
+    qt_alphamapblit_uint32(rasterBuffer, x, y, ARGB2RGBA(color.toArgb32()), map, mapWidth, mapHeight, mapStride, clip, useGammaCorrection);
 }
 #endif
 
-static void qt_alphargbblit_argb32(QRasterBuffer *rasterBuffer,
-                                   int x, int y, const QRgba64 &color,
-                                   const uint *src, int mapWidth, int mapHeight, int srcStride,
-                                   const QClipData *clip)
+static inline int qRgbAvg(QRgb rgb)
 {
-    const quint32 c = color.toArgb32();
+    return (qRed(rgb) * 5 + qGreen(rgb) * 6 + qBlue(rgb) * 5) / 16;
+}
 
-    int sr = qRed(c);
-    int sg = qGreen(c);
-    int sb = qBlue(c);
-    int sa = qAlpha(c);
+static inline void alphargbblend_generic(uint coverage, QRgba64 *dest, int x, const QRgba64 &srcLinear, const QRgba64 &src, const QColorProfile *colorProfile)
+{
+    if (coverage == 0xff000000) {
+        // nothing
+    } else if (coverage == 0xffffffff) {
+        dest[x] = src;
+    } else {
+        QRgba64 dstColor = dest[x];
+        if (dstColor.isOpaque()) {
+            if (colorProfile)
+                dstColor = colorProfile->toLinear(dstColor);
+            dstColor = rgbBlend(dstColor, srcLinear, coverage);
+            if (colorProfile)
+                dstColor = colorProfile->fromLinear(dstColor);
+            dest[x] = dstColor;
+        } else {
+            // Give up and do a gray alphablend.
+            if (colorProfile && !dstColor.isTransparent())
+                dstColor = colorProfile->toLinear(dstColor.unpremultiplied()).premultiplied();
+            const int a = qRgbAvg(coverage);
+            dstColor = interpolate255(srcLinear, coverage, dstColor, 255 - a);
+            if (colorProfile && !dstColor.isTransparent())
+                dstColor = colorProfile->fromLinear(dstColor.unpremultiplied()).premultiplied();
+            dest[x] = dstColor;
+        }
+    }
+}
 
-    const QDrawHelperGammaTables *tables = QGuiApplicationPrivate::instance()->gammaTables();
-    if (!tables)
+static void qt_alphargbblit_generic(QRasterBuffer *rasterBuffer,
+                                    int x, int y, const QRgba64 &color,
+                                    const uint *src, int mapWidth, int mapHeight, int srcStride,
+                                    const QClipData *clip, bool useGammaCorrection)
+{
+    if (color.isTransparent())
         return;
 
-    const uchar *gamma = tables->qt_pow_rgb_gamma;
-    const uchar *invgamma = tables->qt_pow_rgb_invgamma;
+    const QColorProfile *colorProfile = nullptr;
+
+    if (useGammaCorrection)
+        colorProfile = QGuiApplicationPrivate::instance()->colorProfileForA8Text();
+
+    QRgba64 srcColor = color;
+    if (colorProfile) {
+        if (color.isOpaque())
+            srcColor = colorProfile->toLinear(srcColor);
+        else
+            srcColor = colorProfile->toLinear(srcColor.unpremultiplied()).premultiplied();
+    }
+
+    quint64 buffer[buffer_size];
+    const DestFetchProc64 destFetch64 = destFetchProc64[rasterBuffer->format];
+    const DestStoreProc64 destStore64 = destStoreProc64[rasterBuffer->format];
+
+    if (!clip) {
+        for (int ly = 0; ly < mapHeight; ++ly) {
+            int i = x;
+            int length = mapWidth;
+            while (length > 0) {
+                int l = qMin(buffer_size, length);
+                QRgba64 *dest = destFetch64((QRgba64*)buffer, rasterBuffer, i, y + ly, l);
+                for (int j=0; j < l; ++j) {
+                    const uint coverage = src[j + (i - x)];
+                    alphargbblend_generic(coverage, dest, j, srcColor, color, colorProfile);
+                }
+                destStore64(rasterBuffer, i, y + ly, dest, l);
+                length -= l;
+                i += l;
+            }
+            src += srcStride;
+        }
+    } else {
+        int bottom = qMin(y + mapHeight, rasterBuffer->height());
+
+        int top = qMax(y, 0);
+        src += (top - y) * srcStride;
+
+        const_cast<QClipData *>(clip)->initialize();
+        for (int yp = top; yp<bottom; ++yp) {
+            const QClipData::ClipLine &line = clip->m_clipLines[yp];
+
+            for (int i=0; i<line.count; ++i) {
+                const QSpan &clip = line.spans[i];
+
+                int start = qMax<int>(x, clip.x);
+                int end = qMin<int>(x + mapWidth, clip.x + clip.len);
+                Q_ASSERT(clip.len <= buffer_size);
+                QRgba64 *dest = destFetch64((QRgba64*)buffer, rasterBuffer, start, clip.y, clip.len);
+
+                for (int xp=start; xp<end; ++xp) {
+                    const uint coverage = src[xp - x];
+                    alphargbblend_generic(coverage, dest, xp - start, srcColor, color, colorProfile);
+                }
+                destStore64(rasterBuffer, start, clip.y, dest, clip.len);
+            } // for (i -> line.count)
+            src += srcStride;
+        } // for (yp -> bottom)
+    }
+}
+
+static void qt_alphargbblit_argb32(QRasterBuffer *rasterBuffer,
+                                   int x, int y, const QRgba64 &color,
+                                   const uint *src, int mapWidth, int mapHeight, int srcStride,
+                                   const QClipData *clip, bool useGammaCorrection)
+{
+    if (color.isTransparent())
+        return;
 
-    sr = gamma[sr];
-    sg = gamma[sg];
-    sb = gamma[sb];
+    const quint32 c = color.toArgb32();
 
-    if (sa == 0)
+    const QColorProfile *colorProfile = QGuiApplicationPrivate::instance()->colorProfileForA32Text();
+    if (!colorProfile)
         return;
 
+    const QRgba64 slinear = useGammaCorrection ? colorProfile->toLinear64(c) : color;
+
     if (!clip) {
         quint32 *dst = reinterpret_cast<quint32*>(rasterBuffer->scanLine(y)) + x;
         const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint32);
@@ -5659,7 +5948,16 @@ static void qt_alphargbblit_argb32(QRasterBuffer *rasterBuffer,
                 if (coverage == 0xffffffff) {
                     dst[i] = c;
                 } else if (coverage != 0xff000000) {
-                    rgbBlendPixel(dst+i, coverage, sr, sg, sb, gamma, invgamma);
+                    if (dst[i] >= 0xff000000) {
+                        rgbBlendPixel(dst+i, coverage, slinear, colorProfile, useGammaCorrection);
+                    } else {
+                        // Give up and do a gray blend.
+                        const int a = qRgbAvg(coverage);
+                        if (useGammaCorrection)
+                            grayBlendPixel(dst+i, a, slinear, colorProfile);
+                        else
+                            dst[i] = INTERPOLATE_PIXEL_255(c, a, dst[i], 255 - a);
+                    }
                 }
             }
 
@@ -5689,7 +5987,16 @@ static void qt_alphargbblit_argb32(QRasterBuffer *rasterBuffer,
                     if (coverage == 0xffffffff) {
                         dst[xp] = c;
                     } else if (coverage != 0xff000000) {
-                        rgbBlendPixel(dst+xp, coverage, sr, sg, sb, gamma, invgamma);
+                        if (dst[xp] >= 0xff000000) {
+                            rgbBlendPixel(dst+xp, coverage, slinear, colorProfile, useGammaCorrection);
+                        } else {
+                            // Give up and do a gray blend.
+                            const int a = qRgbAvg(coverage);
+                            if (useGammaCorrection)
+                                grayBlendPixel(dst+xp, a, slinear, colorProfile);
+                            else
+                                dst[xp] = INTERPOLATE_PIXEL_255(c, a, dst[xp], 255 - coverage);
+                        }
                     }
                 }
             } // for (i -> line.count)
@@ -5822,56 +6129,80 @@ DrawHelper qDrawHelper[QImage::NImageFormats] =
         qt_gradient_quint16,
         qt_bitmapblit_quint16,
         qt_alphamapblit_quint16,
-        0,
+        qt_alphargbblit_generic,
         qt_rectfill_quint16
     },
     // Format_ARGB8565_Premultiplied
     {
         blend_color_generic,
         blend_src_generic,
-        0, 0, 0, 0
+        0,
+        qt_alphamapblit_generic,
+        qt_alphargbblit_generic,
+        0
     },
     // Format_RGB666
     {
         blend_color_generic,
         blend_src_generic,
-        0, 0, 0, 0
+        0,
+        qt_alphamapblit_generic,
+        qt_alphargbblit_generic,
+        0
     },
     // Format_ARGB6666_Premultiplied
     {
         blend_color_generic,
         blend_src_generic,
-        0, 0, 0, 0
+        0,
+        qt_alphamapblit_generic,
+        qt_alphargbblit_generic,
+        0
     },
     // Format_RGB555
     {
         blend_color_generic,
         blend_src_generic,
-        0, 0, 0, 0
+        0,
+        qt_alphamapblit_generic,
+        qt_alphargbblit_generic,
+        0
     },
     // Format_ARGB8555_Premultiplied
     {
         blend_color_generic,
         blend_src_generic,
-        0, 0, 0, 0
+        0,
+        qt_alphamapblit_generic,
+        qt_alphargbblit_generic,
+        0
     },
     // Format_RGB888
     {
         blend_color_generic,
         blend_src_generic,
-        0, 0, 0, 0
+        0,
+        qt_alphamapblit_generic,
+        qt_alphargbblit_generic,
+        0
     },
     // Format_RGB444
     {
         blend_color_generic,
         blend_src_generic,
-        0, 0, 0, 0
+        0,
+        qt_alphamapblit_generic,
+        qt_alphargbblit_generic,
+        0
     },
     // Format_ARGB4444_Premultiplied
     {
         blend_color_generic,
         blend_src_generic,
-        0, 0, 0, 0
+        0,
+        qt_alphamapblit_generic,
+        qt_alphargbblit_generic,
+        0
     },
     // Format_RGBX8888
     {
@@ -5881,9 +6212,9 @@ DrawHelper qDrawHelper[QImage::NImageFormats] =
 #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
         qt_alphamapblit_rgba8888,
 #else
-        0,
+        qt_alphamapblit_generic,
 #endif
-        0,
+        qt_alphargbblit_generic,
         qt_rectfill_rgba
     },
     // Format_RGBA8888
@@ -5894,9 +6225,9 @@ DrawHelper qDrawHelper[QImage::NImageFormats] =
 #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
         qt_alphamapblit_rgba8888,
 #else
-        0,
+        qt_alphamapblit_generic,
 #endif
-        0,
+        qt_alphargbblit_generic,
         qt_rectfill_nonpremul_rgba
     },
     // Format_RGB8888_Premultiplied
@@ -5907,9 +6238,9 @@ DrawHelper qDrawHelper[QImage::NImageFormats] =
 #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
         qt_alphamapblit_rgba8888,
 #else
-        0,
+        qt_alphamapblit_generic,
 #endif
-        0,
+        qt_alphargbblit_generic,
         qt_rectfill_rgba
     },
     // Format_BGR30
@@ -5917,8 +6248,8 @@ DrawHelper qDrawHelper[QImage::NImageFormats] =
         blend_color_generic_rgb64,
         blend_src_generic_rgb64,
         qt_bitmapblit_rgb30<PixelOrderBGR>,
-        0,
-        0,
+        qt_alphamapblit_generic,
+        qt_alphargbblit_generic,
         qt_rectfill_rgb30<PixelOrderBGR>
     },
     // Format_A2BGR30_Premultiplied
@@ -5926,8 +6257,8 @@ DrawHelper qDrawHelper[QImage::NImageFormats] =
         blend_color_generic_rgb64,
         blend_src_generic_rgb64,
         qt_bitmapblit_rgb30<PixelOrderBGR>,
-        0,
-        0,
+        qt_alphamapblit_generic,
+        qt_alphargbblit_generic,
         qt_rectfill_rgb30<PixelOrderBGR>
     },
     // Format_RGB30
@@ -5935,8 +6266,8 @@ DrawHelper qDrawHelper[QImage::NImageFormats] =
         blend_color_generic_rgb64,
         blend_src_generic_rgb64,
         qt_bitmapblit_rgb30<PixelOrderRGB>,
-        0,
-        0,
+        qt_alphamapblit_generic,
+        qt_alphargbblit_generic,
         qt_rectfill_rgb30<PixelOrderRGB>
     },
     // Format_A2RGB30_Premultiplied
@@ -5944,22 +6275,26 @@ DrawHelper qDrawHelper[QImage::NImageFormats] =
         blend_color_generic_rgb64,
         blend_src_generic_rgb64,
         qt_bitmapblit_rgb30<PixelOrderRGB>,
-        0,
-        0,
+        qt_alphamapblit_generic,
+        qt_alphargbblit_generic,
         qt_rectfill_rgb30<PixelOrderRGB>
     },
     // Format_Alpha8
     {
         blend_color_generic,
         blend_src_generic,
-        0, 0, 0,
+        0,
+        qt_alphamapblit_generic,
+        qt_alphargbblit_generic,
         qt_rectfill_alpha
     },
     // Format_Grayscale8
     {
         blend_color_generic,
         blend_src_generic,
-        0, 0, 0,
+        0,
+        qt_alphamapblit_generic,
+        qt_alphargbblit_generic,
         qt_rectfill_gray
     },
 };
@@ -6117,20 +6452,18 @@ static void qInitDrawhelperFunctions()
 
 #if defined(QT_COMPILER_SUPPORTS_SSE4_1)
     if (qCpuHasFeature(SSE4_1)) {
-#if !defined(__SSE4_1__)
         extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_sse4(uint *buffer, const uint *src, int count,
                                                                     const QVector<QRgb> *, QDitherInfo *);
         extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_sse4(uint *buffer, const uint *src, int count,
                                                                       const QVector<QRgb> *, QDitherInfo *);
-        qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4;
-        qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4;
-#endif
         extern const uint *QT_FASTCALL convertARGB32FromARGB32PM_sse4(uint *buffer, const uint *src, int count,
                                                                       const QVector<QRgb> *, QDitherInfo *);
         extern const uint *QT_FASTCALL convertRGBA8888FromARGB32PM_sse4(uint *buffer, const uint *src, int count,
                                                                         const QVector<QRgb> *, QDitherInfo *);
         extern const uint *QT_FASTCALL convertRGBXFromARGB32PM_sse4(uint *buffer, const uint *src, int count,
                                                                     const QVector<QRgb> *, QDitherInfo *);
+        qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4;
+        qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4;
         qPixelLayouts[QImage::Format_ARGB32].convertFromARGB32PM = convertARGB32FromARGB32PM_sse4;
         qPixelLayouts[QImage::Format_RGBA8888].convertFromARGB32PM = convertRGBA8888FromARGB32PM_sse4;
         qPixelLayouts[QImage::Format_RGBX8888].convertFromARGB32PM = convertRGBXFromARGB32PM_sse4;
@@ -6141,14 +6474,6 @@ static void qInitDrawhelperFunctions()
 
 #if defined(QT_COMPILER_SUPPORTS_AVX2)
     if (qCpuHasFeature(AVX2)) {
-#if !defined(__AVX2__)
-        extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, const uint *src, int count,
-                                                                    const QVector<QRgb> *, QDitherInfo *);
-        extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, const uint *src, int count,
-                                                                      const QVector<QRgb> *, QDitherInfo *);
-        qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_avx2;
-        qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_avx2;
-#endif
         extern void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl,
                                                  const uchar *srcPixels, int sbpl,
                                                  int w, int h, int const_alpha);
@@ -6170,6 +6495,17 @@ static void qInitDrawhelperFunctions()
         qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_avx2;
         qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_avx2;
         qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_avx2;
+
+        extern void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_upscale_helper_avx2(uint *b, uint *end, const QTextureData &image,
+                                                                                            int &fx, int &fy, int fdx, int /*fdy*/);
+        extern void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper_avx2(uint *b, uint *end, const QTextureData &image,
+                                                                                       int &fx, int &fy, int fdx, int /*fdy*/);
+        extern void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2(uint *b, uint *end, const QTextureData &image,
+                                                                                         int &fx, int &fy, int fdx, int fdy);
+
+        bilinearFastTransformHelperARGB32PM[0][SimpleUpscaleTransform] = fetchTransformedBilinearARGB32PM_simple_upscale_helper_avx2;
+        bilinearFastTransformHelperARGB32PM[0][DownscaleTransform] = fetchTransformedBilinearARGB32PM_downscale_helper_avx2;
+        bilinearFastTransformHelperARGB32PM[0][FastRotateTransform] = fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2;
     }
 #endif
 
@@ -6198,6 +6534,15 @@ static void qInitDrawhelperFunctions()
 
     sourceFetchUntransformed[QImage::Format_RGB888] = qt_fetchUntransformed_888_neon;
 
+#if defined(Q_PROCESSOR_ARM_64) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+    extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, const uint *src, int count,
+                                                                const QVector<QRgb> *, QDitherInfo *);
+    extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_neon(uint *buffer, const uint *src, int count,
+                                                                  const QVector<QRgb> *, QDitherInfo *);
+    qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_neon;
+    qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_neon;
+#endif
+
 #if defined(ENABLE_PIXMAN_DRAWHELPERS)
     // The RGB16 helpers are using Arm32 assemblythat has not been ported to AArch64
     qBlendFunctions[QImage::Format_RGB16][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_rgb16_neon;
@@ -6215,8 +6560,8 @@ static void qInitDrawhelperFunctions()
     destFetchProc[QImage::Format_RGB16] = qt_destFetchRGB16_neon;
     destStoreProc[QImage::Format_RGB16] = qt_destStoreRGB16_neon;
 
-    qMemRotateFunctions[QImage::Format_RGB16][0] = qt_memrotate90_16_neon;
-    qMemRotateFunctions[QImage::Format_RGB16][2] = qt_memrotate270_16_neon;
+    qMemRotateFunctions[QPixelLayout::BPP16][0] = qt_memrotate90_16_neon;
+    qMemRotateFunctions[QPixelLayout::BPP16][2] = qt_memrotate270_16_neon;
 #endif
 #endif // defined(__ARM_NEON__)
 
diff --git a/src/gui/painting/qdrawhelper_avx2.cpp b/src/gui/painting/qdrawhelper_avx2.cpp
index 9c1335298e..a7e03a7bb3 100644
--- a/src/gui/painting/qdrawhelper_avx2.cpp
+++ b/src/gui/painting/qdrawhelper_avx2.cpp
@@ -44,18 +44,12 @@
 
 QT_BEGIN_NAMESPACE
 
-// Autovectorized premultiply functions:
-const uint *QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, const uint *src, int count,
-                                                     const QVector<QRgb> *, QDitherInfo *)
-{
-    return qt_convertARGB32ToARGB32PM(buffer, src, count);
-}
+static Q_CONSTEXPR int BufferSize = 2048;
 
-const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, const uint *src, int count,
-                                                       const QVector<QRgb> *, QDitherInfo *)
-{
-    return qt_convertRGBA8888ToARGB32PM(buffer, src, count);
-}
+enum {
+    FixedScale = 1 << 16,
+    HalfPoint = 1 << 15
+};
 
 // Vectorized blend functions:
 
@@ -356,6 +350,413 @@ void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint *destPixels, int length, u
     }
 }
 
+#define interpolate_4_pixels_16_avx2(tlr1, tlr2, blr1, blr2, distx, disty, colorMask, v_256, b)  \
+{ \
+    /* Correct for later unpack */ \
+    const __m256i vdistx = _mm256_permute4x64_epi64(distx, _MM_SHUFFLE(3, 1, 2, 0)); \
+    const __m256i vdisty = _mm256_permute4x64_epi64(disty, _MM_SHUFFLE(3, 1, 2, 0)); \
+    \
+    __m256i dxdy = _mm256_mullo_epi16 (vdistx, vdisty); \
+    const __m256i distx_ = _mm256_slli_epi16(vdistx, 4); \
+    const __m256i disty_ = _mm256_slli_epi16(vdisty, 4); \
+    __m256i idxidy =  _mm256_add_epi16(dxdy, _mm256_sub_epi16(v_256, _mm256_add_epi16(distx_, disty_))); \
+    __m256i dxidy =  _mm256_sub_epi16(distx_, dxdy); \
+    __m256i idxdy =  _mm256_sub_epi16(disty_, dxdy); \
+ \
+    __m256i tlr1AG = _mm256_srli_epi16(tlr1, 8); \
+    __m256i tlr1RB = _mm256_and_si256(tlr1, colorMask); \
+    __m256i tlr2AG = _mm256_srli_epi16(tlr2, 8); \
+    __m256i tlr2RB = _mm256_and_si256(tlr2, colorMask); \
+    __m256i blr1AG = _mm256_srli_epi16(blr1, 8); \
+    __m256i blr1RB = _mm256_and_si256(blr1, colorMask); \
+    __m256i blr2AG = _mm256_srli_epi16(blr2, 8); \
+    __m256i blr2RB = _mm256_and_si256(blr2, colorMask); \
+ \
+    __m256i odxidy1 = _mm256_unpacklo_epi32(idxidy, dxidy); \
+    __m256i odxidy2 = _mm256_unpackhi_epi32(idxidy, dxidy); \
+    tlr1AG = _mm256_mullo_epi16(tlr1AG, odxidy1); \
+    tlr1RB = _mm256_mullo_epi16(tlr1RB, odxidy1); \
+    tlr2AG = _mm256_mullo_epi16(tlr2AG, odxidy2); \
+    tlr2RB = _mm256_mullo_epi16(tlr2RB, odxidy2); \
+    __m256i odxdy1 = _mm256_unpacklo_epi32(idxdy, dxdy); \
+    __m256i odxdy2 = _mm256_unpackhi_epi32(idxdy, dxdy); \
+    blr1AG = _mm256_mullo_epi16(blr1AG, odxdy1); \
+    blr1RB = _mm256_mullo_epi16(blr1RB, odxdy1); \
+    blr2AG = _mm256_mullo_epi16(blr2AG, odxdy2); \
+    blr2RB = _mm256_mullo_epi16(blr2RB, odxdy2); \
+ \
+    /* Add the values, and shift to only keep 8 significant bits per colors */ \
+    __m256i topAG = _mm256_hadd_epi32(tlr1AG, tlr2AG); \
+    __m256i topRB = _mm256_hadd_epi32(tlr1RB, tlr2RB); \
+    __m256i botAG = _mm256_hadd_epi32(blr1AG, blr2AG); \
+    __m256i botRB = _mm256_hadd_epi32(blr1RB, blr2RB); \
+    __m256i rAG = _mm256_add_epi16(topAG, botAG); \
+    __m256i rRB = _mm256_add_epi16(topRB, botRB); \
+    rRB = _mm256_srli_epi16(rRB, 8); \
+    /* Correct for hadd */ \
+    rAG = _mm256_permute4x64_epi64(rAG, _MM_SHUFFLE(3, 1, 2, 0)); \
+    rRB = _mm256_permute4x64_epi64(rRB, _MM_SHUFFLE(3, 1, 2, 0)); \
+    _mm256_storeu_si256((__m256i*)(b), _mm256_blendv_epi8(rAG, rRB, colorMask)); \
+}
+
+inline void fetchTransformedBilinear_pixelBounds(int, int l1, int l2, int &v1, int &v2)
+{
+    if (v1 < l1)
+        v2 = v1 = l1;
+    else if (v1 >= l2)
+        v2 = v1 = l2;
+    else
+        v2 = v1 + 1;
+    Q_ASSERT(v1 >= l1 && v1 <= l2);
+    Q_ASSERT(v2 >= l1 && v2 <= l2);
+}
+
+void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_upscale_helper_avx2(uint *b, uint *end, const QTextureData &image,
+                                                                             int &fx, int &fy, int fdx, int /*fdy*/)
+{
+    int y1 = (fy >> 16);
+    int y2;
+    fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - 1, y1, y2);
+    const uint *s1 = (const uint *)image.scanLine(y1);
+    const uint *s2 = (const uint *)image.scanLine(y2);
+
+    int disty = (fy & 0x0000ffff) >> 8;
+    int idisty = 256 - disty;
+    int x = fx >> 16;
+    int length = end - b;
+
+    // The idea is first to do the interpolation between the row s1 and the row s2
+    // into an intermediate buffer, then we interpolate between two pixel of this buffer.
+
+    // intermediate_buffer[0] is a buffer of red-blue component of the pixel, in the form 0x00RR00BB
+    // intermediate_buffer[1] is the alpha-green component of the pixel, in the form 0x00AA00GG
+    // +1 for the last pixel to interpolate with, and +1 for rounding errors.
+    quint32 intermediate_buffer[2][BufferSize + 2];
+    // count is the size used in the intermediate_buffer.
+    int count = (qint64(length) * fdx + FixedScale - 1) / FixedScale + 2;
+    Q_ASSERT(count <= BufferSize + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case
+    int f = 0;
+    int lim = qMin(count, image.x2 - x);
+    if (x < image.x1) {
+        Q_ASSERT(x < image.x2);
+        uint t = s1[image.x1];
+        uint b = s2[image.x1];
+        quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+        quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+        do {
+            intermediate_buffer[0][f] = rb;
+            intermediate_buffer[1][f] = ag;
+            f++;
+            x++;
+        } while (x < image.x1 && f < lim);
+    }
+
+    const __m256i disty_ = _mm256_set1_epi16(disty);
+    const __m256i idisty_ = _mm256_set1_epi16(idisty);
+    const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
+
+    lim -= 7;
+    for (; f < lim; x += 8, f += 8) {
+        // Load 8 pixels from s1, and split the alpha-green and red-blue component
+        __m256i top = _mm256_loadu_si256((const __m256i*)((const uint *)(s1)+x));
+        __m256i topAG = _mm256_srli_epi16(top, 8);
+        __m256i topRB = _mm256_and_si256(top, colorMask);
+        // Multiplies each color component by idisty
+        topAG = _mm256_mullo_epi16 (topAG, idisty_);
+        topRB = _mm256_mullo_epi16 (topRB, idisty_);
+
+        // Same for the s2 vector
+        __m256i bottom = _mm256_loadu_si256((const __m256i*)((const uint *)(s2)+x));
+        __m256i bottomAG = _mm256_srli_epi16(bottom, 8);
+        __m256i bottomRB = _mm256_and_si256(bottom, colorMask);
+        bottomAG = _mm256_mullo_epi16 (bottomAG, disty_);
+        bottomRB = _mm256_mullo_epi16 (bottomRB, disty_);
+
+        // Add the values, and shift to only keep 8 significant bits per colors
+        __m256i rAG =_mm256_add_epi16(topAG, bottomAG);
+        rAG = _mm256_srli_epi16(rAG, 8);
+        _mm256_storeu_si256((__m256i*)(&intermediate_buffer[1][f]), rAG);
+        __m256i rRB =_mm256_add_epi16(topRB, bottomRB);
+        rRB = _mm256_srli_epi16(rRB, 8);
+        _mm256_storeu_si256((__m256i*)(&intermediate_buffer[0][f]), rRB);
+    }
+
+    for (; f < count; f++) { // Same as above but without simd
+        x = qMin(x, image.x2 - 1);
+
+        uint t = s1[x];
+        uint b = s2[x];
+
+        intermediate_buffer[0][f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+        intermediate_buffer[1][f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+        x++;
+    }
+    // Now interpolate the values from the intermediate_buffer to get the final result.
+    fx &= FixedScale - 1;
+    Q_ASSERT((fx >> 16) == 0);
+
+    const __m128i v_fdx = _mm_set1_epi32(fdx * 4);
+    const __m128i v_blend = _mm_set1_epi32(0x00800080);
+    __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
+
+    while (b < end - 3) {
+        const __m128i offset = _mm_srli_epi32(v_fx, 16);
+        __m256i vrb = _mm256_i32gather_epi64((const long long *)intermediate_buffer[0], offset, 4);
+        __m256i vag = _mm256_i32gather_epi64((const long long *)intermediate_buffer[1], offset, 4);
+
+        __m128i vdx = _mm_and_si128(v_fx, _mm_set1_epi32(0x0000ffff));
+        vdx = _mm_srli_epi16(vdx, 8);
+        __m128i vidx = _mm_sub_epi32(_mm_set1_epi32(256), vdx);
+        __m256i vmulx = _mm256_castsi128_si256(_mm_unpacklo_epi32(vidx, vdx));
+        vmulx = _mm256_inserti128_si256(vmulx, _mm_unpackhi_epi32(vidx, vdx), 1);
+
+        vrb = _mm256_mullo_epi32(vrb, vmulx);
+        vag = _mm256_mullo_epi32(vag, vmulx);
+
+        __m256i vrbag = _mm256_hadd_epi32(vrb, vag);
+        vrbag = _mm256_permute4x64_epi64(vrbag, _MM_SHUFFLE(3, 1, 2, 0));
+
+        __m128i rb = _mm256_castsi256_si128(vrbag);
+        __m128i ag = _mm256_extracti128_si256(vrbag, 1);
+        rb = _mm_srli_epi16(rb, 8);
+
+        _mm_storeu_si128((__m128i*)b, _mm_blendv_epi8(ag, rb, v_blend));
+
+        b += 4;
+        fx += 4 * fdx;
+        v_fx = _mm_add_epi32(v_fx, v_fdx);
+    }
+    while (b < end) {
+        int x = (fx >> 16);
+
+        uint distx = (fx & 0x0000ffff) >> 8;
+        uint idistx = 256 - distx;
+
+        uint rb = ((intermediate_buffer[0][x] * idistx + intermediate_buffer[0][x + 1] * distx) >> 8) & 0xff00ff;
+        uint ag = (intermediate_buffer[1][x] * idistx + intermediate_buffer[1][x + 1] * distx) & 0xff00ff00;
+        *b = rb | ag;
+        b++;
+        fx += fdx;
+    }
+}
+
+void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper_avx2(uint *b, uint *end, const QTextureData &image,
+                                                                        int &fx, int &fy, int fdx, int /*fdy*/)
+{
+    int y1 = (fy >> 16);
+    int y2;
+    fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - 1, y1, y2);
+    const uint *s1 = (const uint *)image.scanLine(y1);
+    const uint *s2 = (const uint *)image.scanLine(y2);
+    const int disty8 = (fy & 0x0000ffff) >> 8;
+    const int disty4 = (disty8 + 0x08) >> 4;
+
+    const qint64 min_fx = qint64(image.x1) * FixedScale;
+    const qint64 max_fx = qint64(image.x2 - 1) * FixedScale;
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2;
+        fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - 1, x1, x2);
+        if (x1 != x2)
+            break;
+        uint top = s1[x1];
+        uint bot = s2[x1];
+        *b = INTERPOLATE_PIXEL_256(top, 256 - disty8, bot, disty8);
+        fx += fdx;
+        ++b;
+    }
+    uint *boundedEnd = end;
+    if (fdx > 0)
+        boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
+    else if (fdx < 0)
+        boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
+
+    // A fast middle part without boundary checks
+    const __m256i vdistShuffle =
+        _mm256_setr_epi8(0, char(0x80), 0, char(0x80), 4, char(0x80), 4, char(0x80), 8, char(0x80), 8, char(0x80), 12, char(0x80), 12, char(0x80),
+                         0, char(0x80), 0, char(0x80), 4, char(0x80), 4, char(0x80), 8, char(0x80), 8, char(0x80), 12, char(0x80), 12, char(0x80));
+    const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
+    const __m256i v_256 = _mm256_set1_epi16(256);
+    const __m256i v_disty = _mm256_set1_epi16(disty4);
+    const __m256i v_fdx = _mm256_set1_epi32(fdx * 8);
+    const __m256i v_fx_r = _mm256_set1_epi32(0x08);
+    const __m256i v_index = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    __m256i v_fx = _mm256_set1_epi32(fx);
+    v_fx = _mm256_add_epi32(v_fx, _mm256_mullo_epi32(_mm256_set1_epi32(fdx), v_index));
+
+    while (b < boundedEnd - 7) {
+        const __m256i offset = _mm256_srli_epi32(v_fx, 16);
+        const __m128i offsetLo = _mm256_castsi256_si128(offset);
+        const __m128i offsetHi = _mm256_extracti128_si256(offset, 1);
+        const __m256i toplo = _mm256_i32gather_epi64((const long long *)s1, offsetLo, 4);
+        const __m256i tophi = _mm256_i32gather_epi64((const long long *)s1, offsetHi, 4);
+        const __m256i botlo = _mm256_i32gather_epi64((const long long *)s2, offsetLo, 4);
+        const __m256i bothi = _mm256_i32gather_epi64((const long long *)s2, offsetHi, 4);
+
+        __m256i v_distx = _mm256_srli_epi16(v_fx, 8);
+        v_distx = _mm256_srli_epi16(_mm256_add_epi32(v_distx, v_fx_r), 4);
+        v_distx = _mm256_shuffle_epi8(v_distx, vdistShuffle);
+
+        interpolate_4_pixels_16_avx2(toplo, tophi, botlo, bothi, v_distx, v_disty, colorMask, v_256, b);
+        b += 8;
+        v_fx = _mm256_add_epi32(v_fx, v_fdx);
+    }
+    fx = _mm_extract_epi32(_mm256_castsi256_si128(v_fx) , 0);
+
+    while (b < boundedEnd) {
+        int x = (fx >> 16);
+        int distx8 = (fx & 0x0000ffff) >> 8;
+        *b = interpolate_4_pixels(s1 + x, s2 + x, distx8, disty8);
+        fx += fdx;
+        ++b;
+    }
+
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2;
+        fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - 1, x1, x2);
+        uint tl = s1[x1];
+        uint tr = s1[x2];
+        uint bl = s2[x1];
+        uint br = s2[x2];
+        int distx8 = (fx & 0x0000ffff) >> 8;
+        *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8);
+        fx += fdx;
+        ++b;
+    }
+}
+
+void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2(uint *b, uint *end, const QTextureData &image,
+                                                                          int &fx, int &fy, int fdx, int fdy)
+{
+    const qint64 min_fx = qint64(image.x1) * FixedScale;
+    const qint64 max_fx = qint64(image.x2 - 1) * FixedScale;
+    const qint64 min_fy = qint64(image.y1) * FixedScale;
+    const qint64 max_fy = qint64(image.y2 - 1) * FixedScale;
+    // first handle the possibly bounded part in the beginning
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2;
+        int y1 = (fy >> 16);
+        int y2;
+        fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - 1, x1, x2);
+        fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - 1, y1, y2);
+        if (x1 != x2 && y1 != y2)
+            break;
+        const uint *s1 = (const uint *)image.scanLine(y1);
+        const uint *s2 = (const uint *)image.scanLine(y2);
+        uint tl = s1[x1];
+        uint tr = s1[x2];
+        uint bl = s2[x1];
+        uint br = s2[x2];
+        int distx = (fx & 0x0000ffff) >> 8;
+        int disty = (fy & 0x0000ffff) >> 8;
+        *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+        fx += fdx;
+        fy += fdy;
+        ++b;
+    }
+    uint *boundedEnd = end;
+    if (fdx > 0)
+        boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
+    else if (fdx < 0)
+        boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
+    if (fdy > 0)
+        boundedEnd = qMin(boundedEnd, b + (max_fy - fy) / fdy);
+    else if (fdy < 0)
+        boundedEnd = qMin(boundedEnd, b + (min_fy - fy) / fdy);
+
+    // until boundedEnd we can now have a fast middle part without boundary checks
+    const __m256i vdistShuffle =
+        _mm256_setr_epi8(0, char(0x80), 0, char(0x80), 4, char(0x80), 4, char(0x80), 8, char(0x80), 8, char(0x80), 12, char(0x80), 12, char(0x80),
+                         0, char(0x80), 0, char(0x80), 4, char(0x80), 4, char(0x80), 8, char(0x80), 8, char(0x80), 12, char(0x80), 12, char(0x80));
+    const __m256i colorMask = _mm256_set1_epi32(0x00ff00ff);
+    const __m256i v_256 = _mm256_set1_epi16(256);
+    const __m256i v_fdx = _mm256_set1_epi32(fdx * 8);
+    const __m256i v_fdy = _mm256_set1_epi32(fdy * 8);
+    const __m256i v_fxy_r = _mm256_set1_epi32(0x08);
+    const __m256i v_index = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    __m256i v_fx = _mm256_set1_epi32(fx);
+    __m256i v_fy = _mm256_set1_epi32(fy);
+    v_fx = _mm256_add_epi32(v_fx, _mm256_mullo_epi32(_mm256_set1_epi32(fdx), v_index));
+    v_fy = _mm256_add_epi32(v_fy, _mm256_mullo_epi32(_mm256_set1_epi32(fdy), v_index));
+
+    const uchar *textureData = image.imageData;
+    const int bytesPerLine = image.bytesPerLine;
+    const __m256i vbpl = _mm256_set1_epi16(bytesPerLine/4);
+
+    while (b < boundedEnd - 7) {
+        const __m256i vy = _mm256_packs_epi32(_mm256_srli_epi32(v_fy, 16), _mm256_setzero_si256());
+        // 8x16bit * 8x16bit -> 8x32bit
+        __m256i offset = _mm256_unpacklo_epi16(_mm256_mullo_epi16(vy, vbpl), _mm256_mulhi_epi16(vy, vbpl));
+        offset = _mm256_add_epi32(offset, _mm256_srli_epi32(v_fx, 16));
+        const __m128i offsetLo = _mm256_castsi256_si128(offset);
+        const __m128i offsetHi = _mm256_extracti128_si256(offset, 1);
+        const uint *topData = (const uint *)(textureData);
+        const uint *botData = (const uint *)(textureData + bytesPerLine);
+        const __m256i toplo = _mm256_i32gather_epi64((const long long *)topData, offsetLo, 4);
+        const __m256i tophi = _mm256_i32gather_epi64((const long long *)topData, offsetHi, 4);
+        const __m256i botlo = _mm256_i32gather_epi64((const long long *)botData, offsetLo, 4);
+        const __m256i bothi = _mm256_i32gather_epi64((const long long *)botData, offsetHi, 4);
+
+        __m256i v_distx = _mm256_srli_epi16(v_fx, 8);
+        __m256i v_disty = _mm256_srli_epi16(v_fy, 8);
+        v_distx = _mm256_srli_epi16(_mm256_add_epi32(v_distx, v_fxy_r), 4);
+        v_disty = _mm256_srli_epi16(_mm256_add_epi32(v_disty, v_fxy_r), 4);
+        v_distx = _mm256_shuffle_epi8(v_distx, vdistShuffle);
+        v_disty = _mm256_shuffle_epi8(v_disty, vdistShuffle);
+
+        interpolate_4_pixels_16_avx2(toplo, tophi, botlo, bothi, v_distx, v_disty, colorMask, v_256, b);
+        b += 8;
+        v_fx = _mm256_add_epi32(v_fx, v_fdx);
+        v_fy = _mm256_add_epi32(v_fy, v_fdy);
+    }
+    fx = _mm_extract_epi32(_mm256_castsi256_si128(v_fx) , 0);
+    fy = _mm_extract_epi32(_mm256_castsi256_si128(v_fy) , 0);
+
+    while (b < boundedEnd) {
+        int x = (fx >> 16);
+        int y = (fy >> 16);
+
+        const uint *s1 = (const uint *)image.scanLine(y);
+        const uint *s2 = (const uint *)image.scanLine(y + 1);
+
+        int distx = (fx & 0x0000ffff) >> 8;
+        int disty = (fy & 0x0000ffff) >> 8;
+        *b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty);
+
+        fx += fdx;
+        fy += fdy;
+        ++b;
+    }
+
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2;
+        int y1 = (fy >> 16);
+        int y2;
+
+        fetchTransformedBilinear_pixelBounds(image.width, image.x1, image.x2 - 1, x1, x2);
+        fetchTransformedBilinear_pixelBounds(image.height, image.y1, image.y2 - 1, y1, y2);
+
+        const uint *s1 = (const uint *)image.scanLine(y1);
+        const uint *s2 = (const uint *)image.scanLine(y2);
+
+        uint tl = s1[x1];
+        uint tr = s1[x2];
+        uint bl = s2[x1];
+        uint br = s2[x2];
+
+        int distx = (fx & 0x0000ffff) >> 8;
+        int disty = (fy & 0x0000ffff) >> 8;
+        *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+
+        fx += fdx;
+        fy += fdy;
+        ++b;
+    }
+}
+
 QT_END_NAMESPACE
 
 #endif
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp
index a833520b00..4cbac009d8 100644
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@@ -535,12 +535,23 @@ void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
 }
 
 #if defined(ENABLE_PIXMAN_DRAWHELPERS)
+extern void qt_alphamapblit_quint16(QRasterBuffer *rasterBuffer,
+                                    int x, int y, const QRgba64 &color,
+                                    const uchar *map,
+                                    int mapWidth, int mapHeight, int mapStride,
+                                    const QClipData *clip, bool useGammaCorrection);
+
 void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
                                   int x, int y, const QRgba64 &color,
                                   const uchar *bitmap,
                                   int mapWidth, int mapHeight, int mapStride,
-                                  const QClipData *)
+                                  const QClipData *clip, bool useGammaCorrection)
 {
+    if (clip || useGammaCorrection) {
+        qt_alphamapblit_quint16(rasterBuffer, x, y, color, bitmap, mapWidth, mapHeight, mapStride, clip, useGammaCorrection);
+        return;
+    }
+
     quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
     const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
 
@@ -1069,6 +1080,67 @@ const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Oper
     return buffer;
 }
 
+#if defined(Q_PROCESSOR_ARM_64) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+template<bool RGBA>
+static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count)
+{
+    int i = 0;
+    const uint8x16_t rgbaMask  = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15};
+    const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7};
+    const uint32x4_t blendMask = vdupq_n_u32(0xff000000);
+
+    for (; i < count - 3; i += 4) {
+        uint32x4_t srcVector = vld1q_u32(src + i);
+        uint32x4_t alphaVector = vshrq_n_u32(srcVector, 24);
+        uint32_t alphaSum = vaddvq_u32(alphaVector);
+        if (alphaSum) {
+            if (alphaSum != 255 * 4) {
+                if (RGBA)
+                    srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask));
+                const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(srcVector));
+                const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(srcVector));
+                const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask);
+                const uint8x8_t alpha2 = vtbl1_u8(s2, shuffleMask);
+                uint16x8_t src1 = vmull_u8(s1, alpha1);
+                uint16x8_t src2 = vmull_u8(s2, alpha2);
+                src1 = vsraq_n_u16(src1, src1, 8);
+                src2 = vsraq_n_u16(src2, src2, 8);
+                const uint8x8_t d1 = vrshrn_n_u16(src1, 8);
+                const uint8x8_t d2 = vrshrn_n_u16(src2, 8);
+                const uint32x4_t d = vbslq_u32(blendMask, srcVector, vreinterpretq_u32_u8(vcombine_u8(d1, d2)));
+                vst1q_u32(buffer + i, d);
+            } else {
+                if (RGBA)
+                    vst1q_u32(buffer + i, vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask)));
+                else if (buffer != src)
+                    vst1q_u32(buffer + i, srcVector);
+            }
+        } else {
+            vst1q_u32(buffer + i, vdupq_n_u32(0));
+        }
+    }
+
+    SIMD_EPILOGUE(i, count, 3) {
+        uint v = qPremultiply(src[i]);
+        buffer[i] = RGBA ? RGBA2ARGB(v) : v;
+    }
+}
+
+const uint *QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, const uint *src, int count,
+                                                     const QVector<QRgb> *, QDitherInfo *)
+{
+    convertARGBToARGB32PM_neon<false>(buffer, src, count);
+    return buffer;
+}
+
+const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_neon(uint *buffer, const uint *src, int count,
+                                                       const QVector<QRgb> *, QDitherInfo *)
+{
+    convertARGBToARGB32PM_neon<true>(buffer, src, count);
+    return buffer;
+}
+#endif
+
 QT_END_NAMESPACE
 
 #endif // __ARM_NEON__
diff --git a/src/gui/painting/qdrawhelper_neon_p.h b/src/gui/painting/qdrawhelper_neon_p.h
index 3cf949fc32..40475a9bde 100644
--- a/src/gui/painting/qdrawhelper_neon_p.h
+++ b/src/gui/painting/qdrawhelper_neon_p.h
@@ -91,7 +91,7 @@ void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
                                   int x, int y, const QRgba64 &color,
                                   const uchar *bitmap,
                                   int mapWidth, int mapHeight, int mapStride,
-                                  const QClipData *clip);
+                                  const QClipData *clip, bool /*useGammaCorrection*/);
 
 void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
                                          const uchar *srcPixels, int sbpl, int srch,
diff --git a/src/gui/painting/qdrawhelper_p.h b/src/gui/painting/qdrawhelper_p.h
index 0e46962784..cf2213042d 100644
--- a/src/gui/painting/qdrawhelper_p.h
+++ b/src/gui/painting/qdrawhelper_p.h
@@ -113,13 +113,13 @@ typedef void (*AlphamapBlitFunc)(QRasterBuffer *rasterBuffer,
                                  int x, int y, const QRgba64 &color,
                                  const uchar *bitmap,
                                  int mapWidth, int mapHeight, int mapStride,
-                                 const QClipData *clip);
+                                 const QClipData *clip, bool useGammaCorrection);
 
 typedef void (*AlphaRGBBlitFunc)(QRasterBuffer *rasterBuffer,
                                  int x, int y, const QRgba64 &color,
                                  const uint *rgbmask,
                                  int mapWidth, int mapHeight, int mapStride,
-                                 const QClipData *clip);
+                                 const QClipData *clip, bool useGammaCorrection);
 
 typedef void (*RectFillFunc)(QRasterBuffer *rasterBuffer,
                              int x, int y, int width, int height,
@@ -159,7 +159,6 @@ struct DrawHelper {
 extern SrcOverBlendFunc qBlendFunctions[QImage::NImageFormats][QImage::NImageFormats];
 extern SrcOverScaleFunc qScaleFunctions[QImage::NImageFormats][QImage::NImageFormats];
 extern SrcOverTransformFunc qTransformFunctions[QImage::NImageFormats][QImage::NImageFormats];
-extern MemRotateFunc qMemRotateFunctions[QImage::NImageFormats][3];
 
 extern DrawHelper qDrawHelper[QImage::NImageFormats];
 
@@ -351,18 +350,6 @@ struct QSpanData
     void adjustSpanMethods();
 };
 
-struct QDrawHelperGammaTables
-{
-    explicit QDrawHelperGammaTables(qreal smoothing);
-
-    void refresh(qreal smoothing);
-
-    uchar qt_pow_rgb_gamma[256];
-    uchar qt_pow_rgb_invgamma[256];
-    uint qt_pow_gamma[256];
-    uchar qt_pow_invgamma[2048];
-};
-
 static inline uint qt_gradient_clamp(const QGradientData *data, int ipos)
 {
     if (ipos < 0 || ipos >= GRADIENT_STOPTABLE_SIZE) {
@@ -1244,6 +1231,7 @@ extern QPixelLayout qPixelLayouts[QImage::NImageFormats];
 extern const FetchPixelsFunc qFetchPixels[QPixelLayout::BPPCount];
 extern StorePixelsFunc qStorePixels[QPixelLayout::BPPCount];
 
+extern MemRotateFunc qMemRotateFunctions[QPixelLayout::BPPCount][3];
 
 
 QT_END_NAMESPACE
diff --git a/src/gui/painting/qdrawhelper_sse4.cpp b/src/gui/painting/qdrawhelper_sse4.cpp
index 257bad9eca..14bfaabf09 100644
--- a/src/gui/painting/qdrawhelper_sse4.cpp
+++ b/src/gui/painting/qdrawhelper_sse4.cpp
@@ -44,16 +44,67 @@
 
 QT_BEGIN_NAMESPACE
 
+template<bool RGBA>
+static inline void convertARGBToARGB32PM_sse4(uint *buffer, const uint *src, int count)
+{
+    int i = 0;
+    const __m128i alphaMask = _mm_set1_epi32(0xff000000);
+    const __m128i rgbaMask = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15);
+    const __m128i shuffleMask = _mm_setr_epi8(6, 7, 6, 7, 6, 7, 6, 7, 14, 15, 14, 15, 14, 15, 14, 15);
+    const __m128i half = _mm_set1_epi16(0x0080);
+    const __m128i zero = _mm_setzero_si128();
+
+    for (; i < count - 3; i += 4) {
+        __m128i srcVector = _mm_loadu_si128((const __m128i *)&src[i]);
+        if (!_mm_testz_si128(srcVector, alphaMask)) {
+            if (!_mm_testc_si128(srcVector, alphaMask)) {
+                if (RGBA)
+                    srcVector = _mm_shuffle_epi8(srcVector, rgbaMask);
+                __m128i src1 = _mm_unpacklo_epi8(srcVector, zero);
+                __m128i src2 = _mm_unpackhi_epi8(srcVector, zero);
+                __m128i alpha1 = _mm_shuffle_epi8(src1, shuffleMask);
+                __m128i alpha2 = _mm_shuffle_epi8(src2, shuffleMask);
+                src1 = _mm_mullo_epi16(src1, alpha1);
+                src2 = _mm_mullo_epi16(src2, alpha2);
+                src1 = _mm_add_epi16(src1, _mm_srli_epi16(src1, 8));
+                src2 = _mm_add_epi16(src2, _mm_srli_epi16(src2, 8));
+                src1 = _mm_add_epi16(src1, half);
+                src2 = _mm_add_epi16(src2, half);
+                src1 = _mm_srli_epi16(src1, 8);
+                src2 = _mm_srli_epi16(src2, 8);
+                src1 = _mm_blend_epi16(src1, alpha1, 0x88);
+                src2 = _mm_blend_epi16(src2, alpha2, 0x88);
+                srcVector = _mm_packus_epi16(src1, src2);
+                _mm_storeu_si128((__m128i *)&buffer[i], srcVector);
+            } else {
+                if (RGBA)
+                    _mm_storeu_si128((__m128i *)&buffer[i], _mm_shuffle_epi8(srcVector, rgbaMask));
+                else if (buffer != src)
+                    _mm_storeu_si128((__m128i *)&buffer[i], srcVector);
+            }
+        } else {
+            _mm_storeu_si128((__m128i *)&buffer[i], _mm_setzero_si128());
+        }
+    }
+
+    SIMD_EPILOGUE(i, count, 3) {
+        uint v = qPremultiply(src[i]);
+        buffer[i] = RGBA ? RGBA2ARGB(v) : v;
+    }
+}
+
 const uint *QT_FASTCALL convertARGB32ToARGB32PM_sse4(uint *buffer, const uint *src, int count,
                                                      const QVector<QRgb> *, QDitherInfo *)
 {
-    return qt_convertARGB32ToARGB32PM(buffer, src, count);
+    convertARGBToARGB32PM_sse4<false>(buffer, src, count);
+    return buffer;
 }
 
 const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_sse4(uint *buffer, const uint *src, int count,
                                                        const QVector<QRgb> *, QDitherInfo *)
 {
-    return qt_convertRGBA8888ToARGB32PM(buffer, src, count);
+    convertARGBToARGB32PM_sse4<true>(buffer, src, count);
+    return buffer;
 }
 
 const uint *QT_FASTCALL convertARGB32FromARGB32PM_sse4(uint *buffer, const uint *src, int count,
diff --git a/src/gui/painting/qmemrotate.cpp b/src/gui/painting/qmemrotate.cpp
index 3fbae76de5..25aa6a3122 100644
--- a/src/gui/painting/qmemrotate.cpp
+++ b/src/gui/painting/qmemrotate.cpp
@@ -41,164 +41,10 @@
 
 QT_BEGIN_NAMESPACE
 
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_TILED
 static const int tileSize = 32;
-#endif
-
-#if Q_BYTE_ORDER == Q_BIG_ENDIAN
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_PACKED || QT_ROTATION_ALGORITHM == QT_ROTATION_TILED
-#error Big endian version not implemented for the transformed driver!
-#endif
-#endif
-
-template <class T>
-Q_STATIC_TEMPLATE_FUNCTION
-inline void qt_memrotate90_cachedRead(const T *src, int w, int h, int sstride, T *dest,
-                                      int dstride)
-{
-    const char *s = reinterpret_cast<const char*>(src);
-    char *d = reinterpret_cast<char*>(dest);
-    for (int y = 0; y < h; ++y) {
-        for (int x = w - 1; x >= 0; --x) {
-            T *destline = reinterpret_cast<T *>(d + (w - x - 1) * dstride);
-            destline[y] = src[x];
-        }
-        s += sstride;
-        src = reinterpret_cast<const T*>(s);
-    }
-}
 
 template <class T>
 Q_STATIC_TEMPLATE_FUNCTION
-inline void qt_memrotate270_cachedRead(const T *src, int w, int h, int sstride, T *dest,
-                                       int dstride)
-{
-    const char *s = reinterpret_cast<const char*>(src);
-    char *d = reinterpret_cast<char*>(dest);
-    s += (h - 1) * sstride;
-    for (int y = h - 1; y >= 0; --y) {
-        src = reinterpret_cast<const T*>(s);
-        for (int x = 0; x < w; ++x) {
-            T *destline = reinterpret_cast<T *>(d + x * dstride);
-            destline[h - y - 1] = src[x];
-        }
-        s -= sstride;
-    }
-}
-
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_CACHEDWRITE
-
-template <class T>
-Q_STATIC_TEMPLATE_FUNCTION
-inline void qt_memrotate90_cachedWrite(const T *src, int w, int h, int sstride, T *dest,
-                                       int dstride)
-{
-    for (int x = w - 1; x >= 0; --x) {
-        T *d = dest + (w - x - 1) * dstride;
-        for (int y = 0; y < h; ++y) {
-            *d++ = src[y * sstride + x];
-        }
-    }
-
-}
-
-template <class T>
-Q_STATIC_TEMPLATE_FUNCTION
-inline void qt_memrotate270_cachedWrite(const T *src, int w, int h, int sstride, T *dest,
-                                        int dstride)
-{
-    for (int x = 0; x < w; ++x) {
-        T *d = dest + x * dstride;
-        for (int y = h - 1; y >= 0; --y) {
-            *d++ = src[y * sstride + x];
-        }
-    }
-}
-
-#endif // QT_ROTATION_CACHEDWRITE
-
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_PACKING
-
-// TODO: packing algorithms should probably be modified on 64-bit architectures
-
-template <class T>
-Q_STATIC_TEMPLATE_FUNCTION
-inline void qt_memrotate90_packing(const T *src, int w, int h, int sstride, T *dest, int dstride)
-{
-    sstride /= sizeof(T);
-    dstride /= sizeof(T);
-
-    const int pack = sizeof(quint32) / sizeof(T);
-    const int unaligned = int((long(dest) & (sizeof(quint32)-1))) / sizeof(T);
-
-    for (int x = w - 1; x >= 0; --x) {
-        int y = 0;
-
-        for (int i = 0; i < unaligned; ++i) {
-            dest[(w - x - 1) * dstride + y] = src[y * sstride + x];
-            ++y;
-        }
-
-        quint32 *d = reinterpret_cast<quint32*>(dest + (w - x - 1) * dstride
-                                                + unaligned);
-        const int rest = (h - unaligned) % pack;
-        while (y < h - rest) {
-            quint32 c = src[y * sstride + x];
-            for (int i = 1; i < pack; ++i) {
-                c |= src[(y + i) * sstride + x] << (sizeof(int) * 8 / pack * i);
-            }
-            *d++ = c;
-            y += pack;
-        }
-
-        while (y < h) {
-            dest[(w - x - 1) * dstride + y] = src[y * sstride + x];
-            ++y;
-        }
-    }
-}
-
-template <class T>
-Q_STATIC_TEMPLATE_FUNCTION
-inline void qt_memrotate270_packing(const T *src, int w, int h, int sstride, T *dest, int dstride)
-{
-    sstride /= sizeof(T);
-    dstride /= sizeof(T);
-
-    const int pack = sizeof(quint32) / sizeof(T);
-    const int unaligned = int((long(dest) & (sizeof(quint32)-1))) / sizeof(T);
-
-    for (int x = 0; x < w; ++x) {
-        int y = h - 1;
-
-        for (int i = 0; i < unaligned; ++i) {
-            dest[x * dstride + h - y - 1] = src[y * sstride + x];
-            --y;
-        }
-
-        quint32 *d = reinterpret_cast<quint32*>(dest + x * dstride
-                                                + unaligned);
-        const int rest = (h - unaligned) % pack;
-        while (y > rest) {
-            quint32 c = src[y * sstride + x];
-            for (int i = 1; i < pack; ++i) {
-                c |= src[(y - i) * sstride + x] << (sizeof(int) * 8 / pack * i);
-            }
-            *d++ = c;
-            y -= pack;
-        }
-        while (y >= 0) {
-            dest[x * dstride + h - y - 1] = src[y * sstride + x];
-            --y;
-        }
-    }
-}
-
-#endif // QT_ROTATION_PACKING
-
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_TILED
-template <class T>
-Q_STATIC_TEMPLATE_FUNCTION
 inline void qt_memrotate90_tiled(const T *src, int w, int h, int sstride, T *dest, int dstride)
 {
     sstride /= sizeof(T);
@@ -235,7 +81,7 @@ inline void qt_memrotate90_tiled(const T *src, int w, int h, int sstride, T *des
                 for (int y = starty; y < stopy; y += pack) {
                     quint32 c = src[y * sstride + x];
                     for (int i = 1; i < pack; ++i) {
-                        const int shift = (sizeof(int) * 8 / pack * i);
+                        const int shift = (sizeof(T) * 8 * i);
                         const T color = src[(y + i) * sstride + x];
                         c |= color << shift;
                     }
@@ -293,7 +139,7 @@ inline void qt_memrotate270_tiled(const T *src, int w, int h, int sstride, T *de
 
     const int pack = sizeof(quint32) / sizeof(T);
     const int unaligned =
-        qMin(uint((long(dest) & (sizeof(quint32)-1)) / sizeof(T)), uint(h));
+        qMin(uint((quintptr(dest) & (sizeof(quint32)-1)) / sizeof(T)), uint(h));
     const int restX = w % tileSize;
     const int restY = (h - unaligned) % tileSize;
     const int unoptimizedY = restY % pack;
@@ -320,10 +166,10 @@ inline void qt_memrotate270_tiled(const T *src, int w, int h, int sstride, T *de
             for (int x = startx; x < stopx; ++x) {
                 quint32 *d = reinterpret_cast<quint32*>(dest + x * dstride
                                                         + h - 1 - starty);
-                for (int y = starty; y > stopy; y -= pack) {
+                for (int y = starty; y >= stopy; y -= pack) {
                     quint32 c = src[y * sstride + x];
                     for (int i = 1; i < pack; ++i) {
-                        const int shift = (sizeof(int) * 8 / pack * i);
+                        const int shift = (sizeof(T) * 8 * i);
                         const T color = src[(y - i) * sstride + x];
                         c |= color << shift;
                     }
@@ -371,22 +217,26 @@ inline void qt_memrotate270_tiled_unpacked(const T *src, int w, int h, int sstri
     }
 }
 
-#endif // QT_ROTATION_ALGORITHM
 
 template <class T>
 Q_STATIC_TEMPLATE_FUNCTION
 inline void qt_memrotate90_template(const T *src, int srcWidth, int srcHeight, int srcStride,
                                     T *dest, int dstStride)
 {
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_CACHEDREAD
-    qt_memrotate90_cachedRead<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_CACHEDWRITE
-    qt_memrotate90_cachedWrite<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_PACKING
-    qt_memrotate90_packing<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_TILED
-    qt_memrotate90_tiled<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
+#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+    // packed algorithm assumes little endian and that sizeof(quint32)/sizeof(T) is an integer
+    if (sizeof(quint32) % sizeof(T) == 0)
+        qt_memrotate90_tiled<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
+    else
 #endif
+    qt_memrotate90_tiled_unpacked<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
+}
+
+template <>
+inline void qt_memrotate90_template<quint32>(const quint32 *src, int w, int h, int sstride, quint32 *dest, int dstride)
+{
+    // packed algorithm doesn't have any benefit for quint32
+    qt_memrotate90_tiled_unpacked(src, w, h, sstride, dest, dstride);
 }
 
 template <class T>
@@ -394,11 +244,11 @@ Q_STATIC_TEMPLATE_FUNCTION
 inline void qt_memrotate180_template(const T *src, int w, int h, int sstride, T *dest, int dstride)
 {
     const char *s = (const char*)(src) + (h - 1) * sstride;
-    for (int y = h - 1; y >= 0; --y) {
-        T *d = reinterpret_cast<T*>((char *)(dest) + (h - y - 1) * dstride);
+    for (int dy = 0; dy < h; ++dy) {
+        T *d = reinterpret_cast<T*>((char *)(dest) + dy * dstride);
         src = reinterpret_cast<const T*>(s);
-        for (int x = w - 1; x >= 0; --x) {
-            d[w - x - 1] = src[x];
+        for (int dx = 0; dx < w; ++dx) {
+            d[dx] = src[w - 1 - dx];
         }
         s -= sstride;
     }
@@ -409,32 +259,20 @@ Q_STATIC_TEMPLATE_FUNCTION
 inline void qt_memrotate270_template(const T *src, int srcWidth, int srcHeight, int srcStride,
                                      T *dest, int dstStride)
 {
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_CACHEDREAD
-    qt_memrotate270_cachedRead<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_CACHEDWRITE
-    qt_memrotate270_cachedWrite<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_PACKING
-    qt_memrotate270_packing<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_TILED
-    qt_memrotate270_tiled_unpacked<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
+#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+    // packed algorithm assumes little endian and that sizeof(quint32)/sizeof(T) is an integer
+    if (sizeof(quint32) % sizeof(T) == 0)
+        qt_memrotate270_tiled<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
+    else
 #endif
+    qt_memrotate270_tiled_unpacked<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
 }
 
 template <>
-inline void qt_memrotate90_template<quint24>(const quint24 *src, int srcWidth, int srcHeight,
-                                             int srcStride, quint24 *dest, int dstStride)
+inline void qt_memrotate270_template<quint32>(const quint32 *src, int w, int h, int sstride, quint32 *dest, int dstride)
 {
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_CACHEDREAD
-    qt_memrotate90_cachedRead<quint24>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_CACHEDWRITE
-    qt_memrotate90_cachedWrite<quint24>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_PACKING
-    // packed algorithm not implemented
-    qt_memrotate90_cachedRead<quint24>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_TILED
-    // packed algorithm not implemented
-    qt_memrotate90_tiled_unpacked<quint24>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#endif
+    // packed algorithm doesn't have any benefit for quint32
+    qt_memrotate270_tiled_unpacked(src, w, h, sstride, dest, dstride);
 }
 
 #define QT_IMPL_MEMROTATE(type)                                     \
@@ -458,7 +296,7 @@ Q_GUI_EXPORT void qt_memrotate270(const type *src, int w, int h, int sstride, \
 Q_GUI_EXPORT void qt_memrotate90(const type *src, int w, int h, int sstride,  \
                                  type *dest, int dstride)           \
 {                                                                   \
-    qt_memrotate90_tiled_unpacked<type>(src, w, h, sstride, dest, dstride); \
+    qt_memrotate90_tiled_unpacked(src, w, h, sstride, dest, dstride); \
 }                                                                   \
 Q_GUI_EXPORT void qt_memrotate180(const type *src, int w, int h, int sstride, \
                                   type *dest, int dstride)          \
@@ -468,7 +306,7 @@ Q_GUI_EXPORT void qt_memrotate180(const type *src, int w, int h, int sstride, \
 Q_GUI_EXPORT void qt_memrotate270(const type *src, int w, int h, int sstride, \
                                   type *dest, int dstride)          \
 {                                                                   \
-    qt_memrotate270_tiled_unpacked<type>(src, w, h, sstride, dest, dstride); \
+    qt_memrotate270_tiled_unpacked(src, w, h, sstride, dest, dstride); \
 }
 
 
@@ -509,6 +347,21 @@ void qt_memrotate270_16(const uchar *srcPixels, int w, int h, int sbpl, uchar *d
     qt_memrotate270((const ushort *)srcPixels, w, h, sbpl, (ushort *)destPixels, dbpl);
 }
 
+void qt_memrotate90_24(const uchar *srcPixels, int w, int h, int sbpl, uchar *destPixels, int dbpl)
+{
+    qt_memrotate90((const quint24 *)srcPixels, w, h, sbpl, (quint24 *)destPixels, dbpl);
+}
+
+void qt_memrotate180_24(const uchar *srcPixels, int w, int h, int sbpl, uchar *destPixels, int dbpl)
+{
+    qt_memrotate180((const quint24 *)srcPixels, w, h, sbpl, (quint24 *)destPixels, dbpl);
+}
+
+void qt_memrotate270_24(const uchar *srcPixels, int w, int h, int sbpl, uchar *destPixels, int dbpl)
+{
+    qt_memrotate270((const quint24 *)srcPixels, w, h, sbpl, (quint24 *)destPixels, dbpl);
+}
+
 void qt_memrotate90_32(const uchar *srcPixels, int w, int h, int sbpl, uchar *destPixels, int dbpl)
 {
     qt_memrotate90((const uint *)srcPixels, w, h, sbpl, (uint *)destPixels, dbpl);
@@ -524,34 +377,16 @@ void qt_memrotate270_32(const uchar *srcPixels, int w, int h, int sbpl, uchar *d
     qt_memrotate270((const uint *)srcPixels, w, h, sbpl, (uint *)destPixels, dbpl);
 }
 
-MemRotateFunc qMemRotateFunctions[QImage::NImageFormats][3] =
+MemRotateFunc qMemRotateFunctions[QPixelLayout::BPPCount][3] =
 // 90, 180, 270
 {
-    { 0, 0, 0 },      // Format_Invalid,
-    { 0, 0, 0 },      // Format_Mono,
-    { 0, 0, 0 },      // Format_MonoLSB,
-    { 0, 0, 0 },      // Format_Indexed8,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_RGB32,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_ARGB32,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_ARGB32_Premultiplied,
-    { qt_memrotate90_16, qt_memrotate180_16, qt_memrotate270_16 },      // Format_RGB16,
-    { 0, 0, 0 },      // Format_ARGB8565_Premultiplied,
-    { 0, 0, 0 },      // Format_RGB666,
-    { 0, 0, 0 },      // Format_ARGB6666_Premultiplied,
-    { 0, 0, 0 },      // Format_RGB555,
-    { 0, 0, 0 },      // Format_ARGB8555_Premultiplied,
-    { 0, 0, 0 },      // Format_RGB888,
-    { 0, 0, 0 },      // Format_RGB444,
-    { 0, 0, 0 },      // Format_ARGB4444_Premultiplied,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_RGBX8888,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_RGBA8888,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_RGBA8888_Premultiplied,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_BGB30,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_A2BGR30_Premultiplied,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_RGB30,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_A2RGB30_Premultiplied,
-    { qt_memrotate90_8, qt_memrotate180_8, qt_memrotate270_8 },         // Format_Alpha8,
-    { qt_memrotate90_8, qt_memrotate180_8, qt_memrotate270_8 },         // Format_Grayscale8,
+    { 0, 0, 0 },      // BPPNone,
+    { 0, 0, 0 },      // BPP1MSB,
+    { 0, 0, 0 },      // BPP1LSB,
+    { qt_memrotate90_8, qt_memrotate180_8, qt_memrotate270_8 },         // BPP8,
+    { qt_memrotate90_16, qt_memrotate180_16, qt_memrotate270_16 },      // BPP16,
+    { qt_memrotate90_24, qt_memrotate180_24, qt_memrotate270_24 },      // BPP24
+    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // BPP32
 };
 
 QT_END_NAMESPACE
diff --git a/src/gui/painting/qmemrotate_p.h b/src/gui/painting/qmemrotate_p.h
index 62613d301a..9bc3fd1010 100644
--- a/src/gui/painting/qmemrotate_p.h
+++ b/src/gui/painting/qmemrotate_p.h
@@ -56,19 +56,6 @@
 
 QT_BEGIN_NAMESPACE
 
-#define QT_ROTATION_CACHEDREAD 1
-#define QT_ROTATION_CACHEDWRITE 2
-#define QT_ROTATION_PACKING 3
-#define QT_ROTATION_TILED 4
-
-#ifndef QT_ROTATION_ALGORITHM
-#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
-#define QT_ROTATION_ALGORITHM QT_ROTATION_TILED
-#else
-#define QT_ROTATION_ALGORITHM QT_ROTATION_CACHEDREAD
-#endif
-#endif
-
 #define QT_DECL_MEMROTATE(type)                            \
     void Q_GUI_EXPORT qt_memrotate90(const type*, int, int, int, type*, int); \
     void Q_GUI_EXPORT qt_memrotate180(const type*, int, int, int, type*, int); \
diff --git a/src/gui/painting/qpaintengine_raster.cpp b/src/gui/painting/qpaintengine_raster.cpp
index 6d5eaf5aed..eb43453ddb 100644
--- a/src/gui/painting/qpaintengine_raster.cpp
+++ b/src/gui/painting/qpaintengine_raster.cpp
@@ -272,6 +272,35 @@ static void qt_debug_path(const QPainterPath &path)
 }
 #endif
 
+// QRect::normalized() will change the width/height of the rectangle due to
+// its incusive-integer definition of left/right vs width. This is not
+// something we want to change in QRect as that would potentially introduce
+// regressions all over the place, so we implement a straightforward
+// normalized here. QRectF already does this, so QRectF::normalized() is ok to
+// use.
+static QRect qrect_normalized(const QRect &rect)
+{
+    int x, y, w, h;
+    if (Q_UNLIKELY(rect.width() < 0)) {
+        x = rect.x() + rect.width();
+        w = -rect.width();
+    } else {
+        x = rect.x();
+        w = rect.width();
+    }
+
+    if (Q_UNLIKELY(rect.height() < 0)) {
+        y = rect.y() + rect.height();
+        h = -rect.height();
+    } else {
+        y = rect.y();
+        h = rect.height();
+    }
+
+    return QRect(x, y, w, h);
+}
+
+
 QRasterPaintEnginePrivate::QRasterPaintEnginePrivate() :
     QPaintEngineExPrivate(),
     cachedLines(0)
@@ -1236,7 +1265,9 @@ void QRasterPaintEngine::clip(const QRect &rect, Qt::ClipOperation op)
 bool QRasterPaintEngine::setClipRectInDeviceCoords(const QRect &r, Qt::ClipOperation op)
 {
     Q_D(QRasterPaintEngine);
-    QRect clipRect = r & d->deviceRect;
+    // normalize before using the & operator which uses QRect::normalize()
+    // internally which will give us the wrong values.
+    QRect clipRect = qrect_normalized(r) & d->deviceRect;
     QRasterPaintEngineState *s = state();
 
     if (op == Qt::ReplaceClip || s->clip == 0) {
@@ -1471,7 +1502,7 @@ void QRasterPaintEngine::drawRects(const QRect *rects, int rectCount)
             int offset_x = int(s->matrix.dx());
             int offset_y = int(s->matrix.dy());
             while (r < lastRect) {
-                QRect rect = r->normalized();
+                QRect rect = qrect_normalized(*r);
                 QRect rr = rect.translated(offset_x, offset_y);
                 fillRect_normalized(rr, &s->brushData, d);
                 ++r;
@@ -2266,8 +2297,9 @@ void QRasterPaintEngine::drawImage(const QRectF &r, const QImage &img, const QRe
                 && d->rasterBuffer->compositionMode == QPainter::CompositionMode_Source)))
     {
         RotationType rotationType = qRotationType(s->matrix);
+        const QPixelLayout::BPP plBpp = qPixelLayouts[d->rasterBuffer->format].bpp;
 
-        if (rotationType != NoRotation && qMemRotateFunctions[d->rasterBuffer->format][rotationType] && img.rect().contains(sr.toAlignedRect())) {
+        if (rotationType != NoRotation && qMemRotateFunctions[plBpp][rotationType] && img.rect().contains(sr.toAlignedRect())) {
             QRectF transformedTargetRect = s->matrix.mapRect(r);
 
             if ((!(s->renderHints & QPainter::SmoothPixmapTransform) && !(s->renderHints & QPainter::Antialiasing))
@@ -2297,7 +2329,7 @@ void QRasterPaintEngine::drawImage(const QRectF &r, const QImage &img, const QRe
                 uint cw = clippedSourceRect.width();
                 uint ch = clippedSourceRect.height();
 
-                qMemRotateFunctions[d->rasterBuffer->format][rotationType](srcBase, cw, ch, sbpl, dstBase, dbpl);
+                qMemRotateFunctions[plBpp][rotationType](srcBase, cw, ch, sbpl, dstBase, dbpl);
 
                 return;
             }
@@ -2500,7 +2532,7 @@ void QRasterPaintEngine::drawTiledPixmap(const QRectF &r, const QPixmap &pixmap,
 
         QRectF rr = r;
         rr.translate(s->matrix.dx(), s->matrix.dy());
-        fillRect_normalized(rr.toRect().normalized(), &d->image_filler, d);
+        fillRect_normalized(rr.normalized().toRect(), &d->image_filler, d);
     }
 }
 
@@ -2523,7 +2555,7 @@ QRasterBuffer *QRasterPaintEngine::rasterBuffer()
 /*!
     \internal
 */
-void QRasterPaintEngine::alphaPenBlt(const void* src, int bpl, int depth, int rx,int ry,int w,int h)
+void QRasterPaintEngine::alphaPenBlt(const void* src, int bpl, int depth, int rx,int ry,int w,int h, bool useGammaCorrection)
 {
     Q_D(QRasterPaintEngine);
     QRasterPaintEngineState *s = state();
@@ -2578,18 +2610,18 @@ void QRasterPaintEngine::alphaPenBlt(const void* src, int bpl, int depth, int rx
             } else if (depth == 8) {
                 if (s->penData.alphamapBlit) {
                     s->penData.alphamapBlit(rb, rx, ry, s->penData.solid.color,
-                                            scanline, w, h, bpl, 0);
+                                            scanline, w, h, bpl, 0, useGammaCorrection);
                     return;
                 }
             } else if (depth == 32) {
                 // (A)RGB Alpha mask where the alpha component is not used.
                 if (s->penData.alphaRGBBlit) {
                     s->penData.alphaRGBBlit(rb, rx, ry, s->penData.solid.color,
-                                            (const uint *) scanline, w, h, bpl / 4, 0);
+                                            (const uint *) scanline, w, h, bpl / 4, 0, useGammaCorrection);
                     return;
                 }
             }
-        } else if (d->deviceDepth == 32 && ((depth == 8 && s->penData.alphamapBlit) || (depth == 32 && s->penData.alphaRGBBlit))) {
+        } else if ((depth == 8 && s->penData.alphamapBlit) || (depth == 32 && s->penData.alphaRGBBlit)) {
             // (A)RGB Alpha mask where the alpha component is not used.
             if (!clip) {
                 int nx = qMax(0, rx);
@@ -2614,10 +2646,10 @@ void QRasterPaintEngine::alphaPenBlt(const void* src, int bpl, int depth, int rx
             }
             if (depth == 8)
                 s->penData.alphamapBlit(rb, rx, ry, s->penData.solid.color,
-                                        scanline, w, h, bpl, clip);
+                                        scanline, w, h, bpl, clip, useGammaCorrection);
             else if (depth == 32)
                 s->penData.alphaRGBBlit(rb, rx, ry, s->penData.solid.color,
-                                        (const uint *) scanline, w, h, bpl / 4, clip);
+                                        (const uint *) scanline, w, h, bpl / 4, clip, useGammaCorrection);
             return;
         }
     }
@@ -2775,7 +2807,8 @@ bool QRasterPaintEngine::drawCachedGlyphs(int numGlyphs, const glyph_t *glyphs,
             alphaPenBlt(alphaMap->constBits(), alphaMap->bytesPerLine(), alphaMap->depth(),
                         qFloor(positions[i].x) + offset.x(),
                         qRound(positions[i].y) + offset.y(),
-                        alphaMap->width(), alphaMap->height());
+                        alphaMap->width(), alphaMap->height(),
+                        fontEngine->expectsGammaCorrectedBlending());
 
             fontEngine->unlockAlphaMapForGlyph();
         }
@@ -2836,7 +2869,7 @@ bool QRasterPaintEngine::drawCachedGlyphs(int numGlyphs, const glyph_t *glyphs,
                 drawImage(QPoint(x, y), QImage(glyphBits, c.w, c.h, bpl, image.format()));
                 s->matrix = originalTransform;
             } else {
-                alphaPenBlt(glyphBits, bpl, depth, x, y, c.w, c.h);
+                alphaPenBlt(glyphBits, bpl, depth, x, y, c.w, c.h, fontEngine->expectsGammaCorrectedBlending());
             }
         }
     }
@@ -2880,7 +2913,7 @@ bool QRasterPaintEnginePrivate::isUnclipped(const QRect &rect,
     const QRasterPaintEngineState *s = q->state();
     const QClipData *cl = clip();
     if (!cl) {
-        QRect r = rect.normalized();
+        QRect r = qrect_normalized(rect);
         // inline contains() for performance (we know the rects are normalized)
         const QRect &r1 = deviceRect;
         return (r.left() >= r1.left() && r.right() <= r1.right()
@@ -2895,7 +2928,7 @@ bool QRasterPaintEnginePrivate::isUnclipped(const QRect &rect,
     if (s->flags.antialiased)
         ++penWidth;
 
-    QRect r = rect.normalized();
+    QRect r = qrect_normalized(rect);
     if (penWidth > 0) {
         r.setX(r.x() - penWidth);
         r.setY(r.y() - penWidth);
@@ -4439,9 +4472,9 @@ void QSpanData::setup(const QBrush &brush, int alpha, QPainter::CompositionMode
             gradient.alphaColor = !brush.isOpaque() || alpha != 256;
 
             auto cacheInfo = qt_gradient_cache()->getBuffer(*g, alpha);
-            cachedGradient = cacheInfo;
             gradient.colorTable32 = cacheInfo->buffer32;
             gradient.colorTable64 = cacheInfo->buffer64;
+            cachedGradient = std::move(cacheInfo);
 
             gradient.spread = g->spread();
 
@@ -4461,9 +4494,9 @@ void QSpanData::setup(const QBrush &brush, int alpha, QPainter::CompositionMode
             gradient.alphaColor = !brush.isOpaque() || alpha != 256;
 
             auto cacheInfo = qt_gradient_cache()->getBuffer(*g, alpha);
-            cachedGradient = cacheInfo;
             gradient.colorTable32 = cacheInfo->buffer32;
             gradient.colorTable64 = cacheInfo->buffer64;
+            cachedGradient = std::move(cacheInfo);
 
             gradient.spread = g->spread();
 
@@ -4487,9 +4520,9 @@ void QSpanData::setup(const QBrush &brush, int alpha, QPainter::CompositionMode
             gradient.alphaColor = !brush.isOpaque() || alpha != 256;
 
             auto cacheInfo = qt_gradient_cache()->getBuffer(*g, alpha);
-            cachedGradient = cacheInfo;
             gradient.colorTable32 = cacheInfo->buffer32;
             gradient.colorTable64 = cacheInfo->buffer64;
+            cachedGradient = std::move(cacheInfo);
 
             gradient.spread = QGradient::RepeatSpread;
 
diff --git a/src/gui/painting/qpaintengine_raster_p.h b/src/gui/painting/qpaintengine_raster_p.h
index 59213220a6..d0b82b3a93 100644
--- a/src/gui/painting/qpaintengine_raster_p.h
+++ b/src/gui/painting/qpaintengine_raster_p.h
@@ -225,7 +225,7 @@ public:
 #endif
 
     QRasterBuffer *rasterBuffer();
-    void alphaPenBlt(const void* src, int bpl, int depth, int rx,int ry,int w,int h);
+    void alphaPenBlt(const void* src, int bpl, int depth, int rx,int ry,int w,int h, bool useGammaCorrection);
 
     Type type() const Q_DECL_OVERRIDE { return Raster; }
 
diff --git a/src/gui/painting/qpainter.h b/src/gui/painting/qpainter.h
index 46817b9c73..64d15d5296 100644
--- a/src/gui/painting/qpainter.h
+++ b/src/gui/painting/qpainter.h
@@ -83,7 +83,6 @@ class Q_GUI_EXPORT QPainter
 {
     Q_DECLARE_PRIVATE(QPainter)
     Q_GADGET
-    Q_FLAGS(RenderHint RenderHints)
 
 public:
     enum RenderHint {
@@ -94,8 +93,10 @@ public:
         NonCosmeticDefaultPen = 0x10,
         Qt4CompatiblePainting = 0x20
     };
+    Q_FLAG(RenderHint)
 
     Q_DECLARE_FLAGS(RenderHints, RenderHint)
+    Q_FLAG(RenderHints)
 
     class PixmapFragment {
     public:
diff --git a/src/gui/painting/qpdf.cpp b/src/gui/painting/qpdf.cpp
index 84e18a64dd..7b8bae1642 100644
--- a/src/gui/painting/qpdf.cpp
+++ b/src/gui/painting/qpdf.cpp
@@ -1504,16 +1504,25 @@ void QPdfEnginePrivate::writeInfo()
     printString(creator);
     xprintf("\n/Producer ");
     printString(QString::fromLatin1("Qt " QT_VERSION_STR));
-    QDateTime now = QDateTime::currentDateTimeUtc();
+    QDateTime now = QDateTime::currentDateTime();
     QTime t = now.time();
     QDate d = now.date();
-    xprintf("\n/CreationDate (D:%d%02d%02d%02d%02d%02d)\n",
+    xprintf("\n/CreationDate (D:%d%02d%02d%02d%02d%02d",
             d.year(),
             d.month(),
             d.day(),
             t.hour(),
             t.minute(),
             t.second());
+    int offset = now.offsetFromUtc();
+    int hours  = (offset / 60) / 60;
+    int mins   = (offset / 60) % 60;
+    if (offset < 0)
+        xprintf("-%02d'%02d')\n", -hours, -mins);
+    else if (offset > 0)
+        xprintf("+%02d'%02d')\n", hours , mins);
+    else
+        xprintf("Z)\n");
     xprintf(">>\n"
             "endobj\n");
 }
diff --git a/src/gui/painting/qregion.cpp b/src/gui/painting/qregion.cpp
index 0571e1a328..3fb6f925b3 100644
--- a/src/gui/painting/qregion.cpp
+++ b/src/gui/painting/qregion.cpp
@@ -739,7 +739,7 @@ bool QRegion::intersects(const QRegion &region) const
 */
 
 
-#if !defined (Q_OS_UNIX) && !defined (Q_OS_WIN)
+#if !defined (Q_OS_UNIX) && !defined (Q_OS_WIN) || defined(Q_CLANG_QDOC)
 /*!
     \overload
     \since 4.4
diff --git a/src/gui/painting/qrgba64_p.h b/src/gui/painting/qrgba64_p.h
index 0dadc038fa..7776a5b08a 100644
--- a/src/gui/painting/qrgba64_p.h
+++ b/src/gui/painting/qrgba64_p.h
@@ -185,6 +185,60 @@ inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b)
                                qMin(a.alpha() + b.alpha(), 65535));
 }
 
+#if defined __SSE2__
+Q_ALWAYS_INLINE uint toArgb32(__m128i v)
+{
+    v = _mm_unpacklo_epi16(v, _mm_setzero_si128());
+    v = _mm_add_epi32(v, _mm_set1_epi32(128));
+    v = _mm_sub_epi32(v, _mm_srli_epi32(v, 8));
+    v = _mm_srli_epi32(v, 8);
+    v = _mm_packs_epi32(v, v);
+    v = _mm_packus_epi16(v, v);
+    return _mm_cvtsi128_si32(v);
+}
+#elif defined __ARM_NEON__
+Q_ALWAYS_INLINE uint toArgb32(uint16x4_t v)
+{
+    v = vsub_u16(v, vrshr_n_u16(v, 8));
+    v = vrshr_n_u16(v, 8);
+    uint8x8_t v8 = vmovn_u16(vcombine_u16(v, v));
+    return vget_lane_u32(vreinterpret_u32_u8(v8), 0);
+}
+#endif
+
+inline uint toArgb32(QRgba64 rgba64)
+{
+#if defined __SSE2__
+    __m128i v = _mm_loadl_epi64((const __m128i *)&rgba64);
+    v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 0, 1, 2));
+    return toArgb32(v);
+#elif defined __ARM_NEON__
+    uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
+#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+    const uint8x8_t shuffleMask = { 4, 5, 2, 3, 0, 1, 6, 7 };
+    v = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(v), shuffleMask));
+#else
+    v = vext_u16(v, v, 3);
+#endif
+    return toArgb32(v);
+#else
+    return rgba64.toArgb32();
+#endif
+}
+
+inline uint toRgba8888(QRgba64 rgba64)
+{
+#if defined __SSE2__
+    __m128i v = _mm_loadl_epi64((const __m128i *)&rgba64);
+    return toArgb32(v);
+#elif defined __ARM_NEON__
+    uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
+    return toArgb32(v);
+#else
+    return ARGB2RGBA(toArgb32(rgba64));
+#endif
+}
+
 #if defined(__SSE2__)
 Q_ALWAYS_INLINE __m128i addWithSaturation(__m128i a, __m128i b)
 {
@@ -199,6 +253,53 @@ Q_ALWAYS_INLINE uint16x4_t addWithSaturation(uint16x4_t a, uint16x4_t b)
 }
 #endif
 
+inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha)
+{
+    QRgba64 blend;
+#if defined(__SSE2__)
+    __m128i vd = _mm_loadl_epi64((const __m128i *)&d);
+    __m128i vs = _mm_loadl_epi64((const __m128i *)&s);
+    __m128i va =  _mm_cvtsi32_si128(rgbAlpha);
+    va = _mm_unpacklo_epi8(va, va);
+    va = _mm_shufflelo_epi16(va, _MM_SHUFFLE(3, 0, 1, 2));
+    __m128i vb = _mm_xor_si128(_mm_set1_epi16(-1), va);
+
+    vs = _mm_unpacklo_epi16(_mm_mullo_epi16(vs, va), _mm_mulhi_epu16(vs, va));
+    vd = _mm_unpacklo_epi16(_mm_mullo_epi16(vd, vb), _mm_mulhi_epu16(vd, vb));
+    vd = _mm_add_epi32(vd, vs);
+    vd = _mm_add_epi32(vd, _mm_srli_epi32(vd, 16));
+    vd = _mm_add_epi32(vd, _mm_set1_epi32(0x8000));
+    vd = _mm_srai_epi32(vd, 16);
+    vd = _mm_packs_epi32(vd, _mm_setzero_si128());
+
+    _mm_storel_epi64((__m128i *)&blend, vd);
+#elif defined(__ARM_NEON__)
+    uint16x4_t vd = vreinterpret_u16_u64(vmov_n_u64(d));
+    uint16x4_t vs = vreinterpret_u16_u64(vmov_n_u64(s));
+    uint8x8_t va8 = vreinterpret_u8_u32(vmov_n_u32(ARGB2RGBA(rgbAlpha)));
+    uint16x4_t va = vreinterpret_u16_u8(vzip_u8(va8, va8).val[0]);
+    uint16x4_t vb = vdup_n_u16(0xffff);
+    vb = vsub_u16(vb, va);
+
+    uint32x4_t vs32 = vmull_u16(vs, va);
+    uint32x4_t vd32 = vmull_u16(vd, vb);
+    vd32 = vaddq_u32(vd32, vs32);
+    vd32 = vsraq_n_u32(vd32, vd32, 16);
+    vd = vrshrn_n_u32(vd32, 16);
+    vst1_u64(reinterpret_cast<uint64_t *>(&blend), vreinterpret_u64_u16(vd));
+#else
+    const int mr = qRed(rgbAlpha);
+    const int mg = qGreen(rgbAlpha);
+    const int mb = qBlue(rgbAlpha);
+    blend.setRed  (qt_div_255(s.red()   * mr + d.red()   * (255 - mr)));
+    blend.setGreen(qt_div_255(s.green() * mg + d.green() * (255 - mg)));
+    blend.setBlue (qt_div_255(s.blue()  * mb + d.blue()  * (255 - mb)));
+    blend.setAlpha(s.alpha());
+#endif
+    return blend;
+}
+
+
 QT_END_NAMESPACE
 
 #endif // QRGBA64_P_H
diff --git a/src/gui/painting/qtransform.cpp b/src/gui/painting/qtransform.cpp
index 2d841b2953..673f64fbca 100644
--- a/src/gui/painting/qtransform.cpp
+++ b/src/gui/painting/qtransform.cpp
@@ -1118,16 +1118,16 @@ QDataStream & operator>>(QDataStream &s, QTransform &t)
 #ifndef QT_NO_DEBUG_STREAM
 QDebug operator<<(QDebug dbg, const QTransform &m)
 {
-    static const char *const typeStr[] =
+    static const char typeStr[][12] =
     {
         "TxNone",
         "TxTranslate",
         "TxScale",
-        0,
+        "",
         "TxRotate",
-        0, 0, 0,
+        "", "", "",
         "TxShear",
-        0, 0, 0, 0, 0, 0, 0,
+        "", "", "", "", "", "", "",
         "TxProject"
     };
 
diff --git a/src/gui/painting/qtriangulator.cpp b/src/gui/painting/qtriangulator.cpp
index 6604d407f0..6d57eba123 100644
--- a/src/gui/painting/qtriangulator.cpp
+++ b/src/gui/painting/qtriangulator.cpp
@@ -50,10 +50,6 @@
 #include <QtCore/qglobal.h>
 #include <QtCore/qpoint.h>
 #include <QtCore/qalgorithms.h>
-#ifndef QT_NO_OPENGL
-# include <private/qopenglcontext_p.h>
-# include <private/qopenglextensions_p.h>
-#endif
 #include <private/qrbtree_p.h>
 
 QT_BEGIN_NAMESPACE
@@ -2266,23 +2262,12 @@ void QTriangulator<T>::MonotoneToTriangles::decompose()
 //                                qTriangulate                                //
 //============================================================================//
 
-static bool hasElementIndexUint()
-{
-#ifndef QT_NO_OPENGL
-    QOpenGLContext *context = QOpenGLContext::currentContext();
-    if (!context)
-        return false;
-    return static_cast<QOpenGLExtensions *>(context->functions())->hasOpenGLExtension(QOpenGLExtensions::ElementIndexUint);
-#else
-    return false;
-#endif
-}
-
 Q_GUI_EXPORT QTriangleSet qTriangulate(const qreal *polygon,
-                          int count, uint hint, const QTransform &matrix)
+                                       int count, uint hint, const QTransform &matrix,
+                                       bool allowUintIndices)
 {
     QTriangleSet triangleSet;
-    if (hasElementIndexUint()) {
+    if (allowUintIndices) {
         QTriangulator<quint32> triangulator;
         triangulator.initialize(polygon, count, hint, matrix);
         QVertexSet<quint32> vertexSet = triangulator.triangulate();
@@ -2300,10 +2285,13 @@ Q_GUI_EXPORT QTriangleSet qTriangulate(const qreal *polygon,
 }
 
 Q_GUI_EXPORT QTriangleSet qTriangulate(const QVectorPath &path,
-                          const QTransform &matrix, qreal lod)
+                                       const QTransform &matrix, qreal lod, bool allowUintIndices)
 {
     QTriangleSet triangleSet;
-    if (hasElementIndexUint()) {
+    // For now systems that support 32-bit index values will always get 32-bit
+    // index values. This is not necessary ideal since 16-bit would be enough in
+    // many cases. TODO revisit this at a later point.
+    if (allowUintIndices) {
         QTriangulator<quint32> triangulator;
         triangulator.initialize(path, matrix, lod);
         QVertexSet<quint32> vertexSet = triangulator.triangulate();
@@ -2320,10 +2308,10 @@ Q_GUI_EXPORT QTriangleSet qTriangulate(const QVectorPath &path,
 }
 
 QTriangleSet qTriangulate(const QPainterPath &path,
-                          const QTransform &matrix, qreal lod)
+                          const QTransform &matrix, qreal lod, bool allowUintIndices)
 {
     QTriangleSet triangleSet;
-    if (hasElementIndexUint()) {
+    if (allowUintIndices) {
         QTriangulator<quint32> triangulator;
         triangulator.initialize(path, matrix, lod);
         QVertexSet<quint32> vertexSet = triangulator.triangulate();
@@ -2340,10 +2328,10 @@ QTriangleSet qTriangulate(const QPainterPath &path,
 }
 
 QPolylineSet qPolyline(const QVectorPath &path,
-                       const QTransform &matrix, qreal lod)
+                       const QTransform &matrix, qreal lod, bool allowUintIndices)
 {
     QPolylineSet polyLineSet;
-    if (hasElementIndexUint()) {
+    if (allowUintIndices) {
         QTriangulator<quint32> triangulator;
         triangulator.initialize(path, matrix, lod);
         QVertexSet<quint32> vertexSet = triangulator.polyline();
@@ -2360,10 +2348,10 @@ QPolylineSet qPolyline(const QVectorPath &path,
 }
 
 QPolylineSet qPolyline(const QPainterPath &path,
-                       const QTransform &matrix, qreal lod)
+                       const QTransform &matrix, qreal lod, bool allowUintIndices)
 {
     QPolylineSet polyLineSet;
-    if (hasElementIndexUint()) {
+    if (allowUintIndices) {
         QTriangulator<quint32> triangulator;
         triangulator.initialize(path, matrix, lod);
         QVertexSet<quint32> vertexSet = triangulator.polyline();
diff --git a/src/gui/painting/qtriangulator_p.h b/src/gui/painting/qtriangulator_p.h
index 4d1aba099c..8f043fc925 100644
--- a/src/gui/painting/qtriangulator_p.h
+++ b/src/gui/painting/qtriangulator_p.h
@@ -137,11 +137,18 @@ struct Q_GUI_EXPORT QPolylineSet
 // integers, the polygon is triangulated, and then scaled back by 1/32.
 // 'hint' should be a combination of QVectorPath::Hints.
 // 'lod' is the level of detail. Default is 1. Curves are split into more lines when 'lod' is higher.
-QTriangleSet Q_GUI_EXPORT qTriangulate(const qreal *polygon, int count, uint hint = QVectorPath::PolygonHint | QVectorPath::OddEvenFill, const QTransform &matrix = QTransform());
-QTriangleSet Q_GUI_EXPORT qTriangulate(const QVectorPath &path, const QTransform &matrix = QTransform(), qreal lod = 1);
-QTriangleSet Q_GUI_EXPORT qTriangulate(const QPainterPath &path, const QTransform &matrix = QTransform(), qreal lod = 1);
-QPolylineSet qPolyline(const QVectorPath &path, const QTransform &matrix = QTransform(), qreal lod = 1);
-QPolylineSet Q_GUI_EXPORT qPolyline(const QPainterPath &path, const QTransform &matrix = QTransform(), qreal lod = 1);
+QTriangleSet Q_GUI_EXPORT qTriangulate(const qreal *polygon, int count,
+                                       uint hint = QVectorPath::PolygonHint | QVectorPath::OddEvenFill,
+                                       const QTransform &matrix = QTransform(),
+                                       bool allowUintIndices = true);
+QTriangleSet Q_GUI_EXPORT qTriangulate(const QVectorPath &path, const QTransform &matrix = QTransform(),
+                                       qreal lod = 1, bool allowUintIndices = true);
+QTriangleSet Q_GUI_EXPORT qTriangulate(const QPainterPath &path, const QTransform &matrix = QTransform(),
+                                       qreal lod = 1, bool allowUintIndices = true);
+QPolylineSet qPolyline(const QVectorPath &path, const QTransform &matrix = QTransform(),
+                       qreal lod = 1, bool allowUintIndices = true);
+QPolylineSet Q_GUI_EXPORT qPolyline(const QPainterPath &path, const QTransform &matrix = QTransform(),
+                                    qreal lod = 1, bool allowUintIndices = true);
 
 QT_END_NAMESPACE