19 files changed, 1173 insertions, 964 deletions
diff --git a/src/gui/painting/painting.pri b/src/gui/painting/painting.pri
index 86e35c39f8..63e345545c 100644
--- a/src/gui/painting/painting.pri
+++ b/src/gui/painting/painting.pri
@@ -8,6 +8,7 @@ HEADERS += \
         painting/qbrush.h \
         painting/qcolor.h \
         painting/qcolor_p.h \
+        painting/qcolorprofile_p.h \
         painting/qcosmeticstroker_p.h \
         painting/qdatabuffer_p.h \
         painting/qdrawhelper_p.h \
@@ -63,11 +64,11 @@ SOURCES += \
         painting/qblittable.cpp \
         painting/qbrush.cpp \
         painting/qcolor.cpp \
+        painting/qcolorprofile.cpp \
         painting/qcompositionfunctions.cpp \
         painting/qcosmeticstroker.cpp \
         painting/qdrawhelper.cpp \
         painting/qemulationpaintengine.cpp \
-        painting/qgammatables.cpp \
         painting/qgrayraster.c \
         painting/qimagescale.cpp \
         painting/qmatrix.cpp \
diff --git a/src/gui/painting/qbrush.cpp b/src/gui/painting/qbrush.cpp
index ebb035a2c1..06a820a859 100644
--- a/src/gui/painting/qbrush.cpp
+++ b/src/gui/painting/qbrush.cpp
@@ -1419,6 +1419,25 @@ void QGradient::setColorAt(qreal pos, const QColor &color)
         m_stops.insert(index, QGradientStop(pos, color));
 }
 
+static inline bool ok(QGradientStop stop)
+{
+    return stop.first >= 0 && stop.first <= 1; // rejects NaNs
+}
+
+static inline bool ok(const QGradientStops &stops)
+{
+    qreal lastPos = -1;
+    for (const QGradientStop &stop : stops) {
+        if (Q_UNLIKELY(!ok(stop)))
+            return false;
+        const bool sorted = stop.first > lastPos; // rejects duplicates
+        if (Q_UNLIKELY(!sorted))
+            return false;
+        lastPos = stop.first;
+    }
+    return true;
+}
+
 /*!
     \fn void QGradient::setStops(const QGradientStops &stopPoints)
 
@@ -1430,6 +1449,14 @@ void QGradient::setColorAt(qreal pos, const QColor &color)
 */
 void QGradient::setStops(const QGradientStops &stops)
 {
+    // ## Qt 6: consider taking \a stops by value, so we can move into m_stops
+    if (Q_LIKELY(ok(stops))) {
+        // fast path for the common case: if everything is ok with the stops, just copy them
+        m_stops = stops;
+        return;
+    }
+    // otherwise, to keep the pre-5.9 behavior, add them one after another,
+    // so each stop is checked, invalid ones are skipped, they are added in-order (which may be O(N^2)).
     m_stops.clear();
     for (int i=0; i<stops.size(); ++i)
         setColorAt(stops.at(i).first, stops.at(i).second);
diff --git a/src/gui/painting/qgammatables.cpp b/src/gui/painting/qcolorprofile.cpp
index 1d76f7ee3c..3b7b0a248b 100644
--- a/src/gui/painting/qgammatables.cpp
+++ b/src/gui/painting/qcolorprofile.cpp
@@ -37,28 +37,51 @@
 **
 ****************************************************************************/
 
-#include <private/qdrawhelper_p.h>
+#include "qcolorprofile_p.h"
+#include <qmath.h>
 
 QT_BEGIN_NAMESPACE
 
+QColorProfile *QColorProfile::fromGamma(qreal gamma)
+{
+    QColorProfile *cp = new QColorProfile;
+
+    for (int i = 0; i <= (255 * 16); ++i) {
+        cp->m_toLinear[i] = ushort(qRound(qPow(i / qreal(255 * 16), gamma) * (255 * 256)));
+        cp->m_fromLinear[i] = ushort(qRound(qPow(i / qreal(255 * 16), qreal(1) / gamma) * (255 * 256)));
+    }
+
+    return cp;
+}
 
-QDrawHelperGammaTables::QDrawHelperGammaTables(qreal smoothing)
+static qreal srgbToLinear(qreal v)
 {
-    const qreal gray_gamma = 2.31;
-    for (int i=0; i<256; ++i)
-        qt_pow_gamma[i] = uint(qRound(qPow(i / qreal(255.), gray_gamma) * 2047));
-    for (int i=0; i<2048; ++i)
-        qt_pow_invgamma[i] = uchar(qRound(qPow(i / qreal(2047.0), 1 / gray_gamma) * 255));
+    const qreal a = 0.055;
+    if (v <= qreal(0.04045))
+        return v / qreal(12.92);
+    else
+        return qPow((v + a) / (qreal(1) + a), qreal(2.4));
+}
 
-    refresh(smoothing);
+static qreal linearToSrgb(qreal v)
+{
+    const qreal a = 0.055;
+    if (v <= qreal(0.0031308))
+        return v * qreal(12.92);
+    else
+        return (qreal(1) + a) * qPow(v, qreal(1.0 / 2.4)) - a;
 }
 
-void QDrawHelperGammaTables::refresh(qreal smoothing)
+QColorProfile *QColorProfile::fromSRgb()
 {
-    for (int i=0; i<256; ++i) {
-        qt_pow_rgb_gamma[i] = uchar(qRound(qPow(i / qreal(255.0), smoothing) * 255));
-        qt_pow_rgb_invgamma[i] = uchar(qRound(qPow(i / qreal(255.), 1 / smoothing) * 255));
+    QColorProfile *cp = new QColorProfile;
+
+    for (int i = 0; i <= (255 * 16); ++i) {
+        cp->m_toLinear[i] = ushort(qRound(srgbToLinear(i / qreal(255 * 16)) * (255 * 256)));
+        cp->m_fromLinear[i] = ushort(qRound(linearToSrgb(i / qreal(255 * 16)) * (255 * 256)));
     }
+
+    return cp;
 }
 
 QT_END_NAMESPACE
diff --git a/src/gui/painting/qcolorprofile_p.h b/src/gui/painting/qcolorprofile_p.h
new file mode 100644
index 0000000000..ca1786ee6d
--- /dev/null
+++ b/src/gui/painting/qcolorprofile_p.h
@@ -0,0 +1,157 @@
+/****************************************************************************
+**
+** Copyright (C) 2016 The Qt Company Ltd.
+** Contact: https://www.qt.io/licensing/
+**
+** This file is part of the QtGui module of the Qt Toolkit.
+**
+** $QT_BEGIN_LICENSE:LGPL$
+** Commercial License Usage
+** Licensees holding valid commercial Qt licenses may use this file in
+** accordance with the commercial license agreement provided with the
+** Software or, alternatively, in accordance with the terms contained in
+** a written agreement between you and The Qt Company. For licensing terms
+** and conditions see https://www.qt.io/terms-conditions. For further
+** information use the contact form at https://www.qt.io/contact-us.
+**
+** GNU Lesser General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU Lesser
+** General Public License version 3 as published by the Free Software
+** Foundation and appearing in the file LICENSE.LGPL3 included in the
+** packaging of this file. Please review the following information to
+** ensure the GNU Lesser General Public License version 3 requirements
+** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
+**
+** GNU General Public License Usage
+** Alternatively, this file may be used under the terms of the GNU
+** General Public License version 2.0 or (at your option) the GNU General
+** Public license version 3 or any later version approved by the KDE Free
+** Qt Foundation. The licenses are as published by the Free Software
+** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
+** included in the packaging of this file. Please review the following
+** information to ensure the GNU General Public License requirements will
+** be met: https://www.gnu.org/licenses/gpl-2.0.html and
+** https://www.gnu.org/licenses/gpl-3.0.html.
+**
+** $QT_END_LICENSE$
+**
+****************************************************************************/
+
+#ifndef QCOLORPROFILE_P_H
+#define QCOLORPROFILE_P_H
+
+//
+//  W A R N I N G
+//  -------------
+//
+// This file is not part of the Qt API.  It exists purely as an
+// implementation detail.  This header file may change from version to
+// version without notice, or even be removed.
+//
+// We mean it.
+//
+
+#include <QtGui/private/qtguiglobal_p.h>
+#include <QtGui/qrgb.h>
+#include <QtGui/qrgba64.h>
+
+QT_BEGIN_NAMESPACE
+
+class Q_GUI_EXPORT QColorProfile
+{
+public:
+    static QColorProfile *fromGamma(qreal gamma);
+    static QColorProfile *fromSRgb();
+
+    // The following methods all convert opaque or unpremultiplied colors:
+
+    QRgba64 toLinear64(QRgb rgb32) const
+    {
+        ushort r = m_toLinear[qRed(rgb32) << 4];
+        ushort g = m_toLinear[qGreen(rgb32) << 4];
+        ushort b = m_toLinear[qBlue(rgb32) << 4];
+        r = r + (r >> 8);
+        g = g + (g >> 8);
+        b = b + (b >> 8);
+        return QRgba64::fromRgba64(r, g, b, qAlpha(rgb32) * 257);
+    }
+
+    QRgb toLinear(QRgb rgb32) const
+    {
+        uchar r = (m_toLinear[qRed(rgb32) << 4] + 0x80) >> 8;
+        uchar g = (m_toLinear[qGreen(rgb32) << 4] + 0x80) >> 8;
+        uchar b = (m_toLinear[qBlue(rgb32) << 4] + 0x80) >> 8;
+        return qRgba(r, g, b, qAlpha(rgb32));
+    }
+
+    QRgba64 toLinear(QRgba64 rgb64) const
+    {
+        ushort r = rgb64.red();
+        ushort g = rgb64.green();
+        ushort b = rgb64.blue();
+        r = r - (r >> 8);
+        g = g - (g >> 8);
+        b = b - (b >> 8);
+        r = m_toLinear[r >> 4];
+        g = m_toLinear[g >> 4];
+        b = m_toLinear[b >> 4];
+        r = r + (r >> 8);
+        g = g + (g >> 8);
+        b = b + (b >> 8);
+        return QRgba64::fromRgba64(r, g, b, rgb64.alpha());
+    }
+
+    QRgb fromLinear64(QRgba64 rgb64) const
+    {
+        ushort r = rgb64.red();
+        ushort g = rgb64.green();
+        ushort b = rgb64.blue();
+        r = r - (r >> 8);
+        g = g - (g >> 8);
+        b = b - (b >> 8);
+        r = (m_fromLinear[r >> 4] + 0x80) >> 8;
+        g = (m_fromLinear[g >> 4] + 0x80) >> 8;
+        b = (m_fromLinear[b >> 4] + 0x80) >> 8;
+        return qRgba(r, g, b, rgb64.alpha8());
+    }
+
+    QRgb fromLinear(QRgb rgb32) const
+    {
+        uchar r = (m_fromLinear[qRed(rgb32) << 4] + 0x80) >> 8;
+        uchar g = (m_fromLinear[qGreen(rgb32) << 4] + 0x80) >> 8;
+        uchar b = (m_fromLinear[qBlue(rgb32) << 4] + 0x80) >> 8;
+        return qRgba(r, g, b, qAlpha(rgb32));
+    }
+
+    QRgba64 fromLinear(QRgba64 rgb64) const
+    {
+        ushort r = rgb64.red();
+        ushort g = rgb64.green();
+        ushort b = rgb64.blue();
+        r = r - (r >> 8);
+        g = g - (g >> 8);
+        b = b - (b >> 8);
+        r = m_fromLinear[r >> 4];
+        g = m_fromLinear[g >> 4];
+        b = m_fromLinear[b >> 4];
+        r = r + (r >> 8);
+        g = g + (g >> 8);
+        b = b + (b >> 8);
+        return QRgba64::fromRgba64(r, g, b, rgb64.alpha());
+    }
+
+private:
+    QColorProfile() { }
+
+    // We translate to 0-65280 (255*256) instead to 0-65535 to make simple
+    // shifting an accurate conversion.
+    // We translate from 0-4080 (255*16) for the same speed up, and to keep
+    // the tables small enough to fit in most inner caches.
+    ushort m_toLinear[(255 * 16) + 1]; // [0-4080] -> [0-65280]
+    ushort m_fromLinear[(255 * 16) + 1]; // [0-4080] -> [0-65280]
+
+};
+
+QT_END_NAMESPACE
+
+#endif // QCOLORPROFILE_P_H
diff --git a/src/gui/painting/qcoregraphics.mm b/src/gui/painting/qcoregraphics.mm
index 3753fa4e88..a64a184e25 100644
--- a/src/gui/painting/qcoregraphics.mm
+++ b/src/gui/painting/qcoregraphics.mm
@@ -39,6 +39,7 @@
 #include <QtGui/private/qpaintengine_p.h>
 #include <QtCore/qdebug.h>
 #include <QtCore/qcoreapplication.h>
+#include <QtCore/qoperatingsystemversion.h>
 
 QT_BEGIN_NAMESPACE
 
@@ -106,29 +107,6 @@ QImage qt_mac_toQImage(CGImageRef image)
 
 #ifdef Q_OS_MACOS
 
-QT_END_NAMESPACE
-
-@interface NSGraphicsContext (QtAdditions)
-
-+ (NSGraphicsContext *)qt_graphicsContextWithCGContext:(CGContextRef)graphicsPort flipped:(BOOL)initialFlippedState;
-
-@end
-
-@implementation NSGraphicsContext (QtAdditions)
-
-+ (NSGraphicsContext *)qt_graphicsContextWithCGContext:(CGContextRef)graphicsPort flipped:(BOOL)initialFlippedState
-{
-#if QT_MAC_PLATFORM_SDK_EQUAL_OR_ABOVE(__MAC_10_10, __IPHONE_NA)
-    if (QT_PREPEND_NAMESPACE(QSysInfo::MacintoshVersion) >= QT_PREPEND_NAMESPACE(QSysInfo::MV_10_10))
-        return [self graphicsContextWithCGContext:graphicsPort flipped:initialFlippedState];
-#endif
-    return [self graphicsContextWithGraphicsPort:graphicsPort flipped:initialFlippedState];
-}
-
-@end
-
-QT_BEGIN_NAMESPACE
-
 static NSImage *qt_mac_cgimage_to_nsimage(CGImageRef image)
 {
     NSImage *newImage = [[NSImage alloc] initWithCGImage:image size:NSZeroSize];
@@ -179,7 +157,7 @@ QPixmap qt_mac_toQPixmap(const NSImage *image, const QSizeF &size)
     QMacCGContext ctx(&pixmap);
     if (!ctx)
         return QPixmap();
-    NSGraphicsContext *gc = [NSGraphicsContext qt_graphicsContextWithCGContext:ctx flipped:YES];
+    NSGraphicsContext *gc = [NSGraphicsContext graphicsContextWithCGContext:ctx flipped:YES];
     if (!gc)
         return QPixmap();
     [NSGraphicsContext saveGraphicsState];
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 298304c4ef..9b5f15470e 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -43,6 +43,7 @@
 #include <qstylehints.h>
 #include <qguiapplication.h>
 #include <qatomic.h>
+#include <private/qcolorprofile_p.h>
 #include <private/qdrawhelper_p.h>
 #include <private/qpaintengine_raster_p.h>
 #include <private/qpainter_p.h>
@@ -1320,7 +1321,7 @@ static void QT_FASTCALL destStore(QRasterBuffer *rasterBuffer, int x, int y, con
 static void QT_FASTCALL convertFromRgb64(uint *dest, const QRgba64 *src, int length)
 {
     for (int i = 0; i < length; ++i) {
-        dest[i] = src[i].toArgb32();
+        dest[i] = toArgb32(src[i]);
     }
 }
 
@@ -1411,7 +1412,7 @@ static void QT_FASTCALL destStore64ARGB32(QRasterBuffer *rasterBuffer, int x, in
 {
     uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
     for (int i = 0; i < length; ++i) {
-        dest[i] = buffer[i].unpremultiplied().toArgb32();
+        dest[i] = toArgb32(buffer[i].unpremultiplied());
     }
 }
 
@@ -1419,7 +1420,7 @@ static void QT_FASTCALL destStore64RGBA8888(QRasterBuffer *rasterBuffer, int x,
 {
     uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
     for (int i = 0; i < length; ++i) {
-        dest[i] = ARGB2RGBA(buffer[i].unpremultiplied().toArgb32());
+        dest[i] = toRgba8888(buffer[i].unpremultiplied());
     }
 }
 
@@ -1918,562 +1919,695 @@ inline void fetchTransformedBilinear_pixelBounds<BlendTransformedBilinear>(int,
     Q_ASSERT(v2 >= l1 && v2 <= l2);
 }
 
-template<TextureBlendType blendType> /* blendType = BlendTransformedBilinear or BlendTransformedBilinearTiled */
-static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, const Operator *,
-                                                                 const QSpanData *data, int y, int x,
-                                                                 int length)
-{
-    int image_width = data->texture.width;
-    int image_height = data->texture.height;
-
-    int image_x1 = data->texture.x1;
-    int image_y1 = data->texture.y1;
-    int image_x2 = data->texture.x2 - 1;
-    int image_y2 = data->texture.y2 - 1;
-
-    const qreal cx = x + qreal(0.5);
-    const qreal cy = y + qreal(0.5);
-
-    uint *end = buffer + length;
-    uint *b = buffer;
-    if (data->fast_matrix) {
-        // The increment pr x in the scanline
-        int fdx = (int)(data->m11 * fixed_scale);
-        int fdy = (int)(data->m12 * fixed_scale);
-
-        int fx = int((data->m21 * cy
-                      + data->m11 * cx + data->dx) * fixed_scale);
-        int fy = int((data->m22 * cy
-                      + data->m12 * cx + data->dy) * fixed_scale);
-
-        fx -= half_point;
-        fy -= half_point;
-
-        if (fdy == 0) { //simple scale, no rotation
-            int y1 = (fy >> 16);
-            int y2;
-            fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
-            const uint *s1 = (const uint *)data->texture.scanLine(y1);
-            const uint *s2 = (const uint *)data->texture.scanLine(y2);
-
-            if (fdx <= fixed_scale && fdx > 0) { // scale up on X
-                int disty = (fy & 0x0000ffff) >> 8;
-                int idisty = 256 - disty;
-                int x = fx >> 16;
+enum FastTransformTypes {
+    SimpleUpscaleTransform,
+    UpscaleTransform,
+    DownscaleTransform,
+    RotateTransform,
+    FastRotateTransform,
+    NFastTransformTypes
+};
 
-                // The idea is first to do the interpolation between the row s1 and the row s2
-                // into an intermediate buffer, then we interpolate between two pixel of this buffer.
+typedef void (QT_FASTCALL *BilinearFastTransformHelper)(uint *b, uint *end, const QTextureData &image, int &fx, int &fy, int fdx, int fdy);
 
-                // intermediate_buffer[0] is a buffer of red-blue component of the pixel, in the form 0x00RR00BB
-                // intermediate_buffer[1] is the alpha-green component of the pixel, in the form 0x00AA00GG
-                // +1 for the last pixel to interpolate with, and +1 for rounding errors.
-                quint32 intermediate_buffer[2][buffer_size + 2];
-                // count is the size used in the intermediate_buffer.
-                int count = (qint64(length) * fdx + fixed_scale - 1) / fixed_scale + 2;
-                Q_ASSERT(count <= buffer_size + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case
-                int f = 0;
-                int lim = count;
-                if (blendType == BlendTransformedBilinearTiled) {
-                    x %= image_width;
-                    if (x < 0) x += image_width;
-                } else {
-                    lim = qMin(count, image_x2-x+1);
-                    if (x < image_x1) {
-                        Q_ASSERT(x <= image_x2);
-                        uint t = s1[image_x1];
-                        uint b = s2[image_x1];
-                        quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                        quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                        do {
-                            intermediate_buffer[0][f] = rb;
-                            intermediate_buffer[1][f] = ag;
-                            f++;
-                            x++;
-                        } while (x < image_x1 && f < lim);
-                    }
-                }
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_upscale_helper(uint *b, uint *end, const QTextureData &image,
+                                                                               int &fx, int &fy, int fdx, int /*fdy*/)
+{
+    int y1 = (fy >> 16);
+    int y2;
+    fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+    const uint *s1 = (const uint *)image.scanLine(y1);
+    const uint *s2 = (const uint *)image.scanLine(y2);
+
+    int disty = (fy & 0x0000ffff) >> 8;
+    int idisty = 256 - disty;
+    int x = fx >> 16;
+    int length = end - b;
+
+    // The idea is first to do the interpolation between the row s1 and the row s2
+    // into an intermediate buffer, then we interpolate between two pixel of this buffer.
+
+    // intermediate_buffer[0] is a buffer of red-blue component of the pixel, in the form 0x00RR00BB
+    // intermediate_buffer[1] is the alpha-green component of the pixel, in the form 0x00AA00GG
+    // +1 for the last pixel to interpolate with, and +1 for rounding errors.
+    quint32 intermediate_buffer[2][buffer_size + 2];
+    // count is the size used in the intermediate_buffer.
+    int count = (qint64(length) * fdx + fixed_scale - 1) / fixed_scale + 2;
+    Q_ASSERT(count <= buffer_size + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case
+    int f = 0;
+    int lim = count;
+    if (blendType == BlendTransformedBilinearTiled) {
+        x %= image.width;
+        if (x < 0) x += image.width;
+    } else {
+        lim = qMin(count, image.x2 - x);
+        if (x < image.x1) {
+            Q_ASSERT(x < image.x2);
+            uint t = s1[image.x1];
+            uint b = s2[image.x1];
+            quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+            quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+            do {
+                intermediate_buffer[0][f] = rb;
+                intermediate_buffer[1][f] = ag;
+                f++;
+                x++;
+            } while (x < image.x1 && f < lim);
+        }
+    }
 
-                if (blendType != BlendTransformedBilinearTiled) {
+    if (blendType != BlendTransformedBilinearTiled) {
 #if defined(__SSE2__)
-                    const __m128i disty_ = _mm_set1_epi16(disty);
-                    const __m128i idisty_ = _mm_set1_epi16(idisty);
-                    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
-
-                    lim -= 3;
-                    for (; f < lim; x += 4, f += 4) {
-                        // Load 4 pixels from s1, and split the alpha-green and red-blue component
-                        __m128i top = _mm_loadu_si128((const __m128i*)((const uint *)(s1)+x));
-                        __m128i topAG = _mm_srli_epi16(top, 8);
-                        __m128i topRB = _mm_and_si128(top, colorMask);
-                        // Multiplies each colour component by idisty
-                        topAG = _mm_mullo_epi16 (topAG, idisty_);
-                        topRB = _mm_mullo_epi16 (topRB, idisty_);
-
-                        // Same for the s2 vector
-                        __m128i bottom = _mm_loadu_si128((const __m128i*)((const uint *)(s2)+x));
-                        __m128i bottomAG = _mm_srli_epi16(bottom, 8);
-                        __m128i bottomRB = _mm_and_si128(bottom, colorMask);
-                        bottomAG = _mm_mullo_epi16 (bottomAG, disty_);
-                        bottomRB = _mm_mullo_epi16 (bottomRB, disty_);
-
-                        // Add the values, and shift to only keep 8 significant bits per colors
-                        __m128i rAG =_mm_add_epi16(topAG, bottomAG);
-                        rAG = _mm_srli_epi16(rAG, 8);
-                        _mm_storeu_si128((__m128i*)(&intermediate_buffer[1][f]), rAG);
-                        __m128i rRB =_mm_add_epi16(topRB, bottomRB);
-                        rRB = _mm_srli_epi16(rRB, 8);
-                        _mm_storeu_si128((__m128i*)(&intermediate_buffer[0][f]), rRB);
-                    }
+        const __m128i disty_ = _mm_set1_epi16(disty);
+        const __m128i idisty_ = _mm_set1_epi16(idisty);
+        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+
+        lim -= 3;
+        for (; f < lim; x += 4, f += 4) {
+            // Load 4 pixels from s1, and split the alpha-green and red-blue component
+            __m128i top = _mm_loadu_si128((const __m128i*)((const uint *)(s1)+x));
+            __m128i topAG = _mm_srli_epi16(top, 8);
+            __m128i topRB = _mm_and_si128(top, colorMask);
+            // Multiplies each color component by idisty
+            topAG = _mm_mullo_epi16 (topAG, idisty_);
+            topRB = _mm_mullo_epi16 (topRB, idisty_);
+
+            // Same for the s2 vector
+            __m128i bottom = _mm_loadu_si128((const __m128i*)((const uint *)(s2)+x));
+            __m128i bottomAG = _mm_srli_epi16(bottom, 8);
+            __m128i bottomRB = _mm_and_si128(bottom, colorMask);
+            bottomAG = _mm_mullo_epi16 (bottomAG, disty_);
+            bottomRB = _mm_mullo_epi16 (bottomRB, disty_);
+
+            // Add the values, and shift to only keep 8 significant bits per colors
+            __m128i rAG =_mm_add_epi16(topAG, bottomAG);
+            rAG = _mm_srli_epi16(rAG, 8);
+            _mm_storeu_si128((__m128i*)(&intermediate_buffer[1][f]), rAG);
+            __m128i rRB =_mm_add_epi16(topRB, bottomRB);
+            rRB = _mm_srli_epi16(rRB, 8);
+            _mm_storeu_si128((__m128i*)(&intermediate_buffer[0][f]), rRB);
+        }
 #elif defined(__ARM_NEON__)
-                    const int16x8_t disty_ = vdupq_n_s16(disty);
-                    const int16x8_t idisty_ = vdupq_n_s16(idisty);
-                    const int16x8_t colorMask = vdupq_n_s16(0x00ff);
-
-                    lim -= 3;
-                    for (; f < lim; x += 4, f += 4) {
-                        // Load 4 pixels from s1, and split the alpha-green and red-blue component
-                        int16x8_t top = vld1q_s16((int16_t*)((const uint *)(s1)+x));
-                        int16x8_t topAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(top), 8));
-                        int16x8_t topRB = vandq_s16(top, colorMask);
-                        // Multiplies each colour component by idisty
-                        topAG = vmulq_s16(topAG, idisty_);
-                        topRB = vmulq_s16(topRB, idisty_);
-
-                        // Same for the s2 vector
-                        int16x8_t bottom = vld1q_s16((int16_t*)((const uint *)(s2)+x));
-                        int16x8_t bottomAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(bottom), 8));
-                        int16x8_t bottomRB = vandq_s16(bottom, colorMask);
-                        bottomAG = vmulq_s16(bottomAG, disty_);
-                        bottomRB = vmulq_s16(bottomRB, disty_);
-
-                        // Add the values, and shift to only keep 8 significant bits per colors
-                        int16x8_t rAG = vaddq_s16(topAG, bottomAG);
-                        rAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rAG), 8));
-                        vst1q_s16((int16_t*)(&intermediate_buffer[1][f]), rAG);
-                        int16x8_t rRB = vaddq_s16(topRB, bottomRB);
-                        rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8));
-                        vst1q_s16((int16_t*)(&intermediate_buffer[0][f]), rRB);
-                    }
+        const int16x8_t disty_ = vdupq_n_s16(disty);
+        const int16x8_t idisty_ = vdupq_n_s16(idisty);
+        const int16x8_t colorMask = vdupq_n_s16(0x00ff);
+
+        lim -= 3;
+        for (; f < lim; x += 4, f += 4) {
+            // Load 4 pixels from s1, and split the alpha-green and red-blue component
+            int16x8_t top = vld1q_s16((int16_t*)((const uint *)(s1)+x));
+            int16x8_t topAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(top), 8));
+            int16x8_t topRB = vandq_s16(top, colorMask);
+            // Multiplies each color component by idisty
+            topAG = vmulq_s16(topAG, idisty_);
+            topRB = vmulq_s16(topRB, idisty_);
+
+            // Same for the s2 vector
+            int16x8_t bottom = vld1q_s16((int16_t*)((const uint *)(s2)+x));
+            int16x8_t bottomAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(bottom), 8));
+            int16x8_t bottomRB = vandq_s16(bottom, colorMask);
+            bottomAG = vmulq_s16(bottomAG, disty_);
+            bottomRB = vmulq_s16(bottomRB, disty_);
+
+            // Add the values, and shift to only keep 8 significant bits per colors
+            int16x8_t rAG = vaddq_s16(topAG, bottomAG);
+            rAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rAG), 8));
+            vst1q_s16((int16_t*)(&intermediate_buffer[1][f]), rAG);
+            int16x8_t rRB = vaddq_s16(topRB, bottomRB);
+            rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8));
+            vst1q_s16((int16_t*)(&intermediate_buffer[0][f]), rRB);
+        }
 #endif
-                }
-                for (; f < count; f++) { // Same as above but without sse2
-                    if (blendType == BlendTransformedBilinearTiled) {
-                        if (x >= image_width) x -= image_width;
-                    } else {
-                        x = qMin(x, image_x2);
-                    }
+    }
+    for (; f < count; f++) { // Same as above but without simd
+        if (blendType == BlendTransformedBilinearTiled) {
+            if (x >= image.width) x -= image.width;
+        } else {
+            x = qMin(x, image.x2 - 1);
+        }
 
-                    uint t = s1[x];
-                    uint b = s2[x];
+        uint t = s1[x];
+        uint b = s2[x];
 
-                    intermediate_buffer[0][f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                    intermediate_buffer[1][f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
-                    x++;
-                }
-                // Now interpolate the values from the intermediate_buffer to get the final result.
-                fx &= fixed_scale - 1;
-                Q_ASSERT((fx >> 16) == 0);
-                while (b < end) {
-                    int x1 = (fx >> 16);
-                    int x2 = x1 + 1;
-                    Q_ASSERT(x1 >= 0);
-                    Q_ASSERT(x2 < count);
+        intermediate_buffer[0][f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+        intermediate_buffer[1][f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+        x++;
+    }
+    // Now interpolate the values from the intermediate_buffer to get the final result.
+    fx &= fixed_scale - 1;
+    Q_ASSERT((fx >> 16) == 0);
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2 = x1 + 1;
+        Q_ASSERT(x1 >= 0);
+        Q_ASSERT(x2 < count);
+
+        int distx = (fx & 0x0000ffff) >> 8;
+        int idistx = 256 - distx;
+        int rb = ((intermediate_buffer[0][x1] * idistx + intermediate_buffer[0][x2] * distx) >> 8) & 0xff00ff;
+        int ag = (intermediate_buffer[1][x1] * idistx + intermediate_buffer[1][x2] * distx) & 0xff00ff00;
+        *b = rb | ag;
+        b++;
+        fx += fdx;
+    }
+}
 
-                    int distx = (fx & 0x0000ffff) >> 8;
-                    int idistx = 256 - distx;
-                    int rb = ((intermediate_buffer[0][x1] * idistx + intermediate_buffer[0][x2] * distx) >> 8) & 0xff00ff;
-                    int ag = (intermediate_buffer[1][x1] * idistx + intermediate_buffer[1][x2] * distx) & 0xff00ff00;
-                    *b = rb | ag;
-                    b++;
-                    fx += fdx;
-                }
-            } else if ((fdx < 0 && fdx > -(fixed_scale / 8)) || std::abs(data->m22) < (1./8.)) { // scale up more than 8x
-                int y1 = (fy >> 16);
-                int y2;
-                fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
-                const uint *s1 = (const uint *)data->texture.scanLine(y1);
-                const uint *s2 = (const uint *)data->texture.scanLine(y2);
-                int disty = (fy & 0x0000ffff) >> 8;
-                while (b < end) {
-                    int x1 = (fx >> 16);
-                    int x2;
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                    uint tl = s1[x1];
-                    uint tr = s1[x2];
-                    uint bl = s2[x1];
-                    uint br = s2[x2];
-                    int distx = (fx & 0x0000ffff) >> 8;
-                    *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_upscale_helper(uint *b, uint *end, const QTextureData &image,
+                                                                        int &fx, int &fy, int fdx, int /*fdy*/)
+{
+    int y1 = (fy >> 16);
+    int y2;
+    fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+    const uint *s1 = (const uint *)image.scanLine(y1);
+    const uint *s2 = (const uint *)image.scanLine(y2);
+    const int disty = (fy & 0x0000ffff) >> 8;
+
+    if (blendType != BlendTransformedBilinearTiled) {
+        const qint64 min_fx = qint64(image.x1) * fixed_scale;
+        const qint64 max_fx = qint64(image.x2 - 1) * fixed_scale;
+        while (b < end) {
+            int x1 = (fx >> 16);
+            int x2;
+            fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+            if (x1 != x2)
+                break;
+            uint top = s1[x1];
+            uint bot = s2[x1];
+            *b = INTERPOLATE_PIXEL_256(top, 256 - disty, bot, disty);
+            fx += fdx;
+            ++b;
+        }
+        uint *boundedEnd = end;
+        if (fdx > 0)
+            boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
+        else if (fdx < 0)
+            boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
+
+        // A fast middle part without boundary checks
+        while (b < boundedEnd) {
+            int x = (fx >> 16);
+            int distx = (fx & 0x0000ffff) >> 8;
+            *b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty);
+            fx += fdx;
+            ++b;
+        }
+    }
 
-                    fx += fdx;
-                    ++b;
-                }
-            } else { //scale down
-                int y1 = (fy >> 16);
-                int y2;
-                fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
-                const uint *s1 = (const uint *)data->texture.scanLine(y1);
-                const uint *s2 = (const uint *)data->texture.scanLine(y2);
-                const int disty8 = (fy & 0x0000ffff) >> 8;
-                const int disty4 = (disty8 + 0x08) >> 4;
-
-                if (blendType != BlendTransformedBilinearTiled) {
-#define BILINEAR_DOWNSCALE_BOUNDS_PROLOG \
-                    const qint64 min_fx = qint64(image_x1) * fixed_scale; \
-                    const qint64 max_fx = qint64(image_x2) * fixed_scale; \
-                    while (b < end) { \
-                        int x1 = (fx >> 16); \
-                        int x2; \
-                        fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); \
-                        if (x1 != x2) \
-                            break; \
-                        uint top = s1[x1]; \
-                        uint bot = s2[x1]; \
-                        *b = INTERPOLATE_PIXEL_256(top, 256 - disty8, bot, disty8); \
-                        fx += fdx; \
-                        ++b; \
-                    } \
-                    uint *boundedEnd = end; \
-                    if (fdx > 0) \
-                        boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); \
-                    else if (fdx < 0) \
-                        boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); \
-                    boundedEnd -= 3;
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2;
+        fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1 , x1, x2);
+        uint tl = s1[x1];
+        uint tr = s1[x2];
+        uint bl = s2[x1];
+        uint br = s2[x2];
+        int distx = (fx & 0x0000ffff) >> 8;
+        *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+
+        fx += fdx;
+        ++b;
+    }
+}
 
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper(uint *b, uint *end, const QTextureData &image,
+                                                                          int &fx, int &fy, int fdx, int /*fdy*/)
+{
+    int y1 = (fy >> 16);
+    int y2;
+    fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+    const uint *s1 = (const uint *)image.scanLine(y1);
+    const uint *s2 = (const uint *)image.scanLine(y2);
+    const int disty8 = (fy & 0x0000ffff) >> 8;
+    const int disty4 = (disty8 + 0x08) >> 4;
+
+    if (blendType != BlendTransformedBilinearTiled) {
+        const qint64 min_fx = qint64(image.x1) * fixed_scale;
+        const qint64 max_fx = qint64(image.x2 - 1) * fixed_scale;
+        while (b < end) {
+            int x1 = (fx >> 16);
+            int x2;
+            fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+            if (x1 != x2)
+                break;
+            uint top = s1[x1];
+            uint bot = s2[x1];
+            *b = INTERPOLATE_PIXEL_256(top, 256 - disty8, bot, disty8);
+            fx += fdx;
+            ++b;
+        }
+        uint *boundedEnd = end;
+        if (fdx > 0)
+            boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
+        else if (fdx < 0)
+            boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
+        // A fast middle part without boundary checks
 #if defined(__SSE2__)
-                    BILINEAR_DOWNSCALE_BOUNDS_PROLOG
-
-                    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
-                    const __m128i v_256 = _mm_set1_epi16(256);
-                    const __m128i v_disty = _mm_set1_epi16(disty4);
-                    const __m128i v_fdx = _mm_set1_epi32(fdx*4);
-                    const __m128i v_fx_r = _mm_set1_epi32(0x8);
-                    __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
-
-                    while (b < boundedEnd) {
-                        __m128i offset = _mm_srli_epi32(v_fx, 16);
-                        const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
-                        const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
-                        const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
-                        const int offset3 = _mm_cvtsi128_si32(offset);
-                        const __m128i tl = _mm_setr_epi32(s1[offset0], s1[offset1], s1[offset2], s1[offset3]);
-                        const __m128i tr = _mm_setr_epi32(s1[offset0 + 1], s1[offset1 + 1], s1[offset2 + 1], s1[offset3 + 1]);
-                        const __m128i bl = _mm_setr_epi32(s2[offset0], s2[offset1], s2[offset2], s2[offset3]);
-                        const __m128i br = _mm_setr_epi32(s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]);
-
-                        __m128i v_distx = _mm_srli_epi16(v_fx, 8);
-                        v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fx_r), 4);
-                        v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
-                        v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
-
-                        interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b);
-                        b += 4;
-                        v_fx = _mm_add_epi32(v_fx, v_fdx);
-                    }
-                    fx = _mm_cvtsi128_si32(v_fx);
+        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+        const __m128i v_256 = _mm_set1_epi16(256);
+        const __m128i v_disty = _mm_set1_epi16(disty4);
+        const __m128i v_fdx = _mm_set1_epi32(fdx*4);
+        const __m128i v_fx_r = _mm_set1_epi32(0x8);
+        __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
+
+        while (b < boundedEnd - 3) {
+            __m128i offset = _mm_srli_epi32(v_fx, 16);
+            const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+            const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+            const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+            const int offset3 = _mm_cvtsi128_si32(offset);
+            const __m128i tl = _mm_setr_epi32(s1[offset0], s1[offset1], s1[offset2], s1[offset3]);
+            const __m128i tr = _mm_setr_epi32(s1[offset0 + 1], s1[offset1 + 1], s1[offset2 + 1], s1[offset3 + 1]);
+            const __m128i bl = _mm_setr_epi32(s2[offset0], s2[offset1], s2[offset2], s2[offset3]);
+            const __m128i br = _mm_setr_epi32(s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]);
+
+            __m128i v_distx = _mm_srli_epi16(v_fx, 8);
+            v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fx_r), 4);
+            v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+            v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+
+            interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b);
+            b += 4;
+            v_fx = _mm_add_epi32(v_fx, v_fdx);
+        }
+        fx = _mm_cvtsi128_si32(v_fx);
 #elif defined(__ARM_NEON__)
-                    BILINEAR_DOWNSCALE_BOUNDS_PROLOG
-
-                    const int16x8_t colorMask = vdupq_n_s16(0x00ff);
-                    const int16x8_t invColorMask = vmvnq_s16(colorMask);
-                    const int16x8_t v_256 = vdupq_n_s16(256);
-                    const int16x8_t v_disty = vdupq_n_s16(disty4);
-                    const int16x8_t v_disty_ = vshlq_n_s16(v_disty, 4);
-                    int32x4_t v_fdx = vdupq_n_s32(fdx*4);
+        const int16x8_t colorMask = vdupq_n_s16(0x00ff);
+        const int16x8_t invColorMask = vmvnq_s16(colorMask);
+        const int16x8_t v_256 = vdupq_n_s16(256);
+        const int16x8_t v_disty = vdupq_n_s16(disty4);
+        const int16x8_t v_disty_ = vshlq_n_s16(v_disty, 4);
+        int32x4_t v_fdx = vdupq_n_s32(fdx*4);
 
-                    int32x4_t v_fx = vmovq_n_s32(fx);
-                    v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1);
-                    v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2);
-                    v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3);
+        int32x4_t v_fx = vmovq_n_s32(fx);
+        v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1);
+        v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2);
+        v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3);
 
-                    const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
-                    const int32x4_t v_fx_r = vdupq_n_s32(0x0800);
+        const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
+        const int32x4_t v_fx_r = vdupq_n_s32(0x0800);
 
-                    while (b < boundedEnd) {
-                        uint32x4x2_t v_top, v_bot;
+        while (b < boundedEnd - 3) {
+            uint32x4x2_t v_top, v_bot;
 
-                        int x1 = (fx >> 16);
-                        fx += fdx;
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 0);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0);
-                        x1 = (fx >> 16);
-                        fx += fdx;
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 1);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1);
-                        x1 = (fx >> 16);
-                        fx += fdx;
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 2);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2);
-                        x1 = (fx >> 16);
-                        fx += fdx;
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 3);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3);
-
-                        int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_fx_r), 12);
-                        v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16));
-
-                        interpolate_4_pixels_16_neon(
-                                    vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]),
-                                    vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]),
-                                    vreinterpretq_s16_s32(v_distx), v_disty, v_disty_,
-                                    colorMask, invColorMask, v_256, b);
-                        b+=4;
-                        v_fx = vaddq_s32(v_fx, v_fdx);
-                    }
+            int x1 = (fx >> 16);
+            fx += fdx;
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 0);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0);
+            x1 = (fx >> 16);
+            fx += fdx;
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 1);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1);
+            x1 = (fx >> 16);
+            fx += fdx;
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 2);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2);
+            x1 = (fx >> 16);
+            fx += fdx;
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 3);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3);
+
+            int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_fx_r), 12);
+            v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16));
+
+            interpolate_4_pixels_16_neon(
+                        vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]),
+                    vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]),
+                    vreinterpretq_s16_s32(v_distx), v_disty, v_disty_,
+                    colorMask, invColorMask, v_256, b);
+            b+=4;
+            v_fx = vaddq_s32(v_fx, v_fdx);
+        }
 #endif
-                }
+        while (b < boundedEnd) {
+            int x = (fx >> 16);
+#if defined(__SSE2__) || defined(__ARM_NEON__)
+            int distx8 = (fx & 0x0000ffff) >> 8;
+            *b = interpolate_4_pixels(s1 + x, s2 + x, distx8, disty8);
+#else
+            uint tl = s1[x];
+            uint tr = s1[x + 1];
+            uint bl = s2[x];
+            uint br = s2[x + 1];
+            int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12;
+            *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4);
+#endif
+            fx += fdx;
+            ++b;
+        }
+    }
 
-                while (b < end) {
-                    int x1 = (fx >> 16);
-                    int x2;
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                    uint tl = s1[x1];
-                    uint tr = s1[x2];
-                    uint bl = s2[x1];
-                    uint br = s2[x2];
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2;
+        fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+        uint tl = s1[x1];
+        uint tr = s1[x2];
+        uint bl = s2[x1];
+        uint br = s2[x2];
 #if defined(__SSE2__) || defined(__ARM_NEON__)
-                    // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
-                    int distx8 = (fx & 0x0000ffff) >> 8;
-                    *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8);
+        // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
+        int distx8 = (fx & 0x0000ffff) >> 8;
+        *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8);
 #else
-                    int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12;
-                    *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4);
+        int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12;
+        *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4);
 #endif
-                    fx += fdx;
-                    ++b;
-                }
-            }
-        } else { //rotation
-            if (std::abs(data->m11) < (1./8.) || std::abs(data->m22) < (1./8.)) {
-                //if we are zooming more than 8 times, we use 8bit precision for the position.
-                while (b < end) {
-                    int x1 = (fx >> 16);
-                    int x2;
-                    int y1 = (fy >> 16);
-                    int y2;
+        fx += fdx;
+        ++b;
+    }
+}
 
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_rotate_helper(uint *b, uint *end, const QTextureData &image,
+                                                                       int &fx, int &fy, int fdx, int fdy)
+{
+    // if we are zooming more than 8 times, we use 8bit precision for the position.
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2;
+        int y1 = (fy >> 16);
+        int y2;
 
-                    const uint *s1 = (const uint *)data->texture.scanLine(y1);
-                    const uint *s2 = (const uint *)data->texture.scanLine(y2);
+        fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+        fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
 
-                    uint tl = s1[x1];
-                    uint tr = s1[x2];
-                    uint bl = s2[x1];
-                    uint br = s2[x2];
+        const uint *s1 = (const uint *)image.scanLine(y1);
+        const uint *s2 = (const uint *)image.scanLine(y2);
 
-                    int distx = (fx & 0x0000ffff) >> 8;
-                    int disty = (fy & 0x0000ffff) >> 8;
+        uint tl = s1[x1];
+        uint tr = s1[x2];
+        uint bl = s2[x1];
+        uint br = s2[x2];
 
-                    *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+        int distx = (fx & 0x0000ffff) >> 8;
+        int disty = (fy & 0x0000ffff) >> 8;
 
-                    fx += fdx;
-                    fy += fdy;
-                    ++b;
-                }
-            } else {
-                //we are zooming less than 8x, use 4bit precision
-
-                if (blendType != BlendTransformedBilinearTiled) {
-#define BILINEAR_ROTATE_BOUNDS_PROLOG \
-                    const qint64 min_fx = qint64(image_x1) * fixed_scale; \
-                    const qint64 max_fx = qint64(image_x2) * fixed_scale; \
-                    const qint64 min_fy = qint64(image_y1) * fixed_scale; \
-                    const qint64 max_fy = qint64(image_y2) * fixed_scale; \
-                    while (b < end) { \
-                        int x1 = (fx >> 16); \
-                        int x2; \
-                        int y1 = (fy >> 16); \
-                        int y2; \
-                        fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); \
-                        fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2); \
-                        if (x1 != x2 && y1 != y2) \
-                            break; \
-                        const uint *s1 = (const uint *)data->texture.scanLine(y1); \
-                        const uint *s2 = (const uint *)data->texture.scanLine(y2); \
-                        uint tl = s1[x1]; \
-                        uint tr = s1[x2]; \
-                        uint bl = s2[x1]; \
-                        uint br = s2[x2]; \
-                        int distx = (fx & 0x0000ffff) >> 8; \
-                        int disty = (fy & 0x0000ffff) >> 8; \
-                        *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); \
-                        fx += fdx; \
-                        fy += fdy; \
-                        ++b; \
-                    } \
-                    uint *boundedEnd = end; \
-                    if (fdx > 0) \
-                        boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); \
-                    else if (fdx < 0) \
-                        boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); \
-                    if (fdy > 0) \
-                        boundedEnd = qMin(boundedEnd, b + (max_fy - fy) / fdy); \
-                    else if (fdy < 0) \
-                        boundedEnd = qMin(boundedEnd, b + (min_fy - fy) / fdy); \
-                    boundedEnd -= 3;
+        *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
 
-#if defined(__SSE2__)
-                    BILINEAR_ROTATE_BOUNDS_PROLOG
+        fx += fdx;
+        fy += fdy;
+        ++b;
+    }
+}
 
-                    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
-                    const __m128i v_256 = _mm_set1_epi16(256);
-                    const __m128i v_fdx = _mm_set1_epi32(fdx*4);
-                    const __m128i v_fdy = _mm_set1_epi32(fdy*4);
-                    const __m128i v_fxy_r = _mm_set1_epi32(0x8);
-                    __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
-                    __m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy);
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper(uint *b, uint *end, const QTextureData &image,
+                                                                            int &fx, int &fy, int fdx, int fdy)
+{
+    //we are zooming less than 8x, use 4bit precision
+    if (blendType != BlendTransformedBilinearTiled) {
+        const qint64 min_fx = qint64(image.x1) * fixed_scale;
+        const qint64 max_fx = qint64(image.x2 - 1) * fixed_scale;
+        const qint64 min_fy = qint64(image.y1) * fixed_scale;
+        const qint64 max_fy = qint64(image.y2 - 1) * fixed_scale;
+        // first handle the possibly bounded part in the beginning
+        while (b < end) {
+            int x1 = (fx >> 16);
+            int x2;
+            int y1 = (fy >> 16);
+            int y2;
+            fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+            fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+            if (x1 != x2 && y1 != y2)
+                break;
+            const uint *s1 = (const uint *)image.scanLine(y1);
+            const uint *s2 = (const uint *)image.scanLine(y2);
+            uint tl = s1[x1];
+            uint tr = s1[x2];
+            uint bl = s2[x1];
+            uint br = s2[x2];
+#if defined(__SSE2__) || defined(__ARM_NEON__)
+            int distx = (fx & 0x0000ffff) >> 8;
+            int disty = (fy & 0x0000ffff) >> 8;
+            *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+#else
+            int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
+            int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
+            *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
+#endif
+            fx += fdx;
+            fy += fdy;
+            ++b;
+        }
+        uint *boundedEnd = end; \
+        if (fdx > 0) \
+            boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); \
+        else if (fdx < 0) \
+            boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); \
+        if (fdy > 0) \
+            boundedEnd = qMin(boundedEnd, b + (max_fy - fy) / fdy); \
+        else if (fdy < 0) \
+            boundedEnd = qMin(boundedEnd, b + (min_fy - fy) / fdy); \
+
+        // until boundedEnd we can now have a fast middle part without boundary checks
+#if defined(__SSE2__)
+        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+        const __m128i v_256 = _mm_set1_epi16(256);
+        const __m128i v_fdx = _mm_set1_epi32(fdx*4);
+        const __m128i v_fdy = _mm_set1_epi32(fdy*4);
+        const __m128i v_fxy_r = _mm_set1_epi32(0x8);
+        __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
+        __m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy);
+
+        const uchar *textureData = image.imageData;
+        const int bytesPerLine = image.bytesPerLine;
+        const __m128i vbpl = _mm_shufflelo_epi16(_mm_cvtsi32_si128(bytesPerLine/4), _MM_SHUFFLE(0, 0, 0, 0));
+
+        while (b < boundedEnd - 3) {
+            const __m128i vy = _mm_packs_epi32(_mm_srli_epi32(v_fy, 16), _mm_setzero_si128());
+            // 4x16bit * 4x16bit -> 4x32bit
+            __m128i offset = _mm_unpacklo_epi16(_mm_mullo_epi16(vy, vbpl), _mm_mulhi_epi16(vy, vbpl));
+            offset = _mm_add_epi32(offset, _mm_srli_epi32(v_fx, 16));
+            const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+            const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+            const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+            const int offset3 = _mm_cvtsi128_si32(offset);
+            const uint *topData = (const uint *)(textureData);
+            const __m128i tl = _mm_setr_epi32(topData[offset0], topData[offset1], topData[offset2], topData[offset3]);
+            const __m128i tr = _mm_setr_epi32(topData[offset0 + 1], topData[offset1 + 1], topData[offset2 + 1], topData[offset3 + 1]);
+            const uint *bottomData = (const uint *)(textureData + bytesPerLine);
+            const __m128i bl = _mm_setr_epi32(bottomData[offset0], bottomData[offset1], bottomData[offset2], bottomData[offset3]);
+            const __m128i br = _mm_setr_epi32(bottomData[offset0 + 1], bottomData[offset1 + 1], bottomData[offset2 + 1], bottomData[offset3 + 1]);
+
+            __m128i v_distx = _mm_srli_epi16(v_fx, 8);
+            __m128i v_disty = _mm_srli_epi16(v_fy, 8);
+            v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fxy_r), 4);
+            v_disty = _mm_srli_epi16(_mm_add_epi32(v_disty, v_fxy_r), 4);
+            v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+            v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+            v_disty = _mm_shufflehi_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
+            v_disty = _mm_shufflelo_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
+
+            interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b);
+            b += 4;
+            v_fx = _mm_add_epi32(v_fx, v_fdx);
+            v_fy = _mm_add_epi32(v_fy, v_fdy);
+        }
+        fx = _mm_cvtsi128_si32(v_fx);
+        fy = _mm_cvtsi128_si32(v_fy);
+#elif defined(__ARM_NEON__)
+        const int16x8_t colorMask = vdupq_n_s16(0x00ff);
+        const int16x8_t invColorMask = vmvnq_s16(colorMask);
+        const int16x8_t v_256 = vdupq_n_s16(256);
+        int32x4_t v_fdx = vdupq_n_s32(fdx * 4);
+        int32x4_t v_fdy = vdupq_n_s32(fdy * 4);
+
+        const uchar *textureData = image.imageData;
+        const int bytesPerLine = image.bytesPerLine;
+
+        int32x4_t v_fx = vmovq_n_s32(fx);
+        int32x4_t v_fy = vmovq_n_s32(fy);
+        v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1);
+        v_fy = vsetq_lane_s32(fy + fdy, v_fy, 1);
+        v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2);
+        v_fy = vsetq_lane_s32(fy + fdy * 2, v_fy, 2);
+        v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3);
+        v_fy = vsetq_lane_s32(fy + fdy * 3, v_fy, 3);
+
+        const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
+        const int32x4_t v_round = vdupq_n_s32(0x0800);
+
+        while (b < boundedEnd - 3) {
+            uint32x4x2_t v_top, v_bot;
+
+            int x1 = (fx >> 16);
+            int y1 = (fy >> 16);
+            fx += fdx; fy += fdy;
+            const uchar *sl = textureData + bytesPerLine * y1;
+            const uint *s1 = reinterpret_cast<const uint *>(sl);
+            const uint *s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 0);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0);
+            x1 = (fx >> 16);
+            y1 = (fy >> 16);
+            fx += fdx; fy += fdy;
+            sl = textureData + bytesPerLine * y1;
+            s1 = reinterpret_cast<const uint *>(sl);
+            s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 1);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1);
+            x1 = (fx >> 16);
+            y1 = (fy >> 16);
+            fx += fdx; fy += fdy;
+            sl = textureData + bytesPerLine * y1;
+            s1 = reinterpret_cast<const uint *>(sl);
+            s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 2);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2);
+            x1 = (fx >> 16);
+            y1 = (fy >> 16);
+            fx += fdx; fy += fdy;
+            sl = textureData + bytesPerLine * y1;
+            s1 = reinterpret_cast<const uint *>(sl);
+            s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
+            v_top = vld2q_lane_u32(s1 + x1, v_top, 3);
+            v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3);
+
+            int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_round), 12);
+            int32x4_t v_disty = vshrq_n_s32(vaddq_s32(vandq_s32(v_fy, v_ffff_mask), v_round), 12);
+            v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16));
+            v_disty = vorrq_s32(v_disty, vshlq_n_s32(v_disty, 16));
+            int16x8_t v_disty_ = vshlq_n_s16(vreinterpretq_s16_s32(v_disty), 4);
+
+            interpolate_4_pixels_16_neon(
+                        vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]),
+                        vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]),
+                        vreinterpretq_s16_s32(v_distx), vreinterpretq_s16_s32(v_disty),
+                        v_disty_, colorMask, invColorMask, v_256, b);
+            b += 4;
+            v_fx = vaddq_s32(v_fx, v_fdx);
+            v_fy = vaddq_s32(v_fy, v_fdy);
+        }
+#endif
+        while (b < boundedEnd) {
+            int x = (fx >> 16);
+            int y = (fy >> 16);
 
-                    const uchar *textureData = data->texture.imageData;
-                    const int bytesPerLine = data->texture.bytesPerLine;
-                    const __m128i vbpl = _mm_shufflelo_epi16(_mm_cvtsi32_si128(bytesPerLine/4), _MM_SHUFFLE(0, 0, 0, 0));
+            const uint *s1 = (const uint *)image.scanLine(y);
+            const uint *s2 = (const uint *)image.scanLine(y + 1);
 
-                    while (b < boundedEnd) {
-                        const __m128i vy = _mm_packs_epi32(_mm_srli_epi32(v_fy, 16), _mm_setzero_si128());
-                        // 4x16bit * 4x16bit -> 4x32bit
-                        __m128i offset = _mm_unpacklo_epi16(_mm_mullo_epi16(vy, vbpl), _mm_mulhi_epi16(vy, vbpl));
-                        offset = _mm_add_epi32(offset, _mm_srli_epi32(v_fx, 16));
-                        const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
-                        const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
-                        const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
-                        const int offset3 = _mm_cvtsi128_si32(offset);
-                        const uint *topData = (const uint *)(textureData);
-                        const __m128i tl = _mm_setr_epi32(topData[offset0], topData[offset1], topData[offset2], topData[offset3]);
-                        const __m128i tr = _mm_setr_epi32(topData[offset0 + 1], topData[offset1 + 1], topData[offset2 + 1], topData[offset3 + 1]);
-                        const uint *bottomData = (const uint *)(textureData + bytesPerLine);
-                        const __m128i bl = _mm_setr_epi32(bottomData[offset0], bottomData[offset1], bottomData[offset2], bottomData[offset3]);
-                        const __m128i br = _mm_setr_epi32(bottomData[offset0 + 1], bottomData[offset1 + 1], bottomData[offset2 + 1], bottomData[offset3 + 1]);
-
-                        __m128i v_distx = _mm_srli_epi16(v_fx, 8);
-                        __m128i v_disty = _mm_srli_epi16(v_fy, 8);
-                        v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fxy_r), 4);
-                        v_disty = _mm_srli_epi16(_mm_add_epi32(v_disty, v_fxy_r), 4);
-                        v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
-                        v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
-                        v_disty = _mm_shufflehi_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
-                        v_disty = _mm_shufflelo_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
-
-                        interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b);
-                        b += 4;
-                        v_fx = _mm_add_epi32(v_fx, v_fdx);
-                        v_fy = _mm_add_epi32(v_fy, v_fdy);
-                    }
-                    fx = _mm_cvtsi128_si32(v_fx);
-                    fy = _mm_cvtsi128_si32(v_fy);
-#elif defined(__ARM_NEON__)
-                    BILINEAR_ROTATE_BOUNDS_PROLOG
+#if defined(__SSE2__) || defined(__ARM_NEON__)
+            int distx = (fx & 0x0000ffff) >> 8;
+            int disty = (fy & 0x0000ffff) >> 8;
+            *b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty);
+#else
+            uint tl = s1[x];
+            uint tr = s1[x + 1];
+            uint bl = s2[x];
+            uint br = s2[x + 1];
+            int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
+            int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
+            *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
+#endif
 
-                    const int16x8_t colorMask = vdupq_n_s16(0x00ff);
-                    const int16x8_t invColorMask = vmvnq_s16(colorMask);
-                    const int16x8_t v_256 = vdupq_n_s16(256);
-                    int32x4_t v_fdx = vdupq_n_s32(fdx * 4);
-                    int32x4_t v_fdy = vdupq_n_s32(fdy * 4);
+            fx += fdx;
+            fy += fdy;
+            ++b;
+        }
+    }
 
-                    const uchar *textureData = data->texture.imageData;
-                    const int bytesPerLine = data->texture.bytesPerLine;
+    while (b < end) {
+        int x1 = (fx >> 16);
+        int x2;
+        int y1 = (fy >> 16);
+        int y2;
 
-                    int32x4_t v_fx = vmovq_n_s32(fx);
-                    int32x4_t v_fy = vmovq_n_s32(fy);
-                    v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1);
-                    v_fy = vsetq_lane_s32(fy + fdy, v_fy, 1);
-                    v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2);
-                    v_fy = vsetq_lane_s32(fy + fdy * 2, v_fy, 2);
-                    v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3);
-                    v_fy = vsetq_lane_s32(fy + fdy * 3, v_fy, 3);
+        fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+        fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
 
-                    const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
-                    const int32x4_t v_round = vdupq_n_s32(0x0800);
+        const uint *s1 = (const uint *)image.scanLine(y1);
+        const uint *s2 = (const uint *)image.scanLine(y2);
 
-                    while (b < boundedEnd) {
-                        uint32x4x2_t v_top, v_bot;
+        uint tl = s1[x1];
+        uint tr = s1[x2];
+        uint bl = s2[x1];
+        uint br = s2[x2];
 
-                        int x1 = (fx >> 16);
-                        int y1 = (fy >> 16);
-                        fx += fdx; fy += fdy;
-                        const uchar *sl = textureData + bytesPerLine * y1;
-                        const uint *s1 = reinterpret_cast<const uint *>(sl);
-                        const uint *s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 0);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0);
-                        x1 = (fx >> 16);
-                        y1 = (fy >> 16);
-                        fx += fdx; fy += fdy;
-                        sl = textureData + bytesPerLine * y1;
-                        s1 = reinterpret_cast<const uint *>(sl);
-                        s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 1);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1);
-                        x1 = (fx >> 16);
-                        y1 = (fy >> 16);
-                        fx += fdx; fy += fdy;
-                        sl = textureData + bytesPerLine * y1;
-                        s1 = reinterpret_cast<const uint *>(sl);
-                        s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 2);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2);
-                        x1 = (fx >> 16);
-                        y1 = (fy >> 16);
-                        fx += fdx; fy += fdy;
-                        sl = textureData + bytesPerLine * y1;
-                        s1 = reinterpret_cast<const uint *>(sl);
-                        s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
-                        v_top = vld2q_lane_u32(s1 + x1, v_top, 3);
-                        v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3);
-
-                        int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_round), 12);
-                        int32x4_t v_disty = vshrq_n_s32(vaddq_s32(vandq_s32(v_fy, v_ffff_mask), v_round), 12);
-                        v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16));
-                        v_disty = vorrq_s32(v_disty, vshlq_n_s32(v_disty, 16));
-                        int16x8_t v_disty_ = vshlq_n_s16(vreinterpretq_s16_s32(v_disty), 4);
-
-                        interpolate_4_pixels_16_neon(
-                                    vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]),
-                                    vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]),
-                                    vreinterpretq_s16_s32(v_distx), vreinterpretq_s16_s32(v_disty),
-                                    v_disty_, colorMask, invColorMask, v_256, b);
-                        b += 4;
-                        v_fx = vaddq_s32(v_fx, v_fdx);
-                        v_fy = vaddq_s32(v_fy, v_fdy);
-                    }
+#if defined(__SSE2__) || defined(__ARM_NEON__)
+        // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
+        int distx = (fx & 0x0000ffff) >> 8;
+        int disty = (fy & 0x0000ffff) >> 8;
+        *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+#else
+        int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
+        int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
+        *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
 #endif
-                }
 
-                while (b < end) {
-                    int x1 = (fx >> 16);
-                    int x2;
-                    int y1 = (fy >> 16);
-                    int y2;
+        fx += fdx;
+        fy += fdy;
+        ++b;
+    }
+}
 
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-                    fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
 
-                    const uint *s1 = (const uint *)data->texture.scanLine(y1);
-                    const uint *s2 = (const uint *)data->texture.scanLine(y2);
+static BilinearFastTransformHelper bilinearFastTransformHelperARGB32PM[2][NFastTransformTypes] = {
+    {
+        fetchTransformedBilinearARGB32PM_simple_upscale_helper<BlendTransformedBilinear>,
+        fetchTransformedBilinearARGB32PM_upscale_helper<BlendTransformedBilinear>,
+        fetchTransformedBilinearARGB32PM_downscale_helper<BlendTransformedBilinear>,
+        fetchTransformedBilinearARGB32PM_rotate_helper<BlendTransformedBilinear>,
+        fetchTransformedBilinearARGB32PM_fast_rotate_helper<BlendTransformedBilinear>
+    },
+    {
+        fetchTransformedBilinearARGB32PM_simple_upscale_helper<BlendTransformedBilinearTiled>,
+        fetchTransformedBilinearARGB32PM_upscale_helper<BlendTransformedBilinearTiled>,
+        fetchTransformedBilinearARGB32PM_downscale_helper<BlendTransformedBilinearTiled>,
+        fetchTransformedBilinearARGB32PM_rotate_helper<BlendTransformedBilinearTiled>,
+        fetchTransformedBilinearARGB32PM_fast_rotate_helper<BlendTransformedBilinearTiled>
+    }
+};
 
-                    uint tl = s1[x1];
-                    uint tr = s1[x2];
-                    uint bl = s2[x1];
-                    uint br = s2[x2];
+template<TextureBlendType blendType> /* blendType = BlendTransformedBilinear or BlendTransformedBilinearTiled */
+static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, const Operator *,
+                                                                 const QSpanData *data, int y, int x,
+                                                                 int length)
+{
+    const qreal cx = x + qreal(0.5);
+    const qreal cy = y + qreal(0.5);
+    Q_CONSTEXPR int tiled = (blendType == BlendTransformedBilinearTiled) ? 1 : 0;
 
-#if defined(__SSE2__) || defined(__ARM_NEON__)
-                    // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
-                    int distx = (fx & 0x0000ffff) >> 8;
-                    int disty = (fy & 0x0000ffff) >> 8;
-                    *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
-#else
-                    int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
-                    int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
-                    *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
-#endif
+    uint *end = buffer + length;
+    uint *b = buffer;
+    if (data->fast_matrix) {
+        // The increment pr x in the scanline
+        int fdx = (int)(data->m11 * fixed_scale);
+        int fdy = (int)(data->m12 * fixed_scale);
 
-                    fx += fdx;
-                    fy += fdy;
-                    ++b;
-                }
+        int fx = int((data->m21 * cy
+                      + data->m11 * cx + data->dx) * fixed_scale);
+        int fy = int((data->m22 * cy
+                      + data->m12 * cx + data->dy) * fixed_scale);
+
+        fx -= half_point;
+        fy -= half_point;
+
+        if (fdy == 0) { // simple scale, no rotation or shear
+            if (fdx <= fixed_scale && fdx > 0) {
+                // simple scale up on X without mirroring
+                bilinearFastTransformHelperARGB32PM[tiled][SimpleUpscaleTransform](b, end, data->texture, fx, fy, fdx, fdy);
+            } else if ((fdx < 0 && fdx > -(fixed_scale / 8)) || qAbs(data->m22) < qreal(1./8.)) {
+                // scale up more than 8x (on either Y or on X mirrored)
+                bilinearFastTransformHelperARGB32PM[tiled][UpscaleTransform](b, end, data->texture, fx, fy, fdx, fdy);
+            } else {
+                // scale down on X (or up on X mirrored less than 8x)
+                bilinearFastTransformHelperARGB32PM[tiled][DownscaleTransform](b, end, data->texture, fx, fy, fdx, fdy);
+            }
+        } else { // rotation or shear
+            if (qAbs(data->m11) < qreal(1./8.) || qAbs(data->m22) < qreal(1./8.) ) {
+                // if we are zooming more than 8 times, we use 8bit precision for the position.
+                bilinearFastTransformHelperARGB32PM[tiled][RotateTransform](b, end, data->texture, fx, fy, fdx, fdy);
+            } else {
+                // we are zooming less than 8x, use 4bit precision
+                bilinearFastTransformHelperARGB32PM[tiled][FastRotateTransform](b, end, data->texture, fx, fy, fdx, fdy);
             }
         }
     } else {
+        const QTextureData &image = data->texture;
+
         const qreal fdx = data->m11;
         const qreal fdy = data->m12;
         const qreal fdw = data->m13;
@@ -2495,8 +2629,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
             int distx = int((px - x1) * 256);
             int disty = int((py - y1) * 256);
 
-            fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
-            fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
+            fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+            fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
 
             const uint *s1 = (const uint *)data->texture.scanLine(y1);
             const uint *s2 = (const uint *)data->texture.scanLine(y2);
@@ -2678,7 +2812,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
                     layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0);
                     layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0);
 
-                    if ((fdx < 0 && fdx > -(fixed_scale / 8)) || std::abs(data->m22) < (1./8.)) { // scale up more than 8x
+                    if ((fdx < 0 && fdx > -(fixed_scale / 8)) || qAbs(data->m22) < qreal(1./8.)) { // scale up more than 8x
                         int disty = (fy & 0x0000ffff) >> 8;
                         for (int i = 0; i < len; ++i) {
                             int distx = (fracX & 0x0000ffff) >> 8;
@@ -2730,7 +2864,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
                 layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0);
                 layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0);
 
-                if (std::abs(data->m11) < (1./8.) || std::abs(data->m22) < (1./8.)) {
+                if (qAbs(data->m11) < qreal(1./8.) || qAbs(data->m22) < qreal(1./8.) ) {
                     //if we are zooming more than 8 times, we use 8bit precision for the position.
                     for (int i = 0; i < len; ++i) {
                         int distx = (fracX & 0x0000ffff) >> 8;
@@ -5195,6 +5329,8 @@ void qBlendTexture(int count, const QSpan *spans, void *userData)
     case QImage::Format_RGB16:
         proc = processTextureSpansRGB16[blendType];
         break;
+    case QImage::Format_ARGB32:
+    case QImage::Format_RGBA8888:
     case QImage::Format_BGR30:
     case QImage::Format_A2BGR30_Premultiplied:
     case QImage::Format_RGB30:
@@ -5411,7 +5547,7 @@ static void qt_alphamapblit_quint16(QRasterBuffer *rasterBuffer,
                                     int x, int y, const QRgba64 &color,
                                     const uchar *map,
                                     int mapWidth, int mapHeight, int mapStride,
-                                    const QClipData *)
+                                    const QClipData *, bool /*useGammaCorrection*/)
 {
     const quint16 c = color.toRgb16();
     quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
@@ -5436,105 +5572,43 @@ static void qt_alphamapblit_quint16(QRasterBuffer *rasterBuffer,
     }
 }
 
-static inline void rgbBlendPixel(quint32 *dst, int coverage, int sr, int sg, int sb, const uchar *gamma, const uchar *invgamma)
+static inline void rgbBlendPixel(quint32 *dst, int coverage, QRgba64 slinear, const QColorProfile *colorProfile, bool useGammaCorrection)
 {
-    // Do a gray alphablend...
-    int da = qAlpha(*dst);
-    int dr = qRed(*dst);
-    int dg = qGreen(*dst);
-    int db = qBlue(*dst);
-
-    if (da != 255
-        ) {
-
-        int a = qGray(coverage);
-        sr = qt_div_255(invgamma[sr] * a);
-        sg = qt_div_255(invgamma[sg] * a);
-        sb = qt_div_255(invgamma[sb] * a);
-
-        int ia = 255 - a;
-        dr = qt_div_255(dr * ia);
-        dg = qt_div_255(dg * ia);
-        db = qt_div_255(db * ia);
-
-        *dst = ((a + qt_div_255((255 - a) * da)) << 24)
-            |  ((sr + dr) << 16)
-            |  ((sg + dg) << 8)
-            |  ((sb + db));
-        return;
-    }
-
-    int mr = qRed(coverage);
-    int mg = qGreen(coverage);
-    int mb = qBlue(coverage);
-
-    dr = gamma[dr];
-    dg = gamma[dg];
-    db = gamma[db];
-
-    int nr = qt_div_255(sr * mr + dr * (255 - mr));
-    int ng = qt_div_255(sg * mg + dg * (255 - mg));
-    int nb = qt_div_255(sb * mb + db * (255 - mb));
+    // Do a gammacorrected RGB alphablend...
+    const QRgba64 dlinear = useGammaCorrection ? colorProfile->toLinear64(*dst) : QRgba64::fromArgb32(*dst);
 
-    nr = invgamma[nr];
-    ng = invgamma[ng];
-    nb = invgamma[nb];
+    QRgba64 blend = rgbBlend(dlinear, slinear, coverage);
 
-    *dst = qRgb(nr, ng, nb);
+    *dst = useGammaCorrection ? colorProfile->fromLinear64(blend) : toArgb32(blend);
 }
 
-#if defined(Q_OS_WIN)
-Q_GUI_EXPORT bool qt_needs_a8_gamma_correction = false;
-
-static inline void grayBlendPixel(quint32 *dst, int coverage, int sr, int sg, int sb, const uint *gamma, const uchar *invgamma)
+static inline void grayBlendPixel(quint32 *dst, int coverage, QRgba64 slinear, const QColorProfile *colorProfile)
 {
     // Do a gammacorrected gray alphablend...
-    int dr = qRed(*dst);
-    int dg = qGreen(*dst);
-    int db = qBlue(*dst);
-
-    dr = gamma[dr];
-    dg = gamma[dg];
-    db = gamma[db];
-
-    int alpha = coverage;
-    int ialpha = 255 - alpha;
-    int nr = qt_div_255(sr * alpha + dr * ialpha);
-    int ng = qt_div_255(sg * alpha + dg * ialpha);
-    int nb = qt_div_255(sb * alpha + db * ialpha);
+    const QRgba64 dlinear = colorProfile->toLinear64(*dst);
 
-    nr = invgamma[nr];
-    ng = invgamma[ng];
-    nb = invgamma[nb];
+    QRgba64 blend = interpolate255(slinear, coverage, dlinear, 255 - coverage);
 
-    *dst = qRgb(nr, ng, nb);
+    *dst = colorProfile->fromLinear64(blend);
 }
-#endif
 
 static void qt_alphamapblit_uint32(QRasterBuffer *rasterBuffer,
                                    int x, int y, quint32 color,
                                    const uchar *map,
                                    int mapWidth, int mapHeight, int mapStride,
-                                   const QClipData *clip)
+                                   const QClipData *clip, bool useGammaCorrection)
 {
     const quint32 c = color;
     const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint32);
 
-#if defined(Q_OS_WIN)
-    const QDrawHelperGammaTables *tables = QGuiApplicationPrivate::instance()->gammaTables();
-    if (!tables)
+    const QColorProfile *colorProfile = QGuiApplicationPrivate::instance()->colorProfileForA8Text();
+    if (!colorProfile)
         return;
 
-    const uint *gamma = tables->qt_pow_gamma;
-    const uchar *invgamma = tables->qt_pow_invgamma;
-
-    int sr = gamma[qRed(color)];
-    int sg = gamma[qGreen(color)];
-    int sb = gamma[qBlue(color)];
+    const QRgba64 slinear = colorProfile->toLinear64(c);
 
     bool opaque_src = (qAlpha(color) == 255);
-    bool doGrayBlendPixel = opaque_src && qt_needs_a8_gamma_correction;
-#endif
+    bool doGrayBlendPixel = opaque_src && useGammaCorrection;
 
     if (!clip) {
         quint32 *dest = reinterpret_cast<quint32*>(rasterBuffer->scanLine(y)) + x;
@@ -5547,13 +5621,9 @@ static void qt_alphamapblit_uint32(QRasterBuffer *rasterBuffer,
                 } else if (coverage == 255) {
                     dest[i] = c;
                 } else {
-#if defined(Q_OS_WIN)
-                    if (QSysInfo::WindowsVersion >= QSysInfo::WV_XP && doGrayBlendPixel
-                        && qAlpha(dest[i]) == 255) {
-                        grayBlendPixel(dest+i, coverage, sr, sg, sb, gamma, invgamma);
-                    } else
-#endif
-                    {
+                    if (doGrayBlendPixel && qAlpha(dest[i]) == 255) {
+                        grayBlendPixel(dest+i, coverage, slinear, colorProfile);
+                    } else {
                         int ialpha = 255 - coverage;
                         dest[i] = INTERPOLATE_PIXEL_255(c, coverage, dest[i], ialpha);
                     }
@@ -5588,13 +5658,9 @@ static void qt_alphamapblit_uint32(QRasterBuffer *rasterBuffer,
                     } else if (coverage == 255) {
                         dest[xp] = c;
                     } else {
-#if defined(Q_OS_WIN)
-                        if (QSysInfo::WindowsVersion >= QSysInfo::WV_XP && doGrayBlendPixel
-                            && qAlpha(dest[xp]) == 255) {
-                            grayBlendPixel(dest+xp, coverage, sr, sg, sb, gamma, invgamma);
-                        } else
-#endif
-                        {
+                        if (doGrayBlendPixel && qAlpha(dest[xp]) == 255) {
+                            grayBlendPixel(dest+xp, coverage, slinear, colorProfile);
+                        } else {
                             int ialpha = 255 - coverage;
                             dest[xp] = INTERPOLATE_PIXEL_255(c, coverage, dest[xp], ialpha);
                         }
@@ -5612,9 +5678,9 @@ static void qt_alphamapblit_argb32(QRasterBuffer *rasterBuffer,
                                    int x, int y, const QRgba64 &color,
                                    const uchar *map,
                                    int mapWidth, int mapHeight, int mapStride,
-                                   const QClipData *clip)
+                                   const QClipData *clip, bool useGammaCorrection)
 {
-    qt_alphamapblit_uint32(rasterBuffer, x, y, color.toArgb32(), map, mapWidth, mapHeight, mapStride, clip);
+    qt_alphamapblit_uint32(rasterBuffer, x, y, color.toArgb32(), map, mapWidth, mapHeight, mapStride, clip, useGammaCorrection);
 }
 
 #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
@@ -5622,34 +5688,31 @@ static void qt_alphamapblit_rgba8888(QRasterBuffer *rasterBuffer,
                                      int x, int y, const QRgba64 &color,
                                      const uchar *map,
                                      int mapWidth, int mapHeight, int mapStride,
-                                     const QClipData *clip)
+                                     const QClipData *clip, bool useGammaCorrection)
 {
-    qt_alphamapblit_uint32(rasterBuffer, x, y, ARGB2RGBA(color.toArgb32()), map, mapWidth, mapHeight, mapStride, clip);
+    qt_alphamapblit_uint32(rasterBuffer, x, y, ARGB2RGBA(color.toArgb32()), map, mapWidth, mapHeight, mapStride, clip, useGammaCorrection);
 }
 #endif
 
+inline static int qRgbAvg(QRgb rgb)
+{
+    return (qRed(rgb) * 5 + qGreen(rgb) * 6 + qBlue(rgb) * 5) / 16;
+}
+
 static void qt_alphargbblit_argb32(QRasterBuffer *rasterBuffer,
                                    int x, int y, const QRgba64 &color,
                                    const uint *src, int mapWidth, int mapHeight, int srcStride,
-                                   const QClipData *clip)
+                                   const QClipData *clip, bool useGammaCorrection)
 {
     const quint32 c = color.toArgb32();
 
-    int sr = qRed(c);
-    int sg = qGreen(c);
-    int sb = qBlue(c);
     int sa = qAlpha(c);
 
-    const QDrawHelperGammaTables *tables = QGuiApplicationPrivate::instance()->gammaTables();
-    if (!tables)
+    const QColorProfile *colorProfile = QGuiApplicationPrivate::instance()->colorProfileForA32Text();
+    if (!colorProfile)
         return;
 
-    const uchar *gamma = tables->qt_pow_rgb_gamma;
-    const uchar *invgamma = tables->qt_pow_rgb_invgamma;
-
-    sr = gamma[sr];
-    sg = gamma[sg];
-    sb = gamma[sb];
+    const QRgba64 slinear = useGammaCorrection ? colorProfile->toLinear64(c) : color;
 
     if (sa == 0)
         return;
@@ -5663,7 +5726,13 @@ static void qt_alphargbblit_argb32(QRasterBuffer *rasterBuffer,
                 if (coverage == 0xffffffff) {
                     dst[i] = c;
                 } else if (coverage != 0xff000000) {
-                    rgbBlendPixel(dst+i, coverage, sr, sg, sb, gamma, invgamma);
+                    if (dst[i] >= 0xff000000) {
+                        rgbBlendPixel(dst+i, coverage, slinear, colorProfile, useGammaCorrection);
+                    } else {
+                        // Give up and do a naive blend.
+                        const int a = qRgbAvg(coverage);
+                        dst[i] = INTERPOLATE_PIXEL_255(c, a, dst[i], 255 - a);
+                    }
                 }
             }
 
@@ -5693,7 +5762,13 @@ static void qt_alphargbblit_argb32(QRasterBuffer *rasterBuffer,
                     if (coverage == 0xffffffff) {
                         dst[xp] = c;
                     } else if (coverage != 0xff000000) {
-                        rgbBlendPixel(dst+xp, coverage, sr, sg, sb, gamma, invgamma);
+                        if (dst[xp] >= 0xff000000) {
+                            rgbBlendPixel(dst+xp, coverage, slinear, colorProfile, useGammaCorrection);
+                        } else {
+                            // Give up and do a naive blend.
+                            const int a = qRgbAvg(coverage);
+                            dst[xp] = INTERPOLATE_PIXEL_255(c, a, dst[xp], 255 - coverage);
+                        }
                     }
                 }
             } // for (i -> line.count)
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp
index a833520b00..cdb374f823 100644
--- a/src/gui/painting/qdrawhelper_neon.cpp
+++ b/src/gui/painting/qdrawhelper_neon.cpp
@@ -539,7 +539,7 @@ void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
                                   int x, int y, const QRgba64 &color,
                                   const uchar *bitmap,
                                   int mapWidth, int mapHeight, int mapStride,
-                                  const QClipData *)
+                                  const QClipData *, bool /*useGammaCorrection*/)
 {
     quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x;
     const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16);
diff --git a/src/gui/painting/qdrawhelper_neon_p.h b/src/gui/painting/qdrawhelper_neon_p.h
index 3cf949fc32..40475a9bde 100644
--- a/src/gui/painting/qdrawhelper_neon_p.h
+++ b/src/gui/painting/qdrawhelper_neon_p.h
@@ -91,7 +91,7 @@ void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer,
                                   int x, int y, const QRgba64 &color,
                                   const uchar *bitmap,
                                   int mapWidth, int mapHeight, int mapStride,
-                                  const QClipData *clip);
+                                  const QClipData *clip, bool /*useGammaCorrection*/);
 
 void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
                                          const uchar *srcPixels, int sbpl, int srch,
diff --git a/src/gui/painting/qdrawhelper_p.h b/src/gui/painting/qdrawhelper_p.h
index 0e46962784..cf2213042d 100644
--- a/src/gui/painting/qdrawhelper_p.h
+++ b/src/gui/painting/qdrawhelper_p.h
@@ -113,13 +113,13 @@ typedef void (*AlphamapBlitFunc)(QRasterBuffer *rasterBuffer,
                                  int x, int y, const QRgba64 &color,
                                  const uchar *bitmap,
                                  int mapWidth, int mapHeight, int mapStride,
-                                 const QClipData *clip);
+                                 const QClipData *clip, bool useGammaCorrection);
 
 typedef void (*AlphaRGBBlitFunc)(QRasterBuffer *rasterBuffer,
                                  int x, int y, const QRgba64 &color,
                                  const uint *rgbmask,
                                  int mapWidth, int mapHeight, int mapStride,
-                                 const QClipData *clip);
+                                 const QClipData *clip, bool useGammaCorrection);
 
 typedef void (*RectFillFunc)(QRasterBuffer *rasterBuffer,
                              int x, int y, int width, int height,
@@ -159,7 +159,6 @@ struct DrawHelper {
 extern SrcOverBlendFunc qBlendFunctions[QImage::NImageFormats][QImage::NImageFormats];
 extern SrcOverScaleFunc qScaleFunctions[QImage::NImageFormats][QImage::NImageFormats];
 extern SrcOverTransformFunc qTransformFunctions[QImage::NImageFormats][QImage::NImageFormats];
-extern MemRotateFunc qMemRotateFunctions[QImage::NImageFormats][3];
 
 extern DrawHelper qDrawHelper[QImage::NImageFormats];
 
@@ -351,18 +350,6 @@ struct QSpanData
     void adjustSpanMethods();
 };
 
-struct QDrawHelperGammaTables
-{
-    explicit QDrawHelperGammaTables(qreal smoothing);
-
-    void refresh(qreal smoothing);
-
-    uchar qt_pow_rgb_gamma[256];
-    uchar qt_pow_rgb_invgamma[256];
-    uint qt_pow_gamma[256];
-    uchar qt_pow_invgamma[2048];
-};
-
 static inline uint qt_gradient_clamp(const QGradientData *data, int ipos)
 {
     if (ipos < 0 || ipos >= GRADIENT_STOPTABLE_SIZE) {
@@ -1244,6 +1231,7 @@ extern QPixelLayout qPixelLayouts[QImage::NImageFormats];
 extern const FetchPixelsFunc qFetchPixels[QPixelLayout::BPPCount];
 extern StorePixelsFunc qStorePixels[QPixelLayout::BPPCount];
 
+extern MemRotateFunc qMemRotateFunctions[QPixelLayout::BPPCount][3];
 
 
 QT_END_NAMESPACE
diff --git a/src/gui/painting/qmemrotate.cpp b/src/gui/painting/qmemrotate.cpp
index 3fbae76de5..25aa6a3122 100644
--- a/src/gui/painting/qmemrotate.cpp
+++ b/src/gui/painting/qmemrotate.cpp
@@ -41,164 +41,10 @@
 
 QT_BEGIN_NAMESPACE
 
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_TILED
 static const int tileSize = 32;
-#endif
-
-#if Q_BYTE_ORDER == Q_BIG_ENDIAN
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_PACKED || QT_ROTATION_ALGORITHM == QT_ROTATION_TILED
-#error Big endian version not implemented for the transformed driver!
-#endif
-#endif
-
-template <class T>
-Q_STATIC_TEMPLATE_FUNCTION
-inline void qt_memrotate90_cachedRead(const T *src, int w, int h, int sstride, T *dest,
-                                      int dstride)
-{
-    const char *s = reinterpret_cast<const char*>(src);
-    char *d = reinterpret_cast<char*>(dest);
-    for (int y = 0; y < h; ++y) {
-        for (int x = w - 1; x >= 0; --x) {
-            T *destline = reinterpret_cast<T *>(d + (w - x - 1) * dstride);
-            destline[y] = src[x];
-        }
-        s += sstride;
-        src = reinterpret_cast<const T*>(s);
-    }
-}
 
 template <class T>
 Q_STATIC_TEMPLATE_FUNCTION
-inline void qt_memrotate270_cachedRead(const T *src, int w, int h, int sstride, T *dest,
-                                       int dstride)
-{
-    const char *s = reinterpret_cast<const char*>(src);
-    char *d = reinterpret_cast<char*>(dest);
-    s += (h - 1) * sstride;
-    for (int y = h - 1; y >= 0; --y) {
-        src = reinterpret_cast<const T*>(s);
-        for (int x = 0; x < w; ++x) {
-            T *destline = reinterpret_cast<T *>(d + x * dstride);
-            destline[h - y - 1] = src[x];
-        }
-        s -= sstride;
-    }
-}
-
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_CACHEDWRITE
-
-template <class T>
-Q_STATIC_TEMPLATE_FUNCTION
-inline void qt_memrotate90_cachedWrite(const T *src, int w, int h, int sstride, T *dest,
-                                       int dstride)
-{
-    for (int x = w - 1; x >= 0; --x) {
-        T *d = dest + (w - x - 1) * dstride;
-        for (int y = 0; y < h; ++y) {
-            *d++ = src[y * sstride + x];
-        }
-    }
-
-}
-
-template <class T>
-Q_STATIC_TEMPLATE_FUNCTION
-inline void qt_memrotate270_cachedWrite(const T *src, int w, int h, int sstride, T *dest,
-                                        int dstride)
-{
-    for (int x = 0; x < w; ++x) {
-        T *d = dest + x * dstride;
-        for (int y = h - 1; y >= 0; --y) {
-            *d++ = src[y * sstride + x];
-        }
-    }
-}
-
-#endif // QT_ROTATION_CACHEDWRITE
-
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_PACKING
-
-// TODO: packing algorithms should probably be modified on 64-bit architectures
-
-template <class T>
-Q_STATIC_TEMPLATE_FUNCTION
-inline void qt_memrotate90_packing(const T *src, int w, int h, int sstride, T *dest, int dstride)
-{
-    sstride /= sizeof(T);
-    dstride /= sizeof(T);
-
-    const int pack = sizeof(quint32) / sizeof(T);
-    const int unaligned = int((long(dest) & (sizeof(quint32)-1))) / sizeof(T);
-
-    for (int x = w - 1; x >= 0; --x) {
-        int y = 0;
-
-        for (int i = 0; i < unaligned; ++i) {
-            dest[(w - x - 1) * dstride + y] = src[y * sstride + x];
-            ++y;
-        }
-
-        quint32 *d = reinterpret_cast<quint32*>(dest + (w - x - 1) * dstride
-                                                + unaligned);
-        const int rest = (h - unaligned) % pack;
-        while (y < h - rest) {
-            quint32 c = src[y * sstride + x];
-            for (int i = 1; i < pack; ++i) {
-                c |= src[(y + i) * sstride + x] << (sizeof(int) * 8 / pack * i);
-            }
-            *d++ = c;
-            y += pack;
-        }
-
-        while (y < h) {
-            dest[(w - x - 1) * dstride + y] = src[y * sstride + x];
-            ++y;
-        }
-    }
-}
-
-template <class T>
-Q_STATIC_TEMPLATE_FUNCTION
-inline void qt_memrotate270_packing(const T *src, int w, int h, int sstride, T *dest, int dstride)
-{
-    sstride /= sizeof(T);
-    dstride /= sizeof(T);
-
-    const int pack = sizeof(quint32) / sizeof(T);
-    const int unaligned = int((long(dest) & (sizeof(quint32)-1))) / sizeof(T);
-
-    for (int x = 0; x < w; ++x) {
-        int y = h - 1;
-
-        for (int i = 0; i < unaligned; ++i) {
-            dest[x * dstride + h - y - 1] = src[y * sstride + x];
-            --y;
-        }
-
-        quint32 *d = reinterpret_cast<quint32*>(dest + x * dstride
-                                                + unaligned);
-        const int rest = (h - unaligned) % pack;
-        while (y > rest) {
-            quint32 c = src[y * sstride + x];
-            for (int i = 1; i < pack; ++i) {
-                c |= src[(y - i) * sstride + x] << (sizeof(int) * 8 / pack * i);
-            }
-            *d++ = c;
-            y -= pack;
-        }
-        while (y >= 0) {
-            dest[x * dstride + h - y - 1] = src[y * sstride + x];
-            --y;
-        }
-    }
-}
-
-#endif // QT_ROTATION_PACKING
-
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_TILED
-template <class T>
-Q_STATIC_TEMPLATE_FUNCTION
 inline void qt_memrotate90_tiled(const T *src, int w, int h, int sstride, T *dest, int dstride)
 {
     sstride /= sizeof(T);
@@ -235,7 +81,7 @@ inline void qt_memrotate90_tiled(const T *src, int w, int h, int sstride, T *des
                 for (int y = starty; y < stopy; y += pack) {
                     quint32 c = src[y * sstride + x];
                     for (int i = 1; i < pack; ++i) {
-                        const int shift = (sizeof(int) * 8 / pack * i);
+                        const int shift = (sizeof(T) * 8 * i);
                         const T color = src[(y + i) * sstride + x];
                         c |= color << shift;
                     }
@@ -293,7 +139,7 @@ inline void qt_memrotate270_tiled(const T *src, int w, int h, int sstride, T *de
 
     const int pack = sizeof(quint32) / sizeof(T);
     const int unaligned =
-        qMin(uint((long(dest) & (sizeof(quint32)-1)) / sizeof(T)), uint(h));
+        qMin(uint((quintptr(dest) & (sizeof(quint32)-1)) / sizeof(T)), uint(h));
     const int restX = w % tileSize;
     const int restY = (h - unaligned) % tileSize;
     const int unoptimizedY = restY % pack;
@@ -320,10 +166,10 @@ inline void qt_memrotate270_tiled(const T *src, int w, int h, int sstride, T *de
             for (int x = startx; x < stopx; ++x) {
                 quint32 *d = reinterpret_cast<quint32*>(dest + x * dstride
                                                         + h - 1 - starty);
-                for (int y = starty; y > stopy; y -= pack) {
+                for (int y = starty; y >= stopy; y -= pack) {
                     quint32 c = src[y * sstride + x];
                     for (int i = 1; i < pack; ++i) {
-                        const int shift = (sizeof(int) * 8 / pack * i);
+                        const int shift = (sizeof(T) * 8 * i);
                         const T color = src[(y - i) * sstride + x];
                         c |= color << shift;
                     }
@@ -371,22 +217,26 @@ inline void qt_memrotate270_tiled_unpacked(const T *src, int w, int h, int sstri
     }
 }
 
-#endif // QT_ROTATION_ALGORITHM
 
 template <class T>
 Q_STATIC_TEMPLATE_FUNCTION
 inline void qt_memrotate90_template(const T *src, int srcWidth, int srcHeight, int srcStride,
                                     T *dest, int dstStride)
 {
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_CACHEDREAD
-    qt_memrotate90_cachedRead<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_CACHEDWRITE
-    qt_memrotate90_cachedWrite<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_PACKING
-    qt_memrotate90_packing<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_TILED
-    qt_memrotate90_tiled<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
+#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+    // packed algorithm assumes little endian and that sizeof(quint32)/sizeof(T) is an integer
+    if (sizeof(quint32) % sizeof(T) == 0)
+        qt_memrotate90_tiled<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
+    else
 #endif
+    qt_memrotate90_tiled_unpacked<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
+}
+
+template <>
+inline void qt_memrotate90_template<quint32>(const quint32 *src, int w, int h, int sstride, quint32 *dest, int dstride)
+{
+    // packed algorithm doesn't have any benefit for quint32
+    qt_memrotate90_tiled_unpacked(src, w, h, sstride, dest, dstride);
 }
 
 template <class T>
@@ -394,11 +244,11 @@ Q_STATIC_TEMPLATE_FUNCTION
 inline void qt_memrotate180_template(const T *src, int w, int h, int sstride, T *dest, int dstride)
 {
     const char *s = (const char*)(src) + (h - 1) * sstride;
-    for (int y = h - 1; y >= 0; --y) {
-        T *d = reinterpret_cast<T*>((char *)(dest) + (h - y - 1) * dstride);
+    for (int dy = 0; dy < h; ++dy) {
+        T *d = reinterpret_cast<T*>((char *)(dest) + dy * dstride);
         src = reinterpret_cast<const T*>(s);
-        for (int x = w - 1; x >= 0; --x) {
-            d[w - x - 1] = src[x];
+        for (int dx = 0; dx < w; ++dx) {
+            d[dx] = src[w - 1 - dx];
         }
         s -= sstride;
     }
@@ -409,32 +259,20 @@ Q_STATIC_TEMPLATE_FUNCTION
 inline void qt_memrotate270_template(const T *src, int srcWidth, int srcHeight, int srcStride,
                                      T *dest, int dstStride)
 {
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_CACHEDREAD
-    qt_memrotate270_cachedRead<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_CACHEDWRITE
-    qt_memrotate270_cachedWrite<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_PACKING
-    qt_memrotate270_packing<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_TILED
-    qt_memrotate270_tiled_unpacked<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
+#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+    // packed algorithm assumes little endian and that sizeof(quint32)/sizeof(T) is an integer
+    if (sizeof(quint32) % sizeof(T) == 0)
+        qt_memrotate270_tiled<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
+    else
 #endif
+    qt_memrotate270_tiled_unpacked<T>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
 }
 
 template <>
-inline void qt_memrotate90_template<quint24>(const quint24 *src, int srcWidth, int srcHeight,
-                                             int srcStride, quint24 *dest, int dstStride)
+inline void qt_memrotate270_template<quint32>(const quint32 *src, int w, int h, int sstride, quint32 *dest, int dstride)
 {
-#if QT_ROTATION_ALGORITHM == QT_ROTATION_CACHEDREAD
-    qt_memrotate90_cachedRead<quint24>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_CACHEDWRITE
-    qt_memrotate90_cachedWrite<quint24>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_PACKING
-    // packed algorithm not implemented
-    qt_memrotate90_cachedRead<quint24>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#elif QT_ROTATION_ALGORITHM == QT_ROTATION_TILED
-    // packed algorithm not implemented
-    qt_memrotate90_tiled_unpacked<quint24>(src, srcWidth, srcHeight, srcStride, dest, dstStride);
-#endif
+    // packed algorithm doesn't have any benefit for quint32
+    qt_memrotate270_tiled_unpacked(src, w, h, sstride, dest, dstride);
 }
 
 #define QT_IMPL_MEMROTATE(type)                                     \
@@ -458,7 +296,7 @@ Q_GUI_EXPORT void qt_memrotate270(const type *src, int w, int h, int sstride, \
 Q_GUI_EXPORT void qt_memrotate90(const type *src, int w, int h, int sstride,  \
                                  type *dest, int dstride)           \
 {                                                                   \
-    qt_memrotate90_tiled_unpacked<type>(src, w, h, sstride, dest, dstride); \
+    qt_memrotate90_tiled_unpacked(src, w, h, sstride, dest, dstride); \
 }                                                                   \
 Q_GUI_EXPORT void qt_memrotate180(const type *src, int w, int h, int sstride, \
                                   type *dest, int dstride)          \
@@ -468,7 +306,7 @@ Q_GUI_EXPORT void qt_memrotate180(const type *src, int w, int h, int sstride, \
 Q_GUI_EXPORT void qt_memrotate270(const type *src, int w, int h, int sstride, \
                                   type *dest, int dstride)          \
 {                                                                   \
-    qt_memrotate270_tiled_unpacked<type>(src, w, h, sstride, dest, dstride); \
+    qt_memrotate270_tiled_unpacked(src, w, h, sstride, dest, dstride); \
 }
 
 
@@ -509,6 +347,21 @@ void qt_memrotate270_16(const uchar *srcPixels, int w, int h, int sbpl, uchar *d
     qt_memrotate270((const ushort *)srcPixels, w, h, sbpl, (ushort *)destPixels, dbpl);
 }
 
+void qt_memrotate90_24(const uchar *srcPixels, int w, int h, int sbpl, uchar *destPixels, int dbpl)
+{
+    qt_memrotate90((const quint24 *)srcPixels, w, h, sbpl, (quint24 *)destPixels, dbpl);
+}
+
+void qt_memrotate180_24(const uchar *srcPixels, int w, int h, int sbpl, uchar *destPixels, int dbpl)
+{
+    qt_memrotate180((const quint24 *)srcPixels, w, h, sbpl, (quint24 *)destPixels, dbpl);
+}
+
+void qt_memrotate270_24(const uchar *srcPixels, int w, int h, int sbpl, uchar *destPixels, int dbpl)
+{
+    qt_memrotate270((const quint24 *)srcPixels, w, h, sbpl, (quint24 *)destPixels, dbpl);
+}
+
 void qt_memrotate90_32(const uchar *srcPixels, int w, int h, int sbpl, uchar *destPixels, int dbpl)
 {
     qt_memrotate90((const uint *)srcPixels, w, h, sbpl, (uint *)destPixels, dbpl);
@@ -524,34 +377,16 @@ void qt_memrotate270_32(const uchar *srcPixels, int w, int h, int sbpl, uchar *d
     qt_memrotate270((const uint *)srcPixels, w, h, sbpl, (uint *)destPixels, dbpl);
 }
 
-MemRotateFunc qMemRotateFunctions[QImage::NImageFormats][3] =
+MemRotateFunc qMemRotateFunctions[QPixelLayout::BPPCount][3] =
 // 90, 180, 270
 {
-    { 0, 0, 0 },      // Format_Invalid,
-    { 0, 0, 0 },      // Format_Mono,
-    { 0, 0, 0 },      // Format_MonoLSB,
-    { 0, 0, 0 },      // Format_Indexed8,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_RGB32,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_ARGB32,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_ARGB32_Premultiplied,
-    { qt_memrotate90_16, qt_memrotate180_16, qt_memrotate270_16 },      // Format_RGB16,
-    { 0, 0, 0 },      // Format_ARGB8565_Premultiplied,
-    { 0, 0, 0 },      // Format_RGB666,
-    { 0, 0, 0 },      // Format_ARGB6666_Premultiplied,
-    { 0, 0, 0 },      // Format_RGB555,
-    { 0, 0, 0 },      // Format_ARGB8555_Premultiplied,
-    { 0, 0, 0 },      // Format_RGB888,
-    { 0, 0, 0 },      // Format_RGB444,
-    { 0, 0, 0 },      // Format_ARGB4444_Premultiplied,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_RGBX8888,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_RGBA8888,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_RGBA8888_Premultiplied,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_BGB30,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_A2BGR30_Premultiplied,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_RGB30,
-    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // Format_A2RGB30_Premultiplied,
-    { qt_memrotate90_8, qt_memrotate180_8, qt_memrotate270_8 },         // Format_Alpha8,
-    { qt_memrotate90_8, qt_memrotate180_8, qt_memrotate270_8 },         // Format_Grayscale8,
+    { 0, 0, 0 },      // BPPNone,
+    { 0, 0, 0 },      // BPP1MSB,
+    { 0, 0, 0 },      // BPP1LSB,
+    { qt_memrotate90_8, qt_memrotate180_8, qt_memrotate270_8 },         // BPP8,
+    { qt_memrotate90_16, qt_memrotate180_16, qt_memrotate270_16 },      // BPP16,
+    { qt_memrotate90_24, qt_memrotate180_24, qt_memrotate270_24 },      // BPP24
+    { qt_memrotate90_32, qt_memrotate180_32, qt_memrotate270_32 },      // BPP32
 };
 
 QT_END_NAMESPACE
diff --git a/src/gui/painting/qmemrotate_p.h b/src/gui/painting/qmemrotate_p.h
index 62613d301a..9bc3fd1010 100644
--- a/src/gui/painting/qmemrotate_p.h
+++ b/src/gui/painting/qmemrotate_p.h
@@ -56,19 +56,6 @@
 
 QT_BEGIN_NAMESPACE
 
-#define QT_ROTATION_CACHEDREAD 1
-#define QT_ROTATION_CACHEDWRITE 2
-#define QT_ROTATION_PACKING 3
-#define QT_ROTATION_TILED 4
-
-#ifndef QT_ROTATION_ALGORITHM
-#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
-#define QT_ROTATION_ALGORITHM QT_ROTATION_TILED
-#else
-#define QT_ROTATION_ALGORITHM QT_ROTATION_CACHEDREAD
-#endif
-#endif
-
 #define QT_DECL_MEMROTATE(type)                            \
     void Q_GUI_EXPORT qt_memrotate90(const type*, int, int, int, type*, int); \
     void Q_GUI_EXPORT qt_memrotate180(const type*, int, int, int, type*, int); \
diff --git a/src/gui/painting/qpaintengine_raster.cpp b/src/gui/painting/qpaintengine_raster.cpp
index 6d5eaf5aed..fc4f2a9944 100644
--- a/src/gui/painting/qpaintengine_raster.cpp
+++ b/src/gui/painting/qpaintengine_raster.cpp
@@ -272,6 +272,35 @@ static void qt_debug_path(const QPainterPath &path)
 }
 #endif
 
+// QRect::normalized() will change the width/height of the rectangle due to
+// its incusive-integer definition of left/right vs width. This is not
+// something we want to change in QRect as that would potentially introduce
+// regressions all over the place, so we implement a straightforward
+// normalized here. QRectF already does this, so QRectF::normalized() is ok to
+// use.
+static QRect qrect_normalized(const QRect &rect)
+{
+    int x, y, w, h;
+    if (Q_UNLIKELY(rect.width() < 0)) {
+        x = rect.x() + rect.width();
+        w = -rect.width();
+    } else {
+        x = rect.x();
+        w = rect.width();
+    }
+
+    if (Q_UNLIKELY(rect.height() < 0)) {
+        y = rect.y() + rect.height();
+        h = -rect.height();
+    } else {
+        y = rect.y();
+        h = rect.height();
+    }
+
+    return QRect(x, y, w, h);
+}
+
+
 QRasterPaintEnginePrivate::QRasterPaintEnginePrivate() :
     QPaintEngineExPrivate(),
     cachedLines(0)
@@ -1236,7 +1265,9 @@ void QRasterPaintEngine::clip(const QRect &rect, Qt::ClipOperation op)
 bool QRasterPaintEngine::setClipRectInDeviceCoords(const QRect &r, Qt::ClipOperation op)
 {
     Q_D(QRasterPaintEngine);
-    QRect clipRect = r & d->deviceRect;
+    // normalize before using the & operator which uses QRect::normalize()
+    // internally which will give us the wrong values.
+    QRect clipRect = qrect_normalized(r) & d->deviceRect;
     QRasterPaintEngineState *s = state();
 
     if (op == Qt::ReplaceClip || s->clip == 0) {
@@ -1471,7 +1502,7 @@ void QRasterPaintEngine::drawRects(const QRect *rects, int rectCount)
             int offset_x = int(s->matrix.dx());
             int offset_y = int(s->matrix.dy());
             while (r < lastRect) {
-                QRect rect = r->normalized();
+                QRect rect = qrect_normalized(*r);
                 QRect rr = rect.translated(offset_x, offset_y);
                 fillRect_normalized(rr, &s->brushData, d);
                 ++r;
@@ -2266,8 +2297,9 @@ void QRasterPaintEngine::drawImage(const QRectF &r, const QImage &img, const QRe
                 && d->rasterBuffer->compositionMode == QPainter::CompositionMode_Source)))
     {
         RotationType rotationType = qRotationType(s->matrix);
+        const QPixelLayout::BPP plBpp = qPixelLayouts[d->rasterBuffer->format].bpp;
 
-        if (rotationType != NoRotation && qMemRotateFunctions[d->rasterBuffer->format][rotationType] && img.rect().contains(sr.toAlignedRect())) {
+        if (rotationType != NoRotation && qMemRotateFunctions[plBpp][rotationType] && img.rect().contains(sr.toAlignedRect())) {
             QRectF transformedTargetRect = s->matrix.mapRect(r);
 
             if ((!(s->renderHints & QPainter::SmoothPixmapTransform) && !(s->renderHints & QPainter::Antialiasing))
@@ -2297,7 +2329,7 @@ void QRasterPaintEngine::drawImage(const QRectF &r, const QImage &img, const QRe
                 uint cw = clippedSourceRect.width();
                 uint ch = clippedSourceRect.height();
 
-                qMemRotateFunctions[d->rasterBuffer->format][rotationType](srcBase, cw, ch, sbpl, dstBase, dbpl);
+                qMemRotateFunctions[plBpp][rotationType](srcBase, cw, ch, sbpl, dstBase, dbpl);
 
                 return;
             }
@@ -2500,7 +2532,7 @@ void QRasterPaintEngine::drawTiledPixmap(const QRectF &r, const QPixmap &pixmap,
 
         QRectF rr = r;
         rr.translate(s->matrix.dx(), s->matrix.dy());
-        fillRect_normalized(rr.toRect().normalized(), &d->image_filler, d);
+        fillRect_normalized(rr.normalized().toRect(), &d->image_filler, d);
     }
 }
 
@@ -2523,7 +2555,7 @@ QRasterBuffer *QRasterPaintEngine::rasterBuffer()
 /*!
     \internal
 */
-void QRasterPaintEngine::alphaPenBlt(const void* src, int bpl, int depth, int rx,int ry,int w,int h)
+void QRasterPaintEngine::alphaPenBlt(const void* src, int bpl, int depth, int rx,int ry,int w,int h, bool useGammaCorrection)
 {
     Q_D(QRasterPaintEngine);
     QRasterPaintEngineState *s = state();
@@ -2578,14 +2610,14 @@ void QRasterPaintEngine::alphaPenBlt(const void* src, int bpl, int depth, int rx
             } else if (depth == 8) {
                 if (s->penData.alphamapBlit) {
                     s->penData.alphamapBlit(rb, rx, ry, s->penData.solid.color,
-                                            scanline, w, h, bpl, 0);
+                                            scanline, w, h, bpl, 0, useGammaCorrection);
                     return;
                 }
             } else if (depth == 32) {
                 // (A)RGB Alpha mask where the alpha component is not used.
                 if (s->penData.alphaRGBBlit) {
                     s->penData.alphaRGBBlit(rb, rx, ry, s->penData.solid.color,
-                                            (const uint *) scanline, w, h, bpl / 4, 0);
+                                            (const uint *) scanline, w, h, bpl / 4, 0, useGammaCorrection);
                     return;
                 }
             }
@@ -2614,10 +2646,10 @@ void QRasterPaintEngine::alphaPenBlt(const void* src, int bpl, int depth, int rx
             }
             if (depth == 8)
                 s->penData.alphamapBlit(rb, rx, ry, s->penData.solid.color,
-                                        scanline, w, h, bpl, clip);
+                                        scanline, w, h, bpl, clip, useGammaCorrection);
             else if (depth == 32)
                 s->penData.alphaRGBBlit(rb, rx, ry, s->penData.solid.color,
-                                        (const uint *) scanline, w, h, bpl / 4, clip);
+                                        (const uint *) scanline, w, h, bpl / 4, clip, useGammaCorrection);
             return;
         }
     }
@@ -2775,7 +2807,8 @@ bool QRasterPaintEngine::drawCachedGlyphs(int numGlyphs, const glyph_t *glyphs,
             alphaPenBlt(alphaMap->constBits(), alphaMap->bytesPerLine(), alphaMap->depth(),
                         qFloor(positions[i].x) + offset.x(),
                         qRound(positions[i].y) + offset.y(),
-                        alphaMap->width(), alphaMap->height());
+                        alphaMap->width(), alphaMap->height(),
+                        fontEngine->expectsGammaCorrectedBlending());
 
             fontEngine->unlockAlphaMapForGlyph();
         }
@@ -2836,7 +2869,7 @@ bool QRasterPaintEngine::drawCachedGlyphs(int numGlyphs, const glyph_t *glyphs,
                 drawImage(QPoint(x, y), QImage(glyphBits, c.w, c.h, bpl, image.format()));
                 s->matrix = originalTransform;
             } else {
-                alphaPenBlt(glyphBits, bpl, depth, x, y, c.w, c.h);
+                alphaPenBlt(glyphBits, bpl, depth, x, y, c.w, c.h, fontEngine->expectsGammaCorrectedBlending());
             }
         }
     }
@@ -2880,7 +2913,7 @@ bool QRasterPaintEnginePrivate::isUnclipped(const QRect &rect,
     const QRasterPaintEngineState *s = q->state();
     const QClipData *cl = clip();
     if (!cl) {
-        QRect r = rect.normalized();
+        QRect r = qrect_normalized(rect);
         // inline contains() for performance (we know the rects are normalized)
         const QRect &r1 = deviceRect;
         return (r.left() >= r1.left() && r.right() <= r1.right()
@@ -2895,7 +2928,7 @@ bool QRasterPaintEnginePrivate::isUnclipped(const QRect &rect,
     if (s->flags.antialiased)
         ++penWidth;
 
-    QRect r = rect.normalized();
+    QRect r = qrect_normalized(rect);
     if (penWidth > 0) {
         r.setX(r.x() - penWidth);
         r.setY(r.y() - penWidth);
@@ -4439,9 +4472,9 @@ void QSpanData::setup(const QBrush &brush, int alpha, QPainter::CompositionMode
             gradient.alphaColor = !brush.isOpaque() || alpha != 256;
 
             auto cacheInfo = qt_gradient_cache()->getBuffer(*g, alpha);
-            cachedGradient = cacheInfo;
             gradient.colorTable32 = cacheInfo->buffer32;
             gradient.colorTable64 = cacheInfo->buffer64;
+            cachedGradient = std::move(cacheInfo);
 
             gradient.spread = g->spread();
 
@@ -4461,9 +4494,9 @@ void QSpanData::setup(const QBrush &brush, int alpha, QPainter::CompositionMode
             gradient.alphaColor = !brush.isOpaque() || alpha != 256;
 
             auto cacheInfo = qt_gradient_cache()->getBuffer(*g, alpha);
-            cachedGradient = cacheInfo;
             gradient.colorTable32 = cacheInfo->buffer32;
             gradient.colorTable64 = cacheInfo->buffer64;
+            cachedGradient = std::move(cacheInfo);
 
             gradient.spread = g->spread();
 
@@ -4487,9 +4520,9 @@ void QSpanData::setup(const QBrush &brush, int alpha, QPainter::CompositionMode
             gradient.alphaColor = !brush.isOpaque() || alpha != 256;
 
             auto cacheInfo = qt_gradient_cache()->getBuffer(*g, alpha);
-            cachedGradient = cacheInfo;
             gradient.colorTable32 = cacheInfo->buffer32;
             gradient.colorTable64 = cacheInfo->buffer64;
+            cachedGradient = std::move(cacheInfo);
 
             gradient.spread = QGradient::RepeatSpread;
 
diff --git a/src/gui/painting/qpaintengine_raster_p.h b/src/gui/painting/qpaintengine_raster_p.h
index 59213220a6..d0b82b3a93 100644
--- a/src/gui/painting/qpaintengine_raster_p.h
+++ b/src/gui/painting/qpaintengine_raster_p.h
@@ -225,7 +225,7 @@ public:
 #endif
 
     QRasterBuffer *rasterBuffer();
-    void alphaPenBlt(const void* src, int bpl, int depth, int rx,int ry,int w,int h);
+    void alphaPenBlt(const void* src, int bpl, int depth, int rx,int ry,int w,int h, bool useGammaCorrection);
 
     Type type() const Q_DECL_OVERRIDE { return Raster; }
 
diff --git a/src/gui/painting/qpainter.h b/src/gui/painting/qpainter.h
index 46817b9c73..64d15d5296 100644
--- a/src/gui/painting/qpainter.h
+++ b/src/gui/painting/qpainter.h
@@ -83,7 +83,6 @@ class Q_GUI_EXPORT QPainter
 {
     Q_DECLARE_PRIVATE(QPainter)
     Q_GADGET
-    Q_FLAGS(RenderHint RenderHints)
 
 public:
     enum RenderHint {
@@ -94,8 +93,10 @@ public:
         NonCosmeticDefaultPen = 0x10,
         Qt4CompatiblePainting = 0x20
     };
+    Q_FLAG(RenderHint)
 
     Q_DECLARE_FLAGS(RenderHints, RenderHint)
+    Q_FLAG(RenderHints)
 
     class PixmapFragment {
     public:
diff --git a/src/gui/painting/qpdf.cpp b/src/gui/painting/qpdf.cpp
index 84e18a64dd..7b8bae1642 100644
--- a/src/gui/painting/qpdf.cpp
+++ b/src/gui/painting/qpdf.cpp
@@ -1504,16 +1504,25 @@ void QPdfEnginePrivate::writeInfo()
     printString(creator);
     xprintf("\n/Producer ");
     printString(QString::fromLatin1("Qt " QT_VERSION_STR));
-    QDateTime now = QDateTime::currentDateTimeUtc();
+    QDateTime now = QDateTime::currentDateTime();
     QTime t = now.time();
     QDate d = now.date();
-    xprintf("\n/CreationDate (D:%d%02d%02d%02d%02d%02d)\n",
+    xprintf("\n/CreationDate (D:%d%02d%02d%02d%02d%02d",
             d.year(),
             d.month(),
             d.day(),
             t.hour(),
             t.minute(),
             t.second());
+    int offset = now.offsetFromUtc();
+    int hours  = (offset / 60) / 60;
+    int mins   = (offset / 60) % 60;
+    if (offset < 0)
+        xprintf("-%02d'%02d')\n", -hours, -mins);
+    else if (offset > 0)
+        xprintf("+%02d'%02d')\n", hours , mins);
+    else
+        xprintf("Z)\n");
     xprintf(">>\n"
             "endobj\n");
 }
diff --git a/src/gui/painting/qregion.cpp b/src/gui/painting/qregion.cpp
index 0571e1a328..3fb6f925b3 100644
--- a/src/gui/painting/qregion.cpp
+++ b/src/gui/painting/qregion.cpp
@@ -739,7 +739,7 @@ bool QRegion::intersects(const QRegion &region) const
 */
 
 
-#if !defined (Q_OS_UNIX) && !defined (Q_OS_WIN)
+#if !defined (Q_OS_UNIX) && !defined (Q_OS_WIN) || defined(Q_CLANG_QDOC)
 /*!
     \overload
     \since 4.4
diff --git a/src/gui/painting/qrgba64_p.h b/src/gui/painting/qrgba64_p.h
index 0dadc038fa..2a17d8a624 100644
--- a/src/gui/painting/qrgba64_p.h
+++ b/src/gui/painting/qrgba64_p.h
@@ -185,6 +185,60 @@ inline QRgba64 addWithSaturation(QRgba64 a, QRgba64 b)
                                qMin(a.alpha() + b.alpha(), 65535));
 }
 
+#if defined __SSE2__
+Q_ALWAYS_INLINE uint toArgb32(__m128i v)
+{
+    v = _mm_unpacklo_epi16(v, _mm_setzero_si128());
+    v = _mm_add_epi32(v, _mm_set1_epi32(128));
+    v = _mm_sub_epi32(v, _mm_srli_epi32(v, 8));
+    v = _mm_srli_epi32(v, 8);
+    v = _mm_packs_epi32(v, v);
+    v = _mm_packus_epi16(v, v);
+    return _mm_cvtsi128_si32(v);
+}
+#elif defined __ARM_NEON__
+Q_ALWAYS_INLINE uint toArgb32(uint16x4_t v)
+{
+    v = vsub_u16(v, vrshr_n_u16(v, 8));
+    v = vrshr_n_u16(v, 8);
+    uint8x8_t v8 = vmovn_u16(vcombine_u16(v, v));
+    return vget_lane_u32(vreinterpret_u32_u8(v8), 0);
+}
+#endif
+
+inline uint toArgb32(QRgba64 rgba64)
+{
+#if defined __SSE2__
+    __m128i v = _mm_loadl_epi64((const __m128i *)&rgba64);
+    v = _mm_shufflelo_epi16(v, _MM_SHUFFLE(3, 0, 1, 2));
+    return toArgb32(v);
+#elif defined __ARM_NEON__
+    uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
+#if Q_BYTE_ORDER == Q_LITTLE_ENDIAN
+    const uint8x8_t shuffleMask = { 4, 5, 2, 3, 0, 1, 6, 7 };
+    v = vreinterpret_u16_u8(vtbl1_u8(vreinterpret_u8_u16(v), shuffleMask));
+#else
+    v = vext_u16(v, v, 3);
+#endif
+    return toArgb32(v);
+#else
+    return rgba64.toArgb32();
+#endif
+}
+
+inline uint toRgba8888(QRgba64 rgba64)
+{
+#if defined __SSE2__
+    __m128i v = _mm_loadl_epi64((const __m128i *)&rgba64);
+    return toArgb32(v);
+#elif defined __ARM_NEON__
+    uint16x4_t v = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&rgba64)));
+    return toArgb32(v);
+#else
+    return ARGB2RGBA(toArgb32(rgba64));
+#endif
+}
+
 #if defined(__SSE2__)
 Q_ALWAYS_INLINE __m128i addWithSaturation(__m128i a, __m128i b)
 {
@@ -199,6 +253,52 @@ Q_ALWAYS_INLINE uint16x4_t addWithSaturation(uint16x4_t a, uint16x4_t b)
 }
 #endif
 
+inline QRgba64 rgbBlend(QRgba64 d, QRgba64 s, uint rgbAlpha)
+{
+    QRgba64 blend;
+#ifdef __SSE2__
+    __m128i vd = _mm_loadl_epi64((const __m128i *)&d);
+    __m128i vs = _mm_loadl_epi64((const __m128i *)&s);
+    __m128i va =  _mm_cvtsi32_si128(rgbAlpha);
+    va = _mm_unpacklo_epi8(va, va);
+    __m128i vb = _mm_xor_si128(_mm_set1_epi16(-1), va);
+
+    vs = _mm_unpacklo_epi16(_mm_mullo_epi16(vs, va), _mm_mulhi_epu16(vs, va));
+    vd = _mm_unpacklo_epi16(_mm_mullo_epi16(vd, vb), _mm_mulhi_epu16(vd, vb));
+    vd = _mm_add_epi32(vd, vs);
+    vd = _mm_add_epi32(vd, _mm_srli_epi32(vd, 16));
+    vd = _mm_add_epi32(vd, _mm_set1_epi32(0x8000));
+    vd = _mm_srai_epi32(vd, 16);
+    vd = _mm_packs_epi32(vd, _mm_setzero_si128());
+
+    _mm_storel_epi64((__m128i *)&blend, vd);
+#elif defined(__ARM_NEON__)
+    uint16x4_t vd = vreinterpret_u16_u64(vmov_n_u64(d));
+    uint16x4_t vs = vreinterpret_u16_u64(vmov_n_u64(s));
+    uint8x8_t va8 = vreinterpret_u8_u32(vmov_n_u32(rgbAlpha));
+    uint16x4_t va = vreinterpret_u16_u8(vzip_u8(va8, va8).val[0]);
+    uint16x4_t vb = vdup_n_u16(0xffff);
+    vb = vsub_u16(vb, va);
+
+    uint32x4_t vs32 = vmull_u16(vs, va);
+    uint32x4_t vd32 = vmull_u16(vd, vb);
+    vd32 = vaddq_u32(vd32, vs32);
+    vd32 = vsraq_n_u32(vd32, vd32, 16);
+    vd = vrshrn_n_u32(vd32, 16);
+    vst1_u64(reinterpret_cast<uint64_t *>(&blend), vreinterpret_u64_u16(vd));
+#else
+    const int mr = qRed(rgbAlpha);
+    const int mg = qGreen(rgbAlpha);
+    const int mb = qBlue(rgbAlpha);
+    blend.setRed  (qt_div_255(s.red()   * mr + d.red()   * (255 - mr)));
+    blend.setGreen(qt_div_255(s.green() * mg + d.green() * (255 - mg)));
+    blend.setBlue (qt_div_255(s.blue()  * mb + d.blue()  * (255 - mb)));
+    blend.setAlpha(s.alpha());
+#endif
+    return blend;
+}
+
+
 QT_END_NAMESPACE
 
 #endif // QRGBA64_P_H
diff --git a/src/gui/painting/qtriangulator.cpp b/src/gui/painting/qtriangulator.cpp
index 6604d407f0..6d57eba123 100644
--- a/src/gui/painting/qtriangulator.cpp
+++ b/src/gui/painting/qtriangulator.cpp
@@ -50,10 +50,6 @@
 #include <QtCore/qglobal.h>
 #include <QtCore/qpoint.h>
 #include <QtCore/qalgorithms.h>
-#ifndef QT_NO_OPENGL
-# include <private/qopenglcontext_p.h>
-# include <private/qopenglextensions_p.h>
-#endif
 #include <private/qrbtree_p.h>
 
 QT_BEGIN_NAMESPACE
@@ -2266,23 +2262,12 @@ void QTriangulator<T>::MonotoneToTriangles::decompose()
 //                                qTriangulate                                //
 //============================================================================//
 
-static bool hasElementIndexUint()
-{
-#ifndef QT_NO_OPENGL
-    QOpenGLContext *context = QOpenGLContext::currentContext();
-    if (!context)
-        return false;
-    return static_cast<QOpenGLExtensions *>(context->functions())->hasOpenGLExtension(QOpenGLExtensions::ElementIndexUint);
-#else
-    return false;
-#endif
-}
-
 Q_GUI_EXPORT QTriangleSet qTriangulate(const qreal *polygon,
-                          int count, uint hint, const QTransform &matrix)
+                                       int count, uint hint, const QTransform &matrix,
+                                       bool allowUintIndices)
 {
     QTriangleSet triangleSet;
-    if (hasElementIndexUint()) {
+    if (allowUintIndices) {
         QTriangulator<quint32> triangulator;
         triangulator.initialize(polygon, count, hint, matrix);
         QVertexSet<quint32> vertexSet = triangulator.triangulate();
@@ -2300,10 +2285,13 @@ Q_GUI_EXPORT QTriangleSet qTriangulate(const qreal *polygon,
 }
 
 Q_GUI_EXPORT QTriangleSet qTriangulate(const QVectorPath &path,
-                          const QTransform &matrix, qreal lod)
+                                       const QTransform &matrix, qreal lod, bool allowUintIndices)
 {
     QTriangleSet triangleSet;
-    if (hasElementIndexUint()) {
+    // For now systems that support 32-bit index values will always get 32-bit
+    // index values. This is not necessary ideal since 16-bit would be enough in
+    // many cases. TODO revisit this at a later point.
+    if (allowUintIndices) {
         QTriangulator<quint32> triangulator;
         triangulator.initialize(path, matrix, lod);
         QVertexSet<quint32> vertexSet = triangulator.triangulate();
@@ -2320,10 +2308,10 @@ Q_GUI_EXPORT QTriangleSet qTriangulate(const QVectorPath &path,
 }
 
 QTriangleSet qTriangulate(const QPainterPath &path,
-                          const QTransform &matrix, qreal lod)
+                          const QTransform &matrix, qreal lod, bool allowUintIndices)
 {
     QTriangleSet triangleSet;
-    if (hasElementIndexUint()) {
+    if (allowUintIndices) {
         QTriangulator<quint32> triangulator;
         triangulator.initialize(path, matrix, lod);
         QVertexSet<quint32> vertexSet = triangulator.triangulate();
@@ -2340,10 +2328,10 @@ QTriangleSet qTriangulate(const QPainterPath &path,
 }
 
 QPolylineSet qPolyline(const QVectorPath &path,
-                       const QTransform &matrix, qreal lod)
+                       const QTransform &matrix, qreal lod, bool allowUintIndices)
 {
     QPolylineSet polyLineSet;
-    if (hasElementIndexUint()) {
+    if (allowUintIndices) {
         QTriangulator<quint32> triangulator;
         triangulator.initialize(path, matrix, lod);
         QVertexSet<quint32> vertexSet = triangulator.polyline();
@@ -2360,10 +2348,10 @@ QPolylineSet qPolyline(const QVectorPath &path,
 }
 
 QPolylineSet qPolyline(const QPainterPath &path,
-                       const QTransform &matrix, qreal lod)
+                       const QTransform &matrix, qreal lod, bool allowUintIndices)
 {
     QPolylineSet polyLineSet;
-    if (hasElementIndexUint()) {
+    if (allowUintIndices) {
         QTriangulator<quint32> triangulator;
         triangulator.initialize(path, matrix, lod);
         QVertexSet<quint32> vertexSet = triangulator.polyline();
diff --git a/src/gui/painting/qtriangulator_p.h b/src/gui/painting/qtriangulator_p.h
index 4d1aba099c..8f043fc925 100644
--- a/src/gui/painting/qtriangulator_p.h
+++ b/src/gui/painting/qtriangulator_p.h
@@ -137,11 +137,18 @@ struct Q_GUI_EXPORT QPolylineSet
 // integers, the polygon is triangulated, and then scaled back by 1/32.
 // 'hint' should be a combination of QVectorPath::Hints.
 // 'lod' is the level of detail. Default is 1. Curves are split into more lines when 'lod' is higher.
-QTriangleSet Q_GUI_EXPORT qTriangulate(const qreal *polygon, int count, uint hint = QVectorPath::PolygonHint | QVectorPath::OddEvenFill, const QTransform &matrix = QTransform());
-QTriangleSet Q_GUI_EXPORT qTriangulate(const QVectorPath &path, const QTransform &matrix = QTransform(), qreal lod = 1);
-QTriangleSet Q_GUI_EXPORT qTriangulate(const QPainterPath &path, const QTransform &matrix = QTransform(), qreal lod = 1);
-QPolylineSet qPolyline(const QVectorPath &path, const QTransform &matrix = QTransform(), qreal lod = 1);
-QPolylineSet Q_GUI_EXPORT qPolyline(const QPainterPath &path, const QTransform &matrix = QTransform(), qreal lod = 1);
+QTriangleSet Q_GUI_EXPORT qTriangulate(const qreal *polygon, int count,
+                                       uint hint = QVectorPath::PolygonHint | QVectorPath::OddEvenFill,
+                                       const QTransform &matrix = QTransform(),
+                                       bool allowUintIndices = true);
+QTriangleSet Q_GUI_EXPORT qTriangulate(const QVectorPath &path, const QTransform &matrix = QTransform(),
+                                       qreal lod = 1, bool allowUintIndices = true);
+QTriangleSet Q_GUI_EXPORT qTriangulate(const QPainterPath &path, const QTransform &matrix = QTransform(),
+                                       qreal lod = 1, bool allowUintIndices = true);
+QPolylineSet qPolyline(const QVectorPath &path, const QTransform &matrix = QTransform(),
+                       qreal lod = 1, bool allowUintIndices = true);
+QPolylineSet Q_GUI_EXPORT qPolyline(const QPainterPath &path, const QTransform &matrix = QTransform(),
+                                    qreal lod = 1, bool allowUintIndices = true);
 
 QT_END_NAMESPACE