diff options
Diffstat (limited to 'src/gui/painting/qdrawhelper.cpp')
-rw-r--r-- | src/gui/painting/qdrawhelper.cpp | 1741 |
1 files changed, 1043 insertions, 698 deletions
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index 7b3e1b991d..5c38648fe7 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -43,6 +43,7 @@ #include <qstylehints.h> #include <qguiapplication.h> #include <qatomic.h> +#include <private/qcolorprofile_p.h> #include <private/qdrawhelper_p.h> #include <private/qpaintengine_raster_p.h> #include <private/qpainter_p.h> @@ -1143,6 +1144,11 @@ static QRgba64 *QT_FASTCALL destFetch64uint32(QRgba64 *buffer, QRasterBuffer *ra return const_cast<QRgba64 *>(layout->convertToARGB64PM(buffer, src, length, 0, 0)); } +static QRgba64 * QT_FASTCALL destFetch64Undefined(QRgba64 *buffer, QRasterBuffer *, int, int, int) +{ + return buffer; +} + static DestFetchProc destFetchProc[QImage::NImageFormats] = { 0, // Format_Invalid @@ -1175,8 +1181,8 @@ static DestFetchProc destFetchProc[QImage::NImageFormats] = static DestFetchProc64 destFetchProc64[QImage::NImageFormats] = { 0, // Format_Invalid - destFetch64, // Format_Mono, - destFetch64, // Format_MonoLSB + 0, // Format_Mono, + 0, // Format_MonoLSB 0, // Format_Indexed8 destFetch64uint32, // Format_RGB32 destFetch64uint32, // Format_ARGB32, @@ -1320,7 +1326,7 @@ static void QT_FASTCALL destStore(QRasterBuffer *rasterBuffer, int x, int y, con static void QT_FASTCALL convertFromRgb64(uint *dest, const QRgba64 *src, int length) { for (int i = 0; i < length; ++i) { - dest[i] = src[i].toArgb32(); + dest[i] = toArgb32(src[i]); } } @@ -1411,7 +1417,7 @@ static void QT_FASTCALL destStore64ARGB32(QRasterBuffer *rasterBuffer, int x, in { uint *dest = (uint*)rasterBuffer->scanLine(y) + x; for (int i = 0; i < length; ++i) { - dest[i] = buffer[i].unpremultiplied().toArgb32(); + dest[i] = toArgb32(buffer[i].unpremultiplied()); } } @@ -1419,7 +1425,7 @@ static void QT_FASTCALL destStore64RGBA8888(QRasterBuffer *rasterBuffer, int x, { uint *dest = (uint*)rasterBuffer->scanLine(y) + x; for (int i = 0; i < length; ++i) { - dest[i] = ARGB2RGBA(buffer[i].unpremultiplied().toArgb32()); + dest[i] = toRgba8888(buffer[i].unpremultiplied()); } } @@ -1914,562 +1920,695 @@ inline void fetchTransformedBilinear_pixelBounds<BlendTransformedBilinear>(int, Q_ASSERT(v2 >= l1 && v2 <= l2); } -template<TextureBlendType blendType> /* blendType = BlendTransformedBilinear or BlendTransformedBilinearTiled */ -static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, const Operator *, - const QSpanData *data, int y, int x, - int length) -{ - int image_width = data->texture.width; - int image_height = data->texture.height; - - int image_x1 = data->texture.x1; - int image_y1 = data->texture.y1; - int image_x2 = data->texture.x2 - 1; - int image_y2 = data->texture.y2 - 1; - - const qreal cx = x + qreal(0.5); - const qreal cy = y + qreal(0.5); - - uint *end = buffer + length; - uint *b = buffer; - if (data->fast_matrix) { - // The increment pr x in the scanline - int fdx = (int)(data->m11 * fixed_scale); - int fdy = (int)(data->m12 * fixed_scale); - - int fx = int((data->m21 * cy - + data->m11 * cx + data->dx) * fixed_scale); - int fy = int((data->m22 * cy - + data->m12 * cx + data->dy) * fixed_scale); - - fx -= half_point; - fy -= half_point; - - if (fdy == 0) { //simple scale, no rotation - int y1 = (fy >> 16); - int y2; - fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2); - const uint *s1 = (const uint *)data->texture.scanLine(y1); - const uint *s2 = (const uint *)data->texture.scanLine(y2); - - if (fdx <= fixed_scale && fdx > 0) { // scale up on X - int disty = (fy & 0x0000ffff) >> 8; - int idisty = 256 - disty; - int x = fx >> 16; +enum FastTransformTypes { + SimpleUpscaleTransform, + UpscaleTransform, + DownscaleTransform, + RotateTransform, + FastRotateTransform, + NFastTransformTypes +}; - // The idea is first to do the interpolation between the row s1 and the row s2 - // into an intermediate buffer, then we interpolate between two pixel of this buffer. +typedef void (QT_FASTCALL *BilinearFastTransformHelper)(uint *b, uint *end, const QTextureData &image, int &fx, int &fy, int fdx, int fdy); - // intermediate_buffer[0] is a buffer of red-blue component of the pixel, in the form 0x00RR00BB - // intermediate_buffer[1] is the alpha-green component of the pixel, in the form 0x00AA00GG - // +1 for the last pixel to interpolate with, and +1 for rounding errors. - quint32 intermediate_buffer[2][buffer_size + 2]; - // count is the size used in the intermediate_buffer. - int count = (qint64(length) * fdx + fixed_scale - 1) / fixed_scale + 2; - Q_ASSERT(count <= buffer_size + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case - int f = 0; - int lim = count; - if (blendType == BlendTransformedBilinearTiled) { - x %= image_width; - if (x < 0) x += image_width; - } else { - lim = qMin(count, image_x2-x+1); - if (x < image_x1) { - Q_ASSERT(x <= image_x2); - uint t = s1[image_x1]; - uint b = s2[image_x1]; - quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff; - quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff; - do { - intermediate_buffer[0][f] = rb; - intermediate_buffer[1][f] = ag; - f++; - x++; - } while (x < image_x1 && f < lim); - } - } +template<TextureBlendType blendType> +static void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_upscale_helper(uint *b, uint *end, const QTextureData &image, + int &fx, int &fy, int fdx, int /*fdy*/) +{ + int y1 = (fy >> 16); + int y2; + fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2); + const uint *s1 = (const uint *)image.scanLine(y1); + const uint *s2 = (const uint *)image.scanLine(y2); + + int disty = (fy & 0x0000ffff) >> 8; + int idisty = 256 - disty; + int x = fx >> 16; + int length = end - b; + + // The idea is first to do the interpolation between the row s1 and the row s2 + // into an intermediate buffer, then we interpolate between two pixel of this buffer. + + // intermediate_buffer[0] is a buffer of red-blue component of the pixel, in the form 0x00RR00BB + // intermediate_buffer[1] is the alpha-green component of the pixel, in the form 0x00AA00GG + // +1 for the last pixel to interpolate with, and +1 for rounding errors. + quint32 intermediate_buffer[2][buffer_size + 2]; + // count is the size used in the intermediate_buffer. + int count = (qint64(length) * fdx + fixed_scale - 1) / fixed_scale + 2; + Q_ASSERT(count <= buffer_size + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case + int f = 0; + int lim = count; + if (blendType == BlendTransformedBilinearTiled) { + x %= image.width; + if (x < 0) x += image.width; + } else { + lim = qMin(count, image.x2 - x); + if (x < image.x1) { + Q_ASSERT(x < image.x2); + uint t = s1[image.x1]; + uint b = s2[image.x1]; + quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff; + quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff; + do { + intermediate_buffer[0][f] = rb; + intermediate_buffer[1][f] = ag; + f++; + x++; + } while (x < image.x1 && f < lim); + } + } - if (blendType != BlendTransformedBilinearTiled) { + if (blendType != BlendTransformedBilinearTiled) { #if defined(__SSE2__) - const __m128i disty_ = _mm_set1_epi16(disty); - const __m128i idisty_ = _mm_set1_epi16(idisty); - const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); - - lim -= 3; - for (; f < lim; x += 4, f += 4) { - // Load 4 pixels from s1, and split the alpha-green and red-blue component - __m128i top = _mm_loadu_si128((const __m128i*)((const uint *)(s1)+x)); - __m128i topAG = _mm_srli_epi16(top, 8); - __m128i topRB = _mm_and_si128(top, colorMask); - // Multiplies each colour component by idisty - topAG = _mm_mullo_epi16 (topAG, idisty_); - topRB = _mm_mullo_epi16 (topRB, idisty_); - - // Same for the s2 vector - __m128i bottom = _mm_loadu_si128((const __m128i*)((const uint *)(s2)+x)); - __m128i bottomAG = _mm_srli_epi16(bottom, 8); - __m128i bottomRB = _mm_and_si128(bottom, colorMask); - bottomAG = _mm_mullo_epi16 (bottomAG, disty_); - bottomRB = _mm_mullo_epi16 (bottomRB, disty_); - - // Add the values, and shift to only keep 8 significant bits per colors - __m128i rAG =_mm_add_epi16(topAG, bottomAG); - rAG = _mm_srli_epi16(rAG, 8); - _mm_storeu_si128((__m128i*)(&intermediate_buffer[1][f]), rAG); - __m128i rRB =_mm_add_epi16(topRB, bottomRB); - rRB = _mm_srli_epi16(rRB, 8); - _mm_storeu_si128((__m128i*)(&intermediate_buffer[0][f]), rRB); - } + const __m128i disty_ = _mm_set1_epi16(disty); + const __m128i idisty_ = _mm_set1_epi16(idisty); + const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); + + lim -= 3; + for (; f < lim; x += 4, f += 4) { + // Load 4 pixels from s1, and split the alpha-green and red-blue component + __m128i top = _mm_loadu_si128((const __m128i*)((const uint *)(s1)+x)); + __m128i topAG = _mm_srli_epi16(top, 8); + __m128i topRB = _mm_and_si128(top, colorMask); + // Multiplies each color component by idisty + topAG = _mm_mullo_epi16 (topAG, idisty_); + topRB = _mm_mullo_epi16 (topRB, idisty_); + + // Same for the s2 vector + __m128i bottom = _mm_loadu_si128((const __m128i*)((const uint *)(s2)+x)); + __m128i bottomAG = _mm_srli_epi16(bottom, 8); + __m128i bottomRB = _mm_and_si128(bottom, colorMask); + bottomAG = _mm_mullo_epi16 (bottomAG, disty_); + bottomRB = _mm_mullo_epi16 (bottomRB, disty_); + + // Add the values, and shift to only keep 8 significant bits per colors + __m128i rAG =_mm_add_epi16(topAG, bottomAG); + rAG = _mm_srli_epi16(rAG, 8); + _mm_storeu_si128((__m128i*)(&intermediate_buffer[1][f]), rAG); + __m128i rRB =_mm_add_epi16(topRB, bottomRB); + rRB = _mm_srli_epi16(rRB, 8); + _mm_storeu_si128((__m128i*)(&intermediate_buffer[0][f]), rRB); + } #elif defined(__ARM_NEON__) - const int16x8_t disty_ = vdupq_n_s16(disty); - const int16x8_t idisty_ = vdupq_n_s16(idisty); - const int16x8_t colorMask = vdupq_n_s16(0x00ff); - - lim -= 3; - for (; f < lim; x += 4, f += 4) { - // Load 4 pixels from s1, and split the alpha-green and red-blue component - int16x8_t top = vld1q_s16((int16_t*)((const uint *)(s1)+x)); - int16x8_t topAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(top), 8)); - int16x8_t topRB = vandq_s16(top, colorMask); - // Multiplies each colour component by idisty - topAG = vmulq_s16(topAG, idisty_); - topRB = vmulq_s16(topRB, idisty_); - - // Same for the s2 vector - int16x8_t bottom = vld1q_s16((int16_t*)((const uint *)(s2)+x)); - int16x8_t bottomAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(bottom), 8)); - int16x8_t bottomRB = vandq_s16(bottom, colorMask); - bottomAG = vmulq_s16(bottomAG, disty_); - bottomRB = vmulq_s16(bottomRB, disty_); - - // Add the values, and shift to only keep 8 significant bits per colors - int16x8_t rAG = vaddq_s16(topAG, bottomAG); - rAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rAG), 8)); - vst1q_s16((int16_t*)(&intermediate_buffer[1][f]), rAG); - int16x8_t rRB = vaddq_s16(topRB, bottomRB); - rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8)); - vst1q_s16((int16_t*)(&intermediate_buffer[0][f]), rRB); - } + const int16x8_t disty_ = vdupq_n_s16(disty); + const int16x8_t idisty_ = vdupq_n_s16(idisty); + const int16x8_t colorMask = vdupq_n_s16(0x00ff); + + lim -= 3; + for (; f < lim; x += 4, f += 4) { + // Load 4 pixels from s1, and split the alpha-green and red-blue component + int16x8_t top = vld1q_s16((int16_t*)((const uint *)(s1)+x)); + int16x8_t topAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(top), 8)); + int16x8_t topRB = vandq_s16(top, colorMask); + // Multiplies each color component by idisty + topAG = vmulq_s16(topAG, idisty_); + topRB = vmulq_s16(topRB, idisty_); + + // Same for the s2 vector + int16x8_t bottom = vld1q_s16((int16_t*)((const uint *)(s2)+x)); + int16x8_t bottomAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(bottom), 8)); + int16x8_t bottomRB = vandq_s16(bottom, colorMask); + bottomAG = vmulq_s16(bottomAG, disty_); + bottomRB = vmulq_s16(bottomRB, disty_); + + // Add the values, and shift to only keep 8 significant bits per colors + int16x8_t rAG = vaddq_s16(topAG, bottomAG); + rAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rAG), 8)); + vst1q_s16((int16_t*)(&intermediate_buffer[1][f]), rAG); + int16x8_t rRB = vaddq_s16(topRB, bottomRB); + rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8)); + vst1q_s16((int16_t*)(&intermediate_buffer[0][f]), rRB); + } #endif - } - for (; f < count; f++) { // Same as above but without sse2 - if (blendType == BlendTransformedBilinearTiled) { - if (x >= image_width) x -= image_width; - } else { - x = qMin(x, image_x2); - } + } + for (; f < count; f++) { // Same as above but without simd + if (blendType == BlendTransformedBilinearTiled) { + if (x >= image.width) x -= image.width; + } else { + x = qMin(x, image.x2 - 1); + } - uint t = s1[x]; - uint b = s2[x]; + uint t = s1[x]; + uint b = s2[x]; - intermediate_buffer[0][f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff; - intermediate_buffer[1][f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff; - x++; - } - // Now interpolate the values from the intermediate_buffer to get the final result. - fx &= fixed_scale - 1; - Q_ASSERT((fx >> 16) == 0); - while (b < end) { - int x1 = (fx >> 16); - int x2 = x1 + 1; - Q_ASSERT(x1 >= 0); - Q_ASSERT(x2 < count); + intermediate_buffer[0][f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff; + intermediate_buffer[1][f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff; + x++; + } + // Now interpolate the values from the intermediate_buffer to get the final result. + fx &= fixed_scale - 1; + Q_ASSERT((fx >> 16) == 0); + while (b < end) { + int x1 = (fx >> 16); + int x2 = x1 + 1; + Q_ASSERT(x1 >= 0); + Q_ASSERT(x2 < count); + + int distx = (fx & 0x0000ffff) >> 8; + int idistx = 256 - distx; + int rb = ((intermediate_buffer[0][x1] * idistx + intermediate_buffer[0][x2] * distx) >> 8) & 0xff00ff; + int ag = (intermediate_buffer[1][x1] * idistx + intermediate_buffer[1][x2] * distx) & 0xff00ff00; + *b = rb | ag; + b++; + fx += fdx; + } +} - int distx = (fx & 0x0000ffff) >> 8; - int idistx = 256 - distx; - int rb = ((intermediate_buffer[0][x1] * idistx + intermediate_buffer[0][x2] * distx) >> 8) & 0xff00ff; - int ag = (intermediate_buffer[1][x1] * idistx + intermediate_buffer[1][x2] * distx) & 0xff00ff00; - *b = rb | ag; - b++; - fx += fdx; - } - } else if ((fdx < 0 && fdx > -(fixed_scale / 8)) || std::abs(data->m22) < (1./8.)) { // scale up more than 8x - int y1 = (fy >> 16); - int y2; - fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2); - const uint *s1 = (const uint *)data->texture.scanLine(y1); - const uint *s2 = (const uint *)data->texture.scanLine(y2); - int disty = (fy & 0x0000ffff) >> 8; - while (b < end) { - int x1 = (fx >> 16); - int x2; - fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); - uint tl = s1[x1]; - uint tr = s1[x2]; - uint bl = s2[x1]; - uint br = s2[x2]; - int distx = (fx & 0x0000ffff) >> 8; - *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); +template<TextureBlendType blendType> +static void QT_FASTCALL fetchTransformedBilinearARGB32PM_upscale_helper(uint *b, uint *end, const QTextureData &image, + int &fx, int &fy, int fdx, int /*fdy*/) +{ + int y1 = (fy >> 16); + int y2; + fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2); + const uint *s1 = (const uint *)image.scanLine(y1); + const uint *s2 = (const uint *)image.scanLine(y2); + const int disty = (fy & 0x0000ffff) >> 8; + + if (blendType != BlendTransformedBilinearTiled) { + const qint64 min_fx = qint64(image.x1) * fixed_scale; + const qint64 max_fx = qint64(image.x2 - 1) * fixed_scale; + while (b < end) { + int x1 = (fx >> 16); + int x2; + fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2); + if (x1 != x2) + break; + uint top = s1[x1]; + uint bot = s2[x1]; + *b = INTERPOLATE_PIXEL_256(top, 256 - disty, bot, disty); + fx += fdx; + ++b; + } + uint *boundedEnd = end; + if (fdx > 0) + boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); + else if (fdx < 0) + boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); + + // A fast middle part without boundary checks + while (b < boundedEnd) { + int x = (fx >> 16); + int distx = (fx & 0x0000ffff) >> 8; + *b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty); + fx += fdx; + ++b; + } + } - fx += fdx; - ++b; - } - } else { //scale down - int y1 = (fy >> 16); - int y2; - fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2); - const uint *s1 = (const uint *)data->texture.scanLine(y1); - const uint *s2 = (const uint *)data->texture.scanLine(y2); - const int disty8 = (fy & 0x0000ffff) >> 8; - const int disty4 = (disty8 + 0x08) >> 4; - - if (blendType != BlendTransformedBilinearTiled) { -#define BILINEAR_DOWNSCALE_BOUNDS_PROLOG \ - const qint64 min_fx = qint64(image_x1) * fixed_scale; \ - const qint64 max_fx = qint64(image_x2) * fixed_scale; \ - while (b < end) { \ - int x1 = (fx >> 16); \ - int x2; \ - fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); \ - if (x1 != x2) \ - break; \ - uint top = s1[x1]; \ - uint bot = s2[x1]; \ - *b = INTERPOLATE_PIXEL_256(top, 256 - disty8, bot, disty8); \ - fx += fdx; \ - ++b; \ - } \ - uint *boundedEnd = end; \ - if (fdx > 0) \ - boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); \ - else if (fdx < 0) \ - boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); \ - boundedEnd -= 3; + while (b < end) { + int x1 = (fx >> 16); + int x2; + fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1 , x1, x2); + uint tl = s1[x1]; + uint tr = s1[x2]; + uint bl = s2[x1]; + uint br = s2[x2]; + int distx = (fx & 0x0000ffff) >> 8; + *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); + + fx += fdx; + ++b; + } +} +template<TextureBlendType blendType> +static void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper(uint *b, uint *end, const QTextureData &image, + int &fx, int &fy, int fdx, int /*fdy*/) +{ + int y1 = (fy >> 16); + int y2; + fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2); + const uint *s1 = (const uint *)image.scanLine(y1); + const uint *s2 = (const uint *)image.scanLine(y2); + const int disty8 = (fy & 0x0000ffff) >> 8; + const int disty4 = (disty8 + 0x08) >> 4; + + if (blendType != BlendTransformedBilinearTiled) { + const qint64 min_fx = qint64(image.x1) * fixed_scale; + const qint64 max_fx = qint64(image.x2 - 1) * fixed_scale; + while (b < end) { + int x1 = (fx >> 16); + int x2; + fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2); + if (x1 != x2) + break; + uint top = s1[x1]; + uint bot = s2[x1]; + *b = INTERPOLATE_PIXEL_256(top, 256 - disty8, bot, disty8); + fx += fdx; + ++b; + } + uint *boundedEnd = end; + if (fdx > 0) + boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); + else if (fdx < 0) + boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); + // A fast middle part without boundary checks #if defined(__SSE2__) - BILINEAR_DOWNSCALE_BOUNDS_PROLOG - - const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); - const __m128i v_256 = _mm_set1_epi16(256); - const __m128i v_disty = _mm_set1_epi16(disty4); - const __m128i v_fdx = _mm_set1_epi32(fdx*4); - const __m128i v_fx_r = _mm_set1_epi32(0x8); - __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx); - - while (b < boundedEnd) { - __m128i offset = _mm_srli_epi32(v_fx, 16); - const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4); - const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4); - const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4); - const int offset3 = _mm_cvtsi128_si32(offset); - const __m128i tl = _mm_setr_epi32(s1[offset0], s1[offset1], s1[offset2], s1[offset3]); - const __m128i tr = _mm_setr_epi32(s1[offset0 + 1], s1[offset1 + 1], s1[offset2 + 1], s1[offset3 + 1]); - const __m128i bl = _mm_setr_epi32(s2[offset0], s2[offset1], s2[offset2], s2[offset3]); - const __m128i br = _mm_setr_epi32(s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]); - - __m128i v_distx = _mm_srli_epi16(v_fx, 8); - v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fx_r), 4); - v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0)); - v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0)); - - interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b); - b += 4; - v_fx = _mm_add_epi32(v_fx, v_fdx); - } - fx = _mm_cvtsi128_si32(v_fx); + const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); + const __m128i v_256 = _mm_set1_epi16(256); + const __m128i v_disty = _mm_set1_epi16(disty4); + const __m128i v_fdx = _mm_set1_epi32(fdx*4); + const __m128i v_fx_r = _mm_set1_epi32(0x8); + __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx); + + while (b < boundedEnd - 3) { + __m128i offset = _mm_srli_epi32(v_fx, 16); + const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4); + const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4); + const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4); + const int offset3 = _mm_cvtsi128_si32(offset); + const __m128i tl = _mm_setr_epi32(s1[offset0], s1[offset1], s1[offset2], s1[offset3]); + const __m128i tr = _mm_setr_epi32(s1[offset0 + 1], s1[offset1 + 1], s1[offset2 + 1], s1[offset3 + 1]); + const __m128i bl = _mm_setr_epi32(s2[offset0], s2[offset1], s2[offset2], s2[offset3]); + const __m128i br = _mm_setr_epi32(s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]); + + __m128i v_distx = _mm_srli_epi16(v_fx, 8); + v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fx_r), 4); + v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0)); + v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0)); + + interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b); + b += 4; + v_fx = _mm_add_epi32(v_fx, v_fdx); + } + fx = _mm_cvtsi128_si32(v_fx); #elif defined(__ARM_NEON__) - BILINEAR_DOWNSCALE_BOUNDS_PROLOG - - const int16x8_t colorMask = vdupq_n_s16(0x00ff); - const int16x8_t invColorMask = vmvnq_s16(colorMask); - const int16x8_t v_256 = vdupq_n_s16(256); - const int16x8_t v_disty = vdupq_n_s16(disty4); - const int16x8_t v_disty_ = vshlq_n_s16(v_disty, 4); - int32x4_t v_fdx = vdupq_n_s32(fdx*4); + const int16x8_t colorMask = vdupq_n_s16(0x00ff); + const int16x8_t invColorMask = vmvnq_s16(colorMask); + const int16x8_t v_256 = vdupq_n_s16(256); + const int16x8_t v_disty = vdupq_n_s16(disty4); + const int16x8_t v_disty_ = vshlq_n_s16(v_disty, 4); + int32x4_t v_fdx = vdupq_n_s32(fdx*4); - int32x4_t v_fx = vmovq_n_s32(fx); - v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1); - v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2); - v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3); + int32x4_t v_fx = vmovq_n_s32(fx); + v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1); + v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2); + v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3); - const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff); - const int32x4_t v_fx_r = vdupq_n_s32(0x0800); + const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff); + const int32x4_t v_fx_r = vdupq_n_s32(0x0800); - while (b < boundedEnd) { - uint32x4x2_t v_top, v_bot; + while (b < boundedEnd - 3) { + uint32x4x2_t v_top, v_bot; - int x1 = (fx >> 16); - fx += fdx; - v_top = vld2q_lane_u32(s1 + x1, v_top, 0); - v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0); - x1 = (fx >> 16); - fx += fdx; - v_top = vld2q_lane_u32(s1 + x1, v_top, 1); - v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1); - x1 = (fx >> 16); - fx += fdx; - v_top = vld2q_lane_u32(s1 + x1, v_top, 2); - v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2); - x1 = (fx >> 16); - fx += fdx; - v_top = vld2q_lane_u32(s1 + x1, v_top, 3); - v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3); - - int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_fx_r), 12); - v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16)); - - interpolate_4_pixels_16_neon( - vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]), - vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]), - vreinterpretq_s16_s32(v_distx), v_disty, v_disty_, - colorMask, invColorMask, v_256, b); - b+=4; - v_fx = vaddq_s32(v_fx, v_fdx); - } + int x1 = (fx >> 16); + fx += fdx; + v_top = vld2q_lane_u32(s1 + x1, v_top, 0); + v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0); + x1 = (fx >> 16); + fx += fdx; + v_top = vld2q_lane_u32(s1 + x1, v_top, 1); + v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1); + x1 = (fx >> 16); + fx += fdx; + v_top = vld2q_lane_u32(s1 + x1, v_top, 2); + v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2); + x1 = (fx >> 16); + fx += fdx; + v_top = vld2q_lane_u32(s1 + x1, v_top, 3); + v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3); + + int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_fx_r), 12); + v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16)); + + interpolate_4_pixels_16_neon( + vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]), + vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]), + vreinterpretq_s16_s32(v_distx), v_disty, v_disty_, + colorMask, invColorMask, v_256, b); + b+=4; + v_fx = vaddq_s32(v_fx, v_fdx); + } #endif - } + while (b < boundedEnd) { + int x = (fx >> 16); +#if defined(__SSE2__) || defined(__ARM_NEON__) + int distx8 = (fx & 0x0000ffff) >> 8; + *b = interpolate_4_pixels(s1 + x, s2 + x, distx8, disty8); +#else + uint tl = s1[x]; + uint tr = s1[x + 1]; + uint bl = s2[x]; + uint br = s2[x + 1]; + int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12; + *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4); +#endif + fx += fdx; + ++b; + } + } - while (b < end) { - int x1 = (fx >> 16); - int x2; - fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); - uint tl = s1[x1]; - uint tr = s1[x2]; - uint bl = s2[x1]; - uint br = s2[x2]; + while (b < end) { + int x1 = (fx >> 16); + int x2; + fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2); + uint tl = s1[x1]; + uint tr = s1[x2]; + uint bl = s2[x1]; + uint br = s2[x2]; #if defined(__SSE2__) || defined(__ARM_NEON__) - // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16. - int distx8 = (fx & 0x0000ffff) >> 8; - *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8); + // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16. + int distx8 = (fx & 0x0000ffff) >> 8; + *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8); #else - int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12; - *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4); + int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12; + *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4); #endif - fx += fdx; - ++b; - } - } - } else { //rotation - if (std::abs(data->m11) < (1./8.) || std::abs(data->m22) < (1./8.)) { - //if we are zooming more than 8 times, we use 8bit precision for the position. - while (b < end) { - int x1 = (fx >> 16); - int x2; - int y1 = (fy >> 16); - int y2; + fx += fdx; + ++b; + } +} - fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); - fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2); +template<TextureBlendType blendType> +static void QT_FASTCALL fetchTransformedBilinearARGB32PM_rotate_helper(uint *b, uint *end, const QTextureData &image, + int &fx, int &fy, int fdx, int fdy) +{ + // if we are zooming more than 8 times, we use 8bit precision for the position. + while (b < end) { + int x1 = (fx >> 16); + int x2; + int y1 = (fy >> 16); + int y2; - const uint *s1 = (const uint *)data->texture.scanLine(y1); - const uint *s2 = (const uint *)data->texture.scanLine(y2); + fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2); + fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2); - uint tl = s1[x1]; - uint tr = s1[x2]; - uint bl = s2[x1]; - uint br = s2[x2]; + const uint *s1 = (const uint *)image.scanLine(y1); + const uint *s2 = (const uint *)image.scanLine(y2); - int distx = (fx & 0x0000ffff) >> 8; - int disty = (fy & 0x0000ffff) >> 8; + uint tl = s1[x1]; + uint tr = s1[x2]; + uint bl = s2[x1]; + uint br = s2[x2]; - *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); + int distx = (fx & 0x0000ffff) >> 8; + int disty = (fy & 0x0000ffff) >> 8; - fx += fdx; - fy += fdy; - ++b; - } - } else { - //we are zooming less than 8x, use 4bit precision - - if (blendType != BlendTransformedBilinearTiled) { -#define BILINEAR_ROTATE_BOUNDS_PROLOG \ - const qint64 min_fx = qint64(image_x1) * fixed_scale; \ - const qint64 max_fx = qint64(image_x2) * fixed_scale; \ - const qint64 min_fy = qint64(image_y1) * fixed_scale; \ - const qint64 max_fy = qint64(image_y2) * fixed_scale; \ - while (b < end) { \ - int x1 = (fx >> 16); \ - int x2; \ - int y1 = (fy >> 16); \ - int y2; \ - fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); \ - fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2); \ - if (x1 != x2 && y1 != y2) \ - break; \ - const uint *s1 = (const uint *)data->texture.scanLine(y1); \ - const uint *s2 = (const uint *)data->texture.scanLine(y2); \ - uint tl = s1[x1]; \ - uint tr = s1[x2]; \ - uint bl = s2[x1]; \ - uint br = s2[x2]; \ - int distx = (fx & 0x0000ffff) >> 8; \ - int disty = (fy & 0x0000ffff) >> 8; \ - *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); \ - fx += fdx; \ - fy += fdy; \ - ++b; \ - } \ - uint *boundedEnd = end; \ - if (fdx > 0) \ - boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); \ - else if (fdx < 0) \ - boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); \ - if (fdy > 0) \ - boundedEnd = qMin(boundedEnd, b + (max_fy - fy) / fdy); \ - else if (fdy < 0) \ - boundedEnd = qMin(boundedEnd, b + (min_fy - fy) / fdy); \ - boundedEnd -= 3; + *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); -#if defined(__SSE2__) - BILINEAR_ROTATE_BOUNDS_PROLOG + fx += fdx; + fy += fdy; + ++b; + } +} - const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); - const __m128i v_256 = _mm_set1_epi16(256); - const __m128i v_fdx = _mm_set1_epi32(fdx*4); - const __m128i v_fdy = _mm_set1_epi32(fdy*4); - const __m128i v_fxy_r = _mm_set1_epi32(0x8); - __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx); - __m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy); +template<TextureBlendType blendType> +static void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper(uint *b, uint *end, const QTextureData &image, + int &fx, int &fy, int fdx, int fdy) +{ + //we are zooming less than 8x, use 4bit precision + if (blendType != BlendTransformedBilinearTiled) { + const qint64 min_fx = qint64(image.x1) * fixed_scale; + const qint64 max_fx = qint64(image.x2 - 1) * fixed_scale; + const qint64 min_fy = qint64(image.y1) * fixed_scale; + const qint64 max_fy = qint64(image.y2 - 1) * fixed_scale; + // first handle the possibly bounded part in the beginning + while (b < end) { + int x1 = (fx >> 16); + int x2; + int y1 = (fy >> 16); + int y2; + fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2); + fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2); + if (x1 != x2 && y1 != y2) + break; + const uint *s1 = (const uint *)image.scanLine(y1); + const uint *s2 = (const uint *)image.scanLine(y2); + uint tl = s1[x1]; + uint tr = s1[x2]; + uint bl = s2[x1]; + uint br = s2[x2]; +#if defined(__SSE2__) || defined(__ARM_NEON__) + int distx = (fx & 0x0000ffff) >> 8; + int disty = (fy & 0x0000ffff) >> 8; + *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); +#else + int distx = ((fx & 0x0000ffff) + 0x0800) >> 12; + int disty = ((fy & 0x0000ffff) + 0x0800) >> 12; + *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); +#endif + fx += fdx; + fy += fdy; + ++b; + } + uint *boundedEnd = end; \ + if (fdx > 0) \ + boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); \ + else if (fdx < 0) \ + boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); \ + if (fdy > 0) \ + boundedEnd = qMin(boundedEnd, b + (max_fy - fy) / fdy); \ + else if (fdy < 0) \ + boundedEnd = qMin(boundedEnd, b + (min_fy - fy) / fdy); \ + + // until boundedEnd we can now have a fast middle part without boundary checks +#if defined(__SSE2__) + const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); + const __m128i v_256 = _mm_set1_epi16(256); + const __m128i v_fdx = _mm_set1_epi32(fdx*4); + const __m128i v_fdy = _mm_set1_epi32(fdy*4); + const __m128i v_fxy_r = _mm_set1_epi32(0x8); + __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx); + __m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy); + + const uchar *textureData = image.imageData; + const int bytesPerLine = image.bytesPerLine; + const __m128i vbpl = _mm_shufflelo_epi16(_mm_cvtsi32_si128(bytesPerLine/4), _MM_SHUFFLE(0, 0, 0, 0)); + + while (b < boundedEnd - 3) { + const __m128i vy = _mm_packs_epi32(_mm_srli_epi32(v_fy, 16), _mm_setzero_si128()); + // 4x16bit * 4x16bit -> 4x32bit + __m128i offset = _mm_unpacklo_epi16(_mm_mullo_epi16(vy, vbpl), _mm_mulhi_epi16(vy, vbpl)); + offset = _mm_add_epi32(offset, _mm_srli_epi32(v_fx, 16)); + const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4); + const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4); + const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4); + const int offset3 = _mm_cvtsi128_si32(offset); + const uint *topData = (const uint *)(textureData); + const __m128i tl = _mm_setr_epi32(topData[offset0], topData[offset1], topData[offset2], topData[offset3]); + const __m128i tr = _mm_setr_epi32(topData[offset0 + 1], topData[offset1 + 1], topData[offset2 + 1], topData[offset3 + 1]); + const uint *bottomData = (const uint *)(textureData + bytesPerLine); + const __m128i bl = _mm_setr_epi32(bottomData[offset0], bottomData[offset1], bottomData[offset2], bottomData[offset3]); + const __m128i br = _mm_setr_epi32(bottomData[offset0 + 1], bottomData[offset1 + 1], bottomData[offset2 + 1], bottomData[offset3 + 1]); + + __m128i v_distx = _mm_srli_epi16(v_fx, 8); + __m128i v_disty = _mm_srli_epi16(v_fy, 8); + v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fxy_r), 4); + v_disty = _mm_srli_epi16(_mm_add_epi32(v_disty, v_fxy_r), 4); + v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0)); + v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0)); + v_disty = _mm_shufflehi_epi16(v_disty, _MM_SHUFFLE(2,2,0,0)); + v_disty = _mm_shufflelo_epi16(v_disty, _MM_SHUFFLE(2,2,0,0)); + + interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b); + b += 4; + v_fx = _mm_add_epi32(v_fx, v_fdx); + v_fy = _mm_add_epi32(v_fy, v_fdy); + } + fx = _mm_cvtsi128_si32(v_fx); + fy = _mm_cvtsi128_si32(v_fy); +#elif defined(__ARM_NEON__) + const int16x8_t colorMask = vdupq_n_s16(0x00ff); + const int16x8_t invColorMask = vmvnq_s16(colorMask); + const int16x8_t v_256 = vdupq_n_s16(256); + int32x4_t v_fdx = vdupq_n_s32(fdx * 4); + int32x4_t v_fdy = vdupq_n_s32(fdy * 4); + + const uchar *textureData = image.imageData; + const int bytesPerLine = image.bytesPerLine; + + int32x4_t v_fx = vmovq_n_s32(fx); + int32x4_t v_fy = vmovq_n_s32(fy); + v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1); + v_fy = vsetq_lane_s32(fy + fdy, v_fy, 1); + v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2); + v_fy = vsetq_lane_s32(fy + fdy * 2, v_fy, 2); + v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3); + v_fy = vsetq_lane_s32(fy + fdy * 3, v_fy, 3); + + const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff); + const int32x4_t v_round = vdupq_n_s32(0x0800); + + while (b < boundedEnd - 3) { + uint32x4x2_t v_top, v_bot; + + int x1 = (fx >> 16); + int y1 = (fy >> 16); + fx += fdx; fy += fdy; + const uchar *sl = textureData + bytesPerLine * y1; + const uint *s1 = reinterpret_cast<const uint *>(sl); + const uint *s2 = reinterpret_cast<const uint *>(sl + bytesPerLine); + v_top = vld2q_lane_u32(s1 + x1, v_top, 0); + v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0); + x1 = (fx >> 16); + y1 = (fy >> 16); + fx += fdx; fy += fdy; + sl = textureData + bytesPerLine * y1; + s1 = reinterpret_cast<const uint *>(sl); + s2 = reinterpret_cast<const uint *>(sl + bytesPerLine); + v_top = vld2q_lane_u32(s1 + x1, v_top, 1); + v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1); + x1 = (fx >> 16); + y1 = (fy >> 16); + fx += fdx; fy += fdy; + sl = textureData + bytesPerLine * y1; + s1 = reinterpret_cast<const uint *>(sl); + s2 = reinterpret_cast<const uint *>(sl + bytesPerLine); + v_top = vld2q_lane_u32(s1 + x1, v_top, 2); + v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2); + x1 = (fx >> 16); + y1 = (fy >> 16); + fx += fdx; fy += fdy; + sl = textureData + bytesPerLine * y1; + s1 = reinterpret_cast<const uint *>(sl); + s2 = reinterpret_cast<const uint *>(sl + bytesPerLine); + v_top = vld2q_lane_u32(s1 + x1, v_top, 3); + v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3); + + int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_round), 12); + int32x4_t v_disty = vshrq_n_s32(vaddq_s32(vandq_s32(v_fy, v_ffff_mask), v_round), 12); + v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16)); + v_disty = vorrq_s32(v_disty, vshlq_n_s32(v_disty, 16)); + int16x8_t v_disty_ = vshlq_n_s16(vreinterpretq_s16_s32(v_disty), 4); + + interpolate_4_pixels_16_neon( + vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]), + vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]), + vreinterpretq_s16_s32(v_distx), vreinterpretq_s16_s32(v_disty), + v_disty_, colorMask, invColorMask, v_256, b); + b += 4; + v_fx = vaddq_s32(v_fx, v_fdx); + v_fy = vaddq_s32(v_fy, v_fdy); + } +#endif + while (b < boundedEnd) { + int x = (fx >> 16); + int y = (fy >> 16); - const uchar *textureData = data->texture.imageData; - const int bytesPerLine = data->texture.bytesPerLine; - const __m128i vbpl = _mm_shufflelo_epi16(_mm_cvtsi32_si128(bytesPerLine/4), _MM_SHUFFLE(0, 0, 0, 0)); + const uint *s1 = (const uint *)image.scanLine(y); + const uint *s2 = (const uint *)image.scanLine(y + 1); - while (b < boundedEnd) { - const __m128i vy = _mm_packs_epi32(_mm_srli_epi32(v_fy, 16), _mm_setzero_si128()); - // 4x16bit * 4x16bit -> 4x32bit - __m128i offset = _mm_unpacklo_epi16(_mm_mullo_epi16(vy, vbpl), _mm_mulhi_epi16(vy, vbpl)); - offset = _mm_add_epi32(offset, _mm_srli_epi32(v_fx, 16)); - const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4); - const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4); - const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4); - const int offset3 = _mm_cvtsi128_si32(offset); - const uint *topData = (const uint *)(textureData); - const __m128i tl = _mm_setr_epi32(topData[offset0], topData[offset1], topData[offset2], topData[offset3]); - const __m128i tr = _mm_setr_epi32(topData[offset0 + 1], topData[offset1 + 1], topData[offset2 + 1], topData[offset3 + 1]); - const uint *bottomData = (const uint *)(textureData + bytesPerLine); - const __m128i bl = _mm_setr_epi32(bottomData[offset0], bottomData[offset1], bottomData[offset2], bottomData[offset3]); - const __m128i br = _mm_setr_epi32(bottomData[offset0 + 1], bottomData[offset1 + 1], bottomData[offset2 + 1], bottomData[offset3 + 1]); - - __m128i v_distx = _mm_srli_epi16(v_fx, 8); - __m128i v_disty = _mm_srli_epi16(v_fy, 8); - v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fxy_r), 4); - v_disty = _mm_srli_epi16(_mm_add_epi32(v_disty, v_fxy_r), 4); - v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0)); - v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0)); - v_disty = _mm_shufflehi_epi16(v_disty, _MM_SHUFFLE(2,2,0,0)); - v_disty = _mm_shufflelo_epi16(v_disty, _MM_SHUFFLE(2,2,0,0)); - - interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b); - b += 4; - v_fx = _mm_add_epi32(v_fx, v_fdx); - v_fy = _mm_add_epi32(v_fy, v_fdy); - } - fx = _mm_cvtsi128_si32(v_fx); - fy = _mm_cvtsi128_si32(v_fy); -#elif defined(__ARM_NEON__) - BILINEAR_ROTATE_BOUNDS_PROLOG +#if defined(__SSE2__) || defined(__ARM_NEON__) + int distx = (fx & 0x0000ffff) >> 8; + int disty = (fy & 0x0000ffff) >> 8; + *b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty); +#else + uint tl = s1[x]; + uint tr = s1[x + 1]; + uint bl = s2[x]; + uint br = s2[x + 1]; + int distx = ((fx & 0x0000ffff) + 0x0800) >> 12; + int disty = ((fy & 0x0000ffff) + 0x0800) >> 12; + *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); +#endif - const int16x8_t colorMask = vdupq_n_s16(0x00ff); - const int16x8_t invColorMask = vmvnq_s16(colorMask); - const int16x8_t v_256 = vdupq_n_s16(256); - int32x4_t v_fdx = vdupq_n_s32(fdx * 4); - int32x4_t v_fdy = vdupq_n_s32(fdy * 4); + fx += fdx; + fy += fdy; + ++b; + } + } - const uchar *textureData = data->texture.imageData; - const int bytesPerLine = data->texture.bytesPerLine; + while (b < end) { + int x1 = (fx >> 16); + int x2; + int y1 = (fy >> 16); + int y2; - int32x4_t v_fx = vmovq_n_s32(fx); - int32x4_t v_fy = vmovq_n_s32(fy); - v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1); - v_fy = vsetq_lane_s32(fy + fdy, v_fy, 1); - v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2); - v_fy = vsetq_lane_s32(fy + fdy * 2, v_fy, 2); - v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3); - v_fy = vsetq_lane_s32(fy + fdy * 3, v_fy, 3); + fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2); + fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2); - const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff); - const int32x4_t v_round = vdupq_n_s32(0x0800); + const uint *s1 = (const uint *)image.scanLine(y1); + const uint *s2 = (const uint *)image.scanLine(y2); - while (b < boundedEnd) { - uint32x4x2_t v_top, v_bot; + uint tl = s1[x1]; + uint tr = s1[x2]; + uint bl = s2[x1]; + uint br = s2[x2]; - int x1 = (fx >> 16); - int y1 = (fy >> 16); - fx += fdx; fy += fdy; - const uchar *sl = textureData + bytesPerLine * y1; - const uint *s1 = reinterpret_cast<const uint *>(sl); - const uint *s2 = reinterpret_cast<const uint *>(sl + bytesPerLine); - v_top = vld2q_lane_u32(s1 + x1, v_top, 0); - v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0); - x1 = (fx >> 16); - y1 = (fy >> 16); - fx += fdx; fy += fdy; - sl = textureData + bytesPerLine * y1; - s1 = reinterpret_cast<const uint *>(sl); - s2 = reinterpret_cast<const uint *>(sl + bytesPerLine); - v_top = vld2q_lane_u32(s1 + x1, v_top, 1); - v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1); - x1 = (fx >> 16); - y1 = (fy >> 16); - fx += fdx; fy += fdy; - sl = textureData + bytesPerLine * y1; - s1 = reinterpret_cast<const uint *>(sl); - s2 = reinterpret_cast<const uint *>(sl + bytesPerLine); - v_top = vld2q_lane_u32(s1 + x1, v_top, 2); - v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2); - x1 = (fx >> 16); - y1 = (fy >> 16); - fx += fdx; fy += fdy; - sl = textureData + bytesPerLine * y1; - s1 = reinterpret_cast<const uint *>(sl); - s2 = reinterpret_cast<const uint *>(sl + bytesPerLine); - v_top = vld2q_lane_u32(s1 + x1, v_top, 3); - v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3); - - int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_round), 12); - int32x4_t v_disty = vshrq_n_s32(vaddq_s32(vandq_s32(v_fy, v_ffff_mask), v_round), 12); - v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16)); - v_disty = vorrq_s32(v_disty, vshlq_n_s32(v_disty, 16)); - int16x8_t v_disty_ = vshlq_n_s16(vreinterpretq_s16_s32(v_disty), 4); - - interpolate_4_pixels_16_neon( - vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]), - vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]), - vreinterpretq_s16_s32(v_distx), vreinterpretq_s16_s32(v_disty), - v_disty_, colorMask, invColorMask, v_256, b); - b += 4; - v_fx = vaddq_s32(v_fx, v_fdx); - v_fy = vaddq_s32(v_fy, v_fdy); - } +#if defined(__SSE2__) || defined(__ARM_NEON__) + // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16. + int distx = (fx & 0x0000ffff) >> 8; + int disty = (fy & 0x0000ffff) >> 8; + *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); +#else + int distx = ((fx & 0x0000ffff) + 0x0800) >> 12; + int disty = ((fy & 0x0000ffff) + 0x0800) >> 12; + *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); #endif - } - while (b < end) { - int x1 = (fx >> 16); - int x2; - int y1 = (fy >> 16); - int y2; + fx += fdx; + fy += fdy; + ++b; + } +} - fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); - fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2); - const uint *s1 = (const uint *)data->texture.scanLine(y1); - const uint *s2 = (const uint *)data->texture.scanLine(y2); +static BilinearFastTransformHelper bilinearFastTransformHelperARGB32PM[2][NFastTransformTypes] = { + { + fetchTransformedBilinearARGB32PM_simple_upscale_helper<BlendTransformedBilinear>, + fetchTransformedBilinearARGB32PM_upscale_helper<BlendTransformedBilinear>, + fetchTransformedBilinearARGB32PM_downscale_helper<BlendTransformedBilinear>, + fetchTransformedBilinearARGB32PM_rotate_helper<BlendTransformedBilinear>, + fetchTransformedBilinearARGB32PM_fast_rotate_helper<BlendTransformedBilinear> + }, + { + fetchTransformedBilinearARGB32PM_simple_upscale_helper<BlendTransformedBilinearTiled>, + fetchTransformedBilinearARGB32PM_upscale_helper<BlendTransformedBilinearTiled>, + fetchTransformedBilinearARGB32PM_downscale_helper<BlendTransformedBilinearTiled>, + fetchTransformedBilinearARGB32PM_rotate_helper<BlendTransformedBilinearTiled>, + fetchTransformedBilinearARGB32PM_fast_rotate_helper<BlendTransformedBilinearTiled> + } +}; - uint tl = s1[x1]; - uint tr = s1[x2]; - uint bl = s2[x1]; - uint br = s2[x2]; +template<TextureBlendType blendType> /* blendType = BlendTransformedBilinear or BlendTransformedBilinearTiled */ +static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, const Operator *, + const QSpanData *data, int y, int x, + int length) +{ + const qreal cx = x + qreal(0.5); + const qreal cy = y + qreal(0.5); + Q_CONSTEXPR int tiled = (blendType == BlendTransformedBilinearTiled) ? 1 : 0; -#if defined(__SSE2__) || defined(__ARM_NEON__) - // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16. - int distx = (fx & 0x0000ffff) >> 8; - int disty = (fy & 0x0000ffff) >> 8; - *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); -#else - int distx = ((fx & 0x0000ffff) + 0x0800) >> 12; - int disty = ((fy & 0x0000ffff) + 0x0800) >> 12; - *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty); -#endif + uint *end = buffer + length; + uint *b = buffer; + if (data->fast_matrix) { + // The increment pr x in the scanline + int fdx = (int)(data->m11 * fixed_scale); + int fdy = (int)(data->m12 * fixed_scale); - fx += fdx; - fy += fdy; - ++b; - } + int fx = int((data->m21 * cy + + data->m11 * cx + data->dx) * fixed_scale); + int fy = int((data->m22 * cy + + data->m12 * cx + data->dy) * fixed_scale); + + fx -= half_point; + fy -= half_point; + + if (fdy == 0) { // simple scale, no rotation or shear + if (fdx <= fixed_scale && fdx > 0) { + // simple scale up on X without mirroring + bilinearFastTransformHelperARGB32PM[tiled][SimpleUpscaleTransform](b, end, data->texture, fx, fy, fdx, fdy); + } else if ((fdx < 0 && fdx > -(fixed_scale / 8)) || qAbs(data->m22) < qreal(1./8.)) { + // scale up more than 8x (on either Y or on X mirrored) + bilinearFastTransformHelperARGB32PM[tiled][UpscaleTransform](b, end, data->texture, fx, fy, fdx, fdy); + } else { + // scale down on X (or up on X mirrored less than 8x) + bilinearFastTransformHelperARGB32PM[tiled][DownscaleTransform](b, end, data->texture, fx, fy, fdx, fdy); + } + } else { // rotation or shear + if (qAbs(data->m11) < qreal(1./8.) || qAbs(data->m22) < qreal(1./8.) ) { + // if we are zooming more than 8 times, we use 8bit precision for the position. + bilinearFastTransformHelperARGB32PM[tiled][RotateTransform](b, end, data->texture, fx, fy, fdx, fdy); + } else { + // we are zooming less than 8x, use 4bit precision + bilinearFastTransformHelperARGB32PM[tiled][FastRotateTransform](b, end, data->texture, fx, fy, fdx, fdy); } } } else { + const QTextureData &image = data->texture; + const qreal fdx = data->m11; const qreal fdy = data->m12; const qreal fdw = data->m13; @@ -2491,8 +2630,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c int distx = int((px - x1) * 256); int disty = int((py - y1) * 256); - fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); - fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2); + fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2); + fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2); const uint *s1 = (const uint *)data->texture.scanLine(y1); const uint *s2 = (const uint *)data->texture.scanLine(y2); @@ -2674,7 +2813,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0); layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0); - if ((fdx < 0 && fdx > -(fixed_scale / 8)) || std::abs(data->m22) < (1./8.)) { // scale up more than 8x + if ((fdx < 0 && fdx > -(fixed_scale / 8)) || qAbs(data->m22) < qreal(1./8.)) { // scale up more than 8x int disty = (fy & 0x0000ffff) >> 8; for (int i = 0; i < len; ++i) { int distx = (fracX & 0x0000ffff) >> 8; @@ -2726,7 +2865,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0); layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0); - if (std::abs(data->m11) < (1./8.) || std::abs(data->m22) < (1./8.)) { + if (qAbs(data->m11) < qreal(1./8.) || qAbs(data->m22) < qreal(1./8.) ) { //if we are zooming more than 8 times, we use 8bit precision for the position. for (int i = 0; i < len; ++i) { int distx = (fracX & 0x0000ffff) >> 8; @@ -3603,27 +3742,23 @@ static inline Operator getOperator(const QSpanData *data, const QSpan *spans, in op.destFetch = destFetchProc[data->rasterBuffer->format]; op.destFetch64 = destFetchProc64[data->rasterBuffer->format]; - if (op.mode == QPainter::CompositionMode_Source) { - switch (data->rasterBuffer->format) { - case QImage::Format_RGB32: - case QImage::Format_ARGB32_Premultiplied: - // don't clear destFetch as it sets up the pointer correctly to save one copy - break; - default: { - if (data->type == QSpanData::Texture && data->texture.const_alpha != 256) + if (op.mode == QPainter::CompositionMode_Source && + (data->type != QSpanData::Texture || data->texture.const_alpha == 256)) { + const QSpan *lastSpan = spans + spanCount; + bool alphaSpans = false; + while (spans < lastSpan) { + if (spans->coverage != 255) { + alphaSpans = true; break; - const QSpan *lastSpan = spans + spanCount; - bool alphaSpans = false; - while (spans < lastSpan) { - if (spans->coverage != 255) { - alphaSpans = true; - break; - } - ++spans; } - if (!alphaSpans) - op.destFetch = 0; + ++spans; } + if (!alphaSpans) { + // If all spans are opaque we do not need to fetch dest. + // But don't clear passthrough destFetch as they are just as fast and save destStore. + if (op.destFetch != destFetchARGB32P) + op.destFetch = 0; + op.destFetch64 = destFetch64Undefined; } } @@ -5191,6 +5326,8 @@ void qBlendTexture(int count, const QSpan *spans, void *userData) case QImage::Format_RGB16: proc = processTextureSpansRGB16[blendType]; break; + case QImage::Format_ARGB32: + case QImage::Format_RGBA8888: case QImage::Format_BGR30: case QImage::Format_A2BGR30_Premultiplied: case QImage::Format_RGB30: @@ -5403,134 +5540,200 @@ inline static void qt_bitmapblit_quint16(QRasterBuffer *rasterBuffer, map, mapWidth, mapHeight, mapStride); } -static void qt_alphamapblit_quint16(QRasterBuffer *rasterBuffer, +static inline void alphamapblend_generic(int coverage, QRgba64 *dest, int x, const QRgba64 &srcLinear, const QRgba64 &src, const QColorProfile *colorProfile) +{ + if (coverage == 0) { + // nothing + } else if (coverage == 255) { + dest[x] = src; + } else { + QRgba64 dstColor = dest[x]; + if (colorProfile) { + if (dstColor.isOpaque()) + dstColor = colorProfile->toLinear(dstColor); + else if (!dstColor.isTransparent()) + dstColor = colorProfile->toLinear(dstColor.unpremultiplied()).premultiplied(); + } + + dstColor = interpolate255(srcLinear, coverage, dstColor, 255 - coverage); + if (colorProfile) { + if (dstColor.isOpaque()) + dstColor = colorProfile->fromLinear(dstColor); + else if (!dstColor.isTransparent()) + dstColor = colorProfile->fromLinear(dstColor.unpremultiplied()).premultiplied(); + } + dest[x] = dstColor; + } +} + +static void qt_alphamapblit_generic(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 &color, const uchar *map, int mapWidth, int mapHeight, int mapStride, - const QClipData *) + const QClipData *clip, bool useGammaCorrection) { - const quint16 c = color.toRgb16(); - quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x; - const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16); + if (color.isTransparent()) + return; - while (mapHeight--) { - for (int i = 0; i < mapWidth; ++i) { - const int coverage = map[i]; + const QColorProfile *colorProfile = nullptr; - if (coverage == 0) { - // nothing - } else if (coverage == 255) { - dest[i] = c; - } else { - int ialpha = 255 - coverage; - dest[i] = BYTE_MUL_RGB16(c, coverage) - + BYTE_MUL_RGB16(dest[i], ialpha); + if (useGammaCorrection) + colorProfile = QGuiApplicationPrivate::instance()->colorProfileForA8Text(); + + QRgba64 srcColor = color; + if (colorProfile) { + if (color.isOpaque()) + srcColor = colorProfile->toLinear(srcColor); + else + srcColor = colorProfile->toLinear(srcColor.unpremultiplied()).premultiplied(); + } + + quint64 buffer[buffer_size]; + const DestFetchProc64 destFetch64 = destFetchProc64[rasterBuffer->format]; + const DestStoreProc64 destStore64 = destStoreProc64[rasterBuffer->format]; + + if (!clip) { + for (int ly = 0; ly < mapHeight; ++ly) { + int i = x; + int length = mapWidth; + while (length > 0) { + int l = qMin(buffer_size, length); + QRgba64 *dest = destFetch64((QRgba64*)buffer, rasterBuffer, i, y + ly, l); + for (int j=0; j < l; ++j) { + const int coverage = map[j + (i - x)]; + alphamapblend_generic(coverage, dest, j, srcColor, color, colorProfile); + } + destStore64(rasterBuffer, i, y + ly, dest, l); + length -= l; + i += l; } + map += mapStride; } - dest += destStride; - map += mapStride; - } -} + } else { + int bottom = qMin(y + mapHeight, rasterBuffer->height()); -static inline void rgbBlendPixel(quint32 *dst, int coverage, int sr, int sg, int sb, const uchar *gamma, const uchar *invgamma) -{ - // Do a gray alphablend... - int da = qAlpha(*dst); - int dr = qRed(*dst); - int dg = qGreen(*dst); - int db = qBlue(*dst); + int top = qMax(y, 0); + map += (top - y) * mapStride; + + const_cast<QClipData *>(clip)->initialize(); + for (int yp = top; yp<bottom; ++yp) { + const QClipData::ClipLine &line = clip->m_clipLines[yp]; - if (da != 255 - ) { + for (int i=0; i<line.count; ++i) { + const QSpan &clip = line.spans[i]; - int a = qGray(coverage); - sr = qt_div_255(invgamma[sr] * a); - sg = qt_div_255(invgamma[sg] * a); - sb = qt_div_255(invgamma[sb] * a); + int start = qMax<int>(x, clip.x); + int end = qMin<int>(x + mapWidth, clip.x + clip.len); + Q_ASSERT(clip.len <= buffer_size); + QRgba64 *dest = destFetch64((QRgba64*)buffer, rasterBuffer, start, clip.y, clip.len); - int ia = 255 - a; - dr = qt_div_255(dr * ia); - dg = qt_div_255(dg * ia); - db = qt_div_255(db * ia); + for (int xp=start; xp<end; ++xp) { + const int coverage = map[xp - x]; + alphamapblend_generic(coverage, dest, xp - start, srcColor, color, colorProfile); + } + destStore64(rasterBuffer, start, clip.y, dest, clip.len); + } // for (i -> line.count) + map += mapStride; + } // for (yp -> bottom) + } +} - *dst = ((a + qt_div_255((255 - a) * da)) << 24) - | ((sr + dr) << 16) - | ((sg + dg) << 8) - | ((sb + db)); +static inline void alphamapblend_quint16(int coverage, quint16 *dest, int x, const quint16 srcColor) +{ + if (coverage == 0) { + // nothing + } else if (coverage == 255) { + dest[x] = srcColor; + } else { + dest[x] = BYTE_MUL_RGB16(srcColor, coverage) + + BYTE_MUL_RGB16(dest[x], 255 - coverage); + } +} + +void qt_alphamapblit_quint16(QRasterBuffer *rasterBuffer, + int x, int y, const QRgba64 &color, + const uchar *map, + int mapWidth, int mapHeight, int mapStride, + const QClipData *clip, bool useGammaCorrection) +{ + if (useGammaCorrection) { + qt_alphamapblit_generic(rasterBuffer, x, y, color, map, mapWidth, mapHeight, mapStride, clip, useGammaCorrection); return; } - int mr = qRed(coverage); - int mg = qGreen(coverage); - int mb = qBlue(coverage); + const quint16 c = color.toRgb16(); - dr = gamma[dr]; - dg = gamma[dg]; - db = gamma[db]; + if (!clip) { + quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x; + const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16); + while (mapHeight--) { + for (int i = 0; i < mapWidth; ++i) + alphamapblend_quint16(map[i], dest, i, c); + dest += destStride; + map += mapStride; + } + } else { + int top = qMax(y, 0); + int bottom = qMin(y + mapHeight, rasterBuffer->height()); + map += (top - y) * mapStride; - int nr = qt_div_255(sr * mr + dr * (255 - mr)); - int ng = qt_div_255(sg * mg + dg * (255 - mg)); - int nb = qt_div_255(sb * mb + db * (255 - mb)); + const_cast<QClipData *>(clip)->initialize(); + for (int yp = top; yp<bottom; ++yp) { + const QClipData::ClipLine &line = clip->m_clipLines[yp]; - nr = invgamma[nr]; - ng = invgamma[ng]; - nb = invgamma[nb]; + quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(yp)); - *dst = qRgb(nr, ng, nb); -} + for (int i=0; i<line.count; ++i) { + const QSpan &clip = line.spans[i]; + + int start = qMax<int>(x, clip.x); + int end = qMin<int>(x + mapWidth, clip.x + clip.len); -#if defined(Q_OS_WIN) -Q_GUI_EXPORT bool qt_needs_a8_gamma_correction = false; + for (int xp=start; xp<end; ++xp) + alphamapblend_quint16(map[xp - x], dest, xp, c); + } // for (i -> line.count) + map += mapStride; + } // for (yp -> bottom) + } +} -static inline void grayBlendPixel(quint32 *dst, int coverage, int sr, int sg, int sb, const uint *gamma, const uchar *invgamma) +static inline void rgbBlendPixel(quint32 *dst, int coverage, QRgba64 slinear, const QColorProfile *colorProfile, bool useGammaCorrection) { - // Do a gammacorrected gray alphablend... - int dr = qRed(*dst); - int dg = qGreen(*dst); - int db = qBlue(*dst); + // Do a gammacorrected RGB alphablend... + const QRgba64 dlinear = useGammaCorrection ? colorProfile->toLinear64(*dst) : QRgba64::fromArgb32(*dst); - dr = gamma[dr]; - dg = gamma[dg]; - db = gamma[db]; + QRgba64 blend = rgbBlend(dlinear, slinear, coverage); - int alpha = coverage; - int ialpha = 255 - alpha; - int nr = qt_div_255(sr * alpha + dr * ialpha); - int ng = qt_div_255(sg * alpha + dg * ialpha); - int nb = qt_div_255(sb * alpha + db * ialpha); + *dst = useGammaCorrection ? colorProfile->fromLinear64(blend) : toArgb32(blend); +} + +static inline void grayBlendPixel(quint32 *dst, int coverage, QRgba64 slinear, const QColorProfile *colorProfile) +{ + // Do a gammacorrected gray alphablend... + const QRgba64 dlinear = colorProfile->toLinear64(*dst); - nr = invgamma[nr]; - ng = invgamma[ng]; - nb = invgamma[nb]; + QRgba64 blend = interpolate255(slinear, coverage, dlinear, 255 - coverage); - *dst = qRgb(nr, ng, nb); + *dst = colorProfile->fromLinear64(blend); } -#endif static void qt_alphamapblit_uint32(QRasterBuffer *rasterBuffer, int x, int y, quint32 color, const uchar *map, int mapWidth, int mapHeight, int mapStride, - const QClipData *clip) + const QClipData *clip, bool useGammaCorrection) { const quint32 c = color; const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint32); -#if defined(Q_OS_WIN) - const QDrawHelperGammaTables *tables = QGuiApplicationPrivate::instance()->gammaTables(); - if (!tables) + const QColorProfile *colorProfile = QGuiApplicationPrivate::instance()->colorProfileForA8Text(); + if (!colorProfile) return; - const uint *gamma = tables->qt_pow_gamma; - const uchar *invgamma = tables->qt_pow_invgamma; - - int sr = gamma[qRed(color)]; - int sg = gamma[qGreen(color)]; - int sb = gamma[qBlue(color)]; + const QRgba64 slinear = colorProfile->toLinear64(c); bool opaque_src = (qAlpha(color) == 255); - bool doGrayBlendPixel = opaque_src && qt_needs_a8_gamma_correction; -#endif + bool doGrayBlendPixel = opaque_src && useGammaCorrection; if (!clip) { quint32 *dest = reinterpret_cast<quint32*>(rasterBuffer->scanLine(y)) + x; @@ -5543,13 +5746,9 @@ static void qt_alphamapblit_uint32(QRasterBuffer *rasterBuffer, } else if (coverage == 255) { dest[i] = c; } else { -#if defined(Q_OS_WIN) - if (QSysInfo::WindowsVersion >= QSysInfo::WV_XP && doGrayBlendPixel - && qAlpha(dest[i]) == 255) { - grayBlendPixel(dest+i, coverage, sr, sg, sb, gamma, invgamma); - } else -#endif - { + if (doGrayBlendPixel && qAlpha(dest[i]) == 255) { + grayBlendPixel(dest+i, coverage, slinear, colorProfile); + } else { int ialpha = 255 - coverage; dest[i] = INTERPOLATE_PIXEL_255(c, coverage, dest[i], ialpha); } @@ -5584,13 +5783,9 @@ static void qt_alphamapblit_uint32(QRasterBuffer *rasterBuffer, } else if (coverage == 255) { dest[xp] = c; } else { -#if defined(Q_OS_WIN) - if (QSysInfo::WindowsVersion >= QSysInfo::WV_XP && doGrayBlendPixel - && qAlpha(dest[xp]) == 255) { - grayBlendPixel(dest+xp, coverage, sr, sg, sb, gamma, invgamma); - } else -#endif - { + if (doGrayBlendPixel && qAlpha(dest[xp]) == 255) { + grayBlendPixel(dest+xp, coverage, slinear, colorProfile); + } else { int ialpha = 255 - coverage; dest[xp] = INTERPOLATE_PIXEL_255(c, coverage, dest[xp], ialpha); } @@ -5608,9 +5803,9 @@ static void qt_alphamapblit_argb32(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 &color, const uchar *map, int mapWidth, int mapHeight, int mapStride, - const QClipData *clip) + const QClipData *clip, bool useGammaCorrection) { - qt_alphamapblit_uint32(rasterBuffer, x, y, color.toArgb32(), map, mapWidth, mapHeight, mapStride, clip); + qt_alphamapblit_uint32(rasterBuffer, x, y, color.toArgb32(), map, mapWidth, mapHeight, mapStride, clip, useGammaCorrection); } #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN @@ -5618,38 +5813,132 @@ static void qt_alphamapblit_rgba8888(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 &color, const uchar *map, int mapWidth, int mapHeight, int mapStride, - const QClipData *clip) + const QClipData *clip, bool useGammaCorrection) { - qt_alphamapblit_uint32(rasterBuffer, x, y, ARGB2RGBA(color.toArgb32()), map, mapWidth, mapHeight, mapStride, clip); + qt_alphamapblit_uint32(rasterBuffer, x, y, ARGB2RGBA(color.toArgb32()), map, mapWidth, mapHeight, mapStride, clip, useGammaCorrection); } #endif -static void qt_alphargbblit_argb32(QRasterBuffer *rasterBuffer, - int x, int y, const QRgba64 &color, - const uint *src, int mapWidth, int mapHeight, int srcStride, - const QClipData *clip) +static inline int qRgbAvg(QRgb rgb) { - const quint32 c = color.toArgb32(); + return (qRed(rgb) * 5 + qGreen(rgb) * 6 + qBlue(rgb) * 5) / 16; +} - int sr = qRed(c); - int sg = qGreen(c); - int sb = qBlue(c); - int sa = qAlpha(c); +static inline void alphargbblend_generic(uint coverage, QRgba64 *dest, int x, const QRgba64 &srcLinear, const QRgba64 &src, const QColorProfile *colorProfile) +{ + if (coverage == 0xff000000) { + // nothing + } else if (coverage == 0xffffffff) { + dest[x] = src; + } else { + QRgba64 dstColor = dest[x]; + if (dstColor.isOpaque()) { + if (colorProfile) + dstColor = colorProfile->toLinear(dstColor); + dstColor = rgbBlend(dstColor, srcLinear, coverage); + if (colorProfile) + dstColor = colorProfile->fromLinear(dstColor); + dest[x] = dstColor; + } else { + // Give up and do a gray alphablend. + if (colorProfile && !dstColor.isTransparent()) + dstColor = colorProfile->toLinear(dstColor.unpremultiplied()).premultiplied(); + const int a = qRgbAvg(coverage); + dstColor = interpolate255(srcLinear, coverage, dstColor, 255 - a); + if (colorProfile && !dstColor.isTransparent()) + dstColor = colorProfile->fromLinear(dstColor.unpremultiplied()).premultiplied(); + dest[x] = dstColor; + } + } +} - const QDrawHelperGammaTables *tables = QGuiApplicationPrivate::instance()->gammaTables(); - if (!tables) +static void qt_alphargbblit_generic(QRasterBuffer *rasterBuffer, + int x, int y, const QRgba64 &color, + const uint *src, int mapWidth, int mapHeight, int srcStride, + const QClipData *clip, bool useGammaCorrection) +{ + if (color.isTransparent()) return; - const uchar *gamma = tables->qt_pow_rgb_gamma; - const uchar *invgamma = tables->qt_pow_rgb_invgamma; + const QColorProfile *colorProfile = nullptr; + + if (useGammaCorrection) + colorProfile = QGuiApplicationPrivate::instance()->colorProfileForA8Text(); + + QRgba64 srcColor = color; + if (colorProfile) { + if (color.isOpaque()) + srcColor = colorProfile->toLinear(srcColor); + else + srcColor = colorProfile->toLinear(srcColor.unpremultiplied()).premultiplied(); + } + + quint64 buffer[buffer_size]; + const DestFetchProc64 destFetch64 = destFetchProc64[rasterBuffer->format]; + const DestStoreProc64 destStore64 = destStoreProc64[rasterBuffer->format]; + + if (!clip) { + for (int ly = 0; ly < mapHeight; ++ly) { + int i = x; + int length = mapWidth; + while (length > 0) { + int l = qMin(buffer_size, length); + QRgba64 *dest = destFetch64((QRgba64*)buffer, rasterBuffer, i, y + ly, l); + for (int j=0; j < l; ++j) { + const uint coverage = src[j + (i - x)]; + alphargbblend_generic(coverage, dest, j, srcColor, color, colorProfile); + } + destStore64(rasterBuffer, i, y + ly, dest, l); + length -= l; + i += l; + } + src += srcStride; + } + } else { + int bottom = qMin(y + mapHeight, rasterBuffer->height()); + + int top = qMax(y, 0); + src += (top - y) * srcStride; + + const_cast<QClipData *>(clip)->initialize(); + for (int yp = top; yp<bottom; ++yp) { + const QClipData::ClipLine &line = clip->m_clipLines[yp]; + + for (int i=0; i<line.count; ++i) { + const QSpan &clip = line.spans[i]; + + int start = qMax<int>(x, clip.x); + int end = qMin<int>(x + mapWidth, clip.x + clip.len); + Q_ASSERT(clip.len <= buffer_size); + QRgba64 *dest = destFetch64((QRgba64*)buffer, rasterBuffer, start, clip.y, clip.len); + + for (int xp=start; xp<end; ++xp) { + const uint coverage = src[xp - x]; + alphargbblend_generic(coverage, dest, xp - start, srcColor, color, colorProfile); + } + destStore64(rasterBuffer, start, clip.y, dest, clip.len); + } // for (i -> line.count) + src += srcStride; + } // for (yp -> bottom) + } +} + +static void qt_alphargbblit_argb32(QRasterBuffer *rasterBuffer, + int x, int y, const QRgba64 &color, + const uint *src, int mapWidth, int mapHeight, int srcStride, + const QClipData *clip, bool useGammaCorrection) +{ + if (color.isTransparent()) + return; - sr = gamma[sr]; - sg = gamma[sg]; - sb = gamma[sb]; + const quint32 c = color.toArgb32(); - if (sa == 0) + const QColorProfile *colorProfile = QGuiApplicationPrivate::instance()->colorProfileForA32Text(); + if (!colorProfile) return; + const QRgba64 slinear = useGammaCorrection ? colorProfile->toLinear64(c) : color; + if (!clip) { quint32 *dst = reinterpret_cast<quint32*>(rasterBuffer->scanLine(y)) + x; const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint32); @@ -5659,7 +5948,16 @@ static void qt_alphargbblit_argb32(QRasterBuffer *rasterBuffer, if (coverage == 0xffffffff) { dst[i] = c; } else if (coverage != 0xff000000) { - rgbBlendPixel(dst+i, coverage, sr, sg, sb, gamma, invgamma); + if (dst[i] >= 0xff000000) { + rgbBlendPixel(dst+i, coverage, slinear, colorProfile, useGammaCorrection); + } else { + // Give up and do a gray blend. + const int a = qRgbAvg(coverage); + if (useGammaCorrection) + grayBlendPixel(dst+i, a, slinear, colorProfile); + else + dst[i] = INTERPOLATE_PIXEL_255(c, a, dst[i], 255 - a); + } } } @@ -5689,7 +5987,16 @@ static void qt_alphargbblit_argb32(QRasterBuffer *rasterBuffer, if (coverage == 0xffffffff) { dst[xp] = c; } else if (coverage != 0xff000000) { - rgbBlendPixel(dst+xp, coverage, sr, sg, sb, gamma, invgamma); + if (dst[xp] >= 0xff000000) { + rgbBlendPixel(dst+xp, coverage, slinear, colorProfile, useGammaCorrection); + } else { + // Give up and do a gray blend. + const int a = qRgbAvg(coverage); + if (useGammaCorrection) + grayBlendPixel(dst+xp, a, slinear, colorProfile); + else + dst[xp] = INTERPOLATE_PIXEL_255(c, a, dst[xp], 255 - coverage); + } } } } // for (i -> line.count) @@ -5822,56 +6129,80 @@ DrawHelper qDrawHelper[QImage::NImageFormats] = qt_gradient_quint16, qt_bitmapblit_quint16, qt_alphamapblit_quint16, - 0, + qt_alphargbblit_generic, qt_rectfill_quint16 }, // Format_ARGB8565_Premultiplied { blend_color_generic, blend_src_generic, - 0, 0, 0, 0 + 0, + qt_alphamapblit_generic, + qt_alphargbblit_generic, + 0 }, // Format_RGB666 { blend_color_generic, blend_src_generic, - 0, 0, 0, 0 + 0, + qt_alphamapblit_generic, + qt_alphargbblit_generic, + 0 }, // Format_ARGB6666_Premultiplied { blend_color_generic, blend_src_generic, - 0, 0, 0, 0 + 0, + qt_alphamapblit_generic, + qt_alphargbblit_generic, + 0 }, // Format_RGB555 { blend_color_generic, blend_src_generic, - 0, 0, 0, 0 + 0, + qt_alphamapblit_generic, + qt_alphargbblit_generic, + 0 }, // Format_ARGB8555_Premultiplied { blend_color_generic, blend_src_generic, - 0, 0, 0, 0 + 0, + qt_alphamapblit_generic, + qt_alphargbblit_generic, + 0 }, // Format_RGB888 { blend_color_generic, blend_src_generic, - 0, 0, 0, 0 + 0, + qt_alphamapblit_generic, + qt_alphargbblit_generic, + 0 }, // Format_RGB444 { blend_color_generic, blend_src_generic, - 0, 0, 0, 0 + 0, + qt_alphamapblit_generic, + qt_alphargbblit_generic, + 0 }, // Format_ARGB4444_Premultiplied { blend_color_generic, blend_src_generic, - 0, 0, 0, 0 + 0, + qt_alphamapblit_generic, + qt_alphargbblit_generic, + 0 }, // Format_RGBX8888 { @@ -5881,9 +6212,9 @@ DrawHelper qDrawHelper[QImage::NImageFormats] = #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN qt_alphamapblit_rgba8888, #else - 0, + qt_alphamapblit_generic, #endif - 0, + qt_alphargbblit_generic, qt_rectfill_rgba }, // Format_RGBA8888 @@ -5894,9 +6225,9 @@ DrawHelper qDrawHelper[QImage::NImageFormats] = #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN qt_alphamapblit_rgba8888, #else - 0, + qt_alphamapblit_generic, #endif - 0, + qt_alphargbblit_generic, qt_rectfill_nonpremul_rgba }, // Format_RGB8888_Premultiplied @@ -5907,9 +6238,9 @@ DrawHelper qDrawHelper[QImage::NImageFormats] = #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN qt_alphamapblit_rgba8888, #else - 0, + qt_alphamapblit_generic, #endif - 0, + qt_alphargbblit_generic, qt_rectfill_rgba }, // Format_BGR30 @@ -5917,8 +6248,8 @@ DrawHelper qDrawHelper[QImage::NImageFormats] = blend_color_generic_rgb64, blend_src_generic_rgb64, qt_bitmapblit_rgb30<PixelOrderBGR>, - 0, - 0, + qt_alphamapblit_generic, + qt_alphargbblit_generic, qt_rectfill_rgb30<PixelOrderBGR> }, // Format_A2BGR30_Premultiplied @@ -5926,8 +6257,8 @@ DrawHelper qDrawHelper[QImage::NImageFormats] = blend_color_generic_rgb64, blend_src_generic_rgb64, qt_bitmapblit_rgb30<PixelOrderBGR>, - 0, - 0, + qt_alphamapblit_generic, + qt_alphargbblit_generic, qt_rectfill_rgb30<PixelOrderBGR> }, // Format_RGB30 @@ -5935,8 +6266,8 @@ DrawHelper qDrawHelper[QImage::NImageFormats] = blend_color_generic_rgb64, blend_src_generic_rgb64, qt_bitmapblit_rgb30<PixelOrderRGB>, - 0, - 0, + qt_alphamapblit_generic, + qt_alphargbblit_generic, qt_rectfill_rgb30<PixelOrderRGB> }, // Format_A2RGB30_Premultiplied @@ -5944,22 +6275,26 @@ DrawHelper qDrawHelper[QImage::NImageFormats] = blend_color_generic_rgb64, blend_src_generic_rgb64, qt_bitmapblit_rgb30<PixelOrderRGB>, - 0, - 0, + qt_alphamapblit_generic, + qt_alphargbblit_generic, qt_rectfill_rgb30<PixelOrderRGB> }, // Format_Alpha8 { blend_color_generic, blend_src_generic, - 0, 0, 0, + 0, + qt_alphamapblit_generic, + qt_alphargbblit_generic, qt_rectfill_alpha }, // Format_Grayscale8 { blend_color_generic, blend_src_generic, - 0, 0, 0, + 0, + qt_alphamapblit_generic, + qt_alphargbblit_generic, qt_rectfill_gray }, }; @@ -6117,20 +6452,18 @@ static void qInitDrawhelperFunctions() #if defined(QT_COMPILER_SUPPORTS_SSE4_1) if (qCpuHasFeature(SSE4_1)) { -#if !defined(__SSE4_1__) extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_sse4(uint *buffer, const uint *src, int count, const QVector<QRgb> *, QDitherInfo *); extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_sse4(uint *buffer, const uint *src, int count, const QVector<QRgb> *, QDitherInfo *); - qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4; - qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4; -#endif extern const uint *QT_FASTCALL convertARGB32FromARGB32PM_sse4(uint *buffer, const uint *src, int count, const QVector<QRgb> *, QDitherInfo *); extern const uint *QT_FASTCALL convertRGBA8888FromARGB32PM_sse4(uint *buffer, const uint *src, int count, const QVector<QRgb> *, QDitherInfo *); extern const uint *QT_FASTCALL convertRGBXFromARGB32PM_sse4(uint *buffer, const uint *src, int count, const QVector<QRgb> *, QDitherInfo *); + qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_sse4; + qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_sse4; qPixelLayouts[QImage::Format_ARGB32].convertFromARGB32PM = convertARGB32FromARGB32PM_sse4; qPixelLayouts[QImage::Format_RGBA8888].convertFromARGB32PM = convertRGBA8888FromARGB32PM_sse4; qPixelLayouts[QImage::Format_RGBX8888].convertFromARGB32PM = convertRGBXFromARGB32PM_sse4; @@ -6141,14 +6474,6 @@ static void qInitDrawhelperFunctions() #if defined(QT_COMPILER_SUPPORTS_AVX2) if (qCpuHasFeature(AVX2)) { -#if !defined(__AVX2__) - extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, const uint *src, int count, - const QVector<QRgb> *, QDitherInfo *); - extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, const uint *src, int count, - const QVector<QRgb> *, QDitherInfo *); - qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_avx2; - qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_avx2; -#endif extern void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int w, int h, int const_alpha); @@ -6170,6 +6495,17 @@ static void qInitDrawhelperFunctions() qt_functionForMode_C[QPainter::CompositionMode_SourceOver] = comp_func_SourceOver_avx2; qt_functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_avx2; qt_functionForMode_C[QPainter::CompositionMode_Source] = comp_func_Source_avx2; + + extern void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_upscale_helper_avx2(uint *b, uint *end, const QTextureData &image, + int &fx, int &fy, int fdx, int /*fdy*/); + extern void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper_avx2(uint *b, uint *end, const QTextureData &image, + int &fx, int &fy, int fdx, int /*fdy*/); + extern void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2(uint *b, uint *end, const QTextureData &image, + int &fx, int &fy, int fdx, int fdy); + + bilinearFastTransformHelperARGB32PM[0][SimpleUpscaleTransform] = fetchTransformedBilinearARGB32PM_simple_upscale_helper_avx2; + bilinearFastTransformHelperARGB32PM[0][DownscaleTransform] = fetchTransformedBilinearARGB32PM_downscale_helper_avx2; + bilinearFastTransformHelperARGB32PM[0][FastRotateTransform] = fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2; } #endif @@ -6198,6 +6534,15 @@ static void qInitDrawhelperFunctions() sourceFetchUntransformed[QImage::Format_RGB888] = qt_fetchUntransformed_888_neon; +#if defined(Q_PROCESSOR_ARM_64) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN + extern const uint *QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, const uint *src, int count, + const QVector<QRgb> *, QDitherInfo *); + extern const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_neon(uint *buffer, const uint *src, int count, + const QVector<QRgb> *, QDitherInfo *); + qPixelLayouts[QImage::Format_ARGB32].convertToARGB32PM = convertARGB32ToARGB32PM_neon; + qPixelLayouts[QImage::Format_RGBA8888].convertToARGB32PM = convertRGBA8888ToARGB32PM_neon; +#endif + #if defined(ENABLE_PIXMAN_DRAWHELPERS) // The RGB16 helpers are using Arm32 assemblythat has not been ported to AArch64 qBlendFunctions[QImage::Format_RGB16][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_rgb16_neon; @@ -6215,8 +6560,8 @@ static void qInitDrawhelperFunctions() destFetchProc[QImage::Format_RGB16] = qt_destFetchRGB16_neon; destStoreProc[QImage::Format_RGB16] = qt_destStoreRGB16_neon; - qMemRotateFunctions[QImage::Format_RGB16][0] = qt_memrotate90_16_neon; - qMemRotateFunctions[QImage::Format_RGB16][2] = qt_memrotate270_16_neon; + qMemRotateFunctions[QPixelLayout::BPP16][0] = qt_memrotate90_16_neon; + qMemRotateFunctions[QPixelLayout::BPP16][2] = qt_memrotate270_16_neon; #endif #endif // defined(__ARM_NEON__) |