From 332816779c42e8a3fed34e9295c09338bc9b4945 Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Fri, 22 Nov 2019 13:02:31 +0100 Subject: Multithread some QImage routines Use QThreadPool to process QImage smooth-scaling, format conversions, and colorspace transforms multithreaded. Change-Id: Ic142b1fa899f56e7e5099d36ca713701a47b681b Reviewed-by: Lars Knoll --- src/gui/painting/qimagescale.cpp | 794 ++++++++++++++++++++++------------ src/gui/painting/qimagescale_neon.cpp | 218 ++++++---- src/gui/painting/qimagescale_p.h | 2 + src/gui/painting/qimagescale_sse4.cpp | 246 +++++++---- 4 files changed, 818 insertions(+), 442 deletions(-) (limited to 'src/gui/painting') diff --git a/src/gui/painting/qimagescale.cpp b/src/gui/painting/qimagescale.cpp index 2e2f65b483..ecb0230e71 100644 --- a/src/gui/painting/qimagescale.cpp +++ b/src/gui/painting/qimagescale.cpp @@ -43,6 +43,11 @@ #include "qcolor.h" #include "qrgba64_p.h" +#if QT_CONFIG(thread) +#include "qsemaphore.h" +#include "qthreadpool.h" +#endif + QT_BEGIN_NAMESPACE /* @@ -239,6 +244,8 @@ static QImageScaleInfo* QImageScale::qimageCalcScaleInfo(const QImage &img, isi = new QImageScaleInfo; if (!isi) return nullptr; + isi->sh = sh; + isi->sw = sw; isi->xup_yup = (qAbs(dw) >= sw) + ((qAbs(dh) >= sh) << 1); @@ -303,33 +310,54 @@ static void qt_qimageScaleAARGBA_up_xy(QImageScaleInfo *isi, unsigned int *dest, int *yapoints = isi->yapoints; /* go through every scanline in the output buffer */ - for (int y = 0; y < dh; y++) { - /* calculate the source line we'll scan from */ - const unsigned int *sptr = ypoints[y]; - unsigned int *dptr = dest + (y * dow); - const int yap = yapoints[y]; - if (yap > 0) { - for (int x = 0; x < dw; x++) { - const unsigned int *pix = sptr + xpoints[x]; - const int xap = xapoints[x]; - if (xap > 0) - *dptr = interpolate_4_pixels(pix, pix + sow, xap, yap); - else - *dptr = INTERPOLATE_PIXEL_256(pix[0], 256 - yap, pix[sow], yap); - dptr++; - } - } else { - for (int x = 0; x < dw; x++) { - const unsigned int *pix = sptr + xpoints[x]; - const int xap = xapoints[x]; - if (xap > 0) - *dptr = INTERPOLATE_PIXEL_256(pix[0], 256 - xap, pix[1], xap); - else - *dptr = pix[0]; - dptr++; + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + /* calculate the source line we'll scan from */ + const unsigned int *sptr = ypoints[y]; + unsigned int *dptr = dest + (y * dow); + const int yap = yapoints[y]; + if (yap > 0) { + for (int x = 0; x < dw; x++) { + const unsigned int *pix = sptr + xpoints[x]; + const int xap = xapoints[x]; + if (xap > 0) + *dptr = interpolate_4_pixels(pix, pix + sow, xap, yap); + else + *dptr = INTERPOLATE_PIXEL_256(pix[0], 256 - yap, pix[sow], yap); + dptr++; + } + } else { + for (int x = 0; x < dw; x++) { + const unsigned int *pix = sptr + xpoints[x]; + const int xap = xapoints[x]; + if (xap > 0) + *dptr = INTERPOLATE_PIXEL_256(pix[0], 256 - xap, pix[1], xap); + else + *dptr = pix[0]; + dptr++; + } } } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } /* scale by area sampling - with alpha */ @@ -411,33 +439,54 @@ static void qt_qimageScaleAARGBA_up_x_down_y(QImageScaleInfo *isi, unsigned int int *yapoints = isi->yapoints; /* go through every scanline in the output buffer */ - for (int y = 0; y < dh; y++) { - int Cy = yapoints[y] >> 16; - int yap = yapoints[y] & 0xffff; - - unsigned int *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - const unsigned int *sptr = ypoints[y] + xpoints[x]; - int r, g, b, a; - qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow, r, g, b, a); - - int xap = xapoints[x]; - if (xap > 0) { - int rr, gg, bb, aa; - qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow, rr, gg, bb, aa); - - r = r * (256 - xap); - g = g * (256 - xap); - b = b * (256 - xap); - a = a * (256 - xap); - r = (r + (rr * xap)) >> 8; - g = (g + (gg * xap)) >> 8; - b = (b + (bb * xap)) >> 8; - a = (a + (aa * xap)) >> 8; + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + int Cy = yapoints[y] >> 16; + int yap = yapoints[y] & 0xffff; + + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + const unsigned int *sptr = ypoints[y] + xpoints[x]; + int r, g, b, a; + qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow, r, g, b, a); + + int xap = xapoints[x]; + if (xap > 0) { + int rr, gg, bb, aa; + qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow, rr, gg, bb, aa); + + r = r * (256 - xap); + g = g * (256 - xap); + b = b * (256 - xap); + a = a * (256 - xap); + r = (r + (rr * xap)) >> 8; + g = (g + (gg * xap)) >> 8; + b = (b + (bb * xap)) >> 8; + a = (a + (aa * xap)) >> 8; + } + *dptr++ = qRgba(r >> 14, g >> 14, b >> 14, a >> 14); } - *dptr++ = qRgba(r >> 14, g >> 14, b >> 14, a >> 14); } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } static void qt_qimageScaleAARGBA_down_x_up_y(QImageScaleInfo *isi, unsigned int *dest, @@ -449,34 +498,55 @@ static void qt_qimageScaleAARGBA_down_x_up_y(QImageScaleInfo *isi, unsigned int int *yapoints = isi->yapoints; /* go through every scanline in the output buffer */ - for (int y = 0; y < dh; y++) { - unsigned int *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - int Cx = xapoints[x] >> 16; - int xap = xapoints[x] & 0xffff; - - const unsigned int *sptr = ypoints[y] + xpoints[x]; - int r, g, b, a; - qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, r, g, b, a); - - int yap = yapoints[y]; - if (yap > 0) { - int rr, gg, bb, aa; - qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1, rr, gg, bb, aa); - - r = r * (256 - yap); - g = g * (256 - yap); - b = b * (256 - yap); - a = a * (256 - yap); - r = (r + (rr * yap)) >> 8; - g = (g + (gg * yap)) >> 8; - b = (b + (bb * yap)) >> 8; - a = (a + (aa * yap)) >> 8; + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + int Cx = xapoints[x] >> 16; + int xap = xapoints[x] & 0xffff; + + const unsigned int *sptr = ypoints[y] + xpoints[x]; + int r, g, b, a; + qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, r, g, b, a); + + int yap = yapoints[y]; + if (yap > 0) { + int rr, gg, bb, aa; + qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1, rr, gg, bb, aa); + + r = r * (256 - yap); + g = g * (256 - yap); + b = b * (256 - yap); + a = a * (256 - yap); + r = (r + (rr * yap)) >> 8; + g = (g + (gg * yap)) >> 8; + b = (b + (bb * yap)) >> 8; + a = (a + (aa * yap)) >> 8; + } + *dptr = qRgba(r >> 14, g >> 14, b >> 14, a >> 14); + dptr++; } - *dptr = qRgba(r >> 14, g >> 14, b >> 14, a >> 14); - dptr++; } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } static void qt_qimageScaleAARGBA_down_xy(QImageScaleInfo *isi, unsigned int *dest, @@ -487,45 +557,66 @@ static void qt_qimageScaleAARGBA_down_xy(QImageScaleInfo *isi, unsigned int *des int *xapoints = isi->xapoints; int *yapoints = isi->yapoints; - for (int y = 0; y < dh; y++) { - int Cy = (yapoints[y]) >> 16; - int yap = (yapoints[y]) & 0xffff; - - unsigned int *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - int Cx = xapoints[x] >> 16; - int xap = xapoints[x] & 0xffff; + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + int Cy = (yapoints[y]) >> 16; + int yap = (yapoints[y]) & 0xffff; - const unsigned int *sptr = ypoints[y] + xpoints[x]; - int rx, gx, bx, ax; - qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, rx, gx, bx, ax); + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + int Cx = xapoints[x] >> 16; + int xap = xapoints[x] & 0xffff; - int r = ((rx>>4) * yap); - int g = ((gx>>4) * yap); - int b = ((bx>>4) * yap); - int a = ((ax>>4) * yap); + const unsigned int *sptr = ypoints[y] + xpoints[x]; + int rx, gx, bx, ax; + qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, rx, gx, bx, ax); - int j; - for (j = (1 << 14) - yap; j > Cy; j -= Cy) { + int r = ((rx>>4) * yap); + int g = ((gx>>4) * yap); + int b = ((bx>>4) * yap); + int a = ((ax>>4) * yap); + + int j; + for (j = (1 << 14) - yap; j > Cy; j -= Cy) { + sptr += sow; + qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, rx, gx, bx, ax); + r += ((rx>>4) * Cy); + g += ((gx>>4) * Cy); + b += ((bx>>4) * Cy); + a += ((ax>>4) * Cy); + } sptr += sow; qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, rx, gx, bx, ax); - r += ((rx>>4) * Cy); - g += ((gx>>4) * Cy); - b += ((bx>>4) * Cy); - a += ((ax>>4) * Cy); - } - sptr += sow; - qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, rx, gx, bx, ax); - r += ((rx>>4) * j); - g += ((gx>>4) * j); - b += ((bx>>4) * j); - a += ((ax>>4) * j); + r += ((rx>>4) * j); + g += ((gx>>4) * j); + b += ((bx>>4) * j); + a += ((ax>>4) * j); - *dptr = qRgba(r >> 24, g >> 24, b >> 24, a >> 24); - dptr++; + *dptr = qRgba(r >> 24, g >> 24, b >> 24, a >> 24); + dptr++; + } } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } #if QT_CONFIG(raster_64bit) @@ -546,32 +637,53 @@ static void qt_qimageScaleRgba64_up_xy(QImageScaleInfo *isi, QRgba64 *dest, int *xapoints = isi->xapoints; int *yapoints = isi->yapoints; - for (int y = 0; y < dh; y++) { - const QRgba64 *sptr = ypoints[y]; - QRgba64 *dptr = dest + (y * dow); - const int yap = yapoints[y]; - if (yap > 0) { - for (int x = 0; x < dw; x++) { - const QRgba64 *pix = sptr + xpoints[x]; - const int xap = xapoints[x]; - if (xap > 0) - *dptr = interpolate_4_pixels_rgb64(pix, pix + sow, xap * 256, yap * 256); - else - *dptr = interpolate256(pix[0], 256 - yap, pix[sow], yap); - dptr++; - } - } else { - for (int x = 0; x < dw; x++) { - const QRgba64 *pix = sptr + xpoints[x]; - const int xap = xapoints[x]; - if (xap > 0) - *dptr = interpolate256(pix[0], 256 - xap, pix[1], xap); - else - *dptr = pix[0]; - dptr++; + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + const QRgba64 *sptr = ypoints[y]; + QRgba64 *dptr = dest + (y * dow); + const int yap = yapoints[y]; + if (yap > 0) { + for (int x = 0; x < dw; x++) { + const QRgba64 *pix = sptr + xpoints[x]; + const int xap = xapoints[x]; + if (xap > 0) + *dptr = interpolate_4_pixels_rgb64(pix, pix + sow, xap * 256, yap * 256); + else + *dptr = interpolate256(pix[0], 256 - yap, pix[sow], yap); + dptr++; + } + } else { + for (int x = 0; x < dw; x++) { + const QRgba64 *pix = sptr + xpoints[x]; + const int xap = xapoints[x]; + if (xap > 0) + *dptr = interpolate256(pix[0], 256 - xap, pix[1], xap); + else + *dptr = pix[0]; + dptr++; + } } } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } void qt_qimageScaleRgba64(QImageScaleInfo *isi, QRgba64 *dest, @@ -616,33 +728,54 @@ static void qt_qimageScaleRgba64_up_x_down_y(QImageScaleInfo *isi, QRgba64 *dest int *xapoints = isi->xapoints; int *yapoints = isi->yapoints; - for (int y = 0; y < dh; y++) { - int Cy = (yapoints[y]) >> 16; - int yap = (yapoints[y]) & 0xffff; - - QRgba64 *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - const QRgba64 *sptr = ypoints[y] + xpoints[x]; - qint64 r, g, b, a; - qt_qimageScaleRgba64_helper(sptr, yap, Cy, sow, r, g, b, a); - - int xap = xapoints[x]; - if (xap > 0) { - qint64 rr, gg, bb, aa; - qt_qimageScaleRgba64_helper(sptr + 1, yap, Cy, sow, rr, gg, bb, aa); - - r = r * (256 - xap); - g = g * (256 - xap); - b = b * (256 - xap); - a = a * (256 - xap); - r = (r + (rr * xap)) >> 8; - g = (g + (gg * xap)) >> 8; - b = (b + (bb * xap)) >> 8; - a = (a + (aa * xap)) >> 8; + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + int Cy = (yapoints[y]) >> 16; + int yap = (yapoints[y]) & 0xffff; + + QRgba64 *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + const QRgba64 *sptr = ypoints[y] + xpoints[x]; + qint64 r, g, b, a; + qt_qimageScaleRgba64_helper(sptr, yap, Cy, sow, r, g, b, a); + + int xap = xapoints[x]; + if (xap > 0) { + qint64 rr, gg, bb, aa; + qt_qimageScaleRgba64_helper(sptr + 1, yap, Cy, sow, rr, gg, bb, aa); + + r = r * (256 - xap); + g = g * (256 - xap); + b = b * (256 - xap); + a = a * (256 - xap); + r = (r + (rr * xap)) >> 8; + g = (g + (gg * xap)) >> 8; + b = (b + (bb * xap)) >> 8; + a = (a + (aa * xap)) >> 8; + } + *dptr++ = qRgba64(r >> 14, g >> 14, b >> 14, a >> 14); } - *dptr++ = qRgba64(r >> 14, g >> 14, b >> 14, a >> 14); } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } static void qt_qimageScaleRgba64_down_x_up_y(QImageScaleInfo *isi, QRgba64 *dest, @@ -653,34 +786,55 @@ static void qt_qimageScaleRgba64_down_x_up_y(QImageScaleInfo *isi, QRgba64 *dest int *xapoints = isi->xapoints; int *yapoints = isi->yapoints; - for (int y = 0; y < dh; y++) { - QRgba64 *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - int Cx = xapoints[x] >> 16; - int xap = xapoints[x] & 0xffff; - - const QRgba64 *sptr = ypoints[y] + xpoints[x]; - qint64 r, g, b, a; - qt_qimageScaleRgba64_helper(sptr, xap, Cx, 1, r, g, b, a); - - int yap = yapoints[y]; - if (yap > 0) { - qint64 rr, gg, bb, aa; - qt_qimageScaleRgba64_helper(sptr + sow, xap, Cx, 1, rr, gg, bb, aa); - - r = r * (256 - yap); - g = g * (256 - yap); - b = b * (256 - yap); - a = a * (256 - yap); - r = (r + (rr * yap)) >> 8; - g = (g + (gg * yap)) >> 8; - b = (b + (bb * yap)) >> 8; - a = (a + (aa * yap)) >> 8; + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + QRgba64 *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + int Cx = xapoints[x] >> 16; + int xap = xapoints[x] & 0xffff; + + const QRgba64 *sptr = ypoints[y] + xpoints[x]; + qint64 r, g, b, a; + qt_qimageScaleRgba64_helper(sptr, xap, Cx, 1, r, g, b, a); + + int yap = yapoints[y]; + if (yap > 0) { + qint64 rr, gg, bb, aa; + qt_qimageScaleRgba64_helper(sptr + sow, xap, Cx, 1, rr, gg, bb, aa); + + r = r * (256 - yap); + g = g * (256 - yap); + b = b * (256 - yap); + a = a * (256 - yap); + r = (r + (rr * yap)) >> 8; + g = (g + (gg * yap)) >> 8; + b = (b + (bb * yap)) >> 8; + a = (a + (aa * yap)) >> 8; + } + *dptr = qRgba64(r >> 14, g >> 14, b >> 14, a >> 14); + dptr++; } - *dptr = qRgba64(r >> 14, g >> 14, b >> 14, a >> 14); - dptr++; } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } static void qt_qimageScaleRgba64_down_xy(QImageScaleInfo *isi, QRgba64 *dest, @@ -691,43 +845,64 @@ static void qt_qimageScaleRgba64_down_xy(QImageScaleInfo *isi, QRgba64 *dest, int *xapoints = isi->xapoints; int *yapoints = isi->yapoints; - for (int y = 0; y < dh; y++) { - int Cy = (yapoints[y]) >> 16; - int yap = (yapoints[y]) & 0xffff; - - QRgba64 *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - int Cx = xapoints[x] >> 16; - int xap = xapoints[x] & 0xffff; - - const QRgba64 *sptr = ypoints[y] + xpoints[x]; - qint64 rx, gx, bx, ax; - qt_qimageScaleRgba64_helper(sptr, xap, Cx, 1, rx, gx, bx, ax); - - qint64 r = rx * yap; - qint64 g = gx * yap; - qint64 b = bx * yap; - qint64 a = ax * yap; - int j; - for (j = (1 << 14) - yap; j > Cy; j -= Cy) { + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + int Cy = (yapoints[y]) >> 16; + int yap = (yapoints[y]) & 0xffff; + + QRgba64 *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + int Cx = xapoints[x] >> 16; + int xap = xapoints[x] & 0xffff; + + const QRgba64 *sptr = ypoints[y] + xpoints[x]; + qint64 rx, gx, bx, ax; + qt_qimageScaleRgba64_helper(sptr, xap, Cx, 1, rx, gx, bx, ax); + + qint64 r = rx * yap; + qint64 g = gx * yap; + qint64 b = bx * yap; + qint64 a = ax * yap; + int j; + for (j = (1 << 14) - yap; j > Cy; j -= Cy) { + sptr += sow; + qt_qimageScaleRgba64_helper(sptr, xap, Cx, 1, rx, gx, bx, ax); + r += rx * Cy; + g += gx * Cy; + b += bx * Cy; + a += ax * Cy; + } sptr += sow; qt_qimageScaleRgba64_helper(sptr, xap, Cx, 1, rx, gx, bx, ax); - r += rx * Cy; - g += gx * Cy; - b += bx * Cy; - a += ax * Cy; + r += rx * j; + g += gx * j; + b += bx * j; + a += ax * j; + + *dptr = qRgba64(r >> 28, g >> 28, b >> 28, a >> 28); + dptr++; } - sptr += sow; - qt_qimageScaleRgba64_helper(sptr, xap, Cx, 1, rx, gx, bx, ax); - r += rx * j; - g += gx * j; - b += bx * j; - a += ax * j; - - *dptr = qRgba64(r >> 28, g >> 28, b >> 28, a >> 28); - dptr++; } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } #endif @@ -817,31 +992,52 @@ static void qt_qimageScaleAARGB_up_x_down_y(QImageScaleInfo *isi, unsigned int * int *yapoints = isi->yapoints; /* go through every scanline in the output buffer */ - for (int y = 0; y < dh; y++) { - int Cy = yapoints[y] >> 16; - int yap = yapoints[y] & 0xffff; - - unsigned int *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - const unsigned int *sptr = ypoints[y] + xpoints[x]; - int r, g, b; - qt_qimageScaleAARGB_helper(sptr, yap, Cy, sow, r, g, b); - - int xap = xapoints[x]; - if (xap > 0) { - int rr, bb, gg; - qt_qimageScaleAARGB_helper(sptr + 1, yap, Cy, sow, rr, gg, bb); - - r = r * (256 - xap); - g = g * (256 - xap); - b = b * (256 - xap); - r = (r + (rr * xap)) >> 8; - g = (g + (gg * xap)) >> 8; - b = (b + (bb * xap)) >> 8; + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + int Cy = yapoints[y] >> 16; + int yap = yapoints[y] & 0xffff; + + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + const unsigned int *sptr = ypoints[y] + xpoints[x]; + int r, g, b; + qt_qimageScaleAARGB_helper(sptr, yap, Cy, sow, r, g, b); + + int xap = xapoints[x]; + if (xap > 0) { + int rr, bb, gg; + qt_qimageScaleAARGB_helper(sptr + 1, yap, Cy, sow, rr, gg, bb); + + r = r * (256 - xap); + g = g * (256 - xap); + b = b * (256 - xap); + r = (r + (rr * xap)) >> 8; + g = (g + (gg * xap)) >> 8; + b = (b + (bb * xap)) >> 8; + } + *dptr++ = qRgb(r >> 14, g >> 14, b >> 14); } - *dptr++ = qRgb(r >> 14, g >> 14, b >> 14); } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } static void qt_qimageScaleAARGB_down_x_up_y(QImageScaleInfo *isi, unsigned int *dest, @@ -853,31 +1049,52 @@ static void qt_qimageScaleAARGB_down_x_up_y(QImageScaleInfo *isi, unsigned int * int *yapoints = isi->yapoints; /* go through every scanline in the output buffer */ - for (int y = 0; y < dh; y++) { - unsigned int *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - int Cx = xapoints[x] >> 16; - int xap = xapoints[x] & 0xffff; - - const unsigned int *sptr = ypoints[y] + xpoints[x]; - int r, g, b; - qt_qimageScaleAARGB_helper(sptr, xap, Cx, 1, r, g, b); - - int yap = yapoints[y]; - if (yap > 0) { - int rr, bb, gg; - qt_qimageScaleAARGB_helper(sptr + sow, xap, Cx, 1, rr, gg, bb); - - r = r * (256 - yap); - g = g * (256 - yap); - b = b * (256 - yap); - r = (r + (rr * yap)) >> 8; - g = (g + (gg * yap)) >> 8; - b = (b + (bb * yap)) >> 8; + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + int Cx = xapoints[x] >> 16; + int xap = xapoints[x] & 0xffff; + + const unsigned int *sptr = ypoints[y] + xpoints[x]; + int r, g, b; + qt_qimageScaleAARGB_helper(sptr, xap, Cx, 1, r, g, b); + + int yap = yapoints[y]; + if (yap > 0) { + int rr, bb, gg; + qt_qimageScaleAARGB_helper(sptr + sow, xap, Cx, 1, rr, gg, bb); + + r = r * (256 - yap); + g = g * (256 - yap); + b = b * (256 - yap); + r = (r + (rr * yap)) >> 8; + g = (g + (gg * yap)) >> 8; + b = (b + (bb * yap)) >> 8; + } + *dptr++ = qRgb(r >> 14, g >> 14, b >> 14); } - *dptr++ = qRgb(r >> 14, g >> 14, b >> 14); } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } static void qt_qimageScaleAARGB_down_xy(QImageScaleInfo *isi, unsigned int *dest, @@ -888,43 +1105,64 @@ static void qt_qimageScaleAARGB_down_xy(QImageScaleInfo *isi, unsigned int *dest int *xapoints = isi->xapoints; int *yapoints = isi->yapoints; - for (int y = 0; y < dh; y++) { - int Cy = yapoints[y] >> 16; - int yap = yapoints[y] & 0xffff; + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + int Cy = yapoints[y] >> 16; + int yap = yapoints[y] & 0xffff; - unsigned int *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - int Cx = xapoints[x] >> 16; - int xap = xapoints[x] & 0xffff; + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + int Cx = xapoints[x] >> 16; + int xap = xapoints[x] & 0xffff; + + const unsigned int *sptr = ypoints[y] + xpoints[x]; + int rx, gx, bx; + qt_qimageScaleAARGB_helper(sptr, xap, Cx, 1, rx, gx, bx); - const unsigned int *sptr = ypoints[y] + xpoints[x]; - int rx, gx, bx; - qt_qimageScaleAARGB_helper(sptr, xap, Cx, 1, rx, gx, bx); + int r = (rx >> 4) * yap; + int g = (gx >> 4) * yap; + int b = (bx >> 4) * yap; - int r = (rx >> 4) * yap; - int g = (gx >> 4) * yap; - int b = (bx >> 4) * yap; + int j; + for (j = (1 << 14) - yap; j > Cy; j -= Cy) { + sptr += sow; + qt_qimageScaleAARGB_helper(sptr, xap, Cx, 1, rx, gx, bx); - int j; - for (j = (1 << 14) - yap; j > Cy; j -= Cy) { + r += (rx >> 4) * Cy; + g += (gx >> 4) * Cy; + b += (bx >> 4) * Cy; + } sptr += sow; qt_qimageScaleAARGB_helper(sptr, xap, Cx, 1, rx, gx, bx); - r += (rx >> 4) * Cy; - g += (gx >> 4) * Cy; - b += (bx >> 4) * Cy; - } - sptr += sow; - qt_qimageScaleAARGB_helper(sptr, xap, Cx, 1, rx, gx, bx); - - r += (rx >> 4) * j; - g += (gx >> 4) * j; - b += (bx >> 4) * j; + r += (rx >> 4) * j; + g += (gx >> 4) * j; + b += (bx >> 4) * j; - *dptr = qRgb(r >> 24, g >> 24, b >> 24); - dptr++; + *dptr = qRgb(r >> 24, g >> 24, b >> 24); + dptr++; + } } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } QImage qSmoothScaleImage(const QImage &src, int dw, int dh) diff --git a/src/gui/painting/qimagescale_neon.cpp b/src/gui/painting/qimagescale_neon.cpp index 4ae113b002..416155e139 100644 --- a/src/gui/painting/qimagescale_neon.cpp +++ b/src/gui/painting/qimagescale_neon.cpp @@ -41,6 +41,11 @@ #include "qimage.h" #include +#if QT_CONFIG(thread) +#include "qsemaphore.h" +#include "qthreadpool.h" +#endif + #if defined(__ARM_NEON__) QT_BEGIN_NAMESPACE @@ -76,33 +81,54 @@ void qt_qimageScaleAARGBA_up_x_down_y_neon(QImageScaleInfo *isi, unsigned int *d int *yapoints = isi->yapoints; /* go through every scanline in the output buffer */ - for (int y = 0; y < dh; y++) { - int Cy = yapoints[y] >> 16; - int yap = yapoints[y] & 0xffff; - - unsigned int *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - const unsigned int *sptr = ypoints[y] + xpoints[x]; - uint32x4_t vx = qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow); - - int xap = xapoints[x]; - if (xap > 0) { - uint32x4_t vr = qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow); - - vx = vmulq_n_u32(vx, 256 - xap); - vr = vmulq_n_u32(vr, xap); - vx = vaddq_u32(vx, vr); - vx = vshrq_n_u32(vx, 8); + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + int Cy = yapoints[y] >> 16; + int yap = yapoints[y] & 0xffff; + + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + const unsigned int *sptr = ypoints[y] + xpoints[x]; + uint32x4_t vx = qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow); + + int xap = xapoints[x]; + if (xap > 0) { + uint32x4_t vr = qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow); + + vx = vmulq_n_u32(vx, 256 - xap); + vr = vmulq_n_u32(vr, xap); + vx = vaddq_u32(vx, vr); + vx = vshrq_n_u32(vx, 8); + } + vx = vshrq_n_u32(vx, 14); + const uint16x4_t vx16 = vmovn_u32(vx); + const uint8x8_t vx8 = vmovn_u16(vcombine_u16(vx16, vx16)); + *dptr = vget_lane_u32(vreinterpret_u32_u8(vx8), 0); + if (RGB) + *dptr |= 0xff000000; + dptr++; } - vx = vshrq_n_u32(vx, 14); - const uint16x4_t vx16 = vmovn_u32(vx); - const uint8x8_t vx8 = vmovn_u16(vcombine_u16(vx16, vx16)); - *dptr = vget_lane_u32(vreinterpret_u32_u8(vx8), 0); - if (RGB) - *dptr |= 0xff000000; - dptr++; } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } template @@ -115,33 +141,54 @@ void qt_qimageScaleAARGBA_down_x_up_y_neon(QImageScaleInfo *isi, unsigned int *d int *yapoints = isi->yapoints; /* go through every scanline in the output buffer */ - for (int y = 0; y < dh; y++) { - unsigned int *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - int Cx = xapoints[x] >> 16; - int xap = xapoints[x] & 0xffff; - - const unsigned int *sptr = ypoints[y] + xpoints[x]; - uint32x4_t vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1); - - int yap = yapoints[y]; - if (yap > 0) { - uint32x4_t vr = qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1); - - vx = vmulq_n_u32(vx, 256 - yap); - vr = vmulq_n_u32(vr, yap); - vx = vaddq_u32(vx, vr); - vx = vshrq_n_u32(vx, 8); + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + int Cx = xapoints[x] >> 16; + int xap = xapoints[x] & 0xffff; + + const unsigned int *sptr = ypoints[y] + xpoints[x]; + uint32x4_t vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1); + + int yap = yapoints[y]; + if (yap > 0) { + uint32x4_t vr = qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1); + + vx = vmulq_n_u32(vx, 256 - yap); + vr = vmulq_n_u32(vr, yap); + vx = vaddq_u32(vx, vr); + vx = vshrq_n_u32(vx, 8); + } + vx = vshrq_n_u32(vx, 14); + const uint16x4_t vx16 = vmovn_u32(vx); + const uint8x8_t vx8 = vmovn_u16(vcombine_u16(vx16, vx16)); + *dptr = vget_lane_u32(vreinterpret_u32_u8(vx8), 0); + if (RGB) + *dptr |= 0xff000000; + dptr++; } - vx = vshrq_n_u32(vx, 14); - const uint16x4_t vx16 = vmovn_u32(vx); - const uint8x8_t vx8 = vmovn_u16(vcombine_u16(vx16, vx16)); - *dptr = vget_lane_u32(vreinterpret_u32_u8(vx8), 0); - if (RGB) - *dptr |= 0xff000000; - dptr++; } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } template @@ -153,43 +200,64 @@ void qt_qimageScaleAARGBA_down_xy_neon(QImageScaleInfo *isi, unsigned int *dest, int *xapoints = isi->xapoints; int *yapoints = isi->yapoints; - for (int y = 0; y < dh; y++) { - int Cy = yapoints[y] >> 16; - int yap = yapoints[y] & 0xffff; + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + int Cy = yapoints[y] >> 16; + int yap = yapoints[y] & 0xffff; - unsigned int *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - const int Cx = xapoints[x] >> 16; - const int xap = xapoints[x] & 0xffff; + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + const int Cx = xapoints[x] >> 16; + const int xap = xapoints[x] & 0xffff; - const unsigned int *sptr = ypoints[y] + xpoints[x]; - uint32x4_t vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1); - vx = vshrq_n_u32(vx, 4); - uint32x4_t vr = vmulq_n_u32(vx, yap); + const unsigned int *sptr = ypoints[y] + xpoints[x]; + uint32x4_t vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1); + vx = vshrq_n_u32(vx, 4); + uint32x4_t vr = vmulq_n_u32(vx, yap); - int j; - for (j = (1 << 14) - yap; j > Cy; j -= Cy) { + int j; + for (j = (1 << 14) - yap; j > Cy; j -= Cy) { + sptr += sow; + vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1); + vx = vshrq_n_u32(vx, 4); + vx = vmulq_n_u32(vx, Cy); + vr = vaddq_u32(vr, vx); + } sptr += sow; vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1); vx = vshrq_n_u32(vx, 4); - vx = vmulq_n_u32(vx, Cy); + vx = vmulq_n_u32(vx, j); vr = vaddq_u32(vr, vx); + + vx = vshrq_n_u32(vr, 24); + const uint16x4_t vx16 = vmovn_u32(vx); + const uint8x8_t vx8 = vmovn_u16(vcombine_u16(vx16, vx16)); + *dptr = vget_lane_u32(vreinterpret_u32_u8(vx8), 0); + if (RGB) + *dptr |= 0xff000000; + dptr++; } - sptr += sow; - vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1); - vx = vshrq_n_u32(vx, 4); - vx = vmulq_n_u32(vx, j); - vr = vaddq_u32(vr, vx); - - vx = vshrq_n_u32(vr, 24); - const uint16x4_t vx16 = vmovn_u32(vx); - const uint8x8_t vx8 = vmovn_u16(vcombine_u16(vx16, vx16)); - *dptr = vget_lane_u32(vreinterpret_u32_u8(vx8), 0); - if (RGB) - *dptr |= 0xff000000; - dptr++; } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } template void qt_qimageScaleAARGBA_up_x_down_y_neon(QImageScaleInfo *isi, unsigned int *dest, diff --git a/src/gui/painting/qimagescale_p.h b/src/gui/painting/qimagescale_p.h index 244d681718..a9a4c0f858 100644 --- a/src/gui/painting/qimagescale_p.h +++ b/src/gui/painting/qimagescale_p.h @@ -66,6 +66,8 @@ namespace QImageScale { int *xapoints{nullptr}; int *yapoints{nullptr}; int xup_yup{0}; + int sh = 0; + int sw = 0; }; } diff --git a/src/gui/painting/qimagescale_sse4.cpp b/src/gui/painting/qimagescale_sse4.cpp index 5861a2e2ff..902ae61ed2 100644 --- a/src/gui/painting/qimagescale_sse4.cpp +++ b/src/gui/painting/qimagescale_sse4.cpp @@ -42,6 +42,11 @@ #include #include +#if QT_CONFIG(thread) +#include "qsemaphore.h" +#include "qthreadpool.h" +#endif + #if defined(QT_COMPILER_SUPPORTS_SSE4_1) QT_BEGIN_NAMESPACE @@ -70,44 +75,65 @@ void qt_qimageScaleAARGBA_up_x_down_y_sse4(QImageScaleInfo *isi, unsigned int *d int dw, int dh, int dow, int sow) { const unsigned int **ypoints = isi->ypoints; - int *xpoints = isi->xpoints; - int *xapoints = isi->xapoints; - int *yapoints = isi->yapoints; + const int *xpoints = isi->xpoints; + const int *xapoints = isi->xapoints; + const int *yapoints = isi->yapoints; const __m128i v256 = _mm_set1_epi32(256); /* go through every scanline in the output buffer */ - for (int y = 0; y < dh; y++) { - int Cy = yapoints[y] >> 16; - int yap = yapoints[y] & 0xffff; - const __m128i vCy = _mm_set1_epi32(Cy); - const __m128i vyap = _mm_set1_epi32(yap); - - unsigned int *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - const unsigned int *sptr = ypoints[y] + xpoints[x]; - __m128i vx = qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow, vyap, vCy); - - int xap = xapoints[x]; - if (xap > 0) { - const __m128i vxap = _mm_set1_epi32(xap); - const __m128i vinvxap = _mm_sub_epi32(v256, vxap); - __m128i vr = qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow, vyap, vCy); + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + const int Cy = yapoints[y] >> 16; + const int yap = yapoints[y] & 0xffff; + const __m128i vCy = _mm_set1_epi32(Cy); + const __m128i vyap = _mm_set1_epi32(yap); - vx = _mm_mullo_epi32(vx, vinvxap); - vr = _mm_mullo_epi32(vr, vxap); - vx = _mm_add_epi32(vx, vr); - vx = _mm_srli_epi32(vx, 8); + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + const unsigned int *sptr = ypoints[y] + xpoints[x]; + __m128i vx = qt_qimageScaleAARGBA_helper(sptr, yap, Cy, sow, vyap, vCy); + + const int xap = xapoints[x]; + if (xap > 0) { + const __m128i vxap = _mm_set1_epi32(xap); + const __m128i vinvxap = _mm_sub_epi32(v256, vxap); + __m128i vr = qt_qimageScaleAARGBA_helper(sptr + 1, yap, Cy, sow, vyap, vCy); + + vx = _mm_mullo_epi32(vx, vinvxap); + vr = _mm_mullo_epi32(vr, vxap); + vx = _mm_add_epi32(vx, vr); + vx = _mm_srli_epi32(vx, 8); + } + vx = _mm_srli_epi32(vx, 14); + vx = _mm_packus_epi32(vx, vx); + vx = _mm_packus_epi16(vx, vx); + *dptr = _mm_cvtsi128_si32(vx); + if (RGB) + *dptr |= 0xff000000; + dptr++; } - vx = _mm_srli_epi32(vx, 14); - vx = _mm_packus_epi32(vx, _mm_setzero_si128()); - vx = _mm_packus_epi16(vx, _mm_setzero_si128()); - *dptr = _mm_cvtsi128_si32(vx); - if (RGB) - *dptr |= 0xff000000; - dptr++; } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } template @@ -122,37 +148,58 @@ void qt_qimageScaleAARGBA_down_x_up_y_sse4(QImageScaleInfo *isi, unsigned int *d const __m128i v256 = _mm_set1_epi32(256); /* go through every scanline in the output buffer */ - for (int y = 0; y < dh; y++) { - unsigned int *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - int Cx = xapoints[x] >> 16; - int xap = xapoints[x] & 0xffff; - const __m128i vCx = _mm_set1_epi32(Cx); - const __m128i vxap = _mm_set1_epi32(xap); - - const unsigned int *sptr = ypoints[y] + xpoints[x]; - __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx); - - int yap = yapoints[y]; - if (yap > 0) { - const __m128i vyap = _mm_set1_epi32(yap); - const __m128i vinvyap = _mm_sub_epi32(v256, vyap); - __m128i vr = qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1, vxap, vCx); - - vx = _mm_mullo_epi32(vx, vinvyap); - vr = _mm_mullo_epi32(vr, vyap); - vx = _mm_add_epi32(vx, vr); - vx = _mm_srli_epi32(vx, 8); + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + int Cx = xapoints[x] >> 16; + int xap = xapoints[x] & 0xffff; + const __m128i vCx = _mm_set1_epi32(Cx); + const __m128i vxap = _mm_set1_epi32(xap); + + const unsigned int *sptr = ypoints[y] + xpoints[x]; + __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx); + + int yap = yapoints[y]; + if (yap > 0) { + const __m128i vyap = _mm_set1_epi32(yap); + const __m128i vinvyap = _mm_sub_epi32(v256, vyap); + __m128i vr = qt_qimageScaleAARGBA_helper(sptr + sow, xap, Cx, 1, vxap, vCx); + + vx = _mm_mullo_epi32(vx, vinvyap); + vr = _mm_mullo_epi32(vr, vyap); + vx = _mm_add_epi32(vx, vr); + vx = _mm_srli_epi32(vx, 8); + } + vx = _mm_srli_epi32(vx, 14); + vx = _mm_packus_epi32(vx, vx); + vx = _mm_packus_epi16(vx, vx); + *dptr = _mm_cvtsi128_si32(vx); + if (RGB) + *dptr |= 0xff000000; + dptr++; } - vx = _mm_srli_epi32(vx, 14); - vx = _mm_packus_epi32(vx, _mm_setzero_si128()); - vx = _mm_packus_epi16(vx, _mm_setzero_si128()); - *dptr = _mm_cvtsi128_si32(vx); - if (RGB) - *dptr |= 0xff000000; - dptr++; } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } template @@ -164,42 +211,63 @@ void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi, unsigned int *dest, int *xapoints = isi->xapoints; int *yapoints = isi->yapoints; - for (int y = 0; y < dh; y++) { - int Cy = yapoints[y] >> 16; - int yap = yapoints[y] & 0xffff; - const __m128i vCy = _mm_set1_epi32(Cy); - const __m128i vyap = _mm_set1_epi32(yap); - - unsigned int *dptr = dest + (y * dow); - for (int x = 0; x < dw; x++) { - const int Cx = xapoints[x] >> 16; - const int xap = xapoints[x] & 0xffff; - const __m128i vCx = _mm_set1_epi32(Cx); - const __m128i vxap = _mm_set1_epi32(xap); - - const unsigned int *sptr = ypoints[y] + xpoints[x]; - __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx); - __m128i vr = _mm_mullo_epi32(_mm_srli_epi32(vx, 4), vyap); - - int j; - for (j = (1 << 14) - yap; j > Cy; j -= Cy) { + auto scaleSection = [&] (int yStart, int yEnd) { + for (int y = yStart; y < yEnd; ++y) { + int Cy = yapoints[y] >> 16; + int yap = yapoints[y] & 0xffff; + const __m128i vCy = _mm_set1_epi32(Cy); + const __m128i vyap = _mm_set1_epi32(yap); + + unsigned int *dptr = dest + (y * dow); + for (int x = 0; x < dw; x++) { + const int Cx = xapoints[x] >> 16; + const int xap = xapoints[x] & 0xffff; + const __m128i vCx = _mm_set1_epi32(Cx); + const __m128i vxap = _mm_set1_epi32(xap); + + const unsigned int *sptr = ypoints[y] + xpoints[x]; + __m128i vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx); + __m128i vr = _mm_mullo_epi32(_mm_srli_epi32(vx, 4), vyap); + + int j; + for (j = (1 << 14) - yap; j > Cy; j -= Cy) { + sptr += sow; + vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx); + vr = _mm_add_epi32(vr, _mm_mullo_epi32(_mm_srli_epi32(vx, 4), vCy)); + } sptr += sow; vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx); - vr = _mm_add_epi32(vr, _mm_mullo_epi32(_mm_srli_epi32(vx, 4), vCy)); + vr = _mm_add_epi32(vr, _mm_mullo_epi32(_mm_srli_epi32(vx, 4), _mm_set1_epi32(j))); + + vr = _mm_srli_epi32(vr, 24); + vr = _mm_packus_epi32(vr, _mm_setzero_si128()); + vr = _mm_packus_epi16(vr, _mm_setzero_si128()); + *dptr = _mm_cvtsi128_si32(vr); + if (RGB) + *dptr |= 0xff000000; + dptr++; } - sptr += sow; - vx = qt_qimageScaleAARGBA_helper(sptr, xap, Cx, 1, vxap, vCx); - vr = _mm_add_epi32(vr, _mm_mullo_epi32(_mm_srli_epi32(vx, 4), _mm_set1_epi32(j))); - - vr = _mm_srli_epi32(vr, 24); - vr = _mm_packus_epi32(vr, _mm_setzero_si128()); - vr = _mm_packus_epi16(vr, _mm_setzero_si128()); - *dptr = _mm_cvtsi128_si32(vr); - if (RGB) - *dptr |= 0xff000000; - dptr++; } + }; +#if QT_CONFIG(thread) + int segments = (qsizetype(isi->sh) * isi->sw) / (1<<16); + segments = std::min(segments, dh); + if (segments > 1) { + QSemaphore semaphore; + int y = 0; + for (int i = 0; i < segments; ++i) { + int yn = (dh - y) / (segments - i); + QThreadPool::globalInstance()->start([&, y, yn]() { + scaleSection(y, y + yn); + semaphore.release(1); + }); + y += yn; + } + semaphore.acquire(segments); + return; } +#endif + scaleSection(0, dh); } template void qt_qimageScaleAARGBA_up_x_down_y_sse4(QImageScaleInfo *isi, unsigned int *dest, -- cgit v1.2.3