From 7432c7c08a6709a12a143d48fbaa9927962edae8 Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Tue, 7 Apr 2015 11:20:08 +0200 Subject: Cleanup and optimization of qimage smoothscale Cleaning up smoothscale code. Upscaling is improved using existing optimized interpolation methods, and downscale is given SSE4.1 optimized versions. Change-Id: I7cdc256c26850948aef7dae26fda1622be6b8179 Reviewed-by: Gunnar Sletta --- src/gui/painting/painting.pri | 3 +- src/gui/painting/qdrawhelper.cpp | 38 -- src/gui/painting/qdrawhelper_p.h | 36 + src/gui/painting/qimagescale.cpp | 1155 +++++++++++++-------------------- src/gui/painting/qimagescale_p.h | 9 + src/gui/painting/qimagescale_sse4.cpp | 247 +++++++ 6 files changed, 762 insertions(+), 726 deletions(-) create mode 100644 src/gui/painting/qimagescale_sse4.cpp (limited to 'src/gui/painting') diff --git a/src/gui/painting/painting.pri b/src/gui/painting/painting.pri index 2f2d3daaf8..a861516821 100644 --- a/src/gui/painting/painting.pri +++ b/src/gui/painting/painting.pri @@ -93,7 +93,8 @@ SOURCES += \ SSE2_SOURCES += painting/qdrawhelper_sse2.cpp SSSE3_SOURCES += painting/qdrawhelper_ssse3.cpp -SSE4_1_SOURCES += painting/qdrawhelper_sse4.cpp +SSE4_1_SOURCES += painting/qdrawhelper_sse4.cpp \ + painting/qimagescale_sse4.cpp AVX2_SOURCES += painting/qdrawhelper_avx2.cpp !ios { diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index 57bb111538..b75018452a 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -1245,44 +1245,6 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, i } #endif -#if defined(__SSE2__) -static inline uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty) -{ - // First interpolate right and left pixels in parallel. - __m128i vl = _mm_unpacklo_epi32(_mm_cvtsi32_si128(tl), _mm_cvtsi32_si128(bl)); - __m128i vr = _mm_unpacklo_epi32(_mm_cvtsi32_si128(tr), _mm_cvtsi32_si128(br)); - vl = _mm_unpacklo_epi8(vl, _mm_setzero_si128()); - vr = _mm_unpacklo_epi8(vr, _mm_setzero_si128()); - vl = _mm_mullo_epi16(vl, _mm_set1_epi16(256 - distx)); - vr = _mm_mullo_epi16(vr, _mm_set1_epi16(distx)); - __m128i vtb = _mm_add_epi16(vl, vr); - vtb = _mm_srli_epi16(vtb, 8); - // vtb now contains the result of the first two interpolate calls vtb = unpacked((xbot << 64) | xtop) - - // Now the last interpolate between top and bottom interpolations. - const __m128i vidisty = _mm_shufflelo_epi16(_mm_cvtsi32_si128(256 - disty), _MM_SHUFFLE(0, 0, 0, 0)); - const __m128i vdisty = _mm_shufflelo_epi16(_mm_cvtsi32_si128(disty), _MM_SHUFFLE(0, 0, 0, 0)); - const __m128i vmuly = _mm_unpacklo_epi16(vidisty, vdisty); - vtb = _mm_unpacklo_epi16(vtb, _mm_srli_si128(vtb, 8)); - // vtb now contains the colors of top and bottom interleaved { ta, ba, tr, br, tg, bg, tb, bb } - vtb = _mm_madd_epi16(vtb, vmuly); // Multiply and horizontal add. - vtb = _mm_srli_epi32(vtb, 8); - vtb = _mm_packs_epi32(vtb, _mm_setzero_si128()); - vtb = _mm_packus_epi16(vtb, _mm_setzero_si128()); - return _mm_cvtsi128_si32(vtb); -} -#else -static inline uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty) -{ - uint idistx = 256 - distx; - uint idisty = 256 - disty; - uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx); - uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx); - return INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty); -} -#endif - - template void fetchTransformedBilinear_pixelBounds(int max, int l1, int l2, int &v1, int &v2); diff --git a/src/gui/painting/qdrawhelper_p.h b/src/gui/painting/qdrawhelper_p.h index 480ba4c97b..0d391b2cec 100644 --- a/src/gui/painting/qdrawhelper_p.h +++ b/src/gui/painting/qdrawhelper_p.h @@ -605,6 +605,42 @@ static Q_ALWAYS_INLINE uint BYTE_MUL(uint x, uint a) { } #endif +#ifdef __SSE2__ +static inline uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty) +{ + // First interpolate right and left pixels in parallel. + __m128i vl = _mm_unpacklo_epi32(_mm_cvtsi32_si128(tl), _mm_cvtsi32_si128(bl)); + __m128i vr = _mm_unpacklo_epi32(_mm_cvtsi32_si128(tr), _mm_cvtsi32_si128(br)); + vl = _mm_unpacklo_epi8(vl, _mm_setzero_si128()); + vr = _mm_unpacklo_epi8(vr, _mm_setzero_si128()); + vl = _mm_mullo_epi16(vl, _mm_set1_epi16(256 - distx)); + vr = _mm_mullo_epi16(vr, _mm_set1_epi16(distx)); + __m128i vtb = _mm_add_epi16(vl, vr); + vtb = _mm_srli_epi16(vtb, 8); + // vtb now contains the result of the first two interpolate calls vtb = unpacked((xbot << 64) | xtop) + + // Now the last interpolate between top and bottom interpolations. + const __m128i vidisty = _mm_shufflelo_epi16(_mm_cvtsi32_si128(256 - disty), _MM_SHUFFLE(0, 0, 0, 0)); + const __m128i vdisty = _mm_shufflelo_epi16(_mm_cvtsi32_si128(disty), _MM_SHUFFLE(0, 0, 0, 0)); + const __m128i vmuly = _mm_unpacklo_epi16(vidisty, vdisty); + vtb = _mm_unpacklo_epi16(vtb, _mm_srli_si128(vtb, 8)); + // vtb now contains the colors of top and bottom interleaved { ta, ba, tr, br, tg, bg, tb, bb } + vtb = _mm_madd_epi16(vtb, vmuly); // Multiply and horizontal add. + vtb = _mm_srli_epi32(vtb, 8); + vtb = _mm_packs_epi32(vtb, _mm_setzero_si128()); + vtb = _mm_packus_epi16(vtb, _mm_setzero_si128()); + return _mm_cvtsi128_si32(vtb); +} +#else +static inline uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty) +{ + uint idistx = 256 - distx; + uint idisty = 256 - disty; + uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx); + uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx); + return INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty); +} +#endif #if Q_BYTE_ORDER == Q_BIG_ENDIAN static Q_ALWAYS_INLINE quint32 RGBA2ARGB(quint32 x) { diff --git a/src/gui/painting/qimagescale.cpp b/src/gui/painting/qimagescale.cpp index 58e9112dd6..5f1b25e189 100644 --- a/src/gui/painting/qimagescale.cpp +++ b/src/gui/painting/qimagescale.cpp @@ -38,10 +38,6 @@ QT_BEGIN_NAMESPACE -namespace QImageScale { - struct QImageScaleInfo; -} - typedef void (*qt_qimageScaleFunc)(QImageScale::QImageScaleInfo *isi, unsigned int *dest, int dxx, int dyy, int dx, int dy, int dw, int dh, int dow, int sow); @@ -105,15 +101,7 @@ qt_qimageScaleFunc qt_qimageScaleRgb = qt_qimageScaleAARGB; namespace QImageScale { - struct QImageScaleInfo { - int *xpoints; - const unsigned int **ypoints; - int *xapoints, *yapoints; - int xup_yup; - }; - - const unsigned int** qimageCalcYPoints(const unsigned int *src, int sw, int sh, - int dh); + const unsigned int** qimageCalcYPoints(const unsigned int *src, int sw, int sh, int dh); int* qimageCalcXPoints(int sw, int dw); int* qimageCalcApoints(int s, int d, int up); QImageScaleInfo* qimageFreeScaleInfo(QImageScaleInfo *isi); @@ -134,16 +122,11 @@ using namespace QImageScale; #define G_VAL(p) (qGreen(*p)) #define B_VAL(p) (qBlue(*p)) -#define INV_XAP (256 - xapoints[x]) -#define XAP (xapoints[x]) -#define INV_YAP (256 - yapoints[dyy + y]) -#define YAP (yapoints[dyy + y]) - const unsigned int** QImageScale::qimageCalcYPoints(const unsigned int *src, int sw, int sh, int dh) { const unsigned int **p; - int i, j = 0, rv = 0; + int j = 0, rv = 0; qint64 val, inc; if(dh < 0){ @@ -155,12 +138,12 @@ const unsigned int** QImageScale::qimageCalcYPoints(const unsigned int *src, int up = qAbs(dh) >= sh; val = up ? 0x8000 * sh / dh - 0x8000 : 0; inc = (((qint64)sh) << 16) / dh; - for(i = 0; i < dh; i++){ + for (int i = 0; i < dh; i++) { p[j++] = src + qMax(0LL, val >> 16) * sw; val += inc; } - if(rv){ - for(i = dh / 2; --i >= 0; ){ + if (rv) { + for (int i = dh / 2; --i >= 0; ) { const unsigned int *tmp = p[i]; p[i] = p[dh - i - 1]; p[dh - i - 1] = tmp; @@ -171,7 +154,7 @@ const unsigned int** QImageScale::qimageCalcYPoints(const unsigned int *src, int* QImageScale::qimageCalcXPoints(int sw, int dw) { - int *p, i, j = 0, rv = 0; + int *p, j = 0, rv = 0; qint64 val, inc; if(dw < 0){ @@ -183,13 +166,13 @@ int* QImageScale::qimageCalcXPoints(int sw, int dw) int up = qAbs(dw) >= sw; val = up ? 0x8000 * sw / dw - 0x8000 : 0; inc = (((qint64)sw) << 16) / dw; - for(i = 0; i < dw; i++){ + for (int i = 0; i < dw; i++) { p[j++] = qMax(0LL, val >> 16); val += inc; } - if(rv){ - for(i = dw / 2; --i >= 0; ){ + if (rv) { + for (int i = dw / 2; --i >= 0; ) { int tmp = p[i]; p[i] = p[dw - i - 1]; p[dw - i - 1] = tmp; @@ -200,7 +183,7 @@ int* QImageScale::qimageCalcXPoints(int sw, int dw) int* QImageScale::qimageCalcApoints(int s, int d, int up) { - int *p, i, j = 0, rv = 0; + int *p, j = 0, rv = 0; if(d < 0){ rv = 1; @@ -214,7 +197,7 @@ int* QImageScale::qimageCalcApoints(int s, int d, int up) val = 0x8000 * s / d - 0x8000; inc = (((qint64)s) << 16) / d; - for(i = 0; i < d; i++){ + for (int i = 0; i < d; i++) { int pos = val >> 16; if (pos < 0) p[j++] = 0; @@ -226,14 +209,12 @@ int* QImageScale::qimageCalcApoints(int s, int d, int up) } } /* scaling down */ - else{ - qint64 val, inc; - int ap, Cp; - val = 0; - inc = (((qint64)s) << 16) / d; - Cp = ((d << 14) / s) + 1; - for(i = 0; i < d; i++){ - ap = ((0x100 - ((val >> 8) & 0xff)) * Cp) >> 8; + else { + qint64 val = 0; + qint64 inc = (((qint64)s) << 16) / d; + int Cp = (((d << 14) + s - 1) / s); + for (int i = 0; i < d; i++) { + int ap = ((0x10000 - (val & 0xffff)) * Cp) >> 16; p[j] = ap | (Cp << 16); j++; val += inc; @@ -241,13 +222,13 @@ int* QImageScale::qimageCalcApoints(int s, int d, int up) } if(rv){ int tmp; - for(i = d / 2; --i >= 0; ){ + for (int i = d / 2; --i >= 0; ) { tmp = p[i]; p[i] = p[d - i - 1]; p[d - i - 1] = tmp; } } - return(p); + return p; } QImageScaleInfo* QImageScale::qimageFreeScaleInfo(QImageScaleInfo *isi) @@ -297,700 +278,500 @@ QImageScaleInfo* QImageScale::qimageCalcScaleInfo(const QImage &img, return(isi); } -/* FIXME: NEED to optimize ScaleAARGBA - currently its "ok" but needs work*/ -/* scale by area sampling */ -static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest, - int dxx, int dyy, int dx, int dy, int dw, - int dh, int dow, int sow) +static void qt_qimageScaleAARGBA_up_x_down_y(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow); + +static void qt_qimageScaleAARGBA_down_x_up_y(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow); + +static void qt_qimageScaleAARGBA_down_xy(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, int dw, + int dh, int dow, int sow); + +#if defined(QT_COMPILER_SUPPORTS_SSE4_1) +template +void qt_qimageScaleAARGBA_up_x_down_y_sse4(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow); +template +void qt_qimageScaleAARGBA_down_x_up_y_sse4(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow); +template +void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow); +#endif + +static void qt_qimageScaleAARGBA_up_xy(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow) { - const unsigned int *sptr; - unsigned int *dptr; - int x, y, end; const unsigned int **ypoints = isi->ypoints; int *xpoints = isi->xpoints; int *xapoints = isi->xapoints; int *yapoints = isi->yapoints; - end = dxx + dw; - /* scaling up both ways */ - if(isi->xup_yup == 3){ - /* go through every scanline in the output buffer */ - for(y = 0; y < dh; y++){ - /* calculate the source line we'll scan from */ - dptr = dest + dx + ((y + dy) * dow); - sptr = ypoints[dyy + y]; - if(YAP > 0){ - for(x = dxx; x < end; x++){ - int r, g, b, a; - int rr, gg, bb, aa; - const unsigned int *pix; - - if(XAP > 0){ - pix = ypoints[dyy + y] + xpoints[x]; - r = R_VAL(pix) * INV_XAP; - g = G_VAL(pix) * INV_XAP; - b = B_VAL(pix) * INV_XAP; - a = A_VAL(pix) * INV_XAP; - pix++; - r += R_VAL(pix) * XAP; - g += G_VAL(pix) * XAP; - b += B_VAL(pix) * XAP; - a += A_VAL(pix) * XAP; - pix += sow; - rr = R_VAL(pix) * XAP; - gg = G_VAL(pix) * XAP; - bb = B_VAL(pix) * XAP; - aa = A_VAL(pix) * XAP; - pix--; - rr += R_VAL(pix) * INV_XAP; - gg += G_VAL(pix) * INV_XAP; - bb += B_VAL(pix) * INV_XAP; - aa += A_VAL(pix) * INV_XAP; - r = ((rr * YAP) + (r * INV_YAP)) >> 16; - g = ((gg * YAP) + (g * INV_YAP)) >> 16; - b = ((bb * YAP) + (b * INV_YAP)) >> 16; - a = ((aa * YAP) + (a * INV_YAP)) >> 16; - *dptr++ = qRgba(r, g, b, a); - } - else{ - pix = ypoints[dyy + y] + xpoints[x]; - r = R_VAL(pix) * INV_YAP; - g = G_VAL(pix) * INV_YAP; - b = B_VAL(pix) * INV_YAP; - a = A_VAL(pix) * INV_YAP; - pix += sow; - r += R_VAL(pix) * YAP; - g += G_VAL(pix) * YAP; - b += B_VAL(pix) * YAP; - a += A_VAL(pix) * YAP; - r >>= 8; - g >>= 8; - b >>= 8; - a >>= 8; - *dptr++ = qRgba(r, g, b, a); - } - } + int end = dxx + dw; + /* go through every scanline in the output buffer */ + for (int y = 0; y < dh; y++) { + /* calculate the source line we'll scan from */ + const unsigned int *sptr = ypoints[dyy + y]; + unsigned int *dptr = dest + dx + ((y + dy) * dow); + const int yap = yapoints[dyy + y]; + if (yap > 0) { + for (int x = dxx; x < end; x++) { + const unsigned int *pix = sptr + xpoints[x]; + const int xap = xapoints[x]; + if (xap > 0) + *dptr = interpolate_4_pixels(pix[0], pix[1], pix[sow], pix[sow + 1], xap, yap); + else + *dptr = INTERPOLATE_PIXEL_256(pix[0], 256 - yap, pix[sow], yap); + dptr++; } - else{ - for(x = dxx; x < end; x++){ - int r, g, b, a; - const unsigned int *pix; - - if(XAP > 0){ - pix = ypoints[dyy + y] + xpoints[x]; - r = R_VAL(pix) * INV_XAP; - g = G_VAL(pix) * INV_XAP; - b = B_VAL(pix) * INV_XAP; - a = A_VAL(pix) * INV_XAP; - pix++; - r += R_VAL(pix) * XAP; - g += G_VAL(pix) * XAP; - b += B_VAL(pix) * XAP; - a += A_VAL(pix) * XAP; - r >>= 8; - g >>= 8; - b >>= 8; - a >>= 8; - *dptr++ = qRgba(r, g, b, a); - } - else - *dptr++ = sptr[xpoints[x] ]; - } + } else { + for (int x = dxx; x < end; x++) { + const unsigned int *pix = sptr + xpoints[x]; + const int xap = xapoints[x]; + *dptr = INTERPOLATE_PIXEL_256(pix[0], 256 - xap, pix[1], xap); + dptr++; } } } +} + +/* scale by area sampling */ +static void qt_qimageScaleAARGBA(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, int dw, + int dh, int dow, int sow) +{ + /* scaling up both ways */ + if (isi->xup_yup == 3){ + qt_qimageScaleAARGBA_up_xy(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); + } /* if we're scaling down vertically */ - else if(isi->xup_yup == 1){ - /*\ 'Correct' version, with math units prepared for MMXification \*/ - int Cy, j; - const unsigned int *pix; - int r, g, b, a, rr, gg, bb, aa; - int yap; - - /* go through every scanline in the output buffer */ - for(y = 0; y < dh; y++){ - Cy = YAP >> 16; - yap = YAP & 0xffff; - - dptr = dest + dx + ((y + dy) * dow); - for(x = dxx; x < end; x++){ - pix = ypoints[dyy + y] + xpoints[x]; - r = R_VAL(pix) * yap; - g = G_VAL(pix) * yap; - b = B_VAL(pix) * yap; - a = A_VAL(pix) * yap; - for(j = (1 << 14) - yap; j > Cy; j -= Cy){ - pix += sow; - r += R_VAL(pix) * Cy; - g += G_VAL(pix) * Cy; - b += B_VAL(pix) * Cy; - a += A_VAL(pix) * Cy; - } - if(j > 0){ - pix += sow; - r += R_VAL(pix) * j; - g += G_VAL(pix) * j; - b += B_VAL(pix) * j; - a += A_VAL(pix) * j; - } - if(XAP > 0){ - pix = ypoints[dyy + y] + xpoints[x] + 1; - rr = R_VAL(pix) * yap; - gg = G_VAL(pix) * yap; - bb = B_VAL(pix) * yap; - aa = A_VAL(pix) * yap; - for(j = (1 << 14) - yap; j > Cy; j -= Cy){ - pix += sow; - rr += R_VAL(pix) * Cy; - gg += G_VAL(pix) * Cy; - bb += B_VAL(pix) * Cy; - aa += A_VAL(pix) * Cy; - } - if(j > 0){ - pix += sow; - rr += R_VAL(pix) * j; - gg += G_VAL(pix) * j; - bb += B_VAL(pix) * j; - aa += A_VAL(pix) * j; - } - r = r * INV_XAP; - g = g * INV_XAP; - b = b * INV_XAP; - a = a * INV_XAP; - r = (r + ((rr * XAP))) >> 12; - g = (g + ((gg * XAP))) >> 12; - b = (b + ((bb * XAP))) >> 12; - a = (a + ((aa * XAP))) >> 12; - } - else{ - r >>= 4; - g >>= 4; - b >>= 4; - a >>= 4; - } - *dptr = qRgba(r >> 10, g >> 10, b >> 10, a >> 10); - dptr++; - } - } + else if (isi->xup_yup == 1) { +#ifdef QT_COMPILER_SUPPORTS_SSE4_1 + if (qCpuHasFeature(SSE4_1)) + qt_qimageScaleAARGBA_up_x_down_y_sse4(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); + else +#endif + qt_qimageScaleAARGBA_up_x_down_y(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); } /* if we're scaling down horizontally */ - else if(isi->xup_yup == 2){ - /*\ 'Correct' version, with math units prepared for MMXification \*/ - int Cx, j; - const unsigned int *pix; - int r, g, b, a, rr, gg, bb, aa; - int xap; - - /* go through every scanline in the output buffer */ - for(y = 0; y < dh; y++){ - dptr = dest + dx + ((y + dy) * dow); - for(x = dxx; x < end; x++){ - Cx = XAP >> 16; - xap = XAP & 0xffff; - - pix = ypoints[dyy + y] + xpoints[x]; - r = R_VAL(pix) * xap; - g = G_VAL(pix) * xap; - b = B_VAL(pix) * xap; - a = A_VAL(pix) * xap; - for(j = (1 << 14) - xap; j > Cx; j -= Cx){ - pix++; - r += R_VAL(pix) * Cx; - g += G_VAL(pix) * Cx; - b += B_VAL(pix) * Cx; - a += A_VAL(pix) * Cx; - } - if(j > 0){ - pix++; - r += R_VAL(pix) * j; - g += G_VAL(pix) * j; - b += B_VAL(pix) * j; - a += A_VAL(pix) * j; - } - if(YAP > 0){ - pix = ypoints[dyy + y] + xpoints[x] + sow; - rr = R_VAL(pix) * xap; - gg = G_VAL(pix) * xap; - bb = B_VAL(pix) * xap; - aa = A_VAL(pix) * xap; - for(j = (1 << 14) - xap; j > Cx; j -= Cx){ - pix++; - rr += R_VAL(pix) * Cx; - gg += G_VAL(pix) * Cx; - bb += B_VAL(pix) * Cx; - aa += A_VAL(pix) * Cx; - } - if(j > 0){ - pix++; - rr += R_VAL(pix) * j; - gg += G_VAL(pix) * j; - bb += B_VAL(pix) * j; - aa += A_VAL(pix) * j; - } - r = r * INV_YAP; - g = g * INV_YAP; - b = b * INV_YAP; - a = a * INV_YAP; - r = (r + ((rr * YAP))) >> 12; - g = (g + ((gg * YAP))) >> 12; - b = (b + ((bb * YAP))) >> 12; - a = (a + ((aa * YAP))) >> 12; - } - else{ - r >>= 4; - g >>= 4; - b >>= 4; - a >>= 4; - } - *dptr = qRgba(r >> 10, g >> 10, b >> 10, a >> 10); - dptr++; - } - } + else if (isi->xup_yup == 2) { +#ifdef QT_COMPILER_SUPPORTS_SSE4_1 + if (qCpuHasFeature(SSE4_1)) + qt_qimageScaleAARGBA_down_x_up_y_sse4(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); + else +#endif + qt_qimageScaleAARGBA_down_x_up_y(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); } /* if we're scaling down horizontally & vertically */ - else{ - /*\ 'Correct' version, with math units prepared for MMXification: - |*| The operation 'b = (b * c) >> 16' translates to pmulhw, - |*| so the operation 'b = (b * c) >> d' would translate to - |*| psllw (16 - d), %mmb; pmulh %mmc, %mmb - \*/ - int Cx, Cy, i, j; - const unsigned int *pix; - int a, r, g, b, ax, rx, gx, bx; - int xap, yap; - - for(y = 0; y < dh; y++){ - Cy = YAP >> 16; - yap = YAP & 0xffff; - - dptr = dest + dx + ((y + dy) * dow); - for(x = dxx; x < end; x++){ - Cx = XAP >> 16; - xap = XAP & 0xffff; - - sptr = ypoints[dyy + y] + xpoints[x]; - pix = sptr; - sptr += sow; - rx = R_VAL(pix) * xap; - gx = G_VAL(pix) * xap; - bx = B_VAL(pix) * xap; - ax = A_VAL(pix) * xap; - - pix++; - for(i = (1 << 14) - xap; i > Cx; i -= Cx){ - rx += R_VAL(pix) * Cx; - gx += G_VAL(pix) * Cx; - bx += B_VAL(pix) * Cx; - ax += A_VAL(pix) * Cx; - pix++; - } - if(i > 0){ - rx += R_VAL(pix) * i; - gx += G_VAL(pix) * i; - bx += B_VAL(pix) * i; - ax += A_VAL(pix) * i; - } - - r = (rx >> 5) * yap; - g = (gx >> 5) * yap; - b = (bx >> 5) * yap; - a = (ax >> 5) * yap; - - for(j = (1 << 14) - yap; j > Cy; j -= Cy){ - pix = sptr; - sptr += sow; - rx = R_VAL(pix) * xap; - gx = G_VAL(pix) * xap; - bx = B_VAL(pix) * xap; - ax = A_VAL(pix) * xap; - pix++; - for(i = (1 << 14) - xap; i > Cx; i -= Cx){ - rx += R_VAL(pix) * Cx; - gx += G_VAL(pix) * Cx; - bx += B_VAL(pix) * Cx; - ax += A_VAL(pix) * Cx; - pix++; - } - if(i > 0){ - rx += R_VAL(pix) * i; - gx += G_VAL(pix) * i; - bx += B_VAL(pix) * i; - ax += A_VAL(pix) * i; - } - - r += (rx >> 5) * Cy; - g += (gx >> 5) * Cy; - b += (bx >> 5) * Cy; - a += (ax >> 5) * Cy; - } - if(j > 0){ - pix = sptr; - sptr += sow; - rx = R_VAL(pix) * xap; - gx = G_VAL(pix) * xap; - bx = B_VAL(pix) * xap; - ax = A_VAL(pix) * xap; - pix++; - for(i = (1 << 14) - xap; i > Cx; i -= Cx){ - rx += R_VAL(pix) * Cx; - gx += G_VAL(pix) * Cx; - bx += B_VAL(pix) * Cx; - ax += A_VAL(pix) * Cx; - pix++; - } - if(i > 0){ - rx += R_VAL(pix) * i; - gx += G_VAL(pix) * i; - bx += B_VAL(pix) * i; - ax += A_VAL(pix) * i; - } - - r += (rx >> 5) * j; - g += (gx >> 5) * j; - b += (bx >> 5) * j; - a += (ax >> 5) * j; - } - - *dptr = qRgba(r >> 23, g >> 23, b >> 23, a >> 23); - dptr++; + else { +#ifdef QT_COMPILER_SUPPORTS_SSE4_1 + if (qCpuHasFeature(SSE4_1)) + qt_qimageScaleAARGBA_down_xy_sse4(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); + else +#endif + qt_qimageScaleAARGBA_down_xy(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); + } +} + +inline static void qt_qimageScaleAARGBA_helper_x(const unsigned int *pix, int xap, int Cx, int &r, int &g, int &b, int &a) +{ + r = R_VAL(pix) * xap; + g = G_VAL(pix) * xap; + b = B_VAL(pix) * xap; + a = A_VAL(pix) * xap; + int j; + for (j = (1 << 14) - xap; j > Cx; j -= Cx ){ + pix++; + r += R_VAL(pix) * Cx; + g += G_VAL(pix) * Cx; + b += B_VAL(pix) * Cx; + a += A_VAL(pix) * Cx; + } + pix++; + r += R_VAL(pix) * j; + g += G_VAL(pix) * j; + b += B_VAL(pix) * j; + a += A_VAL(pix) * j; +} + +inline static void qt_qimageScaleAARGBA_helper_y(const unsigned int *pix, int yap, int Cy, int sow, int &r, int &g, int &b, int &a) +{ + r = R_VAL(pix) * yap; + g = G_VAL(pix) * yap; + b = B_VAL(pix) * yap; + a = A_VAL(pix) * yap; + int j; + for (j = (1 << 14) - yap; j > Cy; j -= Cy ){ + pix += sow; + r += R_VAL(pix) * Cy; + g += G_VAL(pix) * Cy; + b += B_VAL(pix) * Cy; + a += A_VAL(pix) * Cy; + } + pix += sow; + r += R_VAL(pix) * j; + g += G_VAL(pix) * j; + b += B_VAL(pix) * j; + a += A_VAL(pix) * j; +} + +static void qt_qimageScaleAARGBA_up_x_down_y(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow) +{ + const unsigned int **ypoints = isi->ypoints; + int *xpoints = isi->xpoints; + int *xapoints = isi->xapoints; + int *yapoints = isi->yapoints; + + int end = dxx + dw; + + /* go through every scanline in the output buffer */ + for (int y = 0; y < dh; y++) { + int Cy = (yapoints[dyy + y]) >> 16; + int yap = (yapoints[dyy + y]) & 0xffff; + + unsigned int *dptr = dest + dx + ((y + dy) * dow); + for (int x = dxx; x < end; x++) { + const unsigned int *sptr = ypoints[dyy + y] + xpoints[x]; + int r, g, b, a; + qt_qimageScaleAARGBA_helper_y(sptr, yap, Cy, sow, r, g, b, a); + + int xap = xapoints[x]; + if (xap > 0) { + int rr, gg, bb, aa; + qt_qimageScaleAARGBA_helper_y(sptr + 1, yap, Cy, sow, rr, gg, bb, aa); + + r = r * (256 - xap); + g = g * (256 - xap); + b = b * (256 - xap); + a = a * (256 - xap); + r = (r + (rr * xap)) >> 8; + g = (g + (gg * xap)) >> 8; + b = (b + (bb * xap)) >> 8; + a = (a + (aa * xap)) >> 8; } + *dptr++ = qRgba(r >> 14, g >> 14, b >> 14, a >> 14); } } } -/* scale by area sampling - IGNORE the ALPHA byte*/ -static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest, - int dxx, int dyy, int dx, int dy, int dw, - int dh, int dow, int sow) +static void qt_qimageScaleAARGBA_down_x_up_y(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow) { - const unsigned int *sptr; - unsigned int *dptr; - int x, y, end; const unsigned int **ypoints = isi->ypoints; int *xpoints = isi->xpoints; int *xapoints = isi->xapoints; int *yapoints = isi->yapoints; - end = dxx + dw; - /* scaling up both ways */ - if(isi->xup_yup == 3){ - /* go through every scanline in the output buffer */ - for(y = 0; y < dh; y++){ - /* calculate the source line we'll scan from */ - dptr = dest + dx + ((y + dy) * dow); - sptr = ypoints[dyy + y]; - if(YAP > 0){ - for(x = dxx; x < end; x++){ - int r = 0, g = 0, b = 0; - int rr = 0, gg = 0, bb = 0; - const unsigned int *pix; - - if(XAP > 0){ - pix = ypoints[dyy + y] + xpoints[x]; - r = R_VAL(pix) * INV_XAP; - g = G_VAL(pix) * INV_XAP; - b = B_VAL(pix) * INV_XAP; - pix++; - r += R_VAL(pix) * XAP; - g += G_VAL(pix) * XAP; - b += B_VAL(pix) * XAP; - pix += sow; - rr = R_VAL(pix) * XAP; - gg = G_VAL(pix) * XAP; - bb = B_VAL(pix) * XAP; - pix --; - rr += R_VAL(pix) * INV_XAP; - gg += G_VAL(pix) * INV_XAP; - bb += B_VAL(pix) * INV_XAP; - r = ((rr * YAP) + (r * INV_YAP)) >> 16; - g = ((gg * YAP) + (g * INV_YAP)) >> 16; - b = ((bb * YAP) + (b * INV_YAP)) >> 16; - *dptr++ = qRgba(r, g, b, 0xff); - } - else{ - pix = ypoints[dyy + y] + xpoints[x]; - r = R_VAL(pix) * INV_YAP; - g = G_VAL(pix) * INV_YAP; - b = B_VAL(pix) * INV_YAP; - pix += sow; - r += R_VAL(pix) * YAP; - g += G_VAL(pix) * YAP; - b += B_VAL(pix) * YAP; - r >>= 8; - g >>= 8; - b >>= 8; - *dptr++ = qRgba(r, g, b, 0xff); - } - } - } - else{ - for(x = dxx; x < end; x++){ - int r = 0, g = 0, b = 0; - const unsigned int *pix; - - if(XAP > 0){ - pix = ypoints[dyy + y] + xpoints[x]; - r = R_VAL(pix) * INV_XAP; - g = G_VAL(pix) * INV_XAP; - b = B_VAL(pix) * INV_XAP; - pix++; - r += R_VAL(pix) * XAP; - g += G_VAL(pix) * XAP; - b += B_VAL(pix) * XAP; - r >>= 8; - g >>= 8; - b >>= 8; - *dptr++ = qRgba(r, g, b, 0xff); - } - else - *dptr++ = sptr[xpoints[x] ]; - } + int end = dxx + dw; + + /* go through every scanline in the output buffer */ + for (int y = 0; y < dh; y++) { + unsigned int *dptr = dest + dx + ((y + dy) * dow); + for (int x = dxx; x < end; x++) { + int Cx = xapoints[x] >> 16; + int xap = xapoints[x] & 0xffff; + + const unsigned int *sptr = ypoints[dyy + y] + xpoints[x]; + int r, g, b, a; + qt_qimageScaleAARGBA_helper_x(sptr, xap, Cx, r, g, b, a); + + int yap = yapoints[dyy + y]; + if (yap > 0) { + int rr, gg, bb, aa; + qt_qimageScaleAARGBA_helper_x(sptr + sow, xap, Cx, rr, gg, bb, aa); + + r = r * (256 - yap); + g = g * (256 - yap); + b = b * (256 - yap); + a = a * (256 - yap); + r = (r + (rr * yap)) >> 8; + g = (g + (gg * yap)) >> 8; + b = (b + (bb * yap)) >> 8; + a = (a + (aa * yap)) >> 8; } + *dptr = qRgba(r >> 14, g >> 14, b >> 14, a >> 14); + dptr++; } } - /* if we're scaling down vertically */ - else if(isi->xup_yup == 1){ - /*\ 'Correct' version, with math units prepared for MMXification \*/ - int Cy, j; - const unsigned int *pix; - int r, g, b, rr, gg, bb; - int yap; - - /* go through every scanline in the output buffer */ - for(y = 0; y < dh; y++){ - Cy = YAP >> 16; - yap = YAP & 0xffff; - - dptr = dest + dx + ((y + dy) * dow); - for(x = dxx; x < end; x++){ - pix = ypoints[dyy + y] + xpoints[x]; - r = R_VAL(pix) * yap; - g = G_VAL(pix) * yap; - b = B_VAL(pix) * yap; - pix += sow; - for(j = (1 << 14) - yap; j > Cy; j -= Cy){ - r += R_VAL(pix) * Cy; - g += G_VAL(pix) * Cy; - b += B_VAL(pix) * Cy; - pix += sow; - } - if(j > 0){ - r += R_VAL(pix) * j; - g += G_VAL(pix) * j; - b += B_VAL(pix) * j; - } - if(XAP > 0){ - pix = ypoints[dyy + y] + xpoints[x] + 1; - rr = R_VAL(pix) * yap; - gg = G_VAL(pix) * yap; - bb = B_VAL(pix) * yap; - pix += sow; - for(j = (1 << 14) - yap; j > Cy; j -= Cy){ - rr += R_VAL(pix) * Cy; - gg += G_VAL(pix) * Cy; - bb += B_VAL(pix) * Cy; - pix += sow; - } - if(j > 0){ - rr += R_VAL(pix) * j; - gg += G_VAL(pix) * j; - bb += B_VAL(pix) * j; - } - r = r * INV_XAP; - g = g * INV_XAP; - b = b * INV_XAP; - r = (r + ((rr * XAP))) >> 12; - g = (g + ((gg * XAP))) >> 12; - b = (b + ((bb * XAP))) >> 12; - } - else{ - r >>= 4; - g >>= 4; - b >>= 4; - } - *dptr = qRgba(r >> 10, g >> 10, b >> 10, 0xff); - dptr++; +} + +static void qt_qimageScaleAARGBA_down_xy(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, int dw, + int dh, int dow, int sow) +{ + const unsigned int **ypoints = isi->ypoints; + int *xpoints = isi->xpoints; + int *xapoints = isi->xapoints; + int *yapoints = isi->yapoints; + + int end = dxx + dw; + + for (int y = 0; y < dh; y++) { + int Cy = (yapoints[dyy + y]) >> 16; + int yap = (yapoints[dyy + y]) & 0xffff; + + unsigned int *dptr = dest + dx + ((y + dy) * dow); + for (int x = dxx; x < end; x++) { + int Cx = xapoints[x] >> 16; + int xap = xapoints[x] & 0xffff; + + const unsigned int *sptr = ypoints[dyy + y] + xpoints[x]; + int rx, gx, bx, ax; + qt_qimageScaleAARGBA_helper_x(sptr, xap, Cx, rx, gx, bx, ax); + + int r = ((rx>>4) * yap); + int g = ((gx>>4) * yap); + int b = ((bx>>4) * yap); + int a = ((ax>>4) * yap); + + int j; + for (j = (1 << 14) - yap; j > Cy; j -= Cy) { + sptr += sow; + qt_qimageScaleAARGBA_helper_x(sptr, xap, Cx, rx, gx, bx, ax); + r += ((rx>>4) * Cy); + g += ((gx>>4) * Cy); + b += ((bx>>4) * Cy); + a += ((ax>>4) * Cy); } + sptr += sow; + qt_qimageScaleAARGBA_helper_x(sptr, xap, Cx, rx, gx, bx, ax); + + r += ((rx>>4) * j); + g += ((gx>>4) * j); + b += ((bx>>4) * j); + a += ((ax>>4) * j); + + *dptr = qRgba(r >> 24, g >> 24, b >> 24, a >> 24); + dptr++; } } +} + +static void qt_qimageScaleAARGB_up_x_down_y(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, int dw, + int dh, int dow, int sow); + +static void qt_qimageScaleAARGB_down_x_up_y(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, int dw, + int dh, int dow, int sow); + +static void qt_qimageScaleAARGB_down_xy(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, int dw, + int dh, int dow, int sow); + +/* scale by area sampling - IGNORE the ALPHA byte*/ +static void qt_qimageScaleAARGB(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow) +{ + /* scaling up both ways */ + if (isi->xup_yup == 3) { + qt_qimageScaleAARGBA_up_xy(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); + } + /* if we're scaling down vertically */ + else if (isi->xup_yup == 1) { +#ifdef QT_COMPILER_SUPPORTS_SSE4_1 + if (qCpuHasFeature(SSE4_1)) + qt_qimageScaleAARGBA_up_x_down_y_sse4(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); + else +#endif + qt_qimageScaleAARGB_up_x_down_y(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); + } /* if we're scaling down horizontally */ - else if(isi->xup_yup == 2){ - /*\ 'Correct' version, with math units prepared for MMXification \*/ - int Cx, j; - const unsigned int *pix; - int r, g, b, rr, gg, bb; - int xap; - - /* go through every scanline in the output buffer */ - for(y = 0; y < dh; y++){ - dptr = dest + dx + ((y + dy) * dow); - for(x = dxx; x < end; x++){ - Cx = XAP >> 16; - xap = XAP & 0xffff; - - pix = ypoints[dyy + y] + xpoints[x]; - r = R_VAL(pix) * xap; - g = G_VAL(pix) * xap; - b = B_VAL(pix) * xap; - pix++; - for(j = (1 << 14) - xap; j > Cx; j -= Cx){ - r += R_VAL(pix) * Cx; - g += G_VAL(pix) * Cx; - b += B_VAL(pix) * Cx; - pix++; - } - if(j > 0){ - r += R_VAL(pix) * j; - g += G_VAL(pix) * j; - b += B_VAL(pix) * j; - } - if(YAP > 0){ - pix = ypoints[dyy + y] + xpoints[x] + sow; - rr = R_VAL(pix) * xap; - gg = G_VAL(pix) * xap; - bb = B_VAL(pix) * xap; - pix++; - for(j = (1 << 14) - xap; j > Cx; j -= Cx){ - rr += R_VAL(pix) * Cx; - gg += G_VAL(pix) * Cx; - bb += B_VAL(pix) * Cx; - pix++; - } - if(j > 0){ - rr += R_VAL(pix) * j; - gg += G_VAL(pix) * j; - bb += B_VAL(pix) * j; - } - r = r * INV_YAP; - g = g * INV_YAP; - b = b * INV_YAP; - r = (r + ((rr * YAP))) >> 12; - g = (g + ((gg * YAP))) >> 12; - b = (b + ((bb * YAP))) >> 12; - } - else{ - r >>= 4; - g >>= 4; - b >>= 4; - } - *dptr = qRgba(r >> 10, g >> 10, b >> 10, 0xff); - dptr++; - } - } + else if (isi->xup_yup == 2) { +#ifdef QT_COMPILER_SUPPORTS_SSE4_1 + if (qCpuHasFeature(SSE4_1)) + qt_qimageScaleAARGBA_down_x_up_y_sse4(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); + else +#endif + qt_qimageScaleAARGB_down_x_up_y(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); } - /* fully optimized (i think) - onyl change of algorithm can help */ /* if we're scaling down horizontally & vertically */ - else{ - /*\ 'Correct' version, with math units prepared for MMXification \*/ - int Cx, Cy, i, j; - const unsigned int *pix; - int r, g, b, rx, gx, bx; - int xap, yap; - - for(y = 0; y < dh; y++){ - Cy = YAP >> 16; - yap = YAP & 0xffff; - - dptr = dest + dx + ((y + dy) * dow); - for(x = dxx; x < end; x++){ - Cx = XAP >> 16; - xap = XAP & 0xffff; - - sptr = ypoints[dyy + y] + xpoints[x]; - pix = sptr; - sptr += sow; - rx = R_VAL(pix) * xap; - gx = G_VAL(pix) * xap; - bx = B_VAL(pix) * xap; - pix++; - for(i = (1 << 14) - xap; i > Cx; i -= Cx){ - rx += R_VAL(pix) * Cx; - gx += G_VAL(pix) * Cx; - bx += B_VAL(pix) * Cx; - pix++; - } - if(i > 0){ - rx += R_VAL(pix) * i; - gx += G_VAL(pix) * i; - bx += B_VAL(pix) * i; - } - - r = (rx >> 5) * yap; - g = (gx >> 5) * yap; - b = (bx >> 5) * yap; - - for(j = (1 << 14) - yap; j > Cy; j -= Cy){ - pix = sptr; - sptr += sow; - rx = R_VAL(pix) * xap; - gx = G_VAL(pix) * xap; - bx = B_VAL(pix) * xap; - pix++; - for(i = (1 << 14) - xap; i > Cx; i -= Cx){ - rx += R_VAL(pix) * Cx; - gx += G_VAL(pix) * Cx; - bx += B_VAL(pix) * Cx; - pix++; - } - if(i > 0){ - rx += R_VAL(pix) * i; - gx += G_VAL(pix) * i; - bx += B_VAL(pix) * i; - } - - r += (rx >> 5) * Cy; - g += (gx >> 5) * Cy; - b += (bx >> 5) * Cy; - } - if(j > 0){ - pix = sptr; - sptr += sow; - rx = R_VAL(pix) * xap; - gx = G_VAL(pix) * xap; - bx = B_VAL(pix) * xap; - pix++; - for(i = (1 << 14) - xap; i > Cx; i -= Cx){ - rx += R_VAL(pix) * Cx; - gx += G_VAL(pix) * Cx; - bx += B_VAL(pix) * Cx; - pix++; - } - if(i > 0){ - rx += R_VAL(pix) * i; - gx += G_VAL(pix) * i; - bx += B_VAL(pix) * i; - } - - r += (rx >> 5) * j; - g += (gx >> 5) * j; - b += (bx >> 5) * j; - } - - *dptr = qRgb(r >> 23, g >> 23, b >> 23); - dptr++; + else { +#ifdef QT_COMPILER_SUPPORTS_SSE4_1 + if (qCpuHasFeature(SSE4_1)) + qt_qimageScaleAARGBA_down_xy_sse4(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); + else +#endif + qt_qimageScaleAARGB_down_xy(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); + } +} + + +inline static void qt_qimageScaleAARGB_helper_x(const unsigned int *pix, int xap, int Cx, int &r, int &g, int &b) +{ + r = R_VAL(pix) * xap; + g = G_VAL(pix) * xap; + b = B_VAL(pix) * xap; + int j; + for (j = (1 << 14) - xap; j > Cx; j -= Cx ){ + pix++; + r += R_VAL(pix) * Cx; + g += G_VAL(pix) * Cx; + b += B_VAL(pix) * Cx; + } + pix++; + r += R_VAL(pix) * j; + g += G_VAL(pix) * j; + b += B_VAL(pix) * j; +} + +inline static void qt_qimageScaleAARGB_helper_y(const unsigned int *pix, int yap, int Cy, int sow, int &r, int &g, int &b) +{ + r = R_VAL(pix) * yap; + g = G_VAL(pix) * yap; + b = B_VAL(pix) * yap; + int j; + for (j = (1 << 14) - yap; j > Cy; j -= Cy ){ + pix += sow; + r += R_VAL(pix) * Cy; + g += G_VAL(pix) * Cy; + b += B_VAL(pix) * Cy; + } + pix += sow; + r += R_VAL(pix) * j; + g += G_VAL(pix) * j; + b += B_VAL(pix) * j; +} + +static void qt_qimageScaleAARGB_up_x_down_y(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, int dw, + int dh, int dow, int sow) +{ + const unsigned int **ypoints = isi->ypoints; + int *xpoints = isi->xpoints; + int *xapoints = isi->xapoints; + int *yapoints = isi->yapoints; + + int end = dxx + dw; + + /* go through every scanline in the output buffer */ + for (int y = 0; y < dh; y++) { + int Cy = (yapoints[dyy + y]) >> 16; + int yap = (yapoints[dyy + y]) & 0xffff; + + unsigned int *dptr = dest + dx + ((y + dy) * dow); + for (int x = dxx; x < end; x++) { + const unsigned int *sptr = ypoints[dyy + y] + xpoints[x]; + int r, g, b; + qt_qimageScaleAARGB_helper_y(sptr, yap, Cy, sow, r, g, b); + + int xap = xapoints[x]; + if (xap > 0) { + int rr, bb, gg; + qt_qimageScaleAARGB_helper_y(sptr + 1, yap, Cy, sow, rr, gg, bb); + + r = r * (256 - xap); + g = g * (256 - xap); + b = b * (256 - xap); + r = (r + (rr * xap)) >> 8; + g = (g + (gg * xap)) >> 8; + b = (b + (bb * xap)) >> 8; } + *dptr++ = qRgb(r >> 14, g >> 14, b >> 14); } } } -#if 0 -static void qt_qimageScaleAARGBASetup(QImageScaleInfo *isi, unsigned int *dest, - int dxx, int dyy, int dx, int dy, int dw, - int dh, int dow, int sow) +static void qt_qimageScaleAARGB_down_x_up_y(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, int dw, + int dh, int dow, int sow) { - qInitDrawhelperAsm(); - qt_qimageScaleAARGBA(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); + const unsigned int **ypoints = isi->ypoints; + int *xpoints = isi->xpoints; + int *xapoints = isi->xapoints; + int *yapoints = isi->yapoints; + + int end = dxx + dw; + + /* go through every scanline in the output buffer */ + for (int y = 0; y < dh; y++) { + unsigned int *dptr = dest + dx + ((y + dy) * dow); + for (int x = dxx; x < end; x++) { + int Cx = xapoints[x] >> 16; + int xap = xapoints[x] & 0xffff; + + const unsigned int *sptr = ypoints[dyy + y] + xpoints[x]; + int r, g, b; + qt_qimageScaleAARGB_helper_x(sptr, xap, Cx, r, g, b); + + int yap = yapoints[dyy + y]; + if (yap > 0) { + int rr, bb, gg; + qt_qimageScaleAARGB_helper_x(sptr + sow, xap, Cx, rr, gg, bb); + + r = r * (256 - yap); + g = g * (256 - yap); + b = b * (256 - yap); + r = (r + (rr * yap)) >> 8; + g = (g + (gg * yap)) >> 8; + b = (b + (bb * yap)) >> 8; + } + *dptr++ = qRgb(r >> 14, g >> 14, b >> 14); + } + } } -static void qt_qimageScaleAARGBSetup(QImageScaleInfo *isi, unsigned int *dest, - int dxx, int dyy, int dx, int dy, int dw, - int dh, int dow, int sow) +static void qt_qimageScaleAARGB_down_xy(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, int dw, + int dh, int dow, int sow) { - qInitDrawhelperAsm(); - qt_qimageScaleAARGB(isi, dest, dxx, dyy, dx, dy, dw, dh, dow, sow); + const unsigned int **ypoints = isi->ypoints; + int *xpoints = isi->xpoints; + int *xapoints = isi->xapoints; + int *yapoints = isi->yapoints; + + int end = dxx + dw; + + for (int y = 0; y < dh; y++) { + int Cy = (yapoints[dyy + y]) >> 16; + int yap = (yapoints[dyy + y]) & 0xffff; + + unsigned int *dptr = dest + dx + ((y + dy) * dow); + for (int x = dxx; x < end; x++) { + int Cx = xapoints[x] >> 16; + int xap = xapoints[x] & 0xffff; + + const unsigned int *sptr = ypoints[dyy + y] + xpoints[x]; + int rx, gx, bx; + qt_qimageScaleAARGB_helper_x(sptr, xap, Cx, rx, gx, bx); + + int r = (rx >> 4) * yap; + int g = (gx >> 4) * yap; + int b = (bx >> 4) * yap; + + int j; + for (j = (1 << 14) - yap; j > Cy; j -= Cy) { + sptr += sow; + qt_qimageScaleAARGB_helper_x(sptr, xap, Cx, rx, gx, bx); + + r += (rx >> 4) * Cy; + g += (gx >> 4) * Cy; + b += (bx >> 4) * Cy; + } + sptr += sow; + qt_qimageScaleAARGB_helper_x(sptr, xap, Cx, rx, gx, bx); + + r += (rx >> 4) * j; + g += (gx >> 4) * j; + b += (bx >> 4) * j; + + *dptr = qRgb(r >> 24, g >> 24, b >> 24); + dptr++; + } + } } -#endif QImage qSmoothScaleImage(const QImage &src, int dw, int dh) { @@ -1012,7 +793,7 @@ QImage qSmoothScaleImage(const QImage &src, int dw, int dh) return QImage(); } - if (src.format() == QImage::Format_ARGB32_Premultiplied || src.format() == QImage::Format_RGBA8888_Premultiplied) + if (src.hasAlphaChannel()) qt_qimageScaleArgb(scaleinfo, (unsigned int *)buffer.scanLine(0), 0, 0, 0, 0, dw, dh, dw, src.bytesPerLine() / 4); else diff --git a/src/gui/painting/qimagescale_p.h b/src/gui/painting/qimagescale_p.h index 512ec6488e..c35aea451a 100644 --- a/src/gui/painting/qimagescale_p.h +++ b/src/gui/painting/qimagescale_p.h @@ -53,6 +53,15 @@ QT_BEGIN_NAMESPACE */ QImage qSmoothScaleImage(const QImage &img, int w, int h); +namespace QImageScale { + struct QImageScaleInfo { + int *xpoints; + const unsigned int **ypoints; + int *xapoints, *yapoints; + int xup_yup; + }; +} + QT_END_NAMESPACE #endif diff --git a/src/gui/painting/qimagescale_sse4.cpp b/src/gui/painting/qimagescale_sse4.cpp new file mode 100644 index 0000000000..565ea4daa1 --- /dev/null +++ b/src/gui/painting/qimagescale_sse4.cpp @@ -0,0 +1,247 @@ +/**************************************************************************** +** +** Copyright (C) 2015 The Qt Company Ltd. +** Contact: http://www.qt.io/licensing/ +** +** This file is part of the QtGui module of the Qt Toolkit. +** +** $QT_BEGIN_LICENSE:LGPL21$ +** Commercial License Usage +** Licensees holding valid commercial Qt licenses may use this file in +** accordance with the commercial license agreement provided with the +** Software or, alternatively, in accordance with the terms contained in +** a written agreement between you and The Qt Company. For licensing terms +** and conditions see http://www.qt.io/terms-conditions. For further +** information use the contact form at http://www.qt.io/contact-us. +** +** GNU Lesser General Public License Usage +** Alternatively, this file may be used under the terms of the GNU Lesser +** General Public License version 2.1 or version 3 as published by the Free +** Software Foundation and appearing in the file LICENSE.LGPLv21 and +** LICENSE.LGPLv3 included in the packaging of this file. Please review the +** following information to ensure the GNU Lesser General Public License +** requirements will be met: https://www.gnu.org/licenses/lgpl.html and +** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. +** +** As a special exception, The Qt Company gives you certain additional +** rights. These rights are described in The Qt Company LGPL Exception +** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. +** +** $QT_END_LICENSE$ +** +****************************************************************************/ + +#include "qimagescale_p.h" +#include "qimage.h" +#include + +#if defined(QT_COMPILER_SUPPORTS_SSE4_1) + +QT_BEGIN_NAMESPACE + +using namespace QImageScale; + +inline static __m128i qt_qimageScaleAARGBA_helper_x(const unsigned int *pix, int xap, int Cx, const __m128i vxap, const __m128i vCx) +{ + __m128i vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix)); + __m128i vx = _mm_mullo_epi32(vpix, vxap); + int i; + for (i = (1 << 14) - xap; i > Cx; i -= Cx) { + pix++; + vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix)); + vx = _mm_add_epi32(vx, _mm_mullo_epi32(vpix, vCx)); + } + pix++; + vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix)); + vx = _mm_add_epi32(vx, _mm_mullo_epi32(vpix, _mm_set1_epi32(i))); + return vx; +} + +inline static __m128i qt_qimageScaleAARGBA_helper_y(const unsigned int *pix, int yap, int Cy, int sow, const __m128i vyap, const __m128i vCy) +{ + __m128i vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix)); + __m128i vx = _mm_mullo_epi32(vpix, vyap); + int i; + for (i = (1 << 14) - yap; i > Cy; i -= Cy) { + pix += sow; + vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix)); + vx = _mm_add_epi32(vx, _mm_mullo_epi32(vpix, vCy)); + } + pix += sow; + vpix = _mm_cvtepu8_epi32(_mm_cvtsi32_si128(*pix)); + vx = _mm_add_epi32(vx, _mm_mullo_epi32(vpix, _mm_set1_epi32(i))); + return vx; +} + +template +void qt_qimageScaleAARGBA_up_x_down_y_sse4(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow) +{ + const unsigned int **ypoints = isi->ypoints; + int *xpoints = isi->xpoints; + int *xapoints = isi->xapoints; + int *yapoints = isi->yapoints; + + int end = dxx + dw; + + const __m128i v256 = _mm_set1_epi32(256); + + /* go through every scanline in the output buffer */ + for (int y = 0; y < dh; y++) { + int Cy = (yapoints[dyy + y]) >> 16; + int yap = (yapoints[dyy + y]) & 0xffff; + const __m128i vCy = _mm_set1_epi32(Cy); + const __m128i vyap = _mm_set1_epi32(yap); + + unsigned int *dptr = dest + dx + ((y + dy) * dow); + for (int x = dxx; x < end; x++) { + const unsigned int *sptr = ypoints[dyy + y] + xpoints[x]; + __m128i vx = qt_qimageScaleAARGBA_helper_y(sptr, yap, Cy, sow, vyap, vCy); + + int xap = xapoints[x]; + if (xap > 0) { + const __m128i vxap = _mm_set1_epi32(xap); + const __m128i vinvxap = _mm_sub_epi32(v256, vxap); + __m128i vr = qt_qimageScaleAARGBA_helper_y(sptr + 1, yap, Cy, sow, vyap, vCy); + + vx = _mm_mullo_epi32(vx, vinvxap); + vr = _mm_mullo_epi32(vr, vxap); + vx = _mm_add_epi32(vx, vr); + vx = _mm_srli_epi32(vx, 8); + } + vx = _mm_srli_epi32(vx, 14); + vx = _mm_packus_epi32(vx, _mm_setzero_si128()); + vx = _mm_packus_epi16(vx, _mm_setzero_si128()); + *dptr = _mm_cvtsi128_si32(vx); + if (RGB) + *dptr |= 0xff000000; + dptr++; + } + } +} + +template +void qt_qimageScaleAARGBA_down_x_up_y_sse4(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow) +{ + const unsigned int **ypoints = isi->ypoints; + int *xpoints = isi->xpoints; + int *xapoints = isi->xapoints; + int *yapoints = isi->yapoints; + + int end = dxx + dw; + + const __m128i v256 = _mm_set1_epi32(256); + + /* go through every scanline in the output buffer */ + for (int y = 0; y < dh; y++) { + unsigned int *dptr = dest + dx + ((y + dy) * dow); + for (int x = dxx; x < end; x++) { + int Cx = xapoints[x] >> 16; + int xap = xapoints[x] & 0xffff; + const __m128i vCx = _mm_set1_epi32(Cx); + const __m128i vxap = _mm_set1_epi32(xap); + + const unsigned int *sptr = ypoints[dyy + y] + xpoints[x]; + __m128i vx = qt_qimageScaleAARGBA_helper_x(sptr, xap, Cx, vxap, vCx); + + int yap = yapoints[dyy + y]; + if (yap > 0) { + const __m128i vyap = _mm_set1_epi32(yap); + const __m128i vinvyap = _mm_sub_epi32(v256, vyap); + __m128i vr = qt_qimageScaleAARGBA_helper_x(sptr + sow, xap, Cx, vxap, vCx); + + vx = _mm_mullo_epi32(vx, vinvyap); + vr = _mm_mullo_epi32(vr, vyap); + vx = _mm_add_epi32(vx, vr); + vx = _mm_srli_epi32(vx, 8); + } + vx = _mm_srli_epi32(vx, 14); + vx = _mm_packus_epi32(vx, _mm_setzero_si128()); + vx = _mm_packus_epi16(vx, _mm_setzero_si128()); + *dptr = _mm_cvtsi128_si32(vx); + if (RGB) + *dptr |= 0xff000000; + dptr++; + } + } +} + +template +void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow) +{ + const unsigned int **ypoints = isi->ypoints; + int *xpoints = isi->xpoints; + int *xapoints = isi->xapoints; + int *yapoints = isi->yapoints; + + for (int y = 0; y < dh; y++) { + int Cy = (yapoints[dyy + y]) >> 16; + int yap = (yapoints[dyy + y]) & 0xffff; + const __m128i vCy = _mm_set1_epi32(Cy); + const __m128i vyap = _mm_set1_epi32(yap); + + unsigned int *dptr = dest + dx + ((y + dy) * dow); + int end = dxx + dw; + for (int x = dxx; x < end; x++) { + const int Cx = xapoints[x] >> 16; + const int xap = xapoints[x] & 0xffff; + const __m128i vCx = _mm_set1_epi32(Cx); + const __m128i vxap = _mm_set1_epi32(xap); + + const unsigned int *sptr = ypoints[dyy + y] + xpoints[x]; + __m128i vx = qt_qimageScaleAARGBA_helper_x(sptr, xap, Cx, vxap, vCx); + __m128i vr = _mm_mullo_epi32(_mm_srli_epi32(vx, 4), vyap); + + int j; + for (j = (1 << 14) - yap; j > Cy; j -= Cy) { + sptr += sow; + vx = qt_qimageScaleAARGBA_helper_x(sptr, xap, Cx, vxap, vCx); + vr = _mm_add_epi32(vr, _mm_mullo_epi32(_mm_srli_epi32(vx, 4), vCy)); + } + sptr += sow; + vx = qt_qimageScaleAARGBA_helper_x(sptr, xap, Cx, vxap, vCx); + vr = _mm_add_epi32(vr, _mm_mullo_epi32(_mm_srli_epi32(vx, 4), _mm_set1_epi32(j))); + + vr = _mm_srli_epi32(vr, 24); + vr = _mm_packus_epi32(vr, _mm_setzero_si128()); + vr = _mm_packus_epi16(vr, _mm_setzero_si128()); + *dptr = _mm_cvtsi128_si32(vr); + if (RGB) + *dptr |= 0xff000000; + dptr++; + } + } +} + +template void qt_qimageScaleAARGBA_up_x_down_y_sse4(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow); + +template void qt_qimageScaleAARGBA_up_x_down_y_sse4(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow); + +template void qt_qimageScaleAARGBA_down_x_up_y_sse4(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow); + +template void qt_qimageScaleAARGBA_down_x_up_y_sse4(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow); + +template void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow); + +template void qt_qimageScaleAARGBA_down_xy_sse4(QImageScaleInfo *isi, unsigned int *dest, + int dxx, int dyy, int dx, int dy, + int dw, int dh, int dow, int sow); + +QT_END_NAMESPACE + +#endif -- cgit v1.2.3