From 2e20d82de4ee26e862a541e3a7cf85ea54942a66 Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Tue, 4 Feb 2014 17:33:52 +0100 Subject: Optimize generic bilinear interpolation using SSE2 The drawing code currently only optimizes the bilinear interpolation under specific conditions that allows the optimizations used there. The patch adds a SSE2 version of the fallback 4 pixel interpolation. Change-Id: I4e8a2ba6cb44647105a9b24e38b3ab755a435050 Reviewed-by: Thiago Macieira Reviewed-by: Gunnar Sletta --- src/gui/painting/qdrawhelper.cpp | 78 +++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 33 deletions(-) (limited to 'src/gui/painting') diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index 928ff4e670..66481d4287 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -1217,7 +1217,7 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, i { uint distxy = distx * disty; //idistx * disty = (16-distx) * disty = 16*disty - distxy - //idistx * idisty = (16-distx) * (16-disty) = 16*16 - 16*distx -16*dity + distxy + //idistx * idisty = (16-distx) * (16-disty) = 16*16 - 16*distx -16*disty + distxy uint tlrb = (tl & 0x00ff00ff) * (16*16 - 16*distx - 16*disty + distxy); uint tlag = ((tl & 0xff00ff00) >> 8) * (16*16 - 16*distx - 16*disty + distxy); uint trrb = ((tr & 0x00ff00ff) * (distx*16 - distxy)); @@ -1299,6 +1299,44 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, i } #endif +#if defined(__SSE2__) +static inline uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty) +{ + // First interpolate right and left pixels in parallel. + __m128i vl = _mm_unpacklo_epi32(_mm_cvtsi32_si128(tl), _mm_cvtsi32_si128(bl)); + __m128i vr = _mm_unpacklo_epi32(_mm_cvtsi32_si128(tr), _mm_cvtsi32_si128(br)); + vl = _mm_unpacklo_epi8(vl, _mm_setzero_si128()); + vr = _mm_unpacklo_epi8(vr, _mm_setzero_si128()); + vl = _mm_mullo_epi16(vl, _mm_set1_epi16(256 - distx)); + vr = _mm_mullo_epi16(vr, _mm_set1_epi16(distx)); + __m128i vtb = _mm_add_epi16(vl, vr); + vtb = _mm_srli_epi16(vtb, 8); + // vtb now contains the result of the first two interpolate calls vtb = unpacked((xbot << 64) | xtop) + + // Now the last interpolate between top and bottom interpolations. + const __m128i vidisty = _mm_shufflelo_epi16(_mm_cvtsi32_si128(256 - disty), _MM_SHUFFLE(0, 0, 0, 0)); + const __m128i vdisty = _mm_shufflelo_epi16(_mm_cvtsi32_si128(disty), _MM_SHUFFLE(0, 0, 0, 0)); + const __m128i vmuly = _mm_unpacklo_epi16(vidisty, vdisty); + vtb = _mm_unpacklo_epi16(vtb, _mm_srli_si128(vtb, 8)); + // vtb now contains the colors of top and bottom interleaved { ta, ba, tr, br, tg, bg, tb, bb } + vtb = _mm_madd_epi16(vtb, vmuly); // Multiply and horizontal add. + vtb = _mm_srli_epi32(vtb, 8); + vtb = _mm_packs_epi32(vtb, _mm_setzero_si128()); + vtb = _mm_packus_epi16(vtb, _mm_setzero_si128()); + return _mm_cvtsi128_si32(vtb); +} +#else +static inline uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty) +{ + uint idistx = 256 - distx; + uint idisty = 256 - disty; + uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx); + uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx); + return INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty); +} +#endif + + template void fetchTransformedBilinear_pixelBounds(int max, int l1, int l2, int &v1, int &v2); @@ -1503,7 +1541,6 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c const uint *s1 = (const uint *)data->texture.scanLine(y1); const uint *s2 = (const uint *)data->texture.scanLine(y2); int disty = (fy & 0x0000ffff) >> 8; - int idisty = 256 - disty; while (b < end) { int x1 = (fx >> 16); int x2; @@ -1512,13 +1549,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c uint tr = s1[x2]; uint bl = s2[x1]; uint br = s2[x2]; - int distx = (fx & 0x0000ffff) >> 8; - int idistx = 256 - distx; - - uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx); - uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx); - *b = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty); + *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); fx += fdx; ++b; @@ -1682,12 +1714,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c int distx = (fx & 0x0000ffff) >> 8; int disty = (fy & 0x0000ffff) >> 8; - int idistx = 256 - distx; - int idisty = 256 - disty; - uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx); - uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx); - *b = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty); + *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); fx += fdx; fy += fdy; @@ -1744,8 +1772,6 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c int distx = int((px - x1) * 256); int disty = int((py - y1) * 256); - int idistx = 256 - distx; - int idisty = 256 - disty; fetchTransformedBilinear_pixelBounds(image_width, image_x1, image_x2, x1, x2); fetchTransformedBilinear_pixelBounds(image_height, image_y1, image_y2, y1, y2); @@ -1758,9 +1784,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c uint bl = s2[x1]; uint br = s2[x2]; - uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx); - uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx); - *b = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty); + *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); fx += fdx; fy += fdy; @@ -1932,17 +1956,13 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper if ((fdx < 0 && fdx > -(fixed_scale / 8)) || fabs(data->m22) < (1./8.)) { // scale up more than 8x int disty = (fy & 0x0000ffff) >> 8; - int idisty = 256 - disty; for (int i = 0; i < len; ++i) { uint tl = buf1[i * 2 + 0]; uint tr = buf1[i * 2 + 1]; uint bl = buf2[i * 2 + 0]; uint br = buf2[i * 2 + 1]; int distx = (fracX & 0x0000ffff) >> 8; - int idistx = 256 - distx; - uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx); - uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx); - b[i] = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty); + b[i] = interpolate_4_pixels(tl, tr, bl, br, distx, disty); fracX += fdx; } } else { //scale down @@ -2003,12 +2023,8 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper int distx = (fracX & 0x0000ffff) >> 8; int disty = (fracY & 0x0000ffff) >> 8; - int idistx = 256 - distx; - int idisty = 256 - disty; - uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx); - uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx); - b[i] = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty); + b[i] = interpolate_4_pixels(tl, tr, bl, br, distx, disty); fracX += fdx; fracY += fdy; } @@ -2090,17 +2106,13 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper for (int i = 0; i < len; ++i) { int distx = distxs[i]; int disty = distys[i]; - int idistx = 256 - distx; - int idisty = 256 - disty; uint tl = buf1[i * 2 + 0]; uint tr = buf1[i * 2 + 1]; uint bl = buf2[i * 2 + 0]; uint br = buf2[i * 2 + 1]; - uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx); - uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx); - b[i] = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty); + b[i] = interpolate_4_pixels(tl, tr, bl, br, distx, disty); } length -= len; b += len; -- cgit v1.2.3