summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@digia.com>2014-02-04 17:33:52 +0100
committerThe Qt Project <gerrit-noreply@qt-project.org>2014-02-17 16:48:54 +0100
commit2e20d82de4ee26e862a541e3a7cf85ea54942a66 (patch)
treee3eebf9b0545aafa9b486a683ca8d5cad2d33168
parent824f080468b02ad0a82e42bbd120b55c0bce1769 (diff)
Optimize generic bilinear interpolation using SSE2
The drawing code currently only optimizes the bilinear interpolation under specific conditions that allows the optimizations used there. The patch adds a SSE2 version of the fallback 4 pixel interpolation. Change-Id: I4e8a2ba6cb44647105a9b24e38b3ab755a435050 Reviewed-by: Thiago Macieira <thiago.macieira@intel.com> Reviewed-by: Gunnar Sletta <gunnar.sletta@jollamobile.com>
-rw-r--r--src/gui/painting/qdrawhelper.cpp78
1 files changed, 45 insertions, 33 deletions
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 928ff4e670..66481d4287 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -1217,7 +1217,7 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, i
{
uint distxy = distx * disty;
//idistx * disty = (16-distx) * disty = 16*disty - distxy
- //idistx * idisty = (16-distx) * (16-disty) = 16*16 - 16*distx -16*dity + distxy
+ //idistx * idisty = (16-distx) * (16-disty) = 16*16 - 16*distx -16*disty + distxy
uint tlrb = (tl & 0x00ff00ff) * (16*16 - 16*distx - 16*disty + distxy);
uint tlag = ((tl & 0xff00ff00) >> 8) * (16*16 - 16*distx - 16*disty + distxy);
uint trrb = ((tr & 0x00ff00ff) * (distx*16 - distxy));
@@ -1299,6 +1299,44 @@ static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, i
}
#endif
+#if defined(__SSE2__)
+static inline uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty)
+{
+ // First interpolate right and left pixels in parallel.
+ __m128i vl = _mm_unpacklo_epi32(_mm_cvtsi32_si128(tl), _mm_cvtsi32_si128(bl));
+ __m128i vr = _mm_unpacklo_epi32(_mm_cvtsi32_si128(tr), _mm_cvtsi32_si128(br));
+ vl = _mm_unpacklo_epi8(vl, _mm_setzero_si128());
+ vr = _mm_unpacklo_epi8(vr, _mm_setzero_si128());
+ vl = _mm_mullo_epi16(vl, _mm_set1_epi16(256 - distx));
+ vr = _mm_mullo_epi16(vr, _mm_set1_epi16(distx));
+ __m128i vtb = _mm_add_epi16(vl, vr);
+ vtb = _mm_srli_epi16(vtb, 8);
+ // vtb now contains the result of the first two interpolate calls vtb = unpacked((xbot << 64) | xtop)
+
+ // Now the last interpolate between top and bottom interpolations.
+ const __m128i vidisty = _mm_shufflelo_epi16(_mm_cvtsi32_si128(256 - disty), _MM_SHUFFLE(0, 0, 0, 0));
+ const __m128i vdisty = _mm_shufflelo_epi16(_mm_cvtsi32_si128(disty), _MM_SHUFFLE(0, 0, 0, 0));
+ const __m128i vmuly = _mm_unpacklo_epi16(vidisty, vdisty);
+ vtb = _mm_unpacklo_epi16(vtb, _mm_srli_si128(vtb, 8));
+ // vtb now contains the colors of top and bottom interleaved { ta, ba, tr, br, tg, bg, tb, bb }
+ vtb = _mm_madd_epi16(vtb, vmuly); // Multiply and horizontal add.
+ vtb = _mm_srli_epi32(vtb, 8);
+ vtb = _mm_packs_epi32(vtb, _mm_setzero_si128());
+ vtb = _mm_packus_epi16(vtb, _mm_setzero_si128());
+ return _mm_cvtsi128_si32(vtb);
+}
+#else
+static inline uint interpolate_4_pixels(uint tl, uint tr, uint bl, uint br, uint distx, uint disty)
+{
+ uint idistx = 256 - distx;
+ uint idisty = 256 - disty;
+ uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx);
+ uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx);
+ return INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty);
+}
+#endif
+
+
template<TextureBlendType blendType>
void fetchTransformedBilinear_pixelBounds(int max, int l1, int l2, int &v1, int &v2);
@@ -1503,7 +1541,6 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
const uint *s1 = (const uint *)data->texture.scanLine(y1);
const uint *s2 = (const uint *)data->texture.scanLine(y2);
int disty = (fy & 0x0000ffff) >> 8;
- int idisty = 256 - disty;
while (b < end) {
int x1 = (fx >> 16);
int x2;
@@ -1512,13 +1549,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
uint tr = s1[x2];
uint bl = s2[x1];
uint br = s2[x2];
-
int distx = (fx & 0x0000ffff) >> 8;
- int idistx = 256 - distx;
-
- uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx);
- uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx);
- *b = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty);
+ *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
fx += fdx;
++b;
@@ -1682,12 +1714,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
int distx = (fx & 0x0000ffff) >> 8;
int disty = (fy & 0x0000ffff) >> 8;
- int idistx = 256 - distx;
- int idisty = 256 - disty;
- uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx);
- uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx);
- *b = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty);
+ *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
fx += fdx;
fy += fdy;
@@ -1744,8 +1772,6 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
int distx = int((px - x1) * 256);
int disty = int((py - y1) * 256);
- int idistx = 256 - distx;
- int idisty = 256 - disty;
fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
@@ -1758,9 +1784,7 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
uint bl = s2[x1];
uint br = s2[x2];
- uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx);
- uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx);
- *b = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty);
+ *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
fx += fdx;
fy += fdy;
@@ -1932,17 +1956,13 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
if ((fdx < 0 && fdx > -(fixed_scale / 8)) || fabs(data->m22) < (1./8.)) { // scale up more than 8x
int disty = (fy & 0x0000ffff) >> 8;
- int idisty = 256 - disty;
for (int i = 0; i < len; ++i) {
uint tl = buf1[i * 2 + 0];
uint tr = buf1[i * 2 + 1];
uint bl = buf2[i * 2 + 0];
uint br = buf2[i * 2 + 1];
int distx = (fracX & 0x0000ffff) >> 8;
- int idistx = 256 - distx;
- uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx);
- uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx);
- b[i] = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty);
+ b[i] = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
fracX += fdx;
}
} else { //scale down
@@ -2003,12 +2023,8 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
int distx = (fracX & 0x0000ffff) >> 8;
int disty = (fracY & 0x0000ffff) >> 8;
- int idistx = 256 - distx;
- int idisty = 256 - disty;
- uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx);
- uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx);
- b[i] = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty);
+ b[i] = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
fracX += fdx;
fracY += fdy;
}
@@ -2090,17 +2106,13 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
for (int i = 0; i < len; ++i) {
int distx = distxs[i];
int disty = distys[i];
- int idistx = 256 - distx;
- int idisty = 256 - disty;
uint tl = buf1[i * 2 + 0];
uint tr = buf1[i * 2 + 1];
uint bl = buf2[i * 2 + 0];
uint br = buf2[i * 2 + 1];
- uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx);
- uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx);
- b[i] = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty);
+ b[i] = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
}
length -= len;
b += len;