summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAllan Sandfeld Jensen <allan.jensen@qt.io>2016-11-25 17:54:49 +0100
committerAllan Sandfeld Jensen <allan.jensen@qt.io>2017-01-05 11:16:54 +0000
commitc5282fc185caee86a3e35499763006c224118185 (patch)
tree7ef4620dee155d68c17d92c1a5efe0fe63a5b0be
parentdea91776759b117137435874ee42b68983a47807 (diff)
Split fetchTransformedBilinearARGB32PM
Split out the fast paths of fetchTransformedBilinearARGB32PM, so each can be more easily read on its own, and to prepare for future AVX2 versions. Change-Id: I1e9842a8928689823bf6d7d8a352625ed4b4b6c5 Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io>
-rw-r--r--src/gui/painting/qdrawhelper.cpp1157
1 files changed, 645 insertions, 512 deletions
diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp
index 772291b22b..8a9f8b8bdc 100644
--- a/src/gui/painting/qdrawhelper.cpp
+++ b/src/gui/painting/qdrawhelper.cpp
@@ -1919,562 +1919,695 @@ inline void fetchTransformedBilinear_pixelBounds<BlendTransformedBilinear>(int,
Q_ASSERT(v2 >= l1 && v2 <= l2);
}
-template<TextureBlendType blendType> /* blendType = BlendTransformedBilinear or BlendTransformedBilinearTiled */
-static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, const Operator *,
- const QSpanData *data, int y, int x,
- int length)
-{
- int image_width = data->texture.width;
- int image_height = data->texture.height;
-
- int image_x1 = data->texture.x1;
- int image_y1 = data->texture.y1;
- int image_x2 = data->texture.x2 - 1;
- int image_y2 = data->texture.y2 - 1;
-
- const qreal cx = x + qreal(0.5);
- const qreal cy = y + qreal(0.5);
-
- uint *end = buffer + length;
- uint *b = buffer;
- if (data->fast_matrix) {
- // The increment pr x in the scanline
- int fdx = (int)(data->m11 * fixed_scale);
- int fdy = (int)(data->m12 * fixed_scale);
-
- int fx = int((data->m21 * cy
- + data->m11 * cx + data->dx) * fixed_scale);
- int fy = int((data->m22 * cy
- + data->m12 * cx + data->dy) * fixed_scale);
-
- fx -= half_point;
- fy -= half_point;
-
- if (fdy == 0) { //simple scale, no rotation
- int y1 = (fy >> 16);
- int y2;
- fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
- const uint *s1 = (const uint *)data->texture.scanLine(y1);
- const uint *s2 = (const uint *)data->texture.scanLine(y2);
+enum FastTransformTypes {
+ SimpleUpscaleTransform,
+ UpscaleTransform,
+ DownscaleTransform,
+ RotateTransform,
+ FastRotateTransform,
+ NFastTransformTypes
+};
- if (fdx <= fixed_scale && fdx > 0) { // scale up on X
- int disty = (fy & 0x0000ffff) >> 8;
- int idisty = 256 - disty;
- int x = fx >> 16;
+typedef void (QT_FASTCALL *BilinearFastTransformHelper)(uint *b, uint *end, const QTextureData &image, int &fx, int &fy, int fdx, int fdy);
- // The idea is first to do the interpolation between the row s1 and the row s2
- // into an intermediate buffer, then we interpolate between two pixel of this buffer.
-
- // intermediate_buffer[0] is a buffer of red-blue component of the pixel, in the form 0x00RR00BB
- // intermediate_buffer[1] is the alpha-green component of the pixel, in the form 0x00AA00GG
- // +1 for the last pixel to interpolate with, and +1 for rounding errors.
- quint32 intermediate_buffer[2][buffer_size + 2];
- // count is the size used in the intermediate_buffer.
- int count = (qint64(length) * fdx + fixed_scale - 1) / fixed_scale + 2;
- Q_ASSERT(count <= buffer_size + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case
- int f = 0;
- int lim = count;
- if (blendType == BlendTransformedBilinearTiled) {
- x %= image_width;
- if (x < 0) x += image_width;
- } else {
- lim = qMin(count, image_x2-x+1);
- if (x < image_x1) {
- Q_ASSERT(x <= image_x2);
- uint t = s1[image_x1];
- uint b = s2[image_x1];
- quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
- quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
- do {
- intermediate_buffer[0][f] = rb;
- intermediate_buffer[1][f] = ag;
- f++;
- x++;
- } while (x < image_x1 && f < lim);
- }
- }
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_upscale_helper(uint *b, uint *end, const QTextureData &image,
+ int &fx, int &fy, int fdx, int /*fdy*/)
+{
+ int y1 = (fy >> 16);
+ int y2;
+ fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+ const uint *s1 = (const uint *)image.scanLine(y1);
+ const uint *s2 = (const uint *)image.scanLine(y2);
+
+ int disty = (fy & 0x0000ffff) >> 8;
+ int idisty = 256 - disty;
+ int x = fx >> 16;
+ int length = end - b;
+
+ // The idea is first to do the interpolation between the row s1 and the row s2
+ // into an intermediate buffer, then we interpolate between two pixel of this buffer.
+
+ // intermediate_buffer[0] is a buffer of red-blue component of the pixel, in the form 0x00RR00BB
+ // intermediate_buffer[1] is the alpha-green component of the pixel, in the form 0x00AA00GG
+ // +1 for the last pixel to interpolate with, and +1 for rounding errors.
+ quint32 intermediate_buffer[2][buffer_size + 2];
+ // count is the size used in the intermediate_buffer.
+ int count = (qint64(length) * fdx + fixed_scale - 1) / fixed_scale + 2;
+ Q_ASSERT(count <= buffer_size + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case
+ int f = 0;
+ int lim = count;
+ if (blendType == BlendTransformedBilinearTiled) {
+ x %= image.width;
+ if (x < 0) x += image.width;
+ } else {
+ lim = qMin(count, image.x2 - x);
+ if (x < image.x1) {
+ Q_ASSERT(x < image.x2);
+ uint t = s1[image.x1];
+ uint b = s2[image.x1];
+ quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+ quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+ do {
+ intermediate_buffer[0][f] = rb;
+ intermediate_buffer[1][f] = ag;
+ f++;
+ x++;
+ } while (x < image.x1 && f < lim);
+ }
+ }
- if (blendType != BlendTransformedBilinearTiled) {
+ if (blendType != BlendTransformedBilinearTiled) {
#if defined(__SSE2__)
- const __m128i disty_ = _mm_set1_epi16(disty);
- const __m128i idisty_ = _mm_set1_epi16(idisty);
- const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
-
- lim -= 3;
- for (; f < lim; x += 4, f += 4) {
- // Load 4 pixels from s1, and split the alpha-green and red-blue component
- __m128i top = _mm_loadu_si128((const __m128i*)((const uint *)(s1)+x));
- __m128i topAG = _mm_srli_epi16(top, 8);
- __m128i topRB = _mm_and_si128(top, colorMask);
- // Multiplies each colour component by idisty
- topAG = _mm_mullo_epi16 (topAG, idisty_);
- topRB = _mm_mullo_epi16 (topRB, idisty_);
-
- // Same for the s2 vector
- __m128i bottom = _mm_loadu_si128((const __m128i*)((const uint *)(s2)+x));
- __m128i bottomAG = _mm_srli_epi16(bottom, 8);
- __m128i bottomRB = _mm_and_si128(bottom, colorMask);
- bottomAG = _mm_mullo_epi16 (bottomAG, disty_);
- bottomRB = _mm_mullo_epi16 (bottomRB, disty_);
-
- // Add the values, and shift to only keep 8 significant bits per colors
- __m128i rAG =_mm_add_epi16(topAG, bottomAG);
- rAG = _mm_srli_epi16(rAG, 8);
- _mm_storeu_si128((__m128i*)(&intermediate_buffer[1][f]), rAG);
- __m128i rRB =_mm_add_epi16(topRB, bottomRB);
- rRB = _mm_srli_epi16(rRB, 8);
- _mm_storeu_si128((__m128i*)(&intermediate_buffer[0][f]), rRB);
- }
+ const __m128i disty_ = _mm_set1_epi16(disty);
+ const __m128i idisty_ = _mm_set1_epi16(idisty);
+ const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+
+ lim -= 3;
+ for (; f < lim; x += 4, f += 4) {
+ // Load 4 pixels from s1, and split the alpha-green and red-blue component
+ __m128i top = _mm_loadu_si128((const __m128i*)((const uint *)(s1)+x));
+ __m128i topAG = _mm_srli_epi16(top, 8);
+ __m128i topRB = _mm_and_si128(top, colorMask);
+ // Multiplies each color component by idisty
+ topAG = _mm_mullo_epi16 (topAG, idisty_);
+ topRB = _mm_mullo_epi16 (topRB, idisty_);
+
+ // Same for the s2 vector
+ __m128i bottom = _mm_loadu_si128((const __m128i*)((const uint *)(s2)+x));
+ __m128i bottomAG = _mm_srli_epi16(bottom, 8);
+ __m128i bottomRB = _mm_and_si128(bottom, colorMask);
+ bottomAG = _mm_mullo_epi16 (bottomAG, disty_);
+ bottomRB = _mm_mullo_epi16 (bottomRB, disty_);
+
+ // Add the values, and shift to only keep 8 significant bits per colors
+ __m128i rAG =_mm_add_epi16(topAG, bottomAG);
+ rAG = _mm_srli_epi16(rAG, 8);
+ _mm_storeu_si128((__m128i*)(&intermediate_buffer[1][f]), rAG);
+ __m128i rRB =_mm_add_epi16(topRB, bottomRB);
+ rRB = _mm_srli_epi16(rRB, 8);
+ _mm_storeu_si128((__m128i*)(&intermediate_buffer[0][f]), rRB);
+ }
#elif defined(__ARM_NEON__)
- const int16x8_t disty_ = vdupq_n_s16(disty);
- const int16x8_t idisty_ = vdupq_n_s16(idisty);
- const int16x8_t colorMask = vdupq_n_s16(0x00ff);
-
- lim -= 3;
- for (; f < lim; x += 4, f += 4) {
- // Load 4 pixels from s1, and split the alpha-green and red-blue component
- int16x8_t top = vld1q_s16((int16_t*)((const uint *)(s1)+x));
- int16x8_t topAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(top), 8));
- int16x8_t topRB = vandq_s16(top, colorMask);
- // Multiplies each colour component by idisty
- topAG = vmulq_s16(topAG, idisty_);
- topRB = vmulq_s16(topRB, idisty_);
-
- // Same for the s2 vector
- int16x8_t bottom = vld1q_s16((int16_t*)((const uint *)(s2)+x));
- int16x8_t bottomAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(bottom), 8));
- int16x8_t bottomRB = vandq_s16(bottom, colorMask);
- bottomAG = vmulq_s16(bottomAG, disty_);
- bottomRB = vmulq_s16(bottomRB, disty_);
-
- // Add the values, and shift to only keep 8 significant bits per colors
- int16x8_t rAG = vaddq_s16(topAG, bottomAG);
- rAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rAG), 8));
- vst1q_s16((int16_t*)(&intermediate_buffer[1][f]), rAG);
- int16x8_t rRB = vaddq_s16(topRB, bottomRB);
- rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8));
- vst1q_s16((int16_t*)(&intermediate_buffer[0][f]), rRB);
- }
+ const int16x8_t disty_ = vdupq_n_s16(disty);
+ const int16x8_t idisty_ = vdupq_n_s16(idisty);
+ const int16x8_t colorMask = vdupq_n_s16(0x00ff);
+
+ lim -= 3;
+ for (; f < lim; x += 4, f += 4) {
+ // Load 4 pixels from s1, and split the alpha-green and red-blue component
+ int16x8_t top = vld1q_s16((int16_t*)((const uint *)(s1)+x));
+ int16x8_t topAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(top), 8));
+ int16x8_t topRB = vandq_s16(top, colorMask);
+ // Multiplies each color component by idisty
+ topAG = vmulq_s16(topAG, idisty_);
+ topRB = vmulq_s16(topRB, idisty_);
+
+ // Same for the s2 vector
+ int16x8_t bottom = vld1q_s16((int16_t*)((const uint *)(s2)+x));
+ int16x8_t bottomAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(bottom), 8));
+ int16x8_t bottomRB = vandq_s16(bottom, colorMask);
+ bottomAG = vmulq_s16(bottomAG, disty_);
+ bottomRB = vmulq_s16(bottomRB, disty_);
+
+ // Add the values, and shift to only keep 8 significant bits per colors
+ int16x8_t rAG = vaddq_s16(topAG, bottomAG);
+ rAG = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rAG), 8));
+ vst1q_s16((int16_t*)(&intermediate_buffer[1][f]), rAG);
+ int16x8_t rRB = vaddq_s16(topRB, bottomRB);
+ rRB = vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(rRB), 8));
+ vst1q_s16((int16_t*)(&intermediate_buffer[0][f]), rRB);
+ }
#endif
- }
- for (; f < count; f++) { // Same as above but without sse2
- if (blendType == BlendTransformedBilinearTiled) {
- if (x >= image_width) x -= image_width;
- } else {
- x = qMin(x, image_x2);
- }
+ }
+ for (; f < count; f++) { // Same as above but without simd
+ if (blendType == BlendTransformedBilinearTiled) {
+ if (x >= image.width) x -= image.width;
+ } else {
+ x = qMin(x, image.x2 - 1);
+ }
- uint t = s1[x];
- uint b = s2[x];
+ uint t = s1[x];
+ uint b = s2[x];
- intermediate_buffer[0][f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
- intermediate_buffer[1][f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
- x++;
- }
- // Now interpolate the values from the intermediate_buffer to get the final result.
- fx &= fixed_scale - 1;
- Q_ASSERT((fx >> 16) == 0);
- while (b < end) {
- int x1 = (fx >> 16);
- int x2 = x1 + 1;
- Q_ASSERT(x1 >= 0);
- Q_ASSERT(x2 < count);
+ intermediate_buffer[0][f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+ intermediate_buffer[1][f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+ x++;
+ }
+ // Now interpolate the values from the intermediate_buffer to get the final result.
+ fx &= fixed_scale - 1;
+ Q_ASSERT((fx >> 16) == 0);
+ while (b < end) {
+ int x1 = (fx >> 16);
+ int x2 = x1 + 1;
+ Q_ASSERT(x1 >= 0);
+ Q_ASSERT(x2 < count);
+
+ int distx = (fx & 0x0000ffff) >> 8;
+ int idistx = 256 - distx;
+ int rb = ((intermediate_buffer[0][x1] * idistx + intermediate_buffer[0][x2] * distx) >> 8) & 0xff00ff;
+ int ag = (intermediate_buffer[1][x1] * idistx + intermediate_buffer[1][x2] * distx) & 0xff00ff00;
+ *b = rb | ag;
+ b++;
+ fx += fdx;
+ }
+}
- int distx = (fx & 0x0000ffff) >> 8;
- int idistx = 256 - distx;
- int rb = ((intermediate_buffer[0][x1] * idistx + intermediate_buffer[0][x2] * distx) >> 8) & 0xff00ff;
- int ag = (intermediate_buffer[1][x1] * idistx + intermediate_buffer[1][x2] * distx) & 0xff00ff00;
- *b = rb | ag;
- b++;
- fx += fdx;
- }
- } else if ((fdx < 0 && fdx > -(fixed_scale / 8)) || std::abs(data->m22) < (1./8.)) { // scale up more than 8x
- int y1 = (fy >> 16);
- int y2;
- fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
- const uint *s1 = (const uint *)data->texture.scanLine(y1);
- const uint *s2 = (const uint *)data->texture.scanLine(y2);
- int disty = (fy & 0x0000ffff) >> 8;
- while (b < end) {
- int x1 = (fx >> 16);
- int x2;
- fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
- uint tl = s1[x1];
- uint tr = s1[x2];
- uint bl = s2[x1];
- uint br = s2[x2];
- int distx = (fx & 0x0000ffff) >> 8;
- *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_upscale_helper(uint *b, uint *end, const QTextureData &image,
+ int &fx, int &fy, int fdx, int /*fdy*/)
+{
+ int y1 = (fy >> 16);
+ int y2;
+ fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+ const uint *s1 = (const uint *)image.scanLine(y1);
+ const uint *s2 = (const uint *)image.scanLine(y2);
+ const int disty = (fy & 0x0000ffff) >> 8;
+
+ if (blendType != BlendTransformedBilinearTiled) {
+ const qint64 min_fx = qint64(image.x1) * fixed_scale;
+ const qint64 max_fx = qint64(image.x2 - 1) * fixed_scale;
+ while (b < end) {
+ int x1 = (fx >> 16);
+ int x2;
+ fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+ if (x1 != x2)
+ break;
+ uint top = s1[x1];
+ uint bot = s2[x1];
+ *b = INTERPOLATE_PIXEL_256(top, 256 - disty, bot, disty);
+ fx += fdx;
+ ++b;
+ }
+ uint *boundedEnd = end;
+ if (fdx > 0)
+ boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
+ else if (fdx < 0)
+ boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
+
+ // A fast middle part without boundary checks
+ while (b < boundedEnd) {
+ int x = (fx >> 16);
+ int distx = (fx & 0x0000ffff) >> 8;
+ *b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty);
+ fx += fdx;
+ ++b;
+ }
+ }
- fx += fdx;
- ++b;
- }
- } else { //scale down
- int y1 = (fy >> 16);
- int y2;
- fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
- const uint *s1 = (const uint *)data->texture.scanLine(y1);
- const uint *s2 = (const uint *)data->texture.scanLine(y2);
- const int disty8 = (fy & 0x0000ffff) >> 8;
- const int disty4 = (disty8 + 0x08) >> 4;
-
- if (blendType != BlendTransformedBilinearTiled) {
-#define BILINEAR_DOWNSCALE_BOUNDS_PROLOG \
- const qint64 min_fx = qint64(image_x1) * fixed_scale; \
- const qint64 max_fx = qint64(image_x2) * fixed_scale; \
- while (b < end) { \
- int x1 = (fx >> 16); \
- int x2; \
- fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); \
- if (x1 != x2) \
- break; \
- uint top = s1[x1]; \
- uint bot = s2[x1]; \
- *b = INTERPOLATE_PIXEL_256(top, 256 - disty8, bot, disty8); \
- fx += fdx; \
- ++b; \
- } \
- uint *boundedEnd = end; \
- if (fdx > 0) \
- boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); \
- else if (fdx < 0) \
- boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); \
- boundedEnd -= 3;
+ while (b < end) {
+ int x1 = (fx >> 16);
+ int x2;
+ fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1 , x1, x2);
+ uint tl = s1[x1];
+ uint tr = s1[x2];
+ uint bl = s2[x1];
+ uint br = s2[x2];
+ int distx = (fx & 0x0000ffff) >> 8;
+ *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+
+ fx += fdx;
+ ++b;
+ }
+}
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper(uint *b, uint *end, const QTextureData &image,
+ int &fx, int &fy, int fdx, int /*fdy*/)
+{
+ int y1 = (fy >> 16);
+ int y2;
+ fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+ const uint *s1 = (const uint *)image.scanLine(y1);
+ const uint *s2 = (const uint *)image.scanLine(y2);
+ const int disty8 = (fy & 0x0000ffff) >> 8;
+ const int disty4 = (disty8 + 0x08) >> 4;
+
+ if (blendType != BlendTransformedBilinearTiled) {
+ const qint64 min_fx = qint64(image.x1) * fixed_scale;
+ const qint64 max_fx = qint64(image.x2 - 1) * fixed_scale;
+ while (b < end) {
+ int x1 = (fx >> 16);
+ int x2;
+ fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+ if (x1 != x2)
+ break;
+ uint top = s1[x1];
+ uint bot = s2[x1];
+ *b = INTERPOLATE_PIXEL_256(top, 256 - disty8, bot, disty8);
+ fx += fdx;
+ ++b;
+ }
+ uint *boundedEnd = end;
+ if (fdx > 0)
+ boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx);
+ else if (fdx < 0)
+ boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx);
+ // A fast middle part without boundary checks
#if defined(__SSE2__)
- BILINEAR_DOWNSCALE_BOUNDS_PROLOG
-
- const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
- const __m128i v_256 = _mm_set1_epi16(256);
- const __m128i v_disty = _mm_set1_epi16(disty4);
- const __m128i v_fdx = _mm_set1_epi32(fdx*4);
- const __m128i v_fx_r = _mm_set1_epi32(0x8);
- __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
-
- while (b < boundedEnd) {
- __m128i offset = _mm_srli_epi32(v_fx, 16);
- const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
- const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
- const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
- const int offset3 = _mm_cvtsi128_si32(offset);
- const __m128i tl = _mm_setr_epi32(s1[offset0], s1[offset1], s1[offset2], s1[offset3]);
- const __m128i tr = _mm_setr_epi32(s1[offset0 + 1], s1[offset1 + 1], s1[offset2 + 1], s1[offset3 + 1]);
- const __m128i bl = _mm_setr_epi32(s2[offset0], s2[offset1], s2[offset2], s2[offset3]);
- const __m128i br = _mm_setr_epi32(s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]);
-
- __m128i v_distx = _mm_srli_epi16(v_fx, 8);
- v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fx_r), 4);
- v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
- v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
-
- interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b);
- b += 4;
- v_fx = _mm_add_epi32(v_fx, v_fdx);
- }
- fx = _mm_cvtsi128_si32(v_fx);
+ const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+ const __m128i v_256 = _mm_set1_epi16(256);
+ const __m128i v_disty = _mm_set1_epi16(disty4);
+ const __m128i v_fdx = _mm_set1_epi32(fdx*4);
+ const __m128i v_fx_r = _mm_set1_epi32(0x8);
+ __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
+
+ while (b < boundedEnd - 3) {
+ __m128i offset = _mm_srli_epi32(v_fx, 16);
+ const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+ const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+ const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+ const int offset3 = _mm_cvtsi128_si32(offset);
+ const __m128i tl = _mm_setr_epi32(s1[offset0], s1[offset1], s1[offset2], s1[offset3]);
+ const __m128i tr = _mm_setr_epi32(s1[offset0 + 1], s1[offset1 + 1], s1[offset2 + 1], s1[offset3 + 1]);
+ const __m128i bl = _mm_setr_epi32(s2[offset0], s2[offset1], s2[offset2], s2[offset3]);
+ const __m128i br = _mm_setr_epi32(s2[offset0 + 1], s2[offset1 + 1], s2[offset2 + 1], s2[offset3 + 1]);
+
+ __m128i v_distx = _mm_srli_epi16(v_fx, 8);
+ v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fx_r), 4);
+ v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+ v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+
+ interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b);
+ b += 4;
+ v_fx = _mm_add_epi32(v_fx, v_fdx);
+ }
+ fx = _mm_cvtsi128_si32(v_fx);
#elif defined(__ARM_NEON__)
- BILINEAR_DOWNSCALE_BOUNDS_PROLOG
+ const int16x8_t colorMask = vdupq_n_s16(0x00ff);
+ const int16x8_t invColorMask = vmvnq_s16(colorMask);
+ const int16x8_t v_256 = vdupq_n_s16(256);
+ const int16x8_t v_disty = vdupq_n_s16(disty4);
+ const int16x8_t v_disty_ = vshlq_n_s16(v_disty, 4);
+ int32x4_t v_fdx = vdupq_n_s32(fdx*4);
- const int16x8_t colorMask = vdupq_n_s16(0x00ff);
- const int16x8_t invColorMask = vmvnq_s16(colorMask);
- const int16x8_t v_256 = vdupq_n_s16(256);
- const int16x8_t v_disty = vdupq_n_s16(disty4);
- const int16x8_t v_disty_ = vshlq_n_s16(v_disty, 4);
- int32x4_t v_fdx = vdupq_n_s32(fdx*4);
+ int32x4_t v_fx = vmovq_n_s32(fx);
+ v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1);
+ v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2);
+ v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3);
- int32x4_t v_fx = vmovq_n_s32(fx);
- v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1);
- v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2);
- v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3);
+ const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
+ const int32x4_t v_fx_r = vdupq_n_s32(0x0800);
- const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
- const int32x4_t v_fx_r = vdupq_n_s32(0x0800);
+ while (b < boundedEnd - 3) {
+ uint32x4x2_t v_top, v_bot;
- while (b < boundedEnd) {
- uint32x4x2_t v_top, v_bot;
-
- int x1 = (fx >> 16);
- fx += fdx;
- v_top = vld2q_lane_u32(s1 + x1, v_top, 0);
- v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0);
- x1 = (fx >> 16);
- fx += fdx;
- v_top = vld2q_lane_u32(s1 + x1, v_top, 1);
- v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1);
- x1 = (fx >> 16);
- fx += fdx;
- v_top = vld2q_lane_u32(s1 + x1, v_top, 2);
- v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2);
- x1 = (fx >> 16);
- fx += fdx;
- v_top = vld2q_lane_u32(s1 + x1, v_top, 3);
- v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3);
-
- int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_fx_r), 12);
- v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16));
-
- interpolate_4_pixels_16_neon(
- vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]),
- vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]),
- vreinterpretq_s16_s32(v_distx), v_disty, v_disty_,
- colorMask, invColorMask, v_256, b);
- b+=4;
- v_fx = vaddq_s32(v_fx, v_fdx);
- }
+ int x1 = (fx >> 16);
+ fx += fdx;
+ v_top = vld2q_lane_u32(s1 + x1, v_top, 0);
+ v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0);
+ x1 = (fx >> 16);
+ fx += fdx;
+ v_top = vld2q_lane_u32(s1 + x1, v_top, 1);
+ v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1);
+ x1 = (fx >> 16);
+ fx += fdx;
+ v_top = vld2q_lane_u32(s1 + x1, v_top, 2);
+ v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2);
+ x1 = (fx >> 16);
+ fx += fdx;
+ v_top = vld2q_lane_u32(s1 + x1, v_top, 3);
+ v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3);
+
+ int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_fx_r), 12);
+ v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16));
+
+ interpolate_4_pixels_16_neon(
+ vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]),
+ vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]),
+ vreinterpretq_s16_s32(v_distx), v_disty, v_disty_,
+ colorMask, invColorMask, v_256, b);
+ b+=4;
+ v_fx = vaddq_s32(v_fx, v_fdx);
+ }
#endif
- }
+ while (b < boundedEnd) {
+ int x = (fx >> 16);
+#if defined(__SSE2__) || defined(__ARM_NEON__)
+ int distx8 = (fx & 0x0000ffff) >> 8;
+ *b = interpolate_4_pixels(s1 + x, s2 + x, distx8, disty8);
+#else
+ uint tl = s1[x];
+ uint tr = s1[x + 1];
+ uint bl = s2[x];
+ uint br = s2[x + 1];
+ int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12;
+ *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4);
+#endif
+ fx += fdx;
+ ++b;
+ }
+ }
- while (b < end) {
- int x1 = (fx >> 16);
- int x2;
- fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
- uint tl = s1[x1];
- uint tr = s1[x2];
- uint bl = s2[x1];
- uint br = s2[x2];
+ while (b < end) {
+ int x1 = (fx >> 16);
+ int x2;
+ fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+ uint tl = s1[x1];
+ uint tr = s1[x2];
+ uint bl = s2[x1];
+ uint br = s2[x2];
#if defined(__SSE2__) || defined(__ARM_NEON__)
- // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
- int distx8 = (fx & 0x0000ffff) >> 8;
- *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8);
+ // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
+ int distx8 = (fx & 0x0000ffff) >> 8;
+ *b = interpolate_4_pixels(tl, tr, bl, br, distx8, disty8);
#else
- int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12;
- *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4);
+ int distx4 = ((fx & 0x0000ffff) + 0x0800) >> 12;
+ *b = interpolate_4_pixels_16(tl, tr, bl, br, distx4, disty4);
#endif
- fx += fdx;
- ++b;
- }
- }
- } else { //rotation
- if (std::abs(data->m11) < (1./8.) || std::abs(data->m22) < (1./8.)) {
- //if we are zooming more than 8 times, we use 8bit precision for the position.
- while (b < end) {
- int x1 = (fx >> 16);
- int x2;
- int y1 = (fy >> 16);
- int y2;
+ fx += fdx;
+ ++b;
+ }
+}
- fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
- fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_rotate_helper(uint *b, uint *end, const QTextureData &image,
+ int &fx, int &fy, int fdx, int fdy)
+{
+ // if we are zooming more than 8 times, we use 8bit precision for the position.
+ while (b < end) {
+ int x1 = (fx >> 16);
+ int x2;
+ int y1 = (fy >> 16);
+ int y2;
- const uint *s1 = (const uint *)data->texture.scanLine(y1);
- const uint *s2 = (const uint *)data->texture.scanLine(y2);
+ fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+ fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
- uint tl = s1[x1];
- uint tr = s1[x2];
- uint bl = s2[x1];
- uint br = s2[x2];
+ const uint *s1 = (const uint *)image.scanLine(y1);
+ const uint *s2 = (const uint *)image.scanLine(y2);
- int distx = (fx & 0x0000ffff) >> 8;
- int disty = (fy & 0x0000ffff) >> 8;
+ uint tl = s1[x1];
+ uint tr = s1[x2];
+ uint bl = s2[x1];
+ uint br = s2[x2];
- *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+ int distx = (fx & 0x0000ffff) >> 8;
+ int disty = (fy & 0x0000ffff) >> 8;
- fx += fdx;
- fy += fdy;
- ++b;
- }
- } else {
- //we are zooming less than 8x, use 4bit precision
-
- if (blendType != BlendTransformedBilinearTiled) {
-#define BILINEAR_ROTATE_BOUNDS_PROLOG \
- const qint64 min_fx = qint64(image_x1) * fixed_scale; \
- const qint64 max_fx = qint64(image_x2) * fixed_scale; \
- const qint64 min_fy = qint64(image_y1) * fixed_scale; \
- const qint64 max_fy = qint64(image_y2) * fixed_scale; \
- while (b < end) { \
- int x1 = (fx >> 16); \
- int x2; \
- int y1 = (fy >> 16); \
- int y2; \
- fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2); \
- fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2); \
- if (x1 != x2 && y1 != y2) \
- break; \
- const uint *s1 = (const uint *)data->texture.scanLine(y1); \
- const uint *s2 = (const uint *)data->texture.scanLine(y2); \
- uint tl = s1[x1]; \
- uint tr = s1[x2]; \
- uint bl = s2[x1]; \
- uint br = s2[x2]; \
- int distx = (fx & 0x0000ffff) >> 8; \
- int disty = (fy & 0x0000ffff) >> 8; \
- *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty); \
- fx += fdx; \
- fy += fdy; \
- ++b; \
- } \
- uint *boundedEnd = end; \
- if (fdx > 0) \
- boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); \
- else if (fdx < 0) \
- boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); \
- if (fdy > 0) \
- boundedEnd = qMin(boundedEnd, b + (max_fy - fy) / fdy); \
- else if (fdy < 0) \
- boundedEnd = qMin(boundedEnd, b + (min_fy - fy) / fdy); \
- boundedEnd -= 3;
+ *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
-#if defined(__SSE2__)
- BILINEAR_ROTATE_BOUNDS_PROLOG
+ fx += fdx;
+ fy += fdy;
+ ++b;
+ }
+}
- const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
- const __m128i v_256 = _mm_set1_epi16(256);
- const __m128i v_fdx = _mm_set1_epi32(fdx*4);
- const __m128i v_fdy = _mm_set1_epi32(fdy*4);
- const __m128i v_fxy_r = _mm_set1_epi32(0x8);
- __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
- __m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy);
+template<TextureBlendType blendType>
+static void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper(uint *b, uint *end, const QTextureData &image,
+ int &fx, int &fy, int fdx, int fdy)
+{
+ //we are zooming less than 8x, use 4bit precision
+ if (blendType != BlendTransformedBilinearTiled) {
+ const qint64 min_fx = qint64(image.x1) * fixed_scale;
+ const qint64 max_fx = qint64(image.x2 - 1) * fixed_scale;
+ const qint64 min_fy = qint64(image.y1) * fixed_scale;
+ const qint64 max_fy = qint64(image.y2 - 1) * fixed_scale;
+ // first handle the possibly bounded part in the beginning
+ while (b < end) {
+ int x1 = (fx >> 16);
+ int x2;
+ int y1 = (fy >> 16);
+ int y2;
+ fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+ fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
+ if (x1 != x2 && y1 != y2)
+ break;
+ const uint *s1 = (const uint *)image.scanLine(y1);
+ const uint *s2 = (const uint *)image.scanLine(y2);
+ uint tl = s1[x1];
+ uint tr = s1[x2];
+ uint bl = s2[x1];
+ uint br = s2[x2];
+#if defined(__SSE2__) || defined(__ARM_NEON__)
+ int distx = (fx & 0x0000ffff) >> 8;
+ int disty = (fy & 0x0000ffff) >> 8;
+ *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+#else
+ int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
+ int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
+ *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
+#endif
+ fx += fdx;
+ fy += fdy;
+ ++b;
+ }
+ uint *boundedEnd = end; \
+ if (fdx > 0) \
+ boundedEnd = qMin(boundedEnd, b + (max_fx - fx) / fdx); \
+ else if (fdx < 0) \
+ boundedEnd = qMin(boundedEnd, b + (min_fx - fx) / fdx); \
+ if (fdy > 0) \
+ boundedEnd = qMin(boundedEnd, b + (max_fy - fy) / fdy); \
+ else if (fdy < 0) \
+ boundedEnd = qMin(boundedEnd, b + (min_fy - fy) / fdy); \
+
+ // until boundedEnd we can now have a fast middle part without boundary checks
+#if defined(__SSE2__)
+ const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+ const __m128i v_256 = _mm_set1_epi16(256);
+ const __m128i v_fdx = _mm_set1_epi32(fdx*4);
+ const __m128i v_fdy = _mm_set1_epi32(fdy*4);
+ const __m128i v_fxy_r = _mm_set1_epi32(0x8);
+ __m128i v_fx = _mm_setr_epi32(fx, fx + fdx, fx + fdx + fdx, fx + fdx + fdx + fdx);
+ __m128i v_fy = _mm_setr_epi32(fy, fy + fdy, fy + fdy + fdy, fy + fdy + fdy + fdy);
+
+ const uchar *textureData = image.imageData;
+ const int bytesPerLine = image.bytesPerLine;
+ const __m128i vbpl = _mm_shufflelo_epi16(_mm_cvtsi32_si128(bytesPerLine/4), _MM_SHUFFLE(0, 0, 0, 0));
+
+ while (b < boundedEnd - 3) {
+ const __m128i vy = _mm_packs_epi32(_mm_srli_epi32(v_fy, 16), _mm_setzero_si128());
+ // 4x16bit * 4x16bit -> 4x32bit
+ __m128i offset = _mm_unpacklo_epi16(_mm_mullo_epi16(vy, vbpl), _mm_mulhi_epi16(vy, vbpl));
+ offset = _mm_add_epi32(offset, _mm_srli_epi32(v_fx, 16));
+ const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+ const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+ const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
+ const int offset3 = _mm_cvtsi128_si32(offset);
+ const uint *topData = (const uint *)(textureData);
+ const __m128i tl = _mm_setr_epi32(topData[offset0], topData[offset1], topData[offset2], topData[offset3]);
+ const __m128i tr = _mm_setr_epi32(topData[offset0 + 1], topData[offset1 + 1], topData[offset2 + 1], topData[offset3 + 1]);
+ const uint *bottomData = (const uint *)(textureData + bytesPerLine);
+ const __m128i bl = _mm_setr_epi32(bottomData[offset0], bottomData[offset1], bottomData[offset2], bottomData[offset3]);
+ const __m128i br = _mm_setr_epi32(bottomData[offset0 + 1], bottomData[offset1 + 1], bottomData[offset2 + 1], bottomData[offset3 + 1]);
+
+ __m128i v_distx = _mm_srli_epi16(v_fx, 8);
+ __m128i v_disty = _mm_srli_epi16(v_fy, 8);
+ v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fxy_r), 4);
+ v_disty = _mm_srli_epi16(_mm_add_epi32(v_disty, v_fxy_r), 4);
+ v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+ v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
+ v_disty = _mm_shufflehi_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
+ v_disty = _mm_shufflelo_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
+
+ interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b);
+ b += 4;
+ v_fx = _mm_add_epi32(v_fx, v_fdx);
+ v_fy = _mm_add_epi32(v_fy, v_fdy);
+ }
+ fx = _mm_cvtsi128_si32(v_fx);
+ fy = _mm_cvtsi128_si32(v_fy);
+#elif defined(__ARM_NEON__)
+ const int16x8_t colorMask = vdupq_n_s16(0x00ff);
+ const int16x8_t invColorMask = vmvnq_s16(colorMask);
+ const int16x8_t v_256 = vdupq_n_s16(256);
+ int32x4_t v_fdx = vdupq_n_s32(fdx * 4);
+ int32x4_t v_fdy = vdupq_n_s32(fdy * 4);
+
+ const uchar *textureData = image.imageData;
+ const int bytesPerLine = image.bytesPerLine;
+
+ int32x4_t v_fx = vmovq_n_s32(fx);
+ int32x4_t v_fy = vmovq_n_s32(fy);
+ v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1);
+ v_fy = vsetq_lane_s32(fy + fdy, v_fy, 1);
+ v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2);
+ v_fy = vsetq_lane_s32(fy + fdy * 2, v_fy, 2);
+ v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3);
+ v_fy = vsetq_lane_s32(fy + fdy * 3, v_fy, 3);
+
+ const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
+ const int32x4_t v_round = vdupq_n_s32(0x0800);
+
+ while (b < boundedEnd - 3) {
+ uint32x4x2_t v_top, v_bot;
+
+ int x1 = (fx >> 16);
+ int y1 = (fy >> 16);
+ fx += fdx; fy += fdy;
+ const uchar *sl = textureData + bytesPerLine * y1;
+ const uint *s1 = reinterpret_cast<const uint *>(sl);
+ const uint *s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
+ v_top = vld2q_lane_u32(s1 + x1, v_top, 0);
+ v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0);
+ x1 = (fx >> 16);
+ y1 = (fy >> 16);
+ fx += fdx; fy += fdy;
+ sl = textureData + bytesPerLine * y1;
+ s1 = reinterpret_cast<const uint *>(sl);
+ s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
+ v_top = vld2q_lane_u32(s1 + x1, v_top, 1);
+ v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1);
+ x1 = (fx >> 16);
+ y1 = (fy >> 16);
+ fx += fdx; fy += fdy;
+ sl = textureData + bytesPerLine * y1;
+ s1 = reinterpret_cast<const uint *>(sl);
+ s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
+ v_top = vld2q_lane_u32(s1 + x1, v_top, 2);
+ v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2);
+ x1 = (fx >> 16);
+ y1 = (fy >> 16);
+ fx += fdx; fy += fdy;
+ sl = textureData + bytesPerLine * y1;
+ s1 = reinterpret_cast<const uint *>(sl);
+ s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
+ v_top = vld2q_lane_u32(s1 + x1, v_top, 3);
+ v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3);
+
+ int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_round), 12);
+ int32x4_t v_disty = vshrq_n_s32(vaddq_s32(vandq_s32(v_fy, v_ffff_mask), v_round), 12);
+ v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16));
+ v_disty = vorrq_s32(v_disty, vshlq_n_s32(v_disty, 16));
+ int16x8_t v_disty_ = vshlq_n_s16(vreinterpretq_s16_s32(v_disty), 4);
+
+ interpolate_4_pixels_16_neon(
+ vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]),
+ vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]),
+ vreinterpretq_s16_s32(v_distx), vreinterpretq_s16_s32(v_disty),
+ v_disty_, colorMask, invColorMask, v_256, b);
+ b += 4;
+ v_fx = vaddq_s32(v_fx, v_fdx);
+ v_fy = vaddq_s32(v_fy, v_fdy);
+ }
+#endif
+ while (b < boundedEnd) {
+ int x = (fx >> 16);
+ int y = (fy >> 16);
- const uchar *textureData = data->texture.imageData;
- const int bytesPerLine = data->texture.bytesPerLine;
- const __m128i vbpl = _mm_shufflelo_epi16(_mm_cvtsi32_si128(bytesPerLine/4), _MM_SHUFFLE(0, 0, 0, 0));
+ const uint *s1 = (const uint *)image.scanLine(y);
+ const uint *s2 = (const uint *)image.scanLine(y + 1);
- while (b < boundedEnd) {
- const __m128i vy = _mm_packs_epi32(_mm_srli_epi32(v_fy, 16), _mm_setzero_si128());
- // 4x16bit * 4x16bit -> 4x32bit
- __m128i offset = _mm_unpacklo_epi16(_mm_mullo_epi16(vy, vbpl), _mm_mulhi_epi16(vy, vbpl));
- offset = _mm_add_epi32(offset, _mm_srli_epi32(v_fx, 16));
- const int offset0 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
- const int offset1 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
- const int offset2 = _mm_cvtsi128_si32(offset); offset = _mm_srli_si128(offset, 4);
- const int offset3 = _mm_cvtsi128_si32(offset);
- const uint *topData = (const uint *)(textureData);
- const __m128i tl = _mm_setr_epi32(topData[offset0], topData[offset1], topData[offset2], topData[offset3]);
- const __m128i tr = _mm_setr_epi32(topData[offset0 + 1], topData[offset1 + 1], topData[offset2 + 1], topData[offset3 + 1]);
- const uint *bottomData = (const uint *)(textureData + bytesPerLine);
- const __m128i bl = _mm_setr_epi32(bottomData[offset0], bottomData[offset1], bottomData[offset2], bottomData[offset3]);
- const __m128i br = _mm_setr_epi32(bottomData[offset0 + 1], bottomData[offset1 + 1], bottomData[offset2 + 1], bottomData[offset3 + 1]);
-
- __m128i v_distx = _mm_srli_epi16(v_fx, 8);
- __m128i v_disty = _mm_srli_epi16(v_fy, 8);
- v_distx = _mm_srli_epi16(_mm_add_epi32(v_distx, v_fxy_r), 4);
- v_disty = _mm_srli_epi16(_mm_add_epi32(v_disty, v_fxy_r), 4);
- v_distx = _mm_shufflehi_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
- v_distx = _mm_shufflelo_epi16(v_distx, _MM_SHUFFLE(2,2,0,0));
- v_disty = _mm_shufflehi_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
- v_disty = _mm_shufflelo_epi16(v_disty, _MM_SHUFFLE(2,2,0,0));
-
- interpolate_4_pixels_16_sse2(tl, tr, bl, br, v_distx, v_disty, colorMask, v_256, b);
- b += 4;
- v_fx = _mm_add_epi32(v_fx, v_fdx);
- v_fy = _mm_add_epi32(v_fy, v_fdy);
- }
- fx = _mm_cvtsi128_si32(v_fx);
- fy = _mm_cvtsi128_si32(v_fy);
-#elif defined(__ARM_NEON__)
- BILINEAR_ROTATE_BOUNDS_PROLOG
+#if defined(__SSE2__) || defined(__ARM_NEON__)
+ int distx = (fx & 0x0000ffff) >> 8;
+ int disty = (fy & 0x0000ffff) >> 8;
+ *b = interpolate_4_pixels(s1 + x, s2 + x, distx, disty);
+#else
+ uint tl = s1[x];
+ uint tr = s1[x + 1];
+ uint bl = s2[x];
+ uint br = s2[x + 1];
+ int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
+ int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
+ *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
+#endif
- const int16x8_t colorMask = vdupq_n_s16(0x00ff);
- const int16x8_t invColorMask = vmvnq_s16(colorMask);
- const int16x8_t v_256 = vdupq_n_s16(256);
- int32x4_t v_fdx = vdupq_n_s32(fdx * 4);
- int32x4_t v_fdy = vdupq_n_s32(fdy * 4);
+ fx += fdx;
+ fy += fdy;
+ ++b;
+ }
+ }
- const uchar *textureData = data->texture.imageData;
- const int bytesPerLine = data->texture.bytesPerLine;
+ while (b < end) {
+ int x1 = (fx >> 16);
+ int x2;
+ int y1 = (fy >> 16);
+ int y2;
- int32x4_t v_fx = vmovq_n_s32(fx);
- int32x4_t v_fy = vmovq_n_s32(fy);
- v_fx = vsetq_lane_s32(fx + fdx, v_fx, 1);
- v_fy = vsetq_lane_s32(fy + fdy, v_fy, 1);
- v_fx = vsetq_lane_s32(fx + fdx * 2, v_fx, 2);
- v_fy = vsetq_lane_s32(fy + fdy * 2, v_fy, 2);
- v_fx = vsetq_lane_s32(fx + fdx * 3, v_fx, 3);
- v_fy = vsetq_lane_s32(fy + fdy * 3, v_fy, 3);
+ fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+ fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
- const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff);
- const int32x4_t v_round = vdupq_n_s32(0x0800);
+ const uint *s1 = (const uint *)image.scanLine(y1);
+ const uint *s2 = (const uint *)image.scanLine(y2);
- while (b < boundedEnd) {
- uint32x4x2_t v_top, v_bot;
+ uint tl = s1[x1];
+ uint tr = s1[x2];
+ uint bl = s2[x1];
+ uint br = s2[x2];
- int x1 = (fx >> 16);
- int y1 = (fy >> 16);
- fx += fdx; fy += fdy;
- const uchar *sl = textureData + bytesPerLine * y1;
- const uint *s1 = reinterpret_cast<const uint *>(sl);
- const uint *s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
- v_top = vld2q_lane_u32(s1 + x1, v_top, 0);
- v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0);
- x1 = (fx >> 16);
- y1 = (fy >> 16);
- fx += fdx; fy += fdy;
- sl = textureData + bytesPerLine * y1;
- s1 = reinterpret_cast<const uint *>(sl);
- s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
- v_top = vld2q_lane_u32(s1 + x1, v_top, 1);
- v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1);
- x1 = (fx >> 16);
- y1 = (fy >> 16);
- fx += fdx; fy += fdy;
- sl = textureData + bytesPerLine * y1;
- s1 = reinterpret_cast<const uint *>(sl);
- s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
- v_top = vld2q_lane_u32(s1 + x1, v_top, 2);
- v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2);
- x1 = (fx >> 16);
- y1 = (fy >> 16);
- fx += fdx; fy += fdy;
- sl = textureData + bytesPerLine * y1;
- s1 = reinterpret_cast<const uint *>(sl);
- s2 = reinterpret_cast<const uint *>(sl + bytesPerLine);
- v_top = vld2q_lane_u32(s1 + x1, v_top, 3);
- v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3);
-
- int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_round), 12);
- int32x4_t v_disty = vshrq_n_s32(vaddq_s32(vandq_s32(v_fy, v_ffff_mask), v_round), 12);
- v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16));
- v_disty = vorrq_s32(v_disty, vshlq_n_s32(v_disty, 16));
- int16x8_t v_disty_ = vshlq_n_s16(vreinterpretq_s16_s32(v_disty), 4);
-
- interpolate_4_pixels_16_neon(
- vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]),
- vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]),
- vreinterpretq_s16_s32(v_distx), vreinterpretq_s16_s32(v_disty),
- v_disty_, colorMask, invColorMask, v_256, b);
- b += 4;
- v_fx = vaddq_s32(v_fx, v_fdx);
- v_fy = vaddq_s32(v_fy, v_fdy);
- }
+#if defined(__SSE2__) || defined(__ARM_NEON__)
+ // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
+ int distx = (fx & 0x0000ffff) >> 8;
+ int disty = (fy & 0x0000ffff) >> 8;
+ *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
+#else
+ int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
+ int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
+ *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
#endif
- }
- while (b < end) {
- int x1 = (fx >> 16);
- int x2;
- int y1 = (fy >> 16);
- int y2;
+ fx += fdx;
+ fy += fdy;
+ ++b;
+ }
+}
- fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
- fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
- const uint *s1 = (const uint *)data->texture.scanLine(y1);
- const uint *s2 = (const uint *)data->texture.scanLine(y2);
+static BilinearFastTransformHelper bilinearFastTransformHelperARGB32PM[2][NFastTransformTypes] = {
+ {
+ fetchTransformedBilinearARGB32PM_simple_upscale_helper<BlendTransformedBilinear>,
+ fetchTransformedBilinearARGB32PM_upscale_helper<BlendTransformedBilinear>,
+ fetchTransformedBilinearARGB32PM_downscale_helper<BlendTransformedBilinear>,
+ fetchTransformedBilinearARGB32PM_rotate_helper<BlendTransformedBilinear>,
+ fetchTransformedBilinearARGB32PM_fast_rotate_helper<BlendTransformedBilinear>
+ },
+ {
+ fetchTransformedBilinearARGB32PM_simple_upscale_helper<BlendTransformedBilinearTiled>,
+ fetchTransformedBilinearARGB32PM_upscale_helper<BlendTransformedBilinearTiled>,
+ fetchTransformedBilinearARGB32PM_downscale_helper<BlendTransformedBilinearTiled>,
+ fetchTransformedBilinearARGB32PM_rotate_helper<BlendTransformedBilinearTiled>,
+ fetchTransformedBilinearARGB32PM_fast_rotate_helper<BlendTransformedBilinearTiled>
+ }
+};
+
+template<TextureBlendType blendType> /* blendType = BlendTransformedBilinear or BlendTransformedBilinearTiled */
+static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, const Operator *,
+ const QSpanData *data, int y, int x,
+ int length)
+{
+ const qreal cx = x + qreal(0.5);
+ const qreal cy = y + qreal(0.5);
+ Q_CONSTEXPR int tiled = (blendType == BlendTransformedBilinearTiled) ? 1 : 0;
- uint tl = s1[x1];
- uint tr = s1[x2];
- uint bl = s2[x1];
- uint br = s2[x2];
+ uint *end = buffer + length;
+ uint *b = buffer;
+ if (data->fast_matrix) {
+ // The increment pr x in the scanline
+ int fdx = (int)(data->m11 * fixed_scale);
+ int fdy = (int)(data->m12 * fixed_scale);
-#if defined(__SSE2__) || defined(__ARM_NEON__)
- // The optimized interpolate_4_pixels are faster than interpolate_4_pixels_16.
- int distx = (fx & 0x0000ffff) >> 8;
- int disty = (fy & 0x0000ffff) >> 8;
- *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
-#else
- int distx = ((fx & 0x0000ffff) + 0x0800) >> 12;
- int disty = ((fy & 0x0000ffff) + 0x0800) >> 12;
- *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty);
-#endif
+ int fx = int((data->m21 * cy
+ + data->m11 * cx + data->dx) * fixed_scale);
+ int fy = int((data->m22 * cy
+ + data->m12 * cx + data->dy) * fixed_scale);
- fx += fdx;
- fy += fdy;
- ++b;
- }
+ fx -= half_point;
+ fy -= half_point;
+
+ if (fdy == 0) { // simple scale, no rotation or shear
+ if (fdx <= fixed_scale && fdx > 0) {
+ // simple scale up on X without mirroring
+ bilinearFastTransformHelperARGB32PM[tiled][SimpleUpscaleTransform](b, end, data->texture, fx, fy, fdx, fdy);
+ } else if ((fdx < 0 && fdx > -(fixed_scale / 8)) || qAbs(data->m22) < qreal(1./8.)) {
+ // scale up more than 8x (on either Y or on X mirrored)
+ bilinearFastTransformHelperARGB32PM[tiled][UpscaleTransform](b, end, data->texture, fx, fy, fdx, fdy);
+ } else {
+ // scale down on X (or up on X mirrored less than 8x)
+ bilinearFastTransformHelperARGB32PM[tiled][DownscaleTransform](b, end, data->texture, fx, fy, fdx, fdy);
+ }
+ } else { // rotation or shear
+ if (qAbs(data->m11) < qreal(1./8.) || qAbs(data->m22) < qreal(1./8.) ) {
+ // if we are zooming more than 8 times, we use 8bit precision for the position.
+ bilinearFastTransformHelperARGB32PM[tiled][RotateTransform](b, end, data->texture, fx, fy, fdx, fdy);
+ } else {
+ // we are zooming less than 8x, use 4bit precision
+ bilinearFastTransformHelperARGB32PM[tiled][FastRotateTransform](b, end, data->texture, fx, fy, fdx, fdy);
}
}
} else {
+ const QTextureData &image = data->texture;
+
const qreal fdx = data->m11;
const qreal fdy = data->m12;
const qreal fdw = data->m13;
@@ -2496,8 +2629,8 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c
int distx = int((px - x1) * 256);
int disty = int((py - y1) * 256);
- fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
- fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
+ fetchTransformedBilinear_pixelBounds<blendType>(image.width, image.x1, image.x2 - 1, x1, x2);
+ fetchTransformedBilinear_pixelBounds<blendType>(image.height, image.y1, image.y2 - 1, y1, y2);
const uint *s1 = (const uint *)data->texture.scanLine(y1);
const uint *s2 = (const uint *)data->texture.scanLine(y2);
@@ -2679,7 +2812,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0);
layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0);
- if ((fdx < 0 && fdx > -(fixed_scale / 8)) || std::abs(data->m22) < (1./8.)) { // scale up more than 8x
+ if ((fdx < 0 && fdx > -(fixed_scale / 8)) || qAbs(data->m22) < qreal(1./8.)) { // scale up more than 8x
int disty = (fy & 0x0000ffff) >> 8;
for (int i = 0; i < len; ++i) {
int distx = (fracX & 0x0000ffff) >> 8;
@@ -2731,7 +2864,7 @@ static const uint *QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Oper
layout->convertToARGB32PM(buf1, buf1, len * 2, clut, 0);
layout->convertToARGB32PM(buf2, buf2, len * 2, clut, 0);
- if (std::abs(data->m11) < (1./8.) || std::abs(data->m22) < (1./8.)) {
+ if (qAbs(data->m11) < qreal(1./8.) || qAbs(data->m22) < qreal(1./8.) ) {
//if we are zooming more than 8 times, we use 8bit precision for the position.
for (int i = 0; i < len; ++i) {
int distx = (fracX & 0x0000ffff) >> 8;