From e231581f1ffce1bb6d9ee0dcba32d0fe8d771e1d Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Tue, 16 Aug 2016 13:37:49 +0200 Subject: Remove type-punned unions Type punning even over a union is not legal C++, and also causes the compilers to produce poorly performing code. This has already been fixed for the SSE2 code in bilinear sampling but not for the NEON code. Change-Id: Id5e184051e0bd78db730d83ef0dda56ac3206e5b Reviewed-by: Erik Verbruggen --- src/gui/painting/qdrawhelper.cpp | 63 +++++++++++++++++++++------------------- 1 file changed, 33 insertions(+), 30 deletions(-) (limited to 'src') diff --git a/src/gui/painting/qdrawhelper.cpp b/src/gui/painting/qdrawhelper.cpp index f0d0ac0283..fa4470a486 100644 --- a/src/gui/painting/qdrawhelper.cpp +++ b/src/gui/painting/qdrawhelper.cpp @@ -2243,45 +2243,48 @@ static const uint * QT_FASTCALL fetchTransformedBilinearARGB32PM(uint *buffer, c const int16x8_t v_disty_ = vshlq_n_s16(v_disty, 4); int32x4_t v_fdx = vdupq_n_s32(fdx*4); - ptrdiff_t secondLine = reinterpret_cast(s2) - reinterpret_cast(s1); - - union Vect_buffer { int32x4_t vect; quint32 i[4]; }; - Vect_buffer v_fx; - - for (int i = 0; i < 4; i++) { - v_fx.i[i] = fx; - fx += fdx; - } + int32x4_t v_fx = vmovq_n_s32(fx); + fx += fdx; + v_fx = vsetq_lane_s32(fx, v_fx, 1); + fx += fdx; + v_fx = vsetq_lane_s32(fx, v_fx, 2); + fx += fdx; + v_fx = vsetq_lane_s32(fx, v_fx, 3); + fx += fdx; const int32x4_t v_ffff_mask = vdupq_n_s32(0x0000ffff); const int32x4_t v_fx_r = vdupq_n_s32(0x0800); while (b < boundedEnd) { - - Vect_buffer tl, tr, bl, br; - - Vect_buffer v_fx_shifted; - v_fx_shifted.vect = vshrq_n_s32(v_fx.vect, 16); - - int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx.vect, v_ffff_mask), v_fx_r), 12); - - for (int i = 0; i < 4; i++) { - int x1 = v_fx_shifted.i[i]; - const uint *addr_tl = reinterpret_cast(s1) + x1; - const uint *addr_tr = addr_tl + 1; - tl.i[i] = *addr_tl; - tr.i[i] = *addr_tr; - bl.i[i] = *(addr_tl+secondLine); - br.i[i] = *(addr_tr+secondLine); - } - + uint32x4x2_t v_top, v_bot; + + int32x4_t v_fx_shifted = vshrq_n_s32(v_fx, 16); + + int x1 = vgetq_lane_s32(v_fx_shifted, 0); + v_top = vld2q_lane_u32(s1 + x1, v_top, 0); + v_bot = vld2q_lane_u32(s2 + x1, v_bot, 0); + x1 = vgetq_lane_s32(v_fx_shifted, 1); + v_top = vld2q_lane_u32(s1 + x1, v_top, 1); + v_bot = vld2q_lane_u32(s2 + x1, v_bot, 1); + x1 = vgetq_lane_s32(v_fx_shifted, 2); + v_top = vld2q_lane_u32(s1 + x1, v_top, 2); + v_bot = vld2q_lane_u32(s2 + x1, v_bot, 2); + x1 = vgetq_lane_s32(v_fx_shifted, 3); + v_top = vld2q_lane_u32(s1 + x1, v_top, 3); + v_bot = vld2q_lane_u32(s2 + x1, v_bot, 3); + + int32x4_t v_distx = vshrq_n_s32(vaddq_s32(vandq_s32(v_fx, v_ffff_mask), v_fx_r), 12); v_distx = vorrq_s32(v_distx, vshlq_n_s32(v_distx, 16)); - interpolate_4_pixels_16_neon(vreinterpretq_s16_s32(tl.vect), vreinterpretq_s16_s32(tr.vect), vreinterpretq_s16_s32(bl.vect), vreinterpretq_s16_s32(br.vect), vreinterpretq_s16_s32(v_distx), v_disty, v_disty_, colorMask, invColorMask, v_256, b); + interpolate_4_pixels_16_neon( + vreinterpretq_s16_u32(v_top.val[0]), vreinterpretq_s16_u32(v_top.val[1]), + vreinterpretq_s16_u32(v_bot.val[0]), vreinterpretq_s16_u32(v_bot.val[1]), + vreinterpretq_s16_s32(v_distx), v_disty, v_disty_, + colorMask, invColorMask, v_256, b); b+=4; - v_fx.vect = vaddq_s32(v_fx.vect, v_fdx); + v_fx = vaddq_s32(v_fx, v_fdx); } - fx = v_fx.i[0]; + fx = vgetq_lane_s32(v_fx, 0); #endif } -- cgit v1.2.3