diff options
Diffstat (limited to 'src/gui/painting/qdrawhelper_neon.cpp')
-rw-r--r-- | src/gui/painting/qdrawhelper_neon.cpp | 181 |
1 files changed, 150 insertions, 31 deletions
diff --git a/src/gui/painting/qdrawhelper_neon.cpp b/src/gui/painting/qdrawhelper_neon.cpp index e126f4b670..629dfe2358 100644 --- a/src/gui/painting/qdrawhelper_neon.cpp +++ b/src/gui/painting/qdrawhelper_neon.cpp @@ -1081,15 +1081,28 @@ const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Oper } #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN -template<bool RGBA> -static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count) +static inline uint32x4_t vrgba2argb(uint32x4_t srcVector) { - int i = 0; #if defined(Q_PROCESSOR_ARM_64) const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; #else const uint8x8_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7 }; #endif +#if defined(Q_PROCESSOR_ARM_64) + srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask)); +#else + // no vqtbl1q_u8, so use two vtbl1_u8 + const uint8x8_t low = vtbl1_u8(vreinterpret_u8_u32(vget_low_u32(srcVector)), rgbaMask); + const uint8x8_t high = vtbl1_u8(vreinterpret_u8_u32(vget_high_u32(srcVector)), rgbaMask); + srcVector = vcombine_u32(vreinterpret_u32_u8(low), vreinterpret_u32_u8(high)); +#endif + return srcVector; +} + +template<bool RGBA> +static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count) +{ + int i = 0; const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7}; const uint32x4_t blendMask = vdupq_n_u32(0xff000000); @@ -1105,16 +1118,8 @@ static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int #endif if (alphaSum) { if (alphaSum != 255 * 4) { - if (RGBA) { -#if defined(Q_PROCESSOR_ARM_64) - srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask)); -#else - // no vqtbl1q_u8 - const uint8x8_t low = vtbl1_u8(vreinterpret_u8_u32(vget_low_u32(srcVector)), rgbaMask); - const uint8x8_t high = vtbl1_u8(vreinterpret_u8_u32(vget_high_u32(srcVector)), rgbaMask); - srcVector = vcombine_u32(vreinterpret_u32_u8(low), vreinterpret_u32_u8(high)); -#endif - } + if (RGBA) + srcVector = vrgba2argb(srcVector); const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(srcVector)); const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(srcVector)); const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask); @@ -1128,19 +1133,10 @@ static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int const uint32x4_t d = vbslq_u32(blendMask, srcVector, vreinterpretq_u32_u8(vcombine_u8(d1, d2))); vst1q_u32(buffer + i, d); } else { - if (RGBA) { -#if defined(Q_PROCESSOR_ARM_64) - srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask)); -#else - // no vqtbl1q_u8 - const uint8x8_t low = vtbl1_u8(vreinterpret_u8_u32(vget_low_u32(srcVector)), rgbaMask); - const uint8x8_t high = vtbl1_u8(vreinterpret_u8_u32(vget_high_u32(srcVector)), rgbaMask); - srcVector = vcombine_u32(vreinterpret_u32_u8(low), vreinterpret_u32_u8(high)); -#endif - vst1q_u32(buffer + i, srcVector); - } else if (buffer != src) { + if (RGBA) + vst1q_u32(buffer + i, vrgba2argb(srcVector)); + else if (buffer != src) vst1q_u32(buffer + i, srcVector); - } } } else { vst1q_u32(buffer + i, vdupq_n_u32(0)); @@ -1153,20 +1149,143 @@ static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int } } -const uint *QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, const uint *src, int count, - const QVector<QRgb> *, QDitherInfo *) +static inline float32x4_t reciprocal_mul_ps(float32x4_t a, float mul) +{ + float32x4_t ia = vrecpeq_f32(a); // estimate 1/a + ia = vmulq_f32(vrecpsq_f32(a, ia), vmulq_n_f32(ia, mul)); // estimate improvement step * mul + return ia; +} + +template<bool RGBA, bool RGBx> +static inline void convertARGBFromARGB32PM_neon(uint *buffer, const uint *src, int count) +{ + int i = 0; + const uint32x4_t alphaMask = vdupq_n_u32(0xff000000); + + for (; i < count - 3; i += 4) { + uint32x4_t srcVector = vld1q_u32(src + i); + uint32x4_t alphaVector = vshrq_n_u32(srcVector, 24); +#if defined(Q_PROCESSOR_ARM_64) + uint32_t alphaSum = vaddvq_u32(alphaVector); +#else + // no vaddvq_u32 + uint32x2_t tmp = vpadd_u32(vget_low_u32(alphaVector), vget_high_u32(alphaVector)); + uint32_t alphaSum = vget_lane_u32(vpadd_u32(tmp, tmp), 0); +#endif + if (alphaSum) { + if (alphaSum != 255 * 4) { + if (RGBA) + srcVector = vrgba2argb(srcVector); + const float32x4_t a = vcvtq_f32_u32(alphaVector); + const float32x4_t ia = reciprocal_mul_ps(a, 255.0f); + // Convert 4x(4xU8) to 4x(4xF32) + uint16x8_t tmp1 = vmovl_u8(vget_low_u8(vreinterpretq_u8_u32(srcVector))); + uint16x8_t tmp3 = vmovl_u8(vget_high_u8(vreinterpretq_u8_u32(srcVector))); + float32x4_t src1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1))); + float32x4_t src2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1))); + float32x4_t src3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp3))); + float32x4_t src4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp3))); + src1 = vmulq_lane_f32(src1, vget_low_f32(ia), 0); + src2 = vmulq_lane_f32(src2, vget_low_f32(ia), 1); + src3 = vmulq_lane_f32(src3, vget_high_f32(ia), 0); + src4 = vmulq_lane_f32(src4, vget_high_f32(ia), 1); + // Convert 4x(4xF32) back to 4x(4xU8) (over a 8.1 fixed point format to get rounding) + tmp1 = vcombine_u16(vrshrn_n_u32(vcvtq_n_u32_f32(src1, 1), 1), + vrshrn_n_u32(vcvtq_n_u32_f32(src2, 1), 1)); + tmp3 = vcombine_u16(vrshrn_n_u32(vcvtq_n_u32_f32(src3, 1), 1), + vrshrn_n_u32(vcvtq_n_u32_f32(src4, 1), 1)); + uint32x4_t dstVector = vreinterpretq_u32_u8(vcombine_u8(vmovn_u16(tmp1), vmovn_u16(tmp3))); + // Overwrite any undefined results from alpha==0 with zeros: +#if defined(Q_PROCESSOR_ARM_64) + uint32x4_t srcVectorAlphaMask = vceqzq_u32(alphaVector); +#else + uint32x4_t srcVectorAlphaMask = vceqq_u32(alphaVector, vdupq_n_u32(0)); +#endif + dstVector = vbicq_u32(dstVector, srcVectorAlphaMask); + // Restore or mask alpha values: + if (RGBx) + dstVector = vorrq_u32(alphaMask, dstVector); + else + dstVector = vbslq_u32(alphaMask, srcVector, dstVector); + vst1q_u32(&buffer[i], dstVector); + } else { + // 4xAlpha==255, no change except if we are doing RGBA->ARGB: + if (RGBA) + vst1q_u32(&buffer[i], vrgba2argb(srcVector)); + else if (buffer != src) + vst1q_u32(&buffer[i], srcVector); + } + } else { + // 4xAlpha==0, always zero, except if output is RGBx: + if (RGBx) + vst1q_u32(&buffer[i], alphaMask); + else + vst1q_u32(&buffer[i], vdupq_n_u32(0)); + } + } + + SIMD_EPILOGUE(i, count, 3) { + uint v = qUnpremultiply(src[i]); + if (RGBx) + v = 0xff000000 | v; + if (RGBA) + v = ARGB2RGBA(v); + buffer[i] = v; + } +} + +void QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, int count, const QVector<QRgb> *) +{ + convertARGBToARGB32PM_neon<false>(buffer, buffer, count); +} + +void QT_FASTCALL convertRGBA8888ToARGB32PM_neon(uint *buffer, int count, const QVector<QRgb> *) +{ + convertARGBToARGB32PM_neon<true>(buffer, buffer, count); +} + +const uint *QT_FASTCALL fetchARGB32ToARGB32PM_neon(uint *buffer, const uchar *src, int index, int count, + const QVector<QRgb> *, QDitherInfo *) { - convertARGBToARGB32PM_neon<false>(buffer, src, count); + convertARGBToARGB32PM_neon<false>(buffer, reinterpret_cast<const uint *>(src) + index, count); return buffer; } -const uint *QT_FASTCALL convertRGBA8888ToARGB32PM_neon(uint *buffer, const uint *src, int count, - const QVector<QRgb> *, QDitherInfo *) +const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_neon(uint *buffer, const uchar *src, int index, int count, + const QVector<QRgb> *, QDitherInfo *) { - convertARGBToARGB32PM_neon<true>(buffer, src, count); + convertARGBToARGB32PM_neon<true>(buffer, reinterpret_cast<const uint *>(src) + index, count); return buffer; } +void QT_FASTCALL storeRGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count, + const QVector<QRgb> *, QDitherInfo *) +{ + uint *d = reinterpret_cast<uint *>(dest) + index; + convertARGBFromARGB32PM_neon<false,true>(d, src, count); +} + +void QT_FASTCALL storeARGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count, + const QVector<QRgb> *, QDitherInfo *) +{ + uint *d = reinterpret_cast<uint *>(dest) + index; + convertARGBFromARGB32PM_neon<false,false>(d, src, count); +} + +void QT_FASTCALL storeRGBA8888FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count, + const QVector<QRgb> *, QDitherInfo *) +{ + uint *d = reinterpret_cast<uint *>(dest) + index; + convertARGBFromARGB32PM_neon<true,false>(d, src, count); +} + +void QT_FASTCALL storeRGBXFromARGB32PM_neon(uchar *dest, const uint *src, int index, int count, + const QVector<QRgb> *, QDitherInfo *) +{ + uint *d = reinterpret_cast<uint *>(dest) + index; + convertARGBFromARGB32PM_neon<true,true>(d, src, count); +} + #endif // Q_BYTE_ORDER == Q_LITTLE_ENDIAN QT_END_NAMESPACE |