From 923d869b0febe62dab2baf67fc8f9da5327ff2e0 Mon Sep 17 00:00:00 2001 From: Erik Verbruggen Date: Thu, 3 Dec 2015 15:10:47 +0100 Subject: Aarch64: vectorize ascii de-/encoding. This works only on Aarch64, because the vaddv instruction is only available on 64bit ARM. Doing something equivalent on 32bit ARM has the high chance to run into micro-architecture differences: on an Cortex-a8, transferring a single vector element from NEON to the regular CPU registers takes 20 cycles(!). Change-Id: Iccbfe84da82abb9b10f3f3dc35c8b950df69e251 Reviewed-by: Allan Sandfeld Jensen --- src/corelib/codecs/qutfcodec.cpp | 73 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 72 insertions(+), 1 deletion(-) (limited to 'src/corelib/codecs') diff --git a/src/corelib/codecs/qutfcodec.cpp b/src/corelib/codecs/qutfcodec.cpp index b4bc1583e6..74a716db4a 100644 --- a/src/corelib/codecs/qutfcodec.cpp +++ b/src/corelib/codecs/qutfcodec.cpp @@ -52,7 +52,8 @@ enum { Endian = 0, Data = 1 }; static const uchar utf8bom[] = { 0xef, 0xbb, 0xbf }; -#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2) +#if (defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2)) \ + || (defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64)) static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) Q_DECL_NOTHROW { uint result = qCountLeadingZeroBits(v); @@ -62,7 +63,9 @@ static Q_ALWAYS_INLINE uint qBitScanReverse(unsigned v) Q_DECL_NOTHROW result ^= sizeof(unsigned) * 8 - 1; return result; } +#endif +#if defined(__SSE2__) && defined(QT_COMPILER_SUPPORTS_SSE2) static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end) { // do sixteen characters at a time @@ -149,6 +152,74 @@ static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const } return src == end; } +#elif defined(__ARM_NEON__) && defined(Q_PROCESSOR_ARM_64) // vaddv is only available on Aarch64 +static inline bool simdEncodeAscii(uchar *&dst, const ushort *&nextAscii, const ushort *&src, const ushort *end) +{ + uint16x8_t maxAscii = vdupq_n_u16(0x7f); + uint16x8_t mask1 = { 1, 1 << 2, 1 << 4, 1 << 6, 1 << 8, 1 << 10, 1 << 12, 1 << 14 }; + uint16x8_t mask2 = vshlq_n_u16(mask1, 1); + + // do sixteen characters at a time + for ( ; end - src >= 16; src += 16, dst += 16) { + // load 2 lanes (or: "load interleaved") + uint16x8x2_t in = vld2q_u16(src); + + // check if any of the elements > 0x7f, select 1 bit per element (element 0 -> bit 0, element 1 -> bit 1, etc), + // add those together into a scalar, and merge the scalars. + uint16_t nonAscii = vaddvq_u16(vandq_u16(vcgtq_u16(in.val[0], maxAscii), mask1)) + | vaddvq_u16(vandq_u16(vcgtq_u16(in.val[1], maxAscii), mask2)); + + // merge the two lanes by shifting the values of the second by 8 and inserting them + uint16x8_t out = vsliq_n_u16(in.val[0], in.val[1], 8); + + // store, even if there are non-ASCII characters here + vst1q_u8(dst, vreinterpretq_u8_u16(out)); + + if (nonAscii) { + // find the next probable ASCII character + // we don't want to load 32 bytes again in this loop if we know there are non-ASCII + // characters still coming + nextAscii = src + qBitScanReverse(nonAscii) + 1; + + nonAscii = qCountTrailingZeroBits(nonAscii); + dst += nonAscii; + src += nonAscii; + return false; + } + } + return src == end; +} + +static inline bool simdDecodeAscii(ushort *&dst, const uchar *&nextAscii, const uchar *&src, const uchar *end) +{ + // do eight characters at a time + uint8x8_t msb_mask = vdup_n_u8(0x80); + uint8x8_t add_mask = { 1, 1 << 1, 1 << 2, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7 }; + for ( ; end - src >= 8; src += 8, dst += 8) { + uint8x8_t c = vld1_u8(src); + uint8_t n = vaddv_u8(vand_u8(vcge_u8(c, msb_mask), add_mask)); + if (!n) { + // store + vst1q_u16(dst, vmovl_u8(c)); + continue; + } + + // copy the front part that is still ASCII + while (!(n & 1)) { + *dst++ = *src++; + n >>= 1; + } + + // find the next probable ASCII character + // we don't want to load 16 bytes again in this loop if we know there are non-ASCII + // characters still coming + n = qBitScanReverse(n); + nextAscii = src + n + 1; + return false; + + } + return src == end; +} #else static inline bool simdEncodeAscii(uchar *, const ushort *, const ushort *, const ushort *) { -- cgit v1.2.3