From 9d27aec869502c7e1689b44a528d9cbd8b1f0545 Mon Sep 17 00:00:00 2001 From: Allan Sandfeld Jensen Date: Fri, 14 Sep 2018 15:29:59 +0200 Subject: Optimize gamma-table lookup on SSE2 and NEON Speeds up gamma-corrected text rendering. Change-Id: I38c12ff52f4601853c3f3524de2761a932111160 Reviewed-by: Erik Verbruggen --- src/gui/painting/qcolorprofile_p.h | 156 +++++++++++++++++++++++++++++-------- 1 file changed, 122 insertions(+), 34 deletions(-) (limited to 'src/gui/painting') diff --git a/src/gui/painting/qcolorprofile_p.h b/src/gui/painting/qcolorprofile_p.h index ca1786ee6d..425e9abace 100644 --- a/src/gui/painting/qcolorprofile_p.h +++ b/src/gui/painting/qcolorprofile_p.h @@ -55,6 +55,11 @@ #include #include +#if defined(__SSE2__) +#include +#elif defined(__ARM_NEON__) || defined(__ARM_NEON) +#include +#endif QT_BEGIN_NAMESPACE class Q_GUI_EXPORT QColorProfile @@ -67,82 +72,165 @@ public: QRgba64 toLinear64(QRgb rgb32) const { - ushort r = m_toLinear[qRed(rgb32) << 4]; - ushort g = m_toLinear[qGreen(rgb32) << 4]; - ushort b = m_toLinear[qBlue(rgb32) << 4]; +#if defined(__SSE2__) + __m128i v = _mm_cvtsi32_si128(rgb32); + v = _mm_unpacklo_epi8(v, _mm_setzero_si128()); + const __m128i vidx = _mm_slli_epi16(v, 4); + const int ridx = _mm_extract_epi16(vidx, 2); + const int gidx = _mm_extract_epi16(vidx, 1); + const int bidx = _mm_extract_epi16(vidx, 0); + v = _mm_slli_epi16(v, 8); // a * 256 + v = _mm_insert_epi16(v, m_toLinear[ridx], 0); + v = _mm_insert_epi16(v, m_toLinear[gidx], 1); + v = _mm_insert_epi16(v, m_toLinear[bidx], 2); + v = _mm_add_epi16(v, _mm_srli_epi16(v, 8)); + QRgba64 rgba64; + _mm_storel_epi64(reinterpret_cast<__m128i *>(&rgba64), v); + return rgba64; +#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN + uint8x8_t v8 = vreinterpret_u8_u32(vmov_n_u32(rgb32)); + uint16x4_t v16 = vget_low_u16(vmovl_u8(v8)); + const uint16x4_t vidx = vshl_n_u16(v16, 4); + const int ridx = vget_lane_u16(vidx, 2); + const int gidx = vget_lane_u16(vidx, 1); + const int bidx = vget_lane_u16(vidx, 0); + v16 = vshl_n_u16(v16, 8); // a * 256 + v16 = vset_lane_u16(m_toLinear[ridx], v16, 0); + v16 = vset_lane_u16(m_toLinear[gidx], v16, 1); + v16 = vset_lane_u16(m_toLinear[bidx], v16, 2); + v16 = vadd_u16(v16, vshr_n_u16(v16, 8)); + return QRgba64::fromRgba64(vget_lane_u64(vreinterpret_u64_u16(v16), 0)); +#else + uint r = m_toLinear[qRed(rgb32) << 4]; + uint g = m_toLinear[qGreen(rgb32) << 4]; + uint b = m_toLinear[qBlue(rgb32) << 4]; r = r + (r >> 8); g = g + (g >> 8); b = b + (b >> 8); return QRgba64::fromRgba64(r, g, b, qAlpha(rgb32) * 257); +#endif } QRgb toLinear(QRgb rgb32) const { - uchar r = (m_toLinear[qRed(rgb32) << 4] + 0x80) >> 8; - uchar g = (m_toLinear[qGreen(rgb32) << 4] + 0x80) >> 8; - uchar b = (m_toLinear[qBlue(rgb32) << 4] + 0x80) >> 8; - return qRgba(r, g, b, qAlpha(rgb32)); + return convertWithTable(rgb32, m_toLinear); } QRgba64 toLinear(QRgba64 rgb64) const { - ushort r = rgb64.red(); - ushort g = rgb64.green(); - ushort b = rgb64.blue(); - r = r - (r >> 8); - g = g - (g >> 8); - b = b - (b >> 8); - r = m_toLinear[r >> 4]; - g = m_toLinear[g >> 4]; - b = m_toLinear[b >> 4]; - r = r + (r >> 8); - g = g + (g >> 8); - b = b + (b >> 8); - return QRgba64::fromRgba64(r, g, b, rgb64.alpha()); + return convertWithTable(rgb64, m_toLinear); } QRgb fromLinear64(QRgba64 rgb64) const { - ushort r = rgb64.red(); - ushort g = rgb64.green(); - ushort b = rgb64.blue(); +#if defined(__SSE2__) + __m128i v = _mm_loadl_epi64(reinterpret_cast(&rgb64)); + v = _mm_sub_epi16(v, _mm_srli_epi16(v, 8)); + const __m128i vidx = _mm_srli_epi16(v, 4); + const int ridx = _mm_extract_epi16(vidx, 0); + const int gidx = _mm_extract_epi16(vidx, 1); + const int bidx = _mm_extract_epi16(vidx, 2); + v = _mm_insert_epi16(v, m_fromLinear[ridx], 2); + v = _mm_insert_epi16(v, m_fromLinear[gidx], 1); + v = _mm_insert_epi16(v, m_fromLinear[bidx], 0); + v = _mm_add_epi16(v, _mm_set1_epi16(0x80)); + v = _mm_srli_epi16(v, 8); + v = _mm_packus_epi16(v, v); + return _mm_cvtsi128_si32(v); +#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN + uint16x4_t v = vreinterpret_u16_u64(vmov_n_u64(rgb64)); + v = vsub_u16(v, vshr_n_u16(v, 8)); + const uint16x4_t vidx = vshr_n_u16(v, 4); + const int ridx = vget_lane_u16(vidx, 0); + const int gidx = vget_lane_u16(vidx, 1); + const int bidx = vget_lane_u16(vidx, 2); + v = vset_lane_u16(m_fromLinear[ridx], v, 2); + v = vset_lane_u16(m_fromLinear[gidx], v, 1); + v = vset_lane_u16(m_fromLinear[bidx], v, 0); + uint8x8_t v8 = vrshrn_n_u16(vcombine_u16(v, v), 8); + return vget_lane_u32(vreinterpret_u32_u8(v8), 0); +#else + uint a = rgb64.alpha(); + uint r = rgb64.red(); + uint g = rgb64.green(); + uint b = rgb64.blue(); + a = a - (a >> 8); r = r - (r >> 8); g = g - (g >> 8); b = b - (b >> 8); + a = (a + 0x80) >> 8; r = (m_fromLinear[r >> 4] + 0x80) >> 8; g = (m_fromLinear[g >> 4] + 0x80) >> 8; b = (m_fromLinear[b >> 4] + 0x80) >> 8; - return qRgba(r, g, b, rgb64.alpha8()); + return (a << 24) | (r << 16) | (g << 8) | b; +#endif } QRgb fromLinear(QRgb rgb32) const { - uchar r = (m_fromLinear[qRed(rgb32) << 4] + 0x80) >> 8; - uchar g = (m_fromLinear[qGreen(rgb32) << 4] + 0x80) >> 8; - uchar b = (m_fromLinear[qBlue(rgb32) << 4] + 0x80) >> 8; - return qRgba(r, g, b, qAlpha(rgb32)); + return convertWithTable(rgb32, m_fromLinear); } QRgba64 fromLinear(QRgba64 rgb64) const { + return convertWithTable(rgb64, m_fromLinear); + } + +private: + QColorProfile() { } + + Q_ALWAYS_INLINE static QRgb convertWithTable(QRgb rgb32, const ushort *table) + { + const int r = (table[qRed(rgb32) << 4] + 0x80) >> 8; + const int g = (table[qGreen(rgb32) << 4] + 0x80) >> 8; + const int b = (table[qBlue(rgb32) << 4] + 0x80) >> 8; + return (rgb32 & 0xff000000) | (r << 16) | (g << 8) | b; + } + Q_ALWAYS_INLINE static QRgba64 convertWithTable(QRgba64 rgb64, const ushort *table) + { +#if defined(__SSE2__) + __m128i v = _mm_loadl_epi64(reinterpret_cast(&rgb64)); + v = _mm_sub_epi16(v, _mm_srli_epi16(v, 8)); + const __m128i vidx = _mm_srli_epi16(v, 4); + const int ridx = _mm_extract_epi16(vidx, 2); + const int gidx = _mm_extract_epi16(vidx, 1); + const int bidx = _mm_extract_epi16(vidx, 0); + v = _mm_insert_epi16(v, table[ridx], 2); + v = _mm_insert_epi16(v, table[gidx], 1); + v = _mm_insert_epi16(v, table[bidx], 0); + v = _mm_add_epi16(v, _mm_srli_epi16(v, 8)); + QRgba64 rgba64; + _mm_storel_epi64(reinterpret_cast<__m128i *>(&rgba64), v); + return rgba64; +#elif (defined(__ARM_NEON__) || defined(__ARM_NEON)) && Q_BYTE_ORDER == Q_LITTLE_ENDIAN + uint16x4_t v = vreinterpret_u16_u64(vmov_n_u64(rgb64)); + v = vsub_u16(v, vshr_n_u16(v, 8)); + const uint16x4_t vidx = vshr_n_u16(v, 4); + const int ridx = vget_lane_u16(vidx, 2); + const int gidx = vget_lane_u16(vidx, 1); + const int bidx = vget_lane_u16(vidx, 0); + v = vset_lane_u16(table[ridx], v, 2); + v = vset_lane_u16(table[gidx], v, 1); + v = vset_lane_u16(table[bidx], v, 0); + v = vadd_u16(v, vshr_n_u16(v, 8)); + return QRgba64::fromRgba64(vget_lane_u64(vreinterpret_u64_u16(v), 0)); +#else ushort r = rgb64.red(); ushort g = rgb64.green(); ushort b = rgb64.blue(); r = r - (r >> 8); g = g - (g >> 8); b = b - (b >> 8); - r = m_fromLinear[r >> 4]; - g = m_fromLinear[g >> 4]; - b = m_fromLinear[b >> 4]; + r = table[r >> 4]; + g = table[g >> 4]; + b = table[b >> 4]; r = r + (r >> 8); g = g + (g >> 8); b = b + (b >> 8); return QRgba64::fromRgba64(r, g, b, rgb64.alpha()); +#endif } -private: - QColorProfile() { } - // We translate to 0-65280 (255*256) instead to 0-65535 to make simple // shifting an accurate conversion. // We translate from 0-4080 (255*16) for the same speed up, and to keep -- cgit v1.2.3