diff options
author | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2021-05-04 13:03:50 +0200 |
---|---|---|
committer | Allan Sandfeld Jensen <allan.jensen@qt.io> | 2021-05-27 00:18:04 +0200 |
commit | da30e402f38a434f856fa8670a8813c3cffe6440 (patch) | |
tree | 80a50d2a5e232fdde6c20f8181558e00d3533aff | |
parent | f3e7073f938ead15d8651623a23fe402186cb709 (diff) |
Add SIMD optimizations for color-transform writes
Add NEON for RGB32 and RGBA64 writeback, and SSE2 for
RGBA64 writeback.
Change-Id: Id9ee803267a78f5bdff5beaa719e7a59c1dbb9fb
Reviewed-by: Qt CI Bot <qt_ci_bot@qt-project.org>
Reviewed-by: Thiago Macieira <thiago.macieira@intel.com>
Reviewed-by: Lars Knoll <lars.knoll@qt.io>
-rw-r--r-- | src/gui/painting/qcolortransform.cpp | 238 | ||||
-rw-r--r-- | tests/auto/gui/painting/qcolorspace/tst_qcolorspace.cpp | 72 |
2 files changed, 262 insertions, 48 deletions
diff --git a/src/gui/painting/qcolortransform.cpp b/src/gui/painting/qcolortransform.cpp index 116a6c0ec7..ff8261989e 100644 --- a/src/gui/painting/qcolortransform.cpp +++ b/src/gui/painting/qcolortransform.cpp @@ -299,6 +299,7 @@ static void applyMatrix(QColorVector *buffer, const qsizetype len, const QColorM #endif } +#if defined(__SSE2__) || defined(__ARM_NEON__) template<typename T> static constexpr inline bool isArgb(); template<> @@ -307,6 +308,16 @@ template<> constexpr inline bool isArgb<QRgba64>() { return false; } template<typename T> +static inline int getAlpha(const T &p); +template<> +inline int getAlpha<QRgb>(const QRgb &p) +{ return qAlpha(p); } +template<> +inline int getAlpha<QRgba64>(const QRgba64 &p) +{ return p.alpha(); } +#endif + +template<typename T> static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr); template<typename T> static void loadUnpremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr); @@ -466,7 +477,7 @@ static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetyp vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vAlphaMask)); // LUT - v = vcvtq_u32_f32(vmulq_n_f32(vf, 4080.f)); + v = vcvtq_u32_f32(vaddq_f32(vmulq_n_f32(vf, 4080.f), vdupq_n_f32(0.5f))); const int ridx = isARGB ? vgetq_lane_u32(v, 2) : vgetq_lane_u32(v, 0); const int gidx = vgetq_lane_u32(v, 1); const int bidx = isARGB ? vgetq_lane_u32(v, 0) : vgetq_lane_u32(v, 2); @@ -581,82 +592,93 @@ void loadUnpremultiplied<QRgba64>(QColorVector *buffer, const QRgba64 *src, cons } #endif -static void storePremultiplied(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len, +#if defined(__SSE2__) +template<typename T> +static inline void storeP(T &p, const __m128i &v); +template<> +inline void storeP<QRgb>(QRgb &p, const __m128i &v) +{ + p = _mm_cvtsi128_si32(_mm_packus_epi16(v, v)); +} +template<> +inline void storeP<QRgba64>(QRgba64 &p, const __m128i &v) +{ + _mm_storel_epi64((__m128i *)&p, v); +} + +template<typename T> +static void storePremultiplied(T *dst, const T *src, const QColorVector *buffer, const qsizetype len, const QColorTransformPrivate *d_ptr) { -#if defined(__SSE2__) const __m128 v4080 = _mm_set1_ps(4080.f); const __m128 iFF00 = _mm_set1_ps(1.0f / (255 * 256)); + constexpr bool isARGB = isArgb<T>(); for (qsizetype i = 0; i < len; ++i) { - const int a = qAlpha(src[i]); + const int a = getAlpha<T>(src[i]); __m128 vf = _mm_loadu_ps(&buffer[i].x); __m128i v = _mm_cvtps_epi32(_mm_mul_ps(vf, v4080)); - __m128 va = _mm_set1_ps(a); - va = _mm_mul_ps(va, iFF00); + __m128 va = _mm_mul_ps(_mm_set1_ps(a), iFF00); const int ridx = _mm_extract_epi16(v, 0); const int gidx = _mm_extract_epi16(v, 2); const int bidx = _mm_extract_epi16(v, 4); - v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], 4); + v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], isARGB ? 4 : 0); v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 2); - v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], 0); + v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], isARGB ? 0 : 4); vf = _mm_cvtepi32_ps(v); vf = _mm_mul_ps(vf, va); v = _mm_cvtps_epi32(vf); v = _mm_packs_epi32(v, v); v = _mm_insert_epi16(v, a, 3); - v = _mm_packus_epi16(v, v); - dst[i] = _mm_cvtsi128_si32(v); - } -#else - for (qsizetype i = 0; i < len; ++i) { - const int a = qAlpha(src[i]); - const float fa = a / (255.0f * 256.0f); - const float r = d_ptr->colorSpaceOut->lut[0]->m_fromLinear[int(buffer[i].x * 4080.0f + 0.5f)]; - const float g = d_ptr->colorSpaceOut->lut[1]->m_fromLinear[int(buffer[i].y * 4080.0f + 0.5f)]; - const float b = d_ptr->colorSpaceOut->lut[2]->m_fromLinear[int(buffer[i].z * 4080.0f + 0.5f)]; - dst[i] = qRgba(r * fa + 0.5f, g * fa + 0.5f, b * fa + 0.5f, a); + storeP<T>(dst[i], v); } -#endif } -static void storeUnpremultiplied(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len, +template<typename T> +static inline void storePU(T &p, __m128i &v, int a); +template<> +inline void storePU<QRgb>(QRgb &p, __m128i &v, int a) +{ + v = _mm_add_epi16(v, _mm_set1_epi16(0x80)); + v = _mm_srli_epi16(v, 8); + v = _mm_insert_epi16(v, a, 3); + p = _mm_cvtsi128_si32(_mm_packus_epi16(v, v)); +} +template<> +inline void storePU<QRgba64>(QRgba64 &p, __m128i &v, int a) +{ + v = _mm_add_epi16(v, _mm_srli_epi16(v, 8)); + v = _mm_insert_epi16(v, a, 3); + _mm_storel_epi64((__m128i *)&p, v); +} + +template<typename T> +static void storeUnpremultiplied(T *dst, const T *src, const QColorVector *buffer, const qsizetype len, const QColorTransformPrivate *d_ptr) { -#if defined(__SSE2__) const __m128 v4080 = _mm_set1_ps(4080.f); + constexpr bool isARGB = isArgb<T>(); for (qsizetype i = 0; i < len; ++i) { - const int a = qAlpha(src[i]); + const int a = getAlpha<T>(src[i]); __m128 vf = _mm_loadu_ps(&buffer[i].x); __m128i v = _mm_cvtps_epi32(_mm_mul_ps(vf, v4080)); const int ridx = _mm_extract_epi16(v, 0); const int gidx = _mm_extract_epi16(v, 2); const int bidx = _mm_extract_epi16(v, 4); v = _mm_setzero_si128(); - v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], 2); + v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], isARGB ? 2 : 0); v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 1); - v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], 0); - v = _mm_add_epi16(v, _mm_set1_epi16(0x80)); - v = _mm_srli_epi16(v, 8); - v = _mm_insert_epi16(v, a, 3); - v = _mm_packus_epi16(v, v); - dst[i] = _mm_cvtsi128_si32(v); - } -#else - for (qsizetype i = 0; i < len; ++i) { - const int r = d_ptr->colorSpaceOut->lut[0]->u8FromLinearF32(buffer[i].x); - const int g = d_ptr->colorSpaceOut->lut[1]->u8FromLinearF32(buffer[i].y); - const int b = d_ptr->colorSpaceOut->lut[2]->u8FromLinearF32(buffer[i].z); - dst[i] = (src[i] & 0xff000000) | (r << 16) | (g << 8) | (b << 0); + v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], isARGB ? 0 : 2); + storePU<T>(dst[i], v, a); } -#endif } -static void storeOpaque(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len, +template<typename T> +static void storeOpaque(T *dst, const T *src, const QColorVector *buffer, const qsizetype len, const QColorTransformPrivate *d_ptr) { Q_UNUSED(src); -#if defined(__SSE2__) const __m128 v4080 = _mm_set1_ps(4080.f); + constexpr bool isARGB = isArgb<T>(); for (qsizetype i = 0; i < len; ++i) { __m128 vf = _mm_loadu_ps(&buffer[i].x); __m128i v = _mm_cvtps_epi32(_mm_mul_ps(vf, v4080)); @@ -664,23 +686,142 @@ static void storeOpaque(QRgb *dst, const QRgb *src, const QColorVector *buffer, const int gidx = _mm_extract_epi16(v, 2); const int bidx = _mm_extract_epi16(v, 4); v = _mm_setzero_si128(); - v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], 2); + v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], isARGB ? 2 : 0); v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 1); - v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], 0); - v = _mm_add_epi16(v, _mm_set1_epi16(0x80)); - v = _mm_srli_epi16(v, 8); - v = _mm_insert_epi16(v, 255, 3); - v = _mm_packus_epi16(v, v); - dst[i] = _mm_cvtsi128_si32(v); + v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], isARGB ? 0 : 2); + storePU<T>(dst[i], v, isARGB ? 255 : 0xffff); } +} +#elif defined(__ARM_NEON__) +template<typename T> +static inline void storeP(T &p, const uint16x4_t &v); +template<> +inline void storeP<QRgb>(QRgb &p, const uint16x4_t &v) +{ + p = vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(v, v))), 0); +} +template<> +inline void storeP<QRgba64>(QRgba64 &p, const uint16x4_t &v) +{ + vst1_u16((uint16_t *)&p, v); +} + +template<typename T> +static void storePremultiplied(T *dst, const T *src, const QColorVector *buffer, const qsizetype len, + const QColorTransformPrivate *d_ptr) +{ + const float iFF00 = 1.0f / (255 * 256); + constexpr bool isARGB = isArgb<T>(); + for (qsizetype i = 0; i < len; ++i) { + const int a = getAlpha<T>(src[i]); + float32x4_t vf = vld1q_f32(&buffer[i].x); + uint32x4_t v = vcvtq_u32_f32(vaddq_f32(vmulq_n_f32(vf, 4080.f), vdupq_n_f32(0.5f))); + const int ridx = vgetq_lane_u32(v, 0); + const int gidx = vgetq_lane_u32(v, 1); + const int bidx = vgetq_lane_u32(v, 2); + v = vsetq_lane_u32(d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], v, isARGB ? 2 : 0); + v = vsetq_lane_u32(d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], v, 1); + v = vsetq_lane_u32(d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], v, isARGB ? 0 : 2); + vf = vcvtq_f32_u32(v); + vf = vmulq_n_f32(vf, a * iFF00); + vf = vaddq_f32(vf, vdupq_n_f32(0.5f)); + v = vcvtq_u32_f32(vf); + uint16x4_t v16 = vmovn_u32(v); + v16 = vset_lane_u16(a, v16, 3); + storeP<T>(dst[i], v16); + } +} + +template<typename T> +static inline void storePU(T &p, uint16x4_t &v, int a); +template<> +inline void storePU<QRgb>(QRgb &p, uint16x4_t &v, int a) +{ + v = vadd_u16(v, vdup_n_u16(0x80)); + v = vshr_n_u16(v, 8); + v = vset_lane_u16(a, v, 3); + p = vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(v, v))), 0); +} +template<> +inline void storePU<QRgba64>(QRgba64 &p, uint16x4_t &v, int a) +{ + v = vadd_u16(v, vshr_n_u16(v, 8)); + v = vset_lane_u16(a, v, 3); + vst1_u16((uint16_t *)&p, v); +} + +template<typename T> +static void storeUnpremultiplied(T *dst, const T *src, const QColorVector *buffer, const qsizetype len, + const QColorTransformPrivate *d_ptr) +{ + constexpr bool isARGB = isArgb<T>(); + for (qsizetype i = 0; i < len; ++i) { + const int a = getAlpha<T>(src[i]); + float32x4_t vf = vld1q_f32(&buffer[i].x); + uint16x4_t v = vmovn_u32(vcvtq_u32_f32(vaddq_f32(vmulq_n_f32(vf, 4080.f), vdupq_n_f32(0.5f)))); + const int ridx = vget_lane_u16(v, 0); + const int gidx = vget_lane_u16(v, 1); + const int bidx = vget_lane_u16(v, 2); + v = vset_lane_u16(d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], v, isARGB ? 2 : 0); + v = vset_lane_u16(d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], v, 1); + v = vset_lane_u16(d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], v, isARGB ? 0 : 2); + storePU<T>(dst[i], v, a); + } +} + +template<typename T> +static void storeOpaque(T *dst, const T *src, const QColorVector *buffer, const qsizetype len, + const QColorTransformPrivate *d_ptr) +{ + Q_UNUSED(src); + constexpr bool isARGB = isArgb<T>(); + for (qsizetype i = 0; i < len; ++i) { + float32x4_t vf = vld1q_f32(&buffer[i].x); + uint16x4_t v = vmovn_u32(vcvtq_u32_f32(vaddq_f32(vmulq_n_f32(vf, 4080.f), vdupq_n_f32(0.5f)))); + const int ridx = vget_lane_u16(v, 0); + const int gidx = vget_lane_u16(v, 1); + const int bidx = vget_lane_u16(v, 2); + v = vset_lane_u16(d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], v, isARGB ? 2 : 0); + v = vset_lane_u16(d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], v, 1); + v = vset_lane_u16(d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], v, isARGB ? 0 : 2); + storePU<T>(dst[i], v, isARGB ? 255 : 0xffff); + } +} #else +static void storePremultiplied(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len, + const QColorTransformPrivate *d_ptr) +{ + for (qsizetype i = 0; i < len; ++i) { + const int a = qAlpha(src[i]); + const float fa = a / (255.0f * 256.0f); + const float r = d_ptr->colorSpaceOut->lut[0]->m_fromLinear[int(buffer[i].x * 4080.0f + 0.5f)]; + const float g = d_ptr->colorSpaceOut->lut[1]->m_fromLinear[int(buffer[i].y * 4080.0f + 0.5f)]; + const float b = d_ptr->colorSpaceOut->lut[2]->m_fromLinear[int(buffer[i].z * 4080.0f + 0.5f)]; + dst[i] = qRgba(r * fa + 0.5f, g * fa + 0.5f, b * fa + 0.5f, a); + } +} + +static void storeUnpremultiplied(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len, + const QColorTransformPrivate *d_ptr) +{ + for (qsizetype i = 0; i < len; ++i) { + const int r = d_ptr->colorSpaceOut->lut[0]->u8FromLinearF32(buffer[i].x); + const int g = d_ptr->colorSpaceOut->lut[1]->u8FromLinearF32(buffer[i].y); + const int b = d_ptr->colorSpaceOut->lut[2]->u8FromLinearF32(buffer[i].z); + dst[i] = (src[i] & 0xff000000) | (r << 16) | (g << 8) | (b << 0); + } +} + +static void storeOpaque(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len, + const QColorTransformPrivate *d_ptr) +{ + Q_UNUSED(src); for (qsizetype i = 0; i < len; ++i) { const int r = d_ptr->colorSpaceOut->lut[0]->u8FromLinearF32(buffer[i].x); const int g = d_ptr->colorSpaceOut->lut[1]->u8FromLinearF32(buffer[i].y); const int b = d_ptr->colorSpaceOut->lut[2]->u8FromLinearF32(buffer[i].z); dst[i] = 0xff000000 | (r << 16) | (g << 8) | (b << 0); } -#endif } static void storePremultiplied(QRgba64 *dst, const QRgba64 *src, const QColorVector *buffer, const qsizetype len, @@ -718,6 +859,7 @@ static void storeOpaque(QRgba64 *dst, const QRgba64 *src, const QColorVector *bu dst[i] = qRgba64(r, g, b, 0xFFFF); } } +#endif static void storeGray(quint8 *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len, const QColorTransformPrivate *d_ptr) diff --git a/tests/auto/gui/painting/qcolorspace/tst_qcolorspace.cpp b/tests/auto/gui/painting/qcolorspace/tst_qcolorspace.cpp index 0c8536d385..913fe55364 100644 --- a/tests/auto/gui/painting/qcolorspace/tst_qcolorspace.cpp +++ b/tests/auto/gui/painting/qcolorspace/tst_qcolorspace.cpp @@ -61,6 +61,8 @@ private slots: void imageConversion(); void imageConversion64_data(); void imageConversion64(); + void imageConversion64PM_data(); + void imageConversion64PM(); void imageConversionOverLargerGamut_data(); void imageConversionOverLargerGamut(); @@ -353,6 +355,76 @@ void tst_QColorSpace::imageConversion64() } } +void tst_QColorSpace::imageConversion64PM_data() +{ + imageConversion64_data(); +} + +void tst_QColorSpace::imageConversion64PM() +{ + QFETCH(QColorSpace::NamedColorSpace, fromColorSpace); + QFETCH(QColorSpace::NamedColorSpace, toColorSpace); + + QImage testImage(256, 16, QImage::Format_RGBA64_Premultiplied); + + for (int j = 0; j < 16; ++j) { + int a = j * 15; + for (int i = 0; i < 256; ++i) + testImage.setPixel(i, j, qPremultiply(qRgba(i, i, i, a))); + } + + testImage.setColorSpace(fromColorSpace); + QCOMPARE(testImage.colorSpace(), QColorSpace(fromColorSpace)); + + testImage.convertToColorSpace(toColorSpace); + QCOMPARE(testImage.colorSpace(), QColorSpace(toColorSpace)); + + int lastRed = 0; + int lastGreen = 0; + int lastBlue = 0; + for (int j = 0; j < 16; ++j) { + for (int i = 0; i < 256; ++i) { + QRgb p = testImage.pixel(i, j); + QVERIFY(qRed(p) >= lastRed); + QVERIFY(qGreen(p) >= lastGreen); + QVERIFY(qBlue(p) >= lastBlue); + QCOMPARE(qAlpha(p), j * 15); + lastRed = qRed(p); + lastGreen = qGreen(p); + lastBlue = qBlue(p); + } + QVERIFY(lastRed <= j * 15); + QVERIFY(lastGreen <= j * 15); + QVERIFY(lastBlue <= j * 15); + lastRed = 0; + lastGreen = 0; + lastBlue = 0; + } + + testImage.convertToColorSpace(fromColorSpace); + QCOMPARE(testImage.colorSpace(), QColorSpace(fromColorSpace)); + for (int j = 0; j < 16; ++j) { + for (int i = 0; i < 256; ++i) { + QRgb p = testImage.pixel(i, j); + QCOMPARE(qRed(p), qGreen(p)); + QCOMPARE(qRed(p), qBlue(p)); + QCOMPARE(qAlpha(p), j * 15); + QVERIFY((lastRed - qRed(p)) <= 0); + QVERIFY((lastGreen - qGreen(p)) <= 0); + QVERIFY((lastBlue - qBlue(p)) <= 0); + lastRed = qRed(p); + lastGreen = qGreen(p); + lastBlue = qBlue(p); + } + QVERIFY(lastRed <= j * 15); + QVERIFY(lastGreen <= j * 15); + QVERIFY(lastBlue <= j * 15); + lastRed = 0; + lastGreen = 0; + lastBlue = 0; + } +} + void tst_QColorSpace::imageConversionOverLargerGamut_data() { QTest::addColumn<QColorSpace::NamedColorSpace>("fromColorSpace"); |