summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/libwebp/src/dsp/enc_sse41.c
diff options
context:
space:
mode:
authorLiang Qi <liang.qi@qt.io>2016-12-13 06:35:54 +0100
committerJani Heikkinen <jani.heikkinen@qt.io>2016-12-13 11:05:49 +0000
commita4125f0c4e8988012fe2bf5b9f933ed63c3c97d0 (patch)
tree7b183f687bd0ba111ec50d406226283c980e03a5 /src/3rdparty/libwebp/src/dsp/enc_sse41.c
parent5976c46685b1335c86ce702e3af69262de97096c (diff)
Bundled libwebp updated to version 0.5.1
This commit imports libwebp 0.5.1, including AUTHORS, COPYING, ChangeLog, NEWS, PATENTS, README and src directories. In src, only includes header and source files. Upstream changes since 0.5.0 have been merged in. Also updated version in qt_attribution.json. Conflicts: src/3rdparty/libwebp.pri src/3rdparty/libwebp/qt_attribution.json src/3rdparty/libwebp/src/webp/config.h Change-Id: I7d0c15400154c3b4ee8ff37665303307c4b84f9f Reviewed-by: Oswald Buddenhagen <oswald.buddenhagen@qt.io> Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io>
Diffstat (limited to 'src/3rdparty/libwebp/src/dsp/enc_sse41.c')
-rw-r--r--src/3rdparty/libwebp/src/dsp/enc_sse41.c100
1 files changed, 33 insertions, 67 deletions
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse41.c b/src/3rdparty/libwebp/src/dsp/enc_sse41.c
index 65c01ae..a178390 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_sse41.c
@@ -17,6 +17,7 @@
#include <smmintrin.h>
#include <stdlib.h> // for abs()
+#include "./common_sse2.h"
#include "../enc/vp8enci.h"
//------------------------------------------------------------------------------
@@ -67,55 +68,45 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
// reconstructed samples.
// Hadamard transform
-// Returns the difference between the weighted sum of the absolute value of
-// transformed coefficients.
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform(const uint8_t* inA, const uint8_t* inB,
const uint16_t* const w) {
+ int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
- // Load, combine and transpose inputs.
+ // Load and combine inputs.
{
- const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
- const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
- const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
+ const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]);
+ const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]);
+ const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]);
+ // In SSE4.1, with gcc 4.8 at least (maybe other versions),
+ // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump
+ // of inA and inB, _mm_loadl_epi64 is still used not to have an out of
+ // bound read.
const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
- const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
- const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
- const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
+ const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]);
+ const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]);
+ const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]);
const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
// Combine inA and inB (we'll do two transforms in parallel).
- const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
- const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
- const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
- const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
- // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0
- // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0
- // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0
- // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0
-
- // Transpose the two 4x4, discarding the filling zeroes.
- const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
- const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
- // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23
- // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33
- const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
- const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
- // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31
- // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33
-
- // Convert to 16b.
- tmp_0 = _mm_cvtepu8_epi16(transpose1_0);
- tmp_1 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_0, 8));
- tmp_2 = _mm_cvtepu8_epi16(transpose1_1);
- tmp_3 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_1, 8));
- // a00 a10 a20 a30 b00 b10 b20 b30
- // a01 a11 a21 a31 b01 b11 b21 b31
- // a02 a12 a22 a32 b02 b12 b22 b32
- // a03 a13 a23 a33 b03 b13 b23 b33
+ const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
+ const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
+ const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
+ const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
+ tmp_0 = _mm_cvtepu8_epi16(inAB_0);
+ tmp_1 = _mm_cvtepu8_epi16(inAB_1);
+ tmp_2 = _mm_cvtepu8_epi16(inAB_2);
+ tmp_3 = _mm_cvtepu8_epi16(inAB_3);
+ // a00 a01 a02 a03 b00 b01 b02 b03
+ // a10 a11 a12 a13 b10 b11 b12 b13
+ // a20 a21 a22 a23 b20 b21 b22 b23
+ // a30 a31 a32 a33 b30 b31 b32 b33
}
- // Horizontal pass and subsequent transpose.
+ // Vertical pass first to avoid a transpose (vertical and horizontal passes
+ // are commutative because w/kWeightY is symmetric) and subsequent transpose.
{
// Calculate a and b (two 4x4 at once).
const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
@@ -132,33 +123,10 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
// a30 a31 a32 a33 b30 b31 b32 b33
// Transpose the two 4x4.
- const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
- const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
- const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
- const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
- // a00 a10 a01 a11 a02 a12 a03 a13
- // a20 a30 a21 a31 a22 a32 a23 a33
- // b00 b10 b01 b11 b02 b12 b03 b13
- // b20 b30 b21 b31 b22 b32 b23 b33
- const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
- const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
- const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
- const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
- // a00 a10 a20 a30 a01 a11 a21 a31
- // b00 b10 b20 b30 b01 b11 b21 b31
- // a02 a12 a22 a32 a03 a13 a23 a33
- // b02 b12 a22 b32 b03 b13 b23 b33
- tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
- tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
- tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
- tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
- // a00 a10 a20 a30 b00 b10 b20 b30
- // a01 a11 a21 a31 b01 b11 b21 b31
- // a02 a12 a22 a32 b02 b12 b22 b32
- // a03 a13 a23 a33 b03 b13 b23 b33
+ VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
}
- // Vertical pass and difference of weighted sums.
+ // Horizontal pass and difference of weighted sums.
{
// Load all inputs.
const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
@@ -195,11 +163,9 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
// difference of weighted sums
A_b2 = _mm_sub_epi32(A_b0, B_b0);
- // cascading summation of the differences
- B_b0 = _mm_hadd_epi32(A_b2, A_b2);
- B_b2 = _mm_hadd_epi32(B_b0, B_b0);
- return _mm_cvtsi128_si32(B_b2);
+ _mm_storeu_si128((__m128i*)&sum[0], A_b2);
}
+ return sum[0] + sum[1] + sum[2] + sum[3];
}
static int Disto4x4(const uint8_t* const a, const uint8_t* const b,