summaryrefslogtreecommitdiffstats
path: root/src/3rdparty/libwebp/src/dsp/enc_sse2.c
diff options
context:
space:
mode:
authorLiang Qi <liang.qi@theqtcompany.com>2015-04-08 10:10:02 +0200
committerLiang Qi <liang.qi@theqtcompany.com>2015-04-09 08:41:30 +0000
commit600a2d18d40ba87a273e1a258ce3189b0edc11f8 (patch)
treed80ede3880cb86f06eae50191e5dd3b102482189 /src/3rdparty/libwebp/src/dsp/enc_sse2.c
parent32e437b1244e7bf45011e4fd770fc0eef40d02de (diff)
libwebp: update to 0.4.3
This commit imports libwebp 0.4.3, including AUTHORS, COPYING, ChangeLog, NEWS, PATENTS, README and src directories. In src, only includes header and source files. The patches required to build it in Qt will follow in separate commit(s). Change-Id: I23ebfd69e47a468c91a9e9b109e9cb8ac63705d4 Reviewed-by: Lars Knoll <lars.knoll@digia.com> Reviewed-by: Konstantin Ritt <ritt.ks@gmail.com> Reviewed-by: aavit <eirik.aavitsland@theqtcompany.com>
Diffstat (limited to 'src/3rdparty/libwebp/src/dsp/enc_sse2.c')
-rw-r--r--src/3rdparty/libwebp/src/dsp/enc_sse2.c175
1 files changed, 100 insertions, 75 deletions
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse2.c b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
index 540a3cb..9958d9f 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
@@ -17,7 +17,9 @@
#include <stdlib.h> // for abs()
#include <emmintrin.h>
+#include "../enc/cost.h"
#include "../enc/vp8enci.h"
+#include "../utils/utils.h"
//------------------------------------------------------------------------------
// Quite useful macro for debugging. Left here for convenience.
@@ -52,9 +54,9 @@ static void PrintReg(const __m128i r, const char* const name, int size) {
// Compute susceptibility based on DCT-coeff histograms:
// the higher, the "easier" the macroblock is to compress.
-static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
- int start_block, int end_block,
- VP8Histogram* const histo) {
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+ int start_block, int end_block,
+ VP8Histogram* const histo) {
const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
int j;
for (j = start_block; j < end_block; ++j) {
@@ -98,8 +100,8 @@ static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred,
// Transforms (Paragraph 14.4)
// Does one or two inverse transforms.
-static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
- int do_two) {
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+ int do_two) {
// This implementation makes use of 16-bit fixed point versions of two
// multiply constants:
// K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
@@ -318,8 +320,7 @@ static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
}
}
-static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
- int16_t* out) {
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
const __m128i zero = _mm_setzero_si128();
const __m128i seven = _mm_set1_epi16(7);
const __m128i k937 = _mm_set1_epi32(937);
@@ -444,14 +445,14 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
// -> f1 = f1 + 1 - (a3 == 0)
const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
- _mm_storel_epi64((__m128i*)&out[ 0], d0);
- _mm_storel_epi64((__m128i*)&out[ 4], g1);
- _mm_storel_epi64((__m128i*)&out[ 8], d2);
- _mm_storel_epi64((__m128i*)&out[12], f3);
+ const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
+ const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
+ _mm_storeu_si128((__m128i*)&out[0], d0_g1);
+ _mm_storeu_si128((__m128i*)&out[8], d2_f3);
}
}
-static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
+static void FTransformWHT(const int16_t* in, int16_t* out) {
int32_t tmp[16];
int i;
for (i = 0; i < 4; ++i, in += 64) {
@@ -487,8 +488,8 @@ static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
//------------------------------------------------------------------------------
// Metric
-static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,
- int num_quads, int do_16) {
+static int SSE_Nx4(const uint8_t* a, const uint8_t* b,
+ int num_quads, int do_16) {
const __m128i zero = _mm_setzero_si128();
__m128i sum1 = zero;
__m128i sum2 = zero;
@@ -565,19 +566,19 @@ static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b,
}
}
-static int SSE16x16SSE2(const uint8_t* a, const uint8_t* b) {
- return SSE_Nx4SSE2(a, b, 4, 1);
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+ return SSE_Nx4(a, b, 4, 1);
}
-static int SSE16x8SSE2(const uint8_t* a, const uint8_t* b) {
- return SSE_Nx4SSE2(a, b, 2, 1);
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+ return SSE_Nx4(a, b, 2, 1);
}
-static int SSE8x8SSE2(const uint8_t* a, const uint8_t* b) {
- return SSE_Nx4SSE2(a, b, 2, 0);
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+ return SSE_Nx4(a, b, 2, 0);
}
-static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
const __m128i zero = _mm_setzero_si128();
// Load values. Note that we read 8 pixels instead of 4,
@@ -634,8 +635,8 @@ static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
// Hadamard transform
// Returns the difference between the weighted sum of the absolute value of
// transformed coefficients.
-static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
- const uint16_t* const w) {
+static int TTransform(const uint8_t* inA, const uint8_t* inB,
+ const uint16_t* const w) {
int32_t sum[4];
__m128i tmp_0, tmp_1, tmp_2, tmp_3;
const __m128i zero = _mm_setzero_si128();
@@ -782,19 +783,19 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
return sum[0] + sum[1] + sum[2] + sum[3];
}
-static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
- const int diff_sum = TTransformSSE2(a, b, w);
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
+ const int diff_sum = TTransform(a, b, w);
return abs(diff_sum) >> 5;
}
-static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
- const uint16_t* const w) {
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+ const uint16_t* const w) {
int D = 0;
int x, y;
for (y = 0; y < 16 * BPS; y += 4 * BPS) {
for (x = 0; x < 16; x += 4) {
- D += Disto4x4SSE2(a + x + y, b + x + y, w);
+ D += Disto4x4(a + x + y, b + x + y, w);
}
}
return D;
@@ -804,9 +805,9 @@ static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b,
// Quantization
//
-// Simple quantization
-static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
- int n, const VP8Matrix* const mtx) {
+static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
+ const uint16_t* const sharpen,
+ const VP8Matrix* const mtx) {
const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
const __m128i zero = _mm_setzero_si128();
__m128i coeff0, coeff8;
@@ -818,18 +819,14 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
// we can use _mm_load_si128 instead of _mm_loadu_si128.
__m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
__m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
- const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
- const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
- const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
- const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
- // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative)
- const __m128i sign0 = _mm_srai_epi16(in0, 15);
- const __m128i sign8 = _mm_srai_epi16(in8, 15);
+ // extract sign(in) (0x0000 if positive, 0xffff if negative)
+ const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
+ const __m128i sign8 = _mm_cmpgt_epi16(zero, in8);
// coeff = abs(in) = (in ^ sign) - sign
coeff0 = _mm_xor_si128(in0, sign0);
@@ -838,32 +835,35 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
coeff8 = _mm_sub_epi16(coeff8, sign8);
// coeff = abs(in) + sharpen
- coeff0 = _mm_add_epi16(coeff0, sharpen0);
- coeff8 = _mm_add_epi16(coeff8, sharpen8);
+ if (sharpen != NULL) {
+ const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&sharpen[0]);
+ const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&sharpen[8]);
+ coeff0 = _mm_add_epi16(coeff0, sharpen0);
+ coeff8 = _mm_add_epi16(coeff8, sharpen8);
+ }
- // out = (coeff * iQ + B) >> QFIX;
+ // out = (coeff * iQ + B) >> QFIX
{
// doing calculations with 32b precision (QFIX=17)
// out = (coeff * iQ)
- __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
- __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
- __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
- __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+ const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+ const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+ const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+ const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
__m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
__m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
__m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
- // expand bias from 16b to 32b
- __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
- __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
- __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
- __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
// out = (coeff * iQ + B)
+ const __m128i bias_00 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
+ const __m128i bias_04 = _mm_loadu_si128((__m128i*)&mtx->bias_[4]);
+ const __m128i bias_08 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
+ const __m128i bias_12 = _mm_loadu_si128((__m128i*)&mtx->bias_[12]);
out_00 = _mm_add_epi32(out_00, bias_00);
out_04 = _mm_add_epi32(out_04, bias_04);
out_08 = _mm_add_epi32(out_08, bias_08);
out_12 = _mm_add_epi32(out_12, bias_12);
- // out = (coeff * iQ + B) >> QFIX;
+ // out = QUANTDIV(coeff, iQ, B, QFIX)
out_00 = _mm_srai_epi32(out_00, QFIX);
out_04 = _mm_srai_epi32(out_04, QFIX);
out_08 = _mm_srai_epi32(out_08, QFIX);
@@ -916,19 +916,44 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
}
// detect if all 'out' values are zeroes or not
- {
- int32_t tmp[4];
- _mm_storeu_si128((__m128i*)tmp, packed_out);
- if (n) {
- tmp[0] &= ~0xff;
- }
- return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
- }
+ return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
}
-static int QuantizeBlockWHTSSE2(int16_t in[16], int16_t out[16],
- const VP8Matrix* const mtx) {
- return QuantizeBlockSSE2(in, out, 0, mtx);
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
+}
+
+static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
+ const VP8Matrix* const mtx) {
+ return DoQuantizeBlock(in, out, NULL, mtx);
+}
+
+// Forward declaration.
+void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
+ VP8Residual* const res);
+
+void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
+ VP8Residual* const res) {
+ const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs);
+ const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
+ // Use SSE to compare 8 values with a single instruction.
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i m0 = _mm_cmpeq_epi16(c0, zero);
+ const __m128i m1 = _mm_cmpeq_epi16(c1, zero);
+ // Get the comparison results as a bitmask, consisting of two times 16 bits:
+ // two identical bits for each result. Concatenate both bitmasks to get a
+ // single 32 bit value. Negate the mask to get the position of entries that
+ // are not equal to zero. We don't need to mask out least significant bits
+ // according to res->first, since coeffs[0] is 0 if res->first > 0
+ const uint32_t mask =
+ ~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0));
+ // The position of the most significant non-zero bit indicates the position of
+ // the last non-zero value. Divide the result by two because __movemask_epi8
+ // operates on 8 bit values instead of 16 bit values.
+ assert(res->first == 0 || coeffs[0] == 0);
+ res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1;
+ res->coeffs = coeffs;
}
#endif // WEBP_USE_SSE2
@@ -940,18 +965,18 @@ extern void VP8EncDspInitSSE2(void);
void VP8EncDspInitSSE2(void) {
#if defined(WEBP_USE_SSE2)
- VP8CollectHistogram = CollectHistogramSSE2;
- VP8EncQuantizeBlock = QuantizeBlockSSE2;
- VP8EncQuantizeBlockWHT = QuantizeBlockWHTSSE2;
- VP8ITransform = ITransformSSE2;
- VP8FTransform = FTransformSSE2;
- VP8FTransformWHT = FTransformWHTSSE2;
- VP8SSE16x16 = SSE16x16SSE2;
- VP8SSE16x8 = SSE16x8SSE2;
- VP8SSE8x8 = SSE8x8SSE2;
- VP8SSE4x4 = SSE4x4SSE2;
- VP8TDisto4x4 = Disto4x4SSE2;
- VP8TDisto16x16 = Disto16x16SSE2;
+ VP8CollectHistogram = CollectHistogram;
+ VP8EncQuantizeBlock = QuantizeBlock;
+ VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
+ VP8ITransform = ITransform;
+ VP8FTransform = FTransform;
+ VP8FTransformWHT = FTransformWHT;
+ VP8SSE16x16 = SSE16x16;
+ VP8SSE16x8 = SSE16x8;
+ VP8SSE8x8 = SSE8x8;
+ VP8SSE4x4 = SSE4x4;
+ VP8TDisto4x4 = Disto4x4;
+ VP8TDisto16x16 = Disto16x16;
#endif // WEBP_USE_SSE2
}