diff options
Diffstat (limited to 'src/3rdparty/libwebp/src/dsp/enc_sse2.c')
-rw-r--r-- | src/3rdparty/libwebp/src/dsp/enc_sse2.c | 175 |
1 files changed, 100 insertions, 75 deletions
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse2.c b/src/3rdparty/libwebp/src/dsp/enc_sse2.c index 540a3cb..9958d9f 100644 --- a/src/3rdparty/libwebp/src/dsp/enc_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/enc_sse2.c @@ -17,7 +17,9 @@ #include <stdlib.h> // for abs() #include <emmintrin.h> +#include "../enc/cost.h" #include "../enc/vp8enci.h" +#include "../utils/utils.h" //------------------------------------------------------------------------------ // Quite useful macro for debugging. Left here for convenience. @@ -52,9 +54,9 @@ static void PrintReg(const __m128i r, const char* const name, int size) { // Compute susceptibility based on DCT-coeff histograms: // the higher, the "easier" the macroblock is to compress. -static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred, - int start_block, int end_block, - VP8Histogram* const histo) { +static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block, + VP8Histogram* const histo) { const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); int j; for (j = start_block; j < end_block; ++j) { @@ -98,8 +100,8 @@ static void CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred, // Transforms (Paragraph 14.4) // Does one or two inverse transforms. -static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, - int do_two) { +static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 @@ -318,8 +320,7 @@ static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, } } -static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, - int16_t* out) { +static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k937 = _mm_set1_epi32(937); @@ -444,14 +445,14 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, // -> f1 = f1 + 1 - (a3 == 0) const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); - _mm_storel_epi64((__m128i*)&out[ 0], d0); - _mm_storel_epi64((__m128i*)&out[ 4], g1); - _mm_storel_epi64((__m128i*)&out[ 8], d2); - _mm_storel_epi64((__m128i*)&out[12], f3); + const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); + const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); + _mm_storeu_si128((__m128i*)&out[0], d0_g1); + _mm_storeu_si128((__m128i*)&out[8], d2_f3); } } -static void FTransformWHTSSE2(const int16_t* in, int16_t* out) { +static void FTransformWHT(const int16_t* in, int16_t* out) { int32_t tmp[16]; int i; for (i = 0; i < 4; ++i, in += 64) { @@ -487,8 +488,8 @@ static void FTransformWHTSSE2(const int16_t* in, int16_t* out) { //------------------------------------------------------------------------------ // Metric -static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b, - int num_quads, int do_16) { +static int SSE_Nx4(const uint8_t* a, const uint8_t* b, + int num_quads, int do_16) { const __m128i zero = _mm_setzero_si128(); __m128i sum1 = zero; __m128i sum2 = zero; @@ -565,19 +566,19 @@ static int SSE_Nx4SSE2(const uint8_t* a, const uint8_t* b, } } -static int SSE16x16SSE2(const uint8_t* a, const uint8_t* b) { - return SSE_Nx4SSE2(a, b, 4, 1); +static int SSE16x16(const uint8_t* a, const uint8_t* b) { + return SSE_Nx4(a, b, 4, 1); } -static int SSE16x8SSE2(const uint8_t* a, const uint8_t* b) { - return SSE_Nx4SSE2(a, b, 2, 1); +static int SSE16x8(const uint8_t* a, const uint8_t* b) { + return SSE_Nx4(a, b, 2, 1); } -static int SSE8x8SSE2(const uint8_t* a, const uint8_t* b) { - return SSE_Nx4SSE2(a, b, 2, 0); +static int SSE8x8(const uint8_t* a, const uint8_t* b) { + return SSE_Nx4(a, b, 2, 0); } -static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) { +static int SSE4x4(const uint8_t* a, const uint8_t* b) { const __m128i zero = _mm_setzero_si128(); // Load values. Note that we read 8 pixels instead of 4, @@ -634,8 +635,8 @@ static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) { // Hadamard transform // Returns the difference between the weighted sum of the absolute value of // transformed coefficients. -static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB, - const uint16_t* const w) { +static int TTransform(const uint8_t* inA, const uint8_t* inB, + const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); @@ -782,19 +783,19 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB, return sum[0] + sum[1] + sum[2] + sum[3]; } -static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { - const int diff_sum = TTransformSSE2(a, b, w); +static int Disto4x4(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + const int diff_sum = TTransform(a, b, w); return abs(diff_sum) >> 5; } -static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b, - const uint16_t* const w) { +static int Disto16x16(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { int D = 0; int x, y; for (y = 0; y < 16 * BPS; y += 4 * BPS) { for (x = 0; x < 16; x += 4) { - D += Disto4x4SSE2(a + x + y, b + x + y, w); + D += Disto4x4(a + x + y, b + x + y, w); } } return D; @@ -804,9 +805,9 @@ static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b, // Quantization // -// Simple quantization -static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], - int n, const VP8Matrix* const mtx) { +static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], + const uint16_t* const sharpen, + const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i zero = _mm_setzero_si128(); __m128i coeff0, coeff8; @@ -818,18 +819,14 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); - const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); - const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); - const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); - const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); - // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) - const __m128i sign0 = _mm_srai_epi16(in0, 15); - const __m128i sign8 = _mm_srai_epi16(in8, 15); + // extract sign(in) (0x0000 if positive, 0xffff if negative) + const __m128i sign0 = _mm_cmpgt_epi16(zero, in0); + const __m128i sign8 = _mm_cmpgt_epi16(zero, in8); // coeff = abs(in) = (in ^ sign) - sign coeff0 = _mm_xor_si128(in0, sign0); @@ -838,32 +835,35 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], coeff8 = _mm_sub_epi16(coeff8, sign8); // coeff = abs(in) + sharpen - coeff0 = _mm_add_epi16(coeff0, sharpen0); - coeff8 = _mm_add_epi16(coeff8, sharpen8); + if (sharpen != NULL) { + const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&sharpen[0]); + const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&sharpen[8]); + coeff0 = _mm_add_epi16(coeff0, sharpen0); + coeff8 = _mm_add_epi16(coeff8, sharpen8); + } - // out = (coeff * iQ + B) >> QFIX; + // out = (coeff * iQ + B) >> QFIX { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) - __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); - __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); - __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); - __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); + const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); + const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); + const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); + const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); - // expand bias from 16b to 32b - __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero); - __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero); - __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero); - __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero); // out = (coeff * iQ + B) + const __m128i bias_00 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); + const __m128i bias_04 = _mm_loadu_si128((__m128i*)&mtx->bias_[4]); + const __m128i bias_08 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); + const __m128i bias_12 = _mm_loadu_si128((__m128i*)&mtx->bias_[12]); out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); - // out = (coeff * iQ + B) >> QFIX; + // out = QUANTDIV(coeff, iQ, B, QFIX) out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); @@ -916,19 +916,44 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], } // detect if all 'out' values are zeroes or not - { - int32_t tmp[4]; - _mm_storeu_si128((__m128i*)tmp, packed_out); - if (n) { - tmp[0] &= ~0xff; - } - return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); - } + return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff); } -static int QuantizeBlockWHTSSE2(int16_t in[16], int16_t out[16], - const VP8Matrix* const mtx) { - return QuantizeBlockSSE2(in, out, 0, mtx); +static int QuantizeBlock(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { + return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx); +} + +static int QuantizeBlockWHT(int16_t in[16], int16_t out[16], + const VP8Matrix* const mtx) { + return DoQuantizeBlock(in, out, NULL, mtx); +} + +// Forward declaration. +void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs, + VP8Residual* const res); + +void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs, + VP8Residual* const res) { + const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs); + const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8)); + // Use SSE to compare 8 values with a single instruction. + const __m128i zero = _mm_setzero_si128(); + const __m128i m0 = _mm_cmpeq_epi16(c0, zero); + const __m128i m1 = _mm_cmpeq_epi16(c1, zero); + // Get the comparison results as a bitmask, consisting of two times 16 bits: + // two identical bits for each result. Concatenate both bitmasks to get a + // single 32 bit value. Negate the mask to get the position of entries that + // are not equal to zero. We don't need to mask out least significant bits + // according to res->first, since coeffs[0] is 0 if res->first > 0 + const uint32_t mask = + ~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0)); + // The position of the most significant non-zero bit indicates the position of + // the last non-zero value. Divide the result by two because __movemask_epi8 + // operates on 8 bit values instead of 16 bit values. + assert(res->first == 0 || coeffs[0] == 0); + res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1; + res->coeffs = coeffs; } #endif // WEBP_USE_SSE2 @@ -940,18 +965,18 @@ extern void VP8EncDspInitSSE2(void); void VP8EncDspInitSSE2(void) { #if defined(WEBP_USE_SSE2) - VP8CollectHistogram = CollectHistogramSSE2; - VP8EncQuantizeBlock = QuantizeBlockSSE2; - VP8EncQuantizeBlockWHT = QuantizeBlockWHTSSE2; - VP8ITransform = ITransformSSE2; - VP8FTransform = FTransformSSE2; - VP8FTransformWHT = FTransformWHTSSE2; - VP8SSE16x16 = SSE16x16SSE2; - VP8SSE16x8 = SSE16x8SSE2; - VP8SSE8x8 = SSE8x8SSE2; - VP8SSE4x4 = SSE4x4SSE2; - VP8TDisto4x4 = Disto4x4SSE2; - VP8TDisto16x16 = Disto16x16SSE2; + VP8CollectHistogram = CollectHistogram; + VP8EncQuantizeBlock = QuantizeBlock; + VP8EncQuantizeBlockWHT = QuantizeBlockWHT; + VP8ITransform = ITransform; + VP8FTransform = FTransform; + VP8FTransformWHT = FTransformWHT; + VP8SSE16x16 = SSE16x16; + VP8SSE16x8 = SSE16x8; + VP8SSE8x8 = SSE8x8; + VP8SSE4x4 = SSE4x4; + VP8TDisto4x4 = Disto4x4; + VP8TDisto16x16 = Disto16x16; #endif // WEBP_USE_SSE2 } |