diff options
Diffstat (limited to 'src/3rdparty/libwebp/src/dsp')
25 files changed, 1830 insertions, 748 deletions
diff --git a/src/3rdparty/libwebp/src/dsp/common_sse2.h b/src/3rdparty/libwebp/src/dsp/common_sse2.h new file mode 100644 index 0000000..7cea13f --- /dev/null +++ b/src/3rdparty/libwebp/src/dsp/common_sse2.h @@ -0,0 +1,109 @@ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// SSE2 code common to several files. +// +// Author: Vincent Rabaud (vrabaud@google.com) + +#ifndef WEBP_DSP_COMMON_SSE2_H_ +#define WEBP_DSP_COMMON_SSE2_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(WEBP_USE_SSE2) + +#include <emmintrin.h> + +//------------------------------------------------------------------------------ +// Quite useful macro for debugging. Left here for convenience. + +#if 0 +#include <stdio.h> +static WEBP_INLINE void PrintReg(const __m128i r, const char* const name, + int size) { + int n; + union { + __m128i r; + uint8_t i8[16]; + uint16_t i16[8]; + uint32_t i32[4]; + uint64_t i64[2]; + } tmp; + tmp.r = r; + fprintf(stderr, "%s\t: ", name); + if (size == 8) { + for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]); + } else if (size == 16) { + for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]); + } else if (size == 32) { + for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]); + } else { + for (n = 0; n < 2; ++n) fprintf(stderr, "%.16lx ", tmp.i64[n]); + } + fprintf(stderr, "\n"); +} +#endif + +//------------------------------------------------------------------------------ +// Math functions. + +// Return the sum of all the 8b in the register. +static WEBP_INLINE int VP8HorizontalAdd8b(const __m128i* const a) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sad8x2 = _mm_sad_epu8(*a, zero); + // sum the two sads: sad8x2[0:1] + sad8x2[8:9] + const __m128i sum = _mm_add_epi32(sad8x2, _mm_shuffle_epi32(sad8x2, 2)); + return _mm_cvtsi128_si32(sum); +} + +// Transpose two 4x4 16b matrices horizontally stored in registers. +static WEBP_INLINE void VP8Transpose_2_4x4_16b( + const __m128i* const in0, const __m128i* const in1, + const __m128i* const in2, const __m128i* const in3, __m128i* const out0, + __m128i* const out1, __m128i* const out2, __m128i* const out3) { + // Transpose the two 4x4. + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + const __m128i transpose0_0 = _mm_unpacklo_epi16(*in0, *in1); + const __m128i transpose0_1 = _mm_unpacklo_epi16(*in2, *in3); + const __m128i transpose0_2 = _mm_unpackhi_epi16(*in0, *in1); + const __m128i transpose0_3 = _mm_unpackhi_epi16(*in2, *in3); + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); + const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + *out0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); + *out1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); + *out2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); + *out3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 +} + +#endif // WEBP_USE_SSE2 + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // WEBP_DSP_COMMON_SSE2_H_ diff --git a/src/3rdparty/libwebp/src/dsp/cpu.c b/src/3rdparty/libwebp/src/dsp/cpu.c index 8844cb4..cbb08db 100644 --- a/src/3rdparty/libwebp/src/dsp/cpu.c +++ b/src/3rdparty/libwebp/src/dsp/cpu.c @@ -13,6 +13,11 @@ #include "./dsp.h" +#if defined(WEBP_HAVE_NEON_RTCD) +#include <stdio.h> +#include <string.h> +#endif + #if defined(WEBP_ANDROID_NEON) #include <cpu-features.h> #endif @@ -142,13 +147,33 @@ VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo; // define a dummy function to enable turning off NEON at runtime by setting // VP8DecGetCPUInfo = NULL static int armCPUInfo(CPUFeature feature) { - (void)feature; + if (feature != kNEON) return 0; +#if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD) + { + int has_neon = 0; + char line[200]; + FILE* const cpuinfo = fopen("/proc/cpuinfo", "r"); + if (cpuinfo == NULL) return 0; + while (fgets(line, sizeof(line), cpuinfo)) { + if (!strncmp(line, "Features", 8)) { + if (strstr(line, " neon ") != NULL) { + has_neon = 1; + break; + } + } + } + fclose(cpuinfo); + return has_neon; + } +#else return 1; +#endif } VP8CPUInfo VP8GetCPUInfo = armCPUInfo; -#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2) +#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2) || \ + defined(WEBP_USE_MSA) static int mipsCPUInfo(CPUFeature feature) { - if ((feature == kMIPS32) || (feature == kMIPSdspR2)) { + if ((feature == kMIPS32) || (feature == kMIPSdspR2) || (feature == kMSA)) { return 1; } else { return 0; diff --git a/src/3rdparty/libwebp/src/dsp/dec.c b/src/3rdparty/libwebp/src/dsp/dec.c index a787206..e92d693 100644 --- a/src/3rdparty/libwebp/src/dsp/dec.c +++ b/src/3rdparty/libwebp/src/dsp/dec.c @@ -13,6 +13,7 @@ #include "./dsp.h" #include "../dec/vp8i.h" +#include "../utils/utils.h" //------------------------------------------------------------------------------ @@ -654,6 +655,23 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride, //------------------------------------------------------------------------------ +static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst, + int dst_stride) { + int i, j; + for (j = 0; j < 8; ++j) { + for (i = 0; i < 8; ++i) { + const int delta0 = dither[i] - VP8_DITHER_AMP_CENTER; + const int delta1 = + (delta0 + VP8_DITHER_DESCALE_ROUNDER) >> VP8_DITHER_DESCALE; + dst[i] = clip_8b((int)dst[i] + delta1); + } + dst += dst_stride; + dither += 8; + } +} + +//------------------------------------------------------------------------------ + VP8DecIdct2 VP8Transform; VP8DecIdct VP8TransformAC3; VP8DecIdct VP8TransformUV; @@ -673,11 +691,15 @@ VP8SimpleFilterFunc VP8SimpleHFilter16; VP8SimpleFilterFunc VP8SimpleVFilter16i; VP8SimpleFilterFunc VP8SimpleHFilter16i; +void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst, + int dst_stride); + extern void VP8DspInitSSE2(void); extern void VP8DspInitSSE41(void); extern void VP8DspInitNEON(void); extern void VP8DspInitMIPS32(void); extern void VP8DspInitMIPSdspR2(void); +extern void VP8DspInitMSA(void); static volatile VP8CPUInfo dec_last_cpuinfo_used = (VP8CPUInfo)&dec_last_cpuinfo_used; @@ -734,6 +756,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) { VP8PredChroma8[5] = DC8uvNoLeft; VP8PredChroma8[6] = DC8uvNoTopLeft; + VP8DitherCombine8x8 = DitherCombine8x8; + // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) @@ -761,6 +785,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) { VP8DspInitMIPSdspR2(); } #endif +#if defined(WEBP_USE_MSA) + if (VP8GetCPUInfo(kMSA)) { + VP8DspInitMSA(); + } +#endif } dec_last_cpuinfo_used = VP8GetCPUInfo; } diff --git a/src/3rdparty/libwebp/src/dsp/dec_msa.c b/src/3rdparty/libwebp/src/dsp/dec_msa.c new file mode 100644 index 0000000..f76055c --- /dev/null +++ b/src/3rdparty/libwebp/src/dsp/dec_msa.c @@ -0,0 +1,172 @@ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// MSA version of dsp functions +// +// Author(s): Prashant Patil (prashant.patil@imgtec.com) + + +#include "./dsp.h" + +#if defined(WEBP_USE_MSA) + +#include "./msa_macro.h" + +//------------------------------------------------------------------------------ +// Transforms + +#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) { \ + v4i32 a1_m, b1_m, c1_m, d1_m; \ + v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m; \ + const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091); \ + const v4i32 sinpi8sqrt2 = __msa_fill_w(35468); \ + \ + a1_m = in0 + in2; \ + b1_m = in0 - in2; \ + c_tmp1_m = (in1 * sinpi8sqrt2) >> 16; \ + c_tmp2_m = in3 + ((in3 * cospi8sqrt2minus1) >> 16); \ + c1_m = c_tmp1_m - c_tmp2_m; \ + d_tmp1_m = in1 + ((in1 * cospi8sqrt2minus1) >> 16); \ + d_tmp2_m = (in3 * sinpi8sqrt2) >> 16; \ + d1_m = d_tmp1_m + d_tmp2_m; \ + BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \ +} +#define MULT1(a) ((((a) * 20091) >> 16) + (a)) +#define MULT2(a) (((a) * 35468) >> 16) + +static void TransformOne(const int16_t* in, uint8_t* dst) { + v8i16 input0, input1; + v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3; + v4i32 res0, res1, res2, res3; + const v16i8 zero = { 0 }; + v16i8 dest0, dest1, dest2, dest3; + + LD_SH2(in, 8, input0, input1); + UNPCK_SH_SW(input0, in0, in1); + UNPCK_SH_SW(input1, in2, in3); + IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3); + TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3); + IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3); + SRARI_W4_SW(vt0, vt1, vt2, vt3, 3); + TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3); + LD_SB4(dst, BPS, dest0, dest1, dest2, dest3); + ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, + res0, res1, res2, res3); + ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, + res0, res1, res2, res3); + ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3); + CLIP_SW4_0_255(res0, res1, res2, res3); + PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1); + res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1); + ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); +} + +static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { + TransformOne(in, dst); + if (do_two) { + TransformOne(in + 16, dst + 4); + } +} + +static void TransformWHT(const int16_t* in, int16_t* out) { + v8i16 input0, input1; + const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 }; + const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 }; + const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 }; + const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 }; + v8i16 tmp0, tmp1, tmp2, tmp3; + v8i16 out0, out1; + + LD_SH2(in, 8, input0, input1); + input1 = SLDI_SH(input1, input1, 8); + tmp0 = input0 + input1; + tmp1 = input0 - input1; + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + out0 = tmp2 + tmp3; + out1 = tmp2 - tmp3; + VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1); + tmp0 = input0 + input1; + tmp1 = input0 - input1; + VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3); + tmp0 = tmp2 + tmp3; + tmp1 = tmp2 - tmp3; + ADDVI_H2_SH(tmp0, 3, tmp1, 3, out0, out1); + SRAI_H2_SH(out0, out1, 3); + out[0] = __msa_copy_s_h(out0, 0); + out[16] = __msa_copy_s_h(out0, 4); + out[32] = __msa_copy_s_h(out1, 0); + out[48] = __msa_copy_s_h(out1, 4); + out[64] = __msa_copy_s_h(out0, 1); + out[80] = __msa_copy_s_h(out0, 5); + out[96] = __msa_copy_s_h(out1, 1); + out[112] = __msa_copy_s_h(out1, 5); + out[128] = __msa_copy_s_h(out0, 2); + out[144] = __msa_copy_s_h(out0, 6); + out[160] = __msa_copy_s_h(out1, 2); + out[176] = __msa_copy_s_h(out1, 6); + out[192] = __msa_copy_s_h(out0, 3); + out[208] = __msa_copy_s_h(out0, 7); + out[224] = __msa_copy_s_h(out1, 3); + out[240] = __msa_copy_s_h(out1, 7); +} + +static void TransformDC(const int16_t* in, uint8_t* dst) { + const int DC = (in[0] + 4) >> 3; + const v8i16 tmp0 = __msa_fill_h(DC); + ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS); +} + +static void TransformAC3(const int16_t* in, uint8_t* dst) { + const int a = in[0] + 4; + const int c4 = MULT2(in[4]); + const int d4 = MULT1(in[4]); + const int in2 = MULT2(in[1]); + const int in3 = MULT1(in[1]); + v4i32 tmp0 = { 0 }; + v4i32 out0 = __msa_fill_w(a + d4); + v4i32 out1 = __msa_fill_w(a + c4); + v4i32 out2 = __msa_fill_w(a - c4); + v4i32 out3 = __msa_fill_w(a - d4); + v4i32 res0, res1, res2, res3; + const v4i32 zero = { 0 }; + v16u8 dest0, dest1, dest2, dest3; + + INSERT_W4_SW(in3, in2, -in2, -in3, tmp0); + ADD4(out0, tmp0, out1, tmp0, out2, tmp0, out3, tmp0, + out0, out1, out2, out3); + SRAI_W4_SW(out0, out1, out2, out3, 3); + LD_UB4(dst, BPS, dest0, dest1, dest2, dest3); + ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, + res0, res1, res2, res3); + ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, + res0, res1, res2, res3); + ADD4(res0, out0, res1, out1, res2, out2, res3, out3, res0, res1, res2, res3); + CLIP_SW4_0_255(res0, res1, res2, res3); + PCKEV_B2_SW(res0, res1, res2, res3, out0, out1); + res0 = (v4i32)__msa_pckev_b((v16i8)out0, (v16i8)out1); + ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS); +} + +//------------------------------------------------------------------------------ +// Entry point + +extern void VP8DspInitMSA(void); + +WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) { + VP8TransformWHT = TransformWHT; + VP8Transform = TransformTwo; + VP8TransformDC = TransformDC; + VP8TransformAC3 = TransformAC3; +} + +#else // !WEBP_USE_MSA + +WEBP_DSP_INIT_STUB(VP8DspInitMSA) + +#endif // WEBP_USE_MSA diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse2.c b/src/3rdparty/libwebp/src/dsp/dec_sse2.c index 935bf02..f0a8ddc 100644 --- a/src/3rdparty/libwebp/src/dsp/dec_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/dec_sse2.c @@ -21,7 +21,9 @@ // #define USE_TRANSFORM_AC3 #include <emmintrin.h> +#include "./common_sse2.h" #include "../dec/vp8i.h" +#include "../utils/utils.h" //------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) @@ -102,34 +104,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) { const __m128i tmp3 = _mm_sub_epi16(a, d); // Transpose the two 4x4. - // a00 a01 a02 a03 b00 b01 b02 b03 - // a10 a11 a12 a13 b10 b11 b12 b13 - // a20 a21 a22 a23 b20 b21 b22 b23 - // a30 a31 a32 a33 b30 b31 b32 b33 - const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); - const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); - const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); - const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); - const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); - T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); - T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); - T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3); } // Horizontal pass and subsequent transpose. @@ -164,34 +139,8 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) { const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); // Transpose the two 4x4. - // a00 a01 a02 a03 b00 b01 b02 b03 - // a10 a11 a12 a13 b10 b11 b12 b13 - // a20 a21 a22 a23 b20 b21 b22 b23 - // a30 a31 a32 a33 b30 b31 b32 b33 - const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); - const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); - const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); - const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); - const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); - T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); - T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); - T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1, + &T2, &T3); } // Add inverse transform to 'dst' and store. diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse41.c b/src/3rdparty/libwebp/src/dsp/dec_sse41.c index 224c6f8..8d6aed1 100644 --- a/src/3rdparty/libwebp/src/dsp/dec_sse41.c +++ b/src/3rdparty/libwebp/src/dsp/dec_sse41.c @@ -17,6 +17,7 @@ #include <smmintrin.h> #include "../dec/vp8i.h" +#include "../utils/utils.h" static void HE16(uint8_t* dst) { // horizontal int j; diff --git a/src/3rdparty/libwebp/src/dsp/dsp.h b/src/3rdparty/libwebp/src/dsp/dsp.h index 95f1ce0..1faac27 100644 --- a/src/3rdparty/libwebp/src/dsp/dsp.h +++ b/src/3rdparty/libwebp/src/dsp/dsp.h @@ -14,8 +14,11 @@ #ifndef WEBP_DSP_DSP_H_ #define WEBP_DSP_DSP_H_ +#ifdef HAVE_CONFIG_H +#include "../webp/config.h" +#endif + #include "../webp/types.h" -#include "../utils/utils.h" #ifdef __cplusplus extern "C" { @@ -72,7 +75,8 @@ extern "C" { // The intrinsics currently cause compiler errors with arm-nacl-gcc and the // inline assembly would need to be modified for use with Native Client. #if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \ - defined(__aarch64__)) && !defined(__native_client__) + defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \ + !defined(__native_client__) #define WEBP_USE_NEON #endif @@ -92,6 +96,10 @@ extern "C" { #endif #endif +#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5) +#define WEBP_USE_MSA +#endif + // This macro prevents thread_sanitizer from reporting known concurrent writes. #define WEBP_TSAN_IGNORE_FUNCTION #if defined(__has_feature) @@ -101,6 +109,27 @@ extern "C" { #endif #endif +#define WEBP_UBSAN_IGNORE_UNDEF +#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW +#if !defined(WEBP_FORCE_ALIGNED) && defined(__clang__) && \ + defined(__has_attribute) +#if __has_attribute(no_sanitize) +// This macro prevents the undefined behavior sanitizer from reporting +// failures. This is only meant to silence unaligned loads on platforms that +// are known to support them. +#undef WEBP_UBSAN_IGNORE_UNDEF +#define WEBP_UBSAN_IGNORE_UNDEF \ + __attribute__((no_sanitize("undefined"))) + +// This macro prevents the undefined behavior sanitizer from reporting +// failures related to unsigned integer overflows. This is only meant to +// silence cases where this well defined behavior is expected. +#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW +#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \ + __attribute__((no_sanitize("unsigned-integer-overflow"))) +#endif +#endif + typedef enum { kSSE2, kSSE3, @@ -109,7 +138,8 @@ typedef enum { kAVX2, kNEON, kMIPS32, - kMIPSdspR2 + kMIPSdspR2, + kMSA } CPUFeature; // returns true if the CPU supports the feature. typedef int (*VP8CPUInfo)(CPUFeature feature); @@ -151,6 +181,8 @@ typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref); extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4; typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref, const uint16_t* const weights); +// The weights for VP8TDisto4x4 and VP8TDisto16x16 contain a row-major +// 4 by 4 symmetric matrix. extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16; typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst); @@ -214,6 +246,35 @@ extern VP8GetResidualCostFunc VP8GetResidualCost; void VP8EncDspCostInit(void); //------------------------------------------------------------------------------ +// SSIM utils + +// struct for accumulating statistical moments +typedef struct { + double w; // sum(w_i) : sum of weights + double xm, ym; // sum(w_i * x_i), sum(w_i * y_i) + double xxm, xym, yym; // sum(w_i * x_i * x_i), etc. +} VP8DistoStats; + +#define VP8_SSIM_KERNEL 3 // total size of the kernel: 2 * VP8_SSIM_KERNEL + 1 +typedef void (*VP8SSIMAccumulateClippedFunc)(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2, + int xo, int yo, // center position + int W, int H, // plane dimension + VP8DistoStats* const stats); + +// This version is called with the guarantee that you can load 8 bytes and +// 8 rows at offset src1 and src2 +typedef void (*VP8SSIMAccumulateFunc)(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2, + VP8DistoStats* const stats); + +extern VP8SSIMAccumulateFunc VP8SSIMAccumulate; // unclipped / unchecked +extern VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped; // with clipping + +// must be called before using any of the above directly +void VP8SSIMDspInit(void); + +//------------------------------------------------------------------------------ // Decoding typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst); @@ -265,6 +326,15 @@ extern VP8LumaFilterFunc VP8HFilter16i; extern VP8ChromaFilterFunc VP8VFilter8i; // filtering u and v altogether extern VP8ChromaFilterFunc VP8HFilter8i; +// Dithering. Combines dithering values (centered around 128) with dst[], +// according to: dst[] = clip(dst[] + (((dither[]-128) + 8) >> 4) +#define VP8_DITHER_DESCALE 4 +#define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1)) +#define VP8_DITHER_AMP_BITS 7 +#define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS) +extern void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst, + int dst_stride); + // must be called before anything using the above void VP8DspInit(void); @@ -472,8 +542,10 @@ typedef enum { // Filter types. typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height, int stride, uint8_t* out); -typedef void (*WebPUnfilterFunc)(int width, int height, int stride, - int row, int num_rows, uint8_t* data); +// In-place un-filtering. +// Warning! 'prev_line' pointer can be equal to 'cur_line' or 'preds'. +typedef void (*WebPUnfilterFunc)(const uint8_t* prev_line, const uint8_t* preds, + uint8_t* cur_line, int width); // Filter the given data using the given predictor. // 'in' corresponds to a 2-dimensional pixel array of size (stride * height) diff --git a/src/3rdparty/libwebp/src/dsp/enc.c b/src/3rdparty/libwebp/src/dsp/enc.c index 8899d50..f639f55 100644 --- a/src/3rdparty/libwebp/src/dsp/enc.c +++ b/src/3rdparty/libwebp/src/dsp/enc.c @@ -69,7 +69,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, // Convert coefficients to bin. for (k = 0; k < 16; ++k) { - const int v = abs(out[k]) >> 3; // TODO(skal): add rounding? + const int v = abs(out[k]) >> 3; const int clipped_value = clip_max(v, MAX_COEFF_THRESH); ++distribution[clipped_value]; } @@ -559,6 +559,7 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. +// w[] contains a row-major 4 by 4 symmetric matrix. static int TTransform(const uint8_t* in, const uint16_t* w) { int sum = 0; int tmp[16]; @@ -636,7 +637,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], int level = QUANTDIV(coeff, iQ, B); if (level > MAX_LEVEL) level = MAX_LEVEL; if (sign) level = -level; - in[j] = level * Q; + in[j] = level * (int)Q; out[n] = level; if (level) last = n; } else { @@ -670,7 +671,7 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16], int level = QUANTDIV(coeff, iQ, B); if (level > MAX_LEVEL) level = MAX_LEVEL; if (sign) level = -level; - in[j] = level * Q; + in[j] = level * (int)Q; out[n] = level; if (level) last = n; } else { @@ -702,6 +703,68 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) { } //------------------------------------------------------------------------------ + +static void SSIMAccumulateClipped(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2, + int xo, int yo, int W, int H, + VP8DistoStats* const stats) { + const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL; + const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1 + : yo + VP8_SSIM_KERNEL; + const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL; + const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1 + : xo + VP8_SSIM_KERNEL; + int x, y; + src1 += ymin * stride1; + src2 += ymin * stride2; + for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) { + for (x = xmin; x <= xmax; ++x) { + const int s1 = src1[x]; + const int s2 = src2[x]; + stats->w += 1; + stats->xm += s1; + stats->ym += s2; + stats->xxm += s1 * s1; + stats->xym += s1 * s2; + stats->yym += s2 * s2; + } + } +} + +static void SSIMAccumulate(const uint8_t* src1, int stride1, + const uint8_t* src2, int stride2, + VP8DistoStats* const stats) { + int x, y; + for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) { + for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) { + const int s1 = src1[x]; + const int s2 = src2[x]; + stats->w += 1; + stats->xm += s1; + stats->ym += s2; + stats->xxm += s1 * s1; + stats->xym += s1 * s2; + stats->yym += s2 * s2; + } + } +} + +VP8SSIMAccumulateFunc VP8SSIMAccumulate; +VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped; + +static volatile VP8CPUInfo ssim_last_cpuinfo_used = + (VP8CPUInfo)&ssim_last_cpuinfo_used; + +WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) { + if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return; + + VP8SSIMAccumulate = SSIMAccumulate; + VP8SSIMAccumulateClipped = SSIMAccumulateClipped; + + ssim_last_cpuinfo_used = VP8GetCPUInfo; +} + +//------------------------------------------------------------------------------ // Initialization // Speed-critical function pointers. We have to initialize them to the default diff --git a/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c index 7c814fa..7ab96f6 100644 --- a/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c +++ b/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c @@ -1393,8 +1393,6 @@ static void FTransformWHT(const int16_t* in, int16_t* out) { "absq_s.ph %[temp1], %[temp1] \n\t" \ "absq_s.ph %[temp2], %[temp2] \n\t" \ "absq_s.ph %[temp3], %[temp3] \n\t" \ - /* TODO(skal): add rounding ? shra_r.ph : shra.ph */ \ - /* for following 4 instructions */ \ "shra.ph %[temp0], %[temp0], 3 \n\t" \ "shra.ph %[temp1], %[temp1], 3 \n\t" \ "shra.ph %[temp2], %[temp2], 3 \n\t" \ diff --git a/src/3rdparty/libwebp/src/dsp/enc_neon.c b/src/3rdparty/libwebp/src/dsp/enc_neon.c index c2aef58..46f6bf9 100644 --- a/src/3rdparty/libwebp/src/dsp/enc_neon.c +++ b/src/3rdparty/libwebp/src/dsp/enc_neon.c @@ -560,21 +560,6 @@ static void FTransformWHT(const int16_t* src, int16_t* out) { // a 26ae, b 26ae // a 37bf, b 37bf // -static WEBP_INLINE uint8x8x4_t DistoTranspose4x4U8(uint8x8x4_t d4_in) { - const uint8x8x2_t d2_tmp0 = vtrn_u8(d4_in.val[0], d4_in.val[1]); - const uint8x8x2_t d2_tmp1 = vtrn_u8(d4_in.val[2], d4_in.val[3]); - const uint16x4x2_t d2_tmp2 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[0]), - vreinterpret_u16_u8(d2_tmp1.val[0])); - const uint16x4x2_t d2_tmp3 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[1]), - vreinterpret_u16_u8(d2_tmp1.val[1])); - - d4_in.val[0] = vreinterpret_u8_u16(d2_tmp2.val[0]); - d4_in.val[2] = vreinterpret_u8_u16(d2_tmp2.val[1]); - d4_in.val[1] = vreinterpret_u8_u16(d2_tmp3.val[0]); - d4_in.val[3] = vreinterpret_u8_u16(d2_tmp3.val[1]); - return d4_in; -} - static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) { const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]); const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]); @@ -589,41 +574,40 @@ static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) { return q4_in; } -static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const uint8x8x4_t d4_in) { +static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) { // {a0, a1} = {in[0] + in[2], in[1] + in[3]} // {a3, a2} = {in[0] - in[2], in[1] - in[3]} - const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[0], - d4_in.val[2])); - const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[1], - d4_in.val[3])); - const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[0], - d4_in.val[2])); - const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[1], - d4_in.val[3])); + const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]); + const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]); + const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]); + const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]); int16x8x4_t q4_out; // tmp[0] = a0 + a1 // tmp[1] = a3 + a2 // tmp[2] = a3 - a2 // tmp[3] = a0 - a1 INIT_VECTOR4(q4_out, - vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2), - vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1)); + vabsq_s16(vaddq_s16(q_a0, q_a1)), + vabsq_s16(vaddq_s16(q_a3, q_a2)), + vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1)); return q4_out; } -static WEBP_INLINE int16x8x4_t DistoVerticalPass(int16x8x4_t q4_in) { - const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]); - const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]); - const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]); - const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]); +static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) { + const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0], + q4_in.val[2])); + const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1], + q4_in.val[3])); + const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1], + q4_in.val[3])); + const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0], + q4_in.val[2])); + int16x8x4_t q4_out; - q4_in.val[0] = vaddq_s16(q_a0, q_a1); - q4_in.val[1] = vaddq_s16(q_a3, q_a2); - q4_in.val[2] = vabdq_s16(q_a3, q_a2); - q4_in.val[3] = vabdq_s16(q_a0, q_a1); - q4_in.val[0] = vabsq_s16(q4_in.val[0]); - q4_in.val[1] = vabsq_s16(q4_in.val[1]); - return q4_in; + INIT_VECTOR4(q4_out, + vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2), + vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1)); + return q4_out; } static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) { @@ -667,6 +651,7 @@ static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in, // Hadamard transform // Returns the weighted sum of the absolute value of transformed coefficients. +// w[] contains a row-major 4 by 4 symmetric matrix. static int Disto4x4(const uint8_t* const a, const uint8_t* const b, const uint16_t* const w) { uint32x2_t d_in_ab_0123 = vdup_n_u32(0); @@ -691,18 +676,19 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b, vreinterpret_u8_u32(d_in_ab_cdef)); { - // horizontal pass - const uint8x8x4_t d4_t = DistoTranspose4x4U8(d4_in); - const int16x8x4_t q4_h = DistoHorizontalPass(d4_t); + // Vertical pass first to avoid a transpose (vertical and horizontal passes + // are commutative because w/kWeightY is symmetric) and subsequent + // transpose. + const int16x8x4_t q4_v = DistoVerticalPass(d4_in); const int16x4x4_t d4_w = DistoLoadW(w); - // vertical pass - const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_h); - const int16x8x4_t q4_v = DistoVerticalPass(q4_t); - int32x2_t d_sum = DistoSum(q4_v, d4_w); + // horizontal pass + const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v); + const int16x8x4_t q4_h = DistoHorizontalPass(q4_t); + int32x2_t d_sum = DistoSum(q4_h, d4_w); // abs(sum2 - sum1) >> 5 d_sum = vabs_s32(d_sum); - d_sum = vshr_n_s32(d_sum, 5); + d_sum = vshr_n_s32(d_sum, 5); return vget_lane_s32(d_sum, 0); } } diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse2.c b/src/3rdparty/libwebp/src/dsp/enc_sse2.c index 2333d2b..4a2e3ce 100644 --- a/src/3rdparty/libwebp/src/dsp/enc_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/enc_sse2.c @@ -17,39 +17,11 @@ #include <stdlib.h> // for abs() #include <emmintrin.h> +#include "./common_sse2.h" #include "../enc/cost.h" #include "../enc/vp8enci.h" //------------------------------------------------------------------------------ -// Quite useful macro for debugging. Left here for convenience. - -#if 0 -#include <stdio.h> -static void PrintReg(const __m128i r, const char* const name, int size) { - int n; - union { - __m128i r; - uint8_t i8[16]; - uint16_t i16[8]; - uint32_t i32[4]; - uint64_t i64[2]; - } tmp; - tmp.r = r; - fprintf(stderr, "%s\t: ", name); - if (size == 8) { - for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]); - } else if (size == 16) { - for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]); - } else if (size == 32) { - for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]); - } else { - for (n = 0; n < 2; ++n) fprintf(stderr, "%.16lx ", tmp.i64[n]); - } - fprintf(stderr, "\n"); -} -#endif - -//------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) // Does one or two inverse transforms. @@ -131,34 +103,7 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, const __m128i tmp3 = _mm_sub_epi16(a, d); // Transpose the two 4x4. - // a00 a01 a02 a03 b00 b01 b02 b03 - // a10 a11 a12 a13 b10 b11 b12 b13 - // a20 a21 a22 a23 b20 b21 b22 b23 - // a30 a31 a32 a33 b30 b31 b32 b33 - const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); - const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); - const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); - const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); - const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); - T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); - T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); - T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3); } // Horizontal pass and subsequent transpose. @@ -193,34 +138,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); // Transpose the two 4x4. - // a00 a01 a02 a03 b00 b01 b02 b03 - // a10 a11 a12 a13 b10 b11 b12 b13 - // a20 a21 a22 a23 b20 b21 b22 b23 - // a30 a31 a32 a33 b30 b31 b32 b33 - const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); - const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); - const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); - const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); - const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); - T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); - T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); - T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1, + &T2, &T3); } // Add inverse transform to 'ref' and store. @@ -373,42 +292,42 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32, static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); - - // Load src and convert to 16b. + // Load src. const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); - const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); - const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); - const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); - const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); - // Load ref and convert to 16b. + // 00 01 02 03 * + // 10 11 12 13 * + // 20 21 22 23 * + // 30 31 32 33 * + // Shuffle. + const __m128i src_0 = _mm_unpacklo_epi16(src0, src1); + const __m128i src_1 = _mm_unpacklo_epi16(src2, src3); + // 00 01 10 11 02 03 12 13 * * ... + // 20 21 30 31 22 22 32 33 * * ... + + // Load ref. const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); - const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); - const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); - const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); - const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); - // Compute difference. -> 00 01 02 03 00 00 00 00 - const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); - const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); - const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); - const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); - - // Unpack and shuffle - // 00 01 02 03 0 0 0 0 - // 10 11 12 13 0 0 0 0 - // 20 21 22 23 0 0 0 0 - // 30 31 32 33 0 0 0 0 - const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1); - const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3); + const __m128i ref_0 = _mm_unpacklo_epi16(ref0, ref1); + const __m128i ref_1 = _mm_unpacklo_epi16(ref2, ref3); + + // Convert both to 16 bit. + const __m128i src_0_16b = _mm_unpacklo_epi8(src_0, zero); + const __m128i src_1_16b = _mm_unpacklo_epi8(src_1, zero); + const __m128i ref_0_16b = _mm_unpacklo_epi8(ref_0, zero); + const __m128i ref_1_16b = _mm_unpacklo_epi8(ref_1, zero); + + // Compute the difference. + const __m128i row01 = _mm_sub_epi16(src_0_16b, ref_0_16b); + const __m128i row23 = _mm_sub_epi16(src_1_16b, ref_1_16b); __m128i v01, v32; // First pass - FTransformPass1(&shuf01, &shuf23, &v01, &v32); + FTransformPass1(&row01, &row23, &v01, &v32); // Second pass FTransformPass2(&v01, &v32, out); @@ -463,8 +382,7 @@ static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) { } static void FTransformWHTRow(const int16_t* const in, __m128i* const out) { - const __m128i kMult1 = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1); - const __m128i kMult2 = _mm_set_epi16(0, 0, 0, 0, -1, 1, -1, 1); + const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1); const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]); const __m128i src2 = _mm_loadl_epi64((__m128i*)&in[2 * 16]); @@ -473,33 +391,38 @@ static void FTransformWHTRow(const int16_t* const in, __m128i* const out) { const __m128i A23 = _mm_unpacklo_epi16(src2, src3); // A2 A3 | ... const __m128i B0 = _mm_adds_epi16(A01, A23); // a0 | a1 | ... const __m128i B1 = _mm_subs_epi16(A01, A23); // a3 | a2 | ... - const __m128i C0 = _mm_unpacklo_epi32(B0, B1); // a0 | a1 | a3 | a2 - const __m128i C1 = _mm_unpacklo_epi32(B1, B0); // a3 | a2 | a0 | a1 - const __m128i D0 = _mm_madd_epi16(C0, kMult1); // out0, out1 - const __m128i D1 = _mm_madd_epi16(C1, kMult2); // out2, out3 - *out = _mm_unpacklo_epi64(D0, D1); + const __m128i C0 = _mm_unpacklo_epi32(B0, B1); // a0 | a1 | a3 | a2 | ... + const __m128i C1 = _mm_unpacklo_epi32(B1, B0); // a3 | a2 | a0 | a1 | ... + const __m128i D = _mm_unpacklo_epi64(C0, C1); // a0 a1 a3 a2 a3 a2 a0 a1 + *out = _mm_madd_epi16(D, kMult); } static void FTransformWHT(const int16_t* in, int16_t* out) { + // Input is 12b signed. __m128i row0, row1, row2, row3; + // Rows are 14b signed. FTransformWHTRow(in + 0 * 64, &row0); FTransformWHTRow(in + 1 * 64, &row1); FTransformWHTRow(in + 2 * 64, &row2); FTransformWHTRow(in + 3 * 64, &row3); { + // The a* are 15b signed. const __m128i a0 = _mm_add_epi32(row0, row2); const __m128i a1 = _mm_add_epi32(row1, row3); const __m128i a2 = _mm_sub_epi32(row1, row3); const __m128i a3 = _mm_sub_epi32(row0, row2); - const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1); - const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1); - const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1); - const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1); - const __m128i out0 = _mm_packs_epi32(b0, b1); - const __m128i out1 = _mm_packs_epi32(b2, b3); - _mm_storeu_si128((__m128i*)&out[0], out0); - _mm_storeu_si128((__m128i*)&out[8], out1); + const __m128i a0a3 = _mm_packs_epi32(a0, a3); + const __m128i a1a2 = _mm_packs_epi32(a1, a2); + + // The b* are 16b signed. + const __m128i b0b1 = _mm_add_epi16(a0a3, a1a2); + const __m128i b3b2 = _mm_sub_epi16(a0a3, a1a2); + const __m128i tmp_b2b3 = _mm_unpackhi_epi64(b3b2, b3b2); + const __m128i b2b3 = _mm_unpacklo_epi64(tmp_b2b3, b3b2); + + _mm_storeu_si128((__m128i*)&out[0], _mm_srai_epi16(b0b1, 1)); + _mm_storeu_si128((__m128i*)&out[8], _mm_srai_epi16(b2b3, 1)); } } @@ -692,12 +615,10 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left, const uint8_t* top) { - const __m128i zero = _mm_setzero_si128(); const __m128i top_values = _mm_loadl_epi64((const __m128i*)top); const __m128i left_values = _mm_loadl_epi64((const __m128i*)left); - const __m128i sum_top = _mm_sad_epu8(top_values, zero); - const __m128i sum_left = _mm_sad_epu8(left_values, zero); - const int DC = _mm_cvtsi128_si32(sum_top) + _mm_cvtsi128_si32(sum_left) + 8; + const __m128i combined = _mm_unpacklo_epi64(top_values, left_values); + const int DC = VP8HorizontalAdd8b(&combined) + 8; Put8x8uv(DC >> 4, dst); } @@ -735,27 +656,16 @@ static WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left, static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left, const uint8_t* top) { - const __m128i zero = _mm_setzero_si128(); const __m128i top_row = _mm_load_si128((const __m128i*)top); const __m128i left_row = _mm_load_si128((const __m128i*)left); - const __m128i sad8x2 = _mm_sad_epu8(top_row, zero); - // sum the two sads: sad8x2[0:1] + sad8x2[8:9] - const __m128i sum_top = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2)); - const __m128i sad8x2_left = _mm_sad_epu8(left_row, zero); - // sum the two sads: sad8x2[0:1] + sad8x2[8:9] - const __m128i sum_left = - _mm_add_epi16(sad8x2_left, _mm_shuffle_epi32(sad8x2_left, 2)); - const int DC = _mm_cvtsi128_si32(sum_top) + _mm_cvtsi128_si32(sum_left) + 16; + const int DC = + VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16; Put16(DC >> 5, dst); } static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) { - const __m128i zero = _mm_setzero_si128(); const __m128i top_row = _mm_load_si128((const __m128i*)top); - const __m128i sad8x2 = _mm_sad_epu8(top_row, zero); - // sum the two sads: sad8x2[0:1] + sad8x2[8:9] - const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2)); - const int DC = _mm_cvtsi128_si32(sum) + 8; + const int DC = VP8HorizontalAdd8b(&top_row) + 8; Put16(DC >> 4, dst); } @@ -1142,15 +1052,15 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { // reconstructed samples. // Hadamard transform -// Returns the difference between the weighted sum of the absolute value of -// transformed coefficients. +// Returns the weighted sum of the absolute value of transformed coefficients. +// w[] contains a row-major 4 by 4 symmetric matrix. static int TTransform(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); - // Load, combine and transpose inputs. + // Load and combine inputs. { const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); @@ -1162,37 +1072,22 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB, const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). - const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); - const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); - const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); - const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); - // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 - // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 - // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 - // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 - - // Transpose the two 4x4, discarding the filling zeroes. - const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); - const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); - // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 - // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); - // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 - - // Convert to 16b. - tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero); - tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero); - tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero); - tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0); + const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1); + const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2); + const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3); + tmp_0 = _mm_unpacklo_epi8(inAB_0, zero); + tmp_1 = _mm_unpacklo_epi8(inAB_1, zero); + tmp_2 = _mm_unpacklo_epi8(inAB_2, zero); + tmp_3 = _mm_unpacklo_epi8(inAB_3, zero); + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 } - // Horizontal pass and subsequent transpose. + // Vertical pass first to avoid a transpose (vertical and horizontal passes + // are commutative because w/kWeightY is symmetric) and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); @@ -1209,33 +1104,10 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB, // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. - const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); - const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); - const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); - const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); - const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); - tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); - tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); - tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3); } - // Vertical pass and difference of weighted sums. + // Horizontal pass and difference of weighted sums. { // Load all inputs. const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse41.c b/src/3rdparty/libwebp/src/dsp/enc_sse41.c index 65c01ae..a178390 100644 --- a/src/3rdparty/libwebp/src/dsp/enc_sse41.c +++ b/src/3rdparty/libwebp/src/dsp/enc_sse41.c @@ -17,6 +17,7 @@ #include <smmintrin.h> #include <stdlib.h> // for abs() +#include "./common_sse2.h" #include "../enc/vp8enci.h" //------------------------------------------------------------------------------ @@ -67,55 +68,45 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred, // reconstructed samples. // Hadamard transform -// Returns the difference between the weighted sum of the absolute value of -// transformed coefficients. +// Returns the weighted sum of the absolute value of transformed coefficients. +// w[] contains a row-major 4 by 4 symmetric matrix. static int TTransform(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { + int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; - // Load, combine and transpose inputs. + // Load and combine inputs. { - const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); - const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); - const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]); + const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]); + const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]); + const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]); + // In SSE4.1, with gcc 4.8 at least (maybe other versions), + // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump + // of inA and inB, _mm_loadl_epi64 is still used not to have an out of + // bound read. const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); - const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]); - const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]); - const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]); + const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]); + const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]); + const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). - const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); - const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); - const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); - const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); - // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 - // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 - // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 - // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 - - // Transpose the two 4x4, discarding the filling zeroes. - const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); - const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); - // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 - // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); - // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 - - // Convert to 16b. - tmp_0 = _mm_cvtepu8_epi16(transpose1_0); - tmp_1 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_0, 8)); - tmp_2 = _mm_cvtepu8_epi16(transpose1_1); - tmp_3 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_1, 8)); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0); + const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1); + const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2); + const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3); + tmp_0 = _mm_cvtepu8_epi16(inAB_0); + tmp_1 = _mm_cvtepu8_epi16(inAB_1); + tmp_2 = _mm_cvtepu8_epi16(inAB_2); + tmp_3 = _mm_cvtepu8_epi16(inAB_3); + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 } - // Horizontal pass and subsequent transpose. + // Vertical pass first to avoid a transpose (vertical and horizontal passes + // are commutative because w/kWeightY is symmetric) and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); @@ -132,33 +123,10 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB, // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. - const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); - const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); - const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); - const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); - // a00 a10 a01 a11 a02 a12 a03 a13 - // a20 a30 a21 a31 a22 a32 a23 a33 - // b00 b10 b01 b11 b02 b12 b03 b13 - // b20 b30 b21 b31 b22 b32 b23 b33 - const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); - const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); - const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); - // a00 a10 a20 a30 a01 a11 a21 a31 - // b00 b10 b20 b30 b01 b11 b21 b31 - // a02 a12 a22 a32 a03 a13 a23 a33 - // b02 b12 a22 b32 b03 b13 b23 b33 - tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); - tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); - tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); - tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); - // a00 a10 a20 a30 b00 b10 b20 b30 - // a01 a11 a21 a31 b01 b11 b21 b31 - // a02 a12 a22 a32 b02 b12 b22 b32 - // a03 a13 a23 a33 b03 b13 b23 b33 + VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3); } - // Vertical pass and difference of weighted sums. + // Horizontal pass and difference of weighted sums. { // Load all inputs. const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); @@ -195,11 +163,9 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB, // difference of weighted sums A_b2 = _mm_sub_epi32(A_b0, B_b0); - // cascading summation of the differences - B_b0 = _mm_hadd_epi32(A_b2, A_b2); - B_b2 = _mm_hadd_epi32(B_b0, B_b0); - return _mm_cvtsi128_si32(B_b2); + _mm_storeu_si128((__m128i*)&sum[0], A_b2); } + return sum[0] + sum[1] + sum[2] + sum[3]; } static int Disto4x4(const uint8_t* const a, const uint8_t* const b, diff --git a/src/3rdparty/libwebp/src/dsp/filters.c b/src/3rdparty/libwebp/src/dsp/filters.c index 5c30f2e..9f04faf 100644 --- a/src/3rdparty/libwebp/src/dsp/filters.c +++ b/src/3rdparty/libwebp/src/dsp/filters.c @@ -184,19 +184,40 @@ static void GradientFilter(const uint8_t* data, int width, int height, //------------------------------------------------------------------------------ -static void VerticalUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data); +static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + uint8_t pred = (prev == NULL) ? 0 : prev[0]; + int i; + for (i = 0; i < width; ++i) { + out[i] = pred + in[i]; + pred = out[i]; + } } -static void HorizontalUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data); +static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + if (prev == NULL) { + HorizontalUnfilter(NULL, in, out, width); + } else { + int i; + for (i = 0; i < width; ++i) out[i] = prev[i] + in[i]; + } } -static void GradientUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoGradientFilter(data, width, height, stride, row, num_rows, 1, data); +static void GradientUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + if (prev == NULL) { + HorizontalUnfilter(NULL, in, out, width); + } else { + uint8_t top = prev[0], top_left = top, left = top; + int i; + for (i = 0; i < width; ++i) { + top = prev[i]; // need to read this first, in case prev==out + left = in[i] + GradientPredictor(left, top, top_left); + top_left = top; + out[i] = left; + } + } } //------------------------------------------------------------------------------ diff --git a/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c index 8134af5..1d82e3c 100644 --- a/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c +++ b/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c @@ -33,10 +33,6 @@ assert(row >= 0 && num_rows > 0 && row + num_rows <= height); \ (void)height; // Silence unused warning. -// if INVERSE -// preds == &dst[-1] == &src[-1] -// else -// preds == &src[-1] != &dst[-1] #define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do { \ const uint8_t* psrc = (uint8_t*)(SRC); \ uint8_t* pdst = (uint8_t*)(DST); \ @@ -45,27 +41,28 @@ __asm__ volatile ( \ ".set push \n\t" \ ".set noreorder \n\t" \ - "srl %[temp0], %[length], 0x2 \n\t" \ + "srl %[temp0], %[length], 2 \n\t" \ "beqz %[temp0], 4f \n\t" \ - " andi %[temp6], %[length], 0x3 \n\t" \ + " andi %[temp6], %[length], 3 \n\t" \ ".if " #INVERSE " \n\t" \ - "lbu %[temp1], -1(%[src]) \n\t" \ "1: \n\t" \ + "lbu %[temp1], -1(%[dst]) \n\t" \ "lbu %[temp2], 0(%[src]) \n\t" \ "lbu %[temp3], 1(%[src]) \n\t" \ "lbu %[temp4], 2(%[src]) \n\t" \ "lbu %[temp5], 3(%[src]) \n\t" \ + "addu %[temp1], %[temp1], %[temp2] \n\t" \ + "addu %[temp2], %[temp1], %[temp3] \n\t" \ + "addu %[temp3], %[temp2], %[temp4] \n\t" \ + "addu %[temp4], %[temp3], %[temp5] \n\t" \ + "sb %[temp1], 0(%[dst]) \n\t" \ + "sb %[temp2], 1(%[dst]) \n\t" \ + "sb %[temp3], 2(%[dst]) \n\t" \ + "sb %[temp4], 3(%[dst]) \n\t" \ "addiu %[src], %[src], 4 \n\t" \ "addiu %[temp0], %[temp0], -1 \n\t" \ - "addu %[temp2], %[temp2], %[temp1] \n\t" \ - "addu %[temp3], %[temp3], %[temp2] \n\t" \ - "addu %[temp4], %[temp4], %[temp3] \n\t" \ - "addu %[temp1], %[temp5], %[temp4] \n\t" \ - "sb %[temp2], -4(%[src]) \n\t" \ - "sb %[temp3], -3(%[src]) \n\t" \ - "sb %[temp4], -2(%[src]) \n\t" \ "bnez %[temp0], 1b \n\t" \ - " sb %[temp1], -1(%[src]) \n\t" \ + " addiu %[dst], %[dst], 4 \n\t" \ ".else \n\t" \ "1: \n\t" \ "ulw %[temp1], -1(%[src]) \n\t" \ @@ -81,16 +78,16 @@ "beqz %[temp6], 3f \n\t" \ " nop \n\t" \ "2: \n\t" \ - "lbu %[temp1], -1(%[src]) \n\t" \ "lbu %[temp2], 0(%[src]) \n\t" \ - "addiu %[src], %[src], 1 \n\t" \ ".if " #INVERSE " \n\t" \ + "lbu %[temp1], -1(%[dst]) \n\t" \ "addu %[temp3], %[temp1], %[temp2] \n\t" \ - "sb %[temp3], -1(%[src]) \n\t" \ ".else \n\t" \ + "lbu %[temp1], -1(%[src]) \n\t" \ "subu %[temp3], %[temp1], %[temp2] \n\t" \ - "sb %[temp3], 0(%[dst]) \n\t" \ ".endif \n\t" \ + "addiu %[src], %[src], 1 \n\t" \ + "sb %[temp3], 0(%[dst]) \n\t" \ "addiu %[temp6], %[temp6], -1 \n\t" \ "bnez %[temp6], 2b \n\t" \ " addiu %[dst], %[dst], 1 \n\t" \ @@ -105,12 +102,8 @@ } while (0) static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst, - int length, int inverse) { - if (inverse) { - DO_PREDICT_LINE(src, dst, length, 1); - } else { - DO_PREDICT_LINE(src, dst, length, 0); - } + int length) { + DO_PREDICT_LINE(src, dst, length, 0); } #define DO_PREDICT_LINE_VERTICAL(SRC, PRED, DST, LENGTH, INVERSE) do { \ @@ -172,16 +165,12 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst, ); \ } while (0) -#define PREDICT_LINE_ONE_PASS(SRC, PRED, DST, INVERSE) do { \ +#define PREDICT_LINE_ONE_PASS(SRC, PRED, DST) do { \ int temp1, temp2, temp3; \ __asm__ volatile ( \ "lbu %[temp1], 0(%[src]) \n\t" \ "lbu %[temp2], 0(%[pred]) \n\t" \ - ".if " #INVERSE " \n\t" \ - "addu %[temp3], %[temp1], %[temp2] \n\t" \ - ".else \n\t" \ "subu %[temp3], %[temp1], %[temp2] \n\t" \ - ".endif \n\t" \ "sb %[temp3], 0(%[dst]) \n\t" \ : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3) \ : [pred]"r"((PRED)), [dst]"r"((DST)), [src]"r"((SRC)) \ @@ -192,10 +181,10 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst, //------------------------------------------------------------------------------ // Horizontal filter. -#define FILTER_LINE_BY_LINE(INVERSE) do { \ +#define FILTER_LINE_BY_LINE do { \ while (row < last_row) { \ - PREDICT_LINE_ONE_PASS(in, preds - stride, out, INVERSE); \ - DO_PREDICT_LINE(in + 1, out + 1, width - 1, INVERSE); \ + PREDICT_LINE_ONE_PASS(in, preds - stride, out); \ + DO_PREDICT_LINE(in + 1, out + 1, width - 1, 0); \ ++row; \ preds += stride; \ in += stride; \ @@ -206,19 +195,19 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst, static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, int width, int height, int stride, int row, int num_rows, - int inverse, uint8_t* out) { + uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); in += start_offset; out += start_offset; - preds = inverse ? out : in; + preds = in; if (row == 0) { // Leftmost pixel is the same as input for topmost scanline. out[0] = in[0]; - PredictLine(in + 1, out + 1, width - 1, inverse); + PredictLine(in + 1, out + 1, width - 1); row = 1; preds += stride; in += stride; @@ -226,31 +215,21 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, } // Filter line-by-line. - if (inverse) { - FILTER_LINE_BY_LINE(1); - } else { - FILTER_LINE_BY_LINE(0); - } + FILTER_LINE_BY_LINE; } - #undef FILTER_LINE_BY_LINE static void HorizontalFilter(const uint8_t* data, int width, int height, int stride, uint8_t* filtered_data) { - DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data); -} - -static void HorizontalUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data); + DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data); } //------------------------------------------------------------------------------ // Vertical filter. -#define FILTER_LINE_BY_LINE(INVERSE) do { \ +#define FILTER_LINE_BY_LINE do { \ while (row < last_row) { \ - DO_PREDICT_LINE_VERTICAL(in, preds, out, width, INVERSE); \ + DO_PREDICT_LINE_VERTICAL(in, preds, out, width, 0); \ ++row; \ preds += stride; \ in += stride; \ @@ -260,21 +239,20 @@ static void HorizontalUnfilter(int width, int height, int stride, int row, static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, int width, int height, int stride, - int row, int num_rows, - int inverse, uint8_t* out) { + int row, int num_rows, uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); in += start_offset; out += start_offset; - preds = inverse ? out : in; + preds = in; if (row == 0) { // Very first top-left pixel is copied. out[0] = in[0]; // Rest of top scan-line is left-predicted. - PredictLine(in + 1, out + 1, width - 1, inverse); + PredictLine(in + 1, out + 1, width - 1); row = 1; in += stride; out += stride; @@ -284,24 +262,13 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, } // Filter line-by-line. - if (inverse) { - FILTER_LINE_BY_LINE(1); - } else { - FILTER_LINE_BY_LINE(0); - } + FILTER_LINE_BY_LINE; } - #undef FILTER_LINE_BY_LINE -#undef DO_PREDICT_LINE_VERTICAL static void VerticalFilter(const uint8_t* data, int width, int height, int stride, uint8_t* filtered_data) { - DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data); -} - -static void VerticalUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data); + DoVerticalFilter(data, width, height, stride, 0, height, filtered_data); } //------------------------------------------------------------------------------ @@ -321,10 +288,10 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) { return temp0; } -#define FILTER_LINE_BY_LINE(INVERSE, PREDS, OPERATION) do { \ +#define FILTER_LINE_BY_LINE(PREDS, OPERATION) do { \ while (row < last_row) { \ int w; \ - PREDICT_LINE_ONE_PASS(in, PREDS - stride, out, INVERSE); \ + PREDICT_LINE_ONE_PASS(in, PREDS - stride, out); \ for (w = 1; w < width; ++w) { \ const int pred = GradientPredictor(PREDS[w - 1], \ PREDS[w - stride], \ @@ -339,20 +306,19 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) { static WEBP_INLINE void DoGradientFilter(const uint8_t* in, int width, int height, int stride, - int row, int num_rows, - int inverse, uint8_t* out) { + int row, int num_rows, uint8_t* out) { const uint8_t* preds; const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); in += start_offset; out += start_offset; - preds = inverse ? out : in; + preds = in; // left prediction for top scan-line if (row == 0) { out[0] = in[0]; - PredictLine(in + 1, out + 1, width - 1, inverse); + PredictLine(in + 1, out + 1, width - 1); row = 1; preds += stride; in += stride; @@ -360,25 +326,49 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, } // Filter line-by-line. - if (inverse) { - FILTER_LINE_BY_LINE(1, out, +); - } else { - FILTER_LINE_BY_LINE(0, in, -); - } + FILTER_LINE_BY_LINE(in, -); } - #undef FILTER_LINE_BY_LINE static void GradientFilter(const uint8_t* data, int width, int height, int stride, uint8_t* filtered_data) { - DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data); + DoGradientFilter(data, width, height, stride, 0, height, filtered_data); } -static void GradientUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoGradientFilter(data, width, height, stride, row, num_rows, 1, data); +//------------------------------------------------------------------------------ + +static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + out[0] = in[0] + (prev == NULL ? 0 : prev[0]); + DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1); } +static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + if (prev == NULL) { + HorizontalUnfilter(NULL, in, out, width); + } else { + DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1); + } +} + +static void GradientUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + if (prev == NULL) { + HorizontalUnfilter(NULL, in, out, width); + } else { + uint8_t top = prev[0], top_left = top, left = top; + int i; + for (i = 0; i < width; ++i) { + top = prev[i]; // need to read this first, in case prev==dst + left = in[i] + GradientPredictor(left, top, top_left); + top_left = top; + out[i] = left; + } + } +} + +#undef DO_PREDICT_LINE_VERTICAL #undef PREDICT_LINE_ONE_PASS #undef DO_PREDICT_LINE #undef SANITY_CHECK @@ -389,13 +379,13 @@ static void GradientUnfilter(int width, int height, int stride, int row, extern void VP8FiltersInitMIPSdspR2(void); WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) { - WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter; - WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter; - WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter; - WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter; WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter; WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter; + + WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter; + WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter; + WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter; } #else // !WEBP_USE_MIPS_DSP_R2 diff --git a/src/3rdparty/libwebp/src/dsp/filters_sse2.c b/src/3rdparty/libwebp/src/dsp/filters_sse2.c index bf93342..67f7799 100644 --- a/src/3rdparty/libwebp/src/dsp/filters_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/filters_sse2.c @@ -33,82 +33,39 @@ (void)height; // Silence unused warning. static void PredictLineTop(const uint8_t* src, const uint8_t* pred, - uint8_t* dst, int length, int inverse) { + uint8_t* dst, int length) { int i; const int max_pos = length & ~31; assert(length >= 0); - if (inverse) { - for (i = 0; i < max_pos; i += 32) { - const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i + 0]); - const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]); - const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i + 0]); - const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]); - const __m128i C0 = _mm_add_epi8(A0, B0); - const __m128i C1 = _mm_add_epi8(A1, B1); - _mm_storeu_si128((__m128i*)&dst[i + 0], C0); - _mm_storeu_si128((__m128i*)&dst[i + 16], C1); - } - for (; i < length; ++i) dst[i] = src[i] + pred[i]; - } else { - for (i = 0; i < max_pos; i += 32) { - const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i + 0]); - const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]); - const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i + 0]); - const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]); - const __m128i C0 = _mm_sub_epi8(A0, B0); - const __m128i C1 = _mm_sub_epi8(A1, B1); - _mm_storeu_si128((__m128i*)&dst[i + 0], C0); - _mm_storeu_si128((__m128i*)&dst[i + 16], C1); - } - for (; i < length; ++i) dst[i] = src[i] - pred[i]; + for (i = 0; i < max_pos; i += 32) { + const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i + 0]); + const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]); + const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i + 0]); + const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]); + const __m128i C0 = _mm_sub_epi8(A0, B0); + const __m128i C1 = _mm_sub_epi8(A1, B1); + _mm_storeu_si128((__m128i*)&dst[i + 0], C0); + _mm_storeu_si128((__m128i*)&dst[i + 16], C1); } + for (; i < length; ++i) dst[i] = src[i] - pred[i]; } // Special case for left-based prediction (when preds==dst-1 or preds==src-1). -static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length, - int inverse) { +static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) { int i; - if (length <= 0) return; - if (inverse) { - const int max_pos = length & ~7; - __m128i last = _mm_set_epi32(0, 0, 0, dst[-1]); - for (i = 0; i < max_pos; i += 8) { - const __m128i A0 = _mm_loadl_epi64((const __m128i*)(src + i)); - const __m128i A1 = _mm_add_epi8(A0, last); - const __m128i A2 = _mm_slli_si128(A1, 1); - const __m128i A3 = _mm_add_epi8(A1, A2); - const __m128i A4 = _mm_slli_si128(A3, 2); - const __m128i A5 = _mm_add_epi8(A3, A4); - const __m128i A6 = _mm_slli_si128(A5, 4); - const __m128i A7 = _mm_add_epi8(A5, A6); - _mm_storel_epi64((__m128i*)(dst + i), A7); - last = _mm_srli_epi64(A7, 56); - } - for (; i < length; ++i) dst[i] = src[i] + dst[i - 1]; - } else { - const int max_pos = length & ~31; - for (i = 0; i < max_pos; i += 32) { - const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i + 0 )); - const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i + 0 - 1)); - const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16 )); - const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1)); - const __m128i C0 = _mm_sub_epi8(A0, B0); - const __m128i C1 = _mm_sub_epi8(A1, B1); - _mm_storeu_si128((__m128i*)(dst + i + 0), C0); - _mm_storeu_si128((__m128i*)(dst + i + 16), C1); - } - for (; i < length; ++i) dst[i] = src[i] - src[i - 1]; - } -} - -static void PredictLineC(const uint8_t* src, const uint8_t* pred, - uint8_t* dst, int length, int inverse) { - int i; - if (inverse) { - for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i]; - } else { - for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i]; + const int max_pos = length & ~31; + assert(length >= 0); + for (i = 0; i < max_pos; i += 32) { + const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i + 0 )); + const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i + 0 - 1)); + const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16 )); + const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1)); + const __m128i C0 = _mm_sub_epi8(A0, B0); + const __m128i C1 = _mm_sub_epi8(A1, B1); + _mm_storeu_si128((__m128i*)(dst + i + 0), C0); + _mm_storeu_si128((__m128i*)(dst + i + 16), C1); } + for (; i < length; ++i) dst[i] = src[i] - src[i - 1]; } //------------------------------------------------------------------------------ @@ -117,21 +74,18 @@ static void PredictLineC(const uint8_t* src, const uint8_t* pred, static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, int width, int height, int stride, int row, int num_rows, - int inverse, uint8_t* out) { - const uint8_t* preds; + uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); in += start_offset; out += start_offset; - preds = inverse ? out : in; if (row == 0) { // Leftmost pixel is the same as input for topmost scanline. out[0] = in[0]; - PredictLineLeft(in + 1, out + 1, width - 1, inverse); + PredictLineLeft(in + 1, out + 1, width - 1); row = 1; - preds += stride; in += stride; out += stride; } @@ -139,10 +93,9 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, // Filter line-by-line. while (row < last_row) { // Leftmost pixel is predicted from above. - PredictLineC(in, preds - stride, out, 1, inverse); - PredictLineLeft(in + 1, out + 1, width - 1, inverse); + out[0] = in[0] - in[-stride]; + PredictLineLeft(in + 1, out + 1, width - 1); ++row; - preds += stride; in += stride; out += stride; } @@ -153,34 +106,27 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in, static WEBP_INLINE void DoVerticalFilter(const uint8_t* in, int width, int height, int stride, - int row, int num_rows, - int inverse, uint8_t* out) { - const uint8_t* preds; + int row, int num_rows, uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); in += start_offset; out += start_offset; - preds = inverse ? out : in; if (row == 0) { // Very first top-left pixel is copied. out[0] = in[0]; // Rest of top scan-line is left-predicted. - PredictLineLeft(in + 1, out + 1, width - 1, inverse); + PredictLineLeft(in + 1, out + 1, width - 1); row = 1; in += stride; out += stride; - } else { - // We are starting from in-between. Make sure 'preds' points to prev row. - preds -= stride; } // Filter line-by-line. while (row < last_row) { - PredictLineTop(in, preds, out, width, inverse); + PredictLineTop(in, in - stride, out, width); ++row; - preds += stride; in += stride; out += stride; } @@ -219,49 +165,10 @@ static void GradientPredictDirect(const uint8_t* const row, } } -static void GradientPredictInverse(const uint8_t* const in, - const uint8_t* const top, - uint8_t* const row, int length) { - if (length > 0) { - int i; - const int max_pos = length & ~7; - const __m128i zero = _mm_setzero_si128(); - __m128i A = _mm_set_epi32(0, 0, 0, row[-1]); // left sample - for (i = 0; i < max_pos; i += 8) { - const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]); - const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]); - const __m128i B = _mm_unpacklo_epi8(tmp0, zero); - const __m128i C = _mm_unpacklo_epi8(tmp1, zero); - const __m128i tmp2 = _mm_loadl_epi64((const __m128i*)&in[i]); - const __m128i D = _mm_unpacklo_epi8(tmp2, zero); // base input - const __m128i E = _mm_sub_epi16(B, C); // unclipped gradient basis B - C - __m128i out = zero; // accumulator for output - __m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff); - int k = 8; - while (1) { - const __m128i tmp3 = _mm_add_epi16(A, E); // delta = A + B - C - const __m128i tmp4 = _mm_min_epi16(tmp3, mask_hi); - const __m128i tmp5 = _mm_max_epi16(tmp4, zero); // clipped delta - const __m128i tmp6 = _mm_add_epi16(tmp5, D); // add to in[] values - A = _mm_and_si128(tmp6, mask_hi); // 1-complement clip - out = _mm_or_si128(out, A); // accumulate output - if (--k == 0) break; - A = _mm_slli_si128(A, 2); // rotate left sample - mask_hi = _mm_slli_si128(mask_hi, 2); // rotate mask - } - A = _mm_srli_si128(A, 14); // prepare left sample for next iteration - _mm_storel_epi64((__m128i*)&row[i], _mm_packus_epi16(out, zero)); - } - for (; i < length; ++i) { - row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]); - } - } -} - static WEBP_INLINE void DoGradientFilter(const uint8_t* in, int width, int height, int stride, int row, int num_rows, - int inverse, uint8_t* out) { + uint8_t* out) { const size_t start_offset = row * stride; const int last_row = row + num_rows; SANITY_CHECK(in, out); @@ -271,7 +178,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, // left prediction for top scan-line if (row == 0) { out[0] = in[0]; - PredictLineLeft(in + 1, out + 1, width - 1, inverse); + PredictLineLeft(in + 1, out + 1, width - 1); row = 1; in += stride; out += stride; @@ -279,13 +186,8 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, // Filter line-by-line. while (row < last_row) { - if (inverse) { - PredictLineC(in, out - stride, out, 1, inverse); // predict from above - GradientPredictInverse(in + 1, out + 1 - stride, out + 1, width - 1); - } else { - PredictLineC(in, in - stride, out, 1, inverse); - GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1); - } + out[0] = in[0] - in[-stride]; + GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1); ++row; in += stride; out += stride; @@ -298,36 +200,112 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in, static void HorizontalFilter(const uint8_t* data, int width, int height, int stride, uint8_t* filtered_data) { - DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data); + DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data); } static void VerticalFilter(const uint8_t* data, int width, int height, int stride, uint8_t* filtered_data) { - DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data); + DoVerticalFilter(data, width, height, stride, 0, height, filtered_data); } - static void GradientFilter(const uint8_t* data, int width, int height, int stride, uint8_t* filtered_data) { - DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data); + DoGradientFilter(data, width, height, stride, 0, height, filtered_data); } - //------------------------------------------------------------------------------ +// Inverse transforms -static void VerticalUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data); +static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + int i; + __m128i last; + out[0] = in[0] + (prev == NULL ? 0 : prev[0]); + if (width <= 1) return; + last = _mm_set_epi32(0, 0, 0, out[0]); + for (i = 1; i + 8 <= width; i += 8) { + const __m128i A0 = _mm_loadl_epi64((const __m128i*)(in + i)); + const __m128i A1 = _mm_add_epi8(A0, last); + const __m128i A2 = _mm_slli_si128(A1, 1); + const __m128i A3 = _mm_add_epi8(A1, A2); + const __m128i A4 = _mm_slli_si128(A3, 2); + const __m128i A5 = _mm_add_epi8(A3, A4); + const __m128i A6 = _mm_slli_si128(A5, 4); + const __m128i A7 = _mm_add_epi8(A5, A6); + _mm_storel_epi64((__m128i*)(out + i), A7); + last = _mm_srli_epi64(A7, 56); + } + for (; i < width; ++i) out[i] = in[i] + out[i - 1]; } -static void HorizontalUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data); +static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + if (prev == NULL) { + HorizontalUnfilter(NULL, in, out, width); + } else { + int i; + const int max_pos = width & ~31; + assert(width >= 0); + for (i = 0; i < max_pos; i += 32) { + const __m128i A0 = _mm_loadu_si128((const __m128i*)&in[i + 0]); + const __m128i A1 = _mm_loadu_si128((const __m128i*)&in[i + 16]); + const __m128i B0 = _mm_loadu_si128((const __m128i*)&prev[i + 0]); + const __m128i B1 = _mm_loadu_si128((const __m128i*)&prev[i + 16]); + const __m128i C0 = _mm_add_epi8(A0, B0); + const __m128i C1 = _mm_add_epi8(A1, B1); + _mm_storeu_si128((__m128i*)&out[i + 0], C0); + _mm_storeu_si128((__m128i*)&out[i + 16], C1); + } + for (; i < width; ++i) out[i] = in[i] + prev[i]; + } } -static void GradientUnfilter(int width, int height, int stride, int row, - int num_rows, uint8_t* data) { - DoGradientFilter(data, width, height, stride, row, num_rows, 1, data); +static void GradientPredictInverse(const uint8_t* const in, + const uint8_t* const top, + uint8_t* const row, int length) { + if (length > 0) { + int i; + const int max_pos = length & ~7; + const __m128i zero = _mm_setzero_si128(); + __m128i A = _mm_set_epi32(0, 0, 0, row[-1]); // left sample + for (i = 0; i < max_pos; i += 8) { + const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]); + const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]); + const __m128i B = _mm_unpacklo_epi8(tmp0, zero); + const __m128i C = _mm_unpacklo_epi8(tmp1, zero); + const __m128i D = _mm_loadl_epi64((const __m128i*)&in[i]); // base input + const __m128i E = _mm_sub_epi16(B, C); // unclipped gradient basis B - C + __m128i out = zero; // accumulator for output + __m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff); + int k = 8; + while (1) { + const __m128i tmp3 = _mm_add_epi16(A, E); // delta = A + B - C + const __m128i tmp4 = _mm_packus_epi16(tmp3, zero); // saturate delta + const __m128i tmp5 = _mm_add_epi8(tmp4, D); // add to in[] + A = _mm_and_si128(tmp5, mask_hi); // 1-complement clip + out = _mm_or_si128(out, A); // accumulate output + if (--k == 0) break; + A = _mm_slli_si128(A, 1); // rotate left sample + mask_hi = _mm_slli_si128(mask_hi, 1); // rotate mask + A = _mm_unpacklo_epi8(A, zero); // convert 8b->16b + } + A = _mm_srli_si128(A, 7); // prepare left sample for next iteration + _mm_storel_epi64((__m128i*)&row[i], out); + } + for (; i < length; ++i) { + row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]); + } + } +} + +static void GradientUnfilter(const uint8_t* prev, const uint8_t* in, + uint8_t* out, int width) { + if (prev == NULL) { + HorizontalUnfilter(NULL, in, out, width); + } else { + out[0] = in[0] + prev[0]; // predict from above + GradientPredictInverse(in + 1, prev + 1, out + 1, width - 1); + } } //------------------------------------------------------------------------------ diff --git a/src/3rdparty/libwebp/src/dsp/lossless.c b/src/3rdparty/libwebp/src/dsp/lossless.c index 71ae9d4..af913ef 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless.c +++ b/src/3rdparty/libwebp/src/dsp/lossless.c @@ -28,9 +28,7 @@ // In-place sum of each component with mod 256. static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) { - const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u); - const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu); - *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu); + *a = VP8LAddPixels(*a, b); } static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) { diff --git a/src/3rdparty/libwebp/src/dsp/lossless.h b/src/3rdparty/libwebp/src/dsp/lossless.h index e063bdd..9f0d7a2 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless.h +++ b/src/3rdparty/libwebp/src/dsp/lossless.h @@ -158,7 +158,8 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride, void VP8LResidualImage(int width, int height, int bits, int low_effort, uint32_t* const argb, uint32_t* const argb_scratch, - uint32_t* const image, int exact); + uint32_t* const image, int near_lossless, int exact, + int used_subtract_green); void VP8LColorSpaceTransform(int width, int height, int bits, int quality, uint32_t* const argb, uint32_t* image); @@ -172,6 +173,17 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size, return (size + (1 << sampling_bits) - 1) >> sampling_bits; } +// Converts near lossless quality into max number of bits shaved off. +static WEBP_INLINE int VP8LNearLosslessBits(int near_lossless_quality) { + // 100 -> 0 + // 80..99 -> 1 + // 60..79 -> 2 + // 40..59 -> 3 + // 20..39 -> 4 + // 0..19 -> 5 + return 5 - near_lossless_quality / 20; +} + // ----------------------------------------------------------------------------- // Faster logarithm for integers. Small values use a look-up table. @@ -262,6 +274,11 @@ extern VP8LHistogramAddFunc VP8LHistogramAdd; // ----------------------------------------------------------------------------- // PrefixEncode() +typedef int (*VP8LVectorMismatchFunc)(const uint32_t* const array1, + const uint32_t* const array2, int length); +// Returns the first index where array1 and array2 are different. +extern VP8LVectorMismatchFunc VP8LVectorMismatch; + static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) { const int log_floor = BitsLog2Floor(n); if (n == (n & ~(n - 1))) // zero or a power of two. @@ -324,7 +341,14 @@ static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code, } } -// In-place difference of each component with mod 256. +// Sum of each component, mod 256. +static WEBP_INLINE uint32_t VP8LAddPixels(uint32_t a, uint32_t b) { + const uint32_t alpha_and_green = (a & 0xff00ff00u) + (b & 0xff00ff00u); + const uint32_t red_and_blue = (a & 0x00ff00ffu) + (b & 0x00ff00ffu); + return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu); +} + +// Difference of each component, mod 256. static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) { const uint32_t alpha_and_green = 0x00ff00ffu + (a & 0xff00ff00u) - (b & 0xff00ff00u); diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc.c b/src/3rdparty/libwebp/src/dsp/lossless_enc.c index 2eafa3d..256f6f5 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_enc.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_enc.c @@ -382,6 +382,7 @@ static float FastLog2Slow(uint32_t v) { // Mostly used to reduce code size + readability static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; } +static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; } //------------------------------------------------------------------------------ // Methods to calculate Entropy (Shannon). @@ -551,18 +552,204 @@ static WEBP_INLINE uint32_t Predict(VP8LPredictorFunc pred_func, } } +static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) { + const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24)); + const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff)); + const int diff_g = abs((int)((p1 >> 8) & 0xff) - (int)((p2 >> 8) & 0xff)); + const int diff_b = abs((int)(p1 & 0xff) - (int)(p2 & 0xff)); + return GetMax(GetMax(diff_a, diff_r), GetMax(diff_g, diff_b)); +} + +static int MaxDiffAroundPixel(uint32_t current, uint32_t up, uint32_t down, + uint32_t left, uint32_t right) { + const int diff_up = MaxDiffBetweenPixels(current, up); + const int diff_down = MaxDiffBetweenPixels(current, down); + const int diff_left = MaxDiffBetweenPixels(current, left); + const int diff_right = MaxDiffBetweenPixels(current, right); + return GetMax(GetMax(diff_up, diff_down), GetMax(diff_left, diff_right)); +} + +static uint32_t AddGreenToBlueAndRed(uint32_t argb) { + const uint32_t green = (argb >> 8) & 0xff; + uint32_t red_blue = argb & 0x00ff00ffu; + red_blue += (green << 16) | green; + red_blue &= 0x00ff00ffu; + return (argb & 0xff00ff00u) | red_blue; +} + +static void MaxDiffsForRow(int width, int stride, const uint32_t* const argb, + uint8_t* const max_diffs, int used_subtract_green) { + uint32_t current, up, down, left, right; + int x; + if (width <= 2) return; + current = argb[0]; + right = argb[1]; + if (used_subtract_green) { + current = AddGreenToBlueAndRed(current); + right = AddGreenToBlueAndRed(right); + } + // max_diffs[0] and max_diffs[width - 1] are never used. + for (x = 1; x < width - 1; ++x) { + up = argb[-stride + x]; + down = argb[stride + x]; + left = current; + current = right; + right = argb[x + 1]; + if (used_subtract_green) { + up = AddGreenToBlueAndRed(up); + down = AddGreenToBlueAndRed(down); + right = AddGreenToBlueAndRed(right); + } + max_diffs[x] = MaxDiffAroundPixel(current, up, down, left, right); + } +} + +// Quantize the difference between the actual component value and its prediction +// to a multiple of quantization, working modulo 256, taking care not to cross +// a boundary (inclusive upper limit). +static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict, + uint8_t boundary, int quantization) { + const int residual = (value - predict) & 0xff; + const int boundary_residual = (boundary - predict) & 0xff; + const int lower = residual & ~(quantization - 1); + const int upper = lower + quantization; + // Resolve ties towards a value closer to the prediction (i.e. towards lower + // if value comes after prediction and towards upper otherwise). + const int bias = ((boundary - value) & 0xff) < boundary_residual; + if (residual - lower < upper - residual + bias) { + // lower is closer to residual than upper. + if (residual > boundary_residual && lower <= boundary_residual) { + // Halve quantization step to avoid crossing boundary. This midpoint is + // on the same side of boundary as residual because midpoint >= residual + // (since lower is closer than upper) and residual is above the boundary. + return lower + (quantization >> 1); + } + return lower; + } else { + // upper is closer to residual than lower. + if (residual <= boundary_residual && upper > boundary_residual) { + // Halve quantization step to avoid crossing boundary. This midpoint is + // on the same side of boundary as residual because midpoint <= residual + // (since upper is closer than lower) and residual is below the boundary. + return lower + (quantization >> 1); + } + return upper & 0xff; + } +} + +// Quantize every component of the difference between the actual pixel value and +// its prediction to a multiple of a quantization (a power of 2, not larger than +// max_quantization which is a power of 2, smaller than max_diff). Take care if +// value and predict have undergone subtract green, which means that red and +// blue are represented as offsets from green. +static uint32_t NearLossless(uint32_t value, uint32_t predict, + int max_quantization, int max_diff, + int used_subtract_green) { + int quantization; + uint8_t new_green = 0; + uint8_t green_diff = 0; + uint8_t a, r, g, b; + if (max_diff <= 2) { + return VP8LSubPixels(value, predict); + } + quantization = max_quantization; + while (quantization >= max_diff) { + quantization >>= 1; + } + if ((value >> 24) == 0 || (value >> 24) == 0xff) { + // Preserve transparency of fully transparent or fully opaque pixels. + a = ((value >> 24) - (predict >> 24)) & 0xff; + } else { + a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization); + } + g = NearLosslessComponent((value >> 8) & 0xff, (predict >> 8) & 0xff, 0xff, + quantization); + if (used_subtract_green) { + // The green offset will be added to red and blue components during decoding + // to obtain the actual red and blue values. + new_green = ((predict >> 8) + g) & 0xff; + // The amount by which green has been adjusted during quantization. It is + // subtracted from red and blue for compensation, to avoid accumulating two + // quantization errors in them. + green_diff = (new_green - (value >> 8)) & 0xff; + } + r = NearLosslessComponent(((value >> 16) - green_diff) & 0xff, + (predict >> 16) & 0xff, 0xff - new_green, + quantization); + b = NearLosslessComponent((value - green_diff) & 0xff, predict & 0xff, + 0xff - new_green, quantization); + return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b; +} + +// Returns the difference between the pixel and its prediction. In case of a +// lossy encoding, updates the source image to avoid propagating the deviation +// further to pixels which depend on the current pixel for their predictions. +static WEBP_INLINE uint32_t GetResidual(int width, int height, + uint32_t* const upper_row, + uint32_t* const current_row, + const uint8_t* const max_diffs, + int mode, VP8LPredictorFunc pred_func, + int x, int y, int max_quantization, + int exact, int used_subtract_green) { + const uint32_t predict = Predict(pred_func, x, y, current_row, upper_row); + uint32_t residual; + if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 || + x == 0 || x == width - 1) { + residual = VP8LSubPixels(current_row[x], predict); + } else { + residual = NearLossless(current_row[x], predict, max_quantization, + max_diffs[x], used_subtract_green); + // Update the source image. + current_row[x] = VP8LAddPixels(predict, residual); + // x is never 0 here so we do not need to update upper_row like below. + } + if (!exact && (current_row[x] & kMaskAlpha) == 0) { + // If alpha is 0, cleanup RGB. We can choose the RGB values of the residual + // for best compression. The prediction of alpha itself can be non-zero and + // must be kept though. We choose RGB of the residual to be 0. + residual &= kMaskAlpha; + // Update the source image. + current_row[x] = predict & ~kMaskAlpha; + // The prediction for the rightmost pixel in a row uses the leftmost pixel + // in that row as its top-right context pixel. Hence if we change the + // leftmost pixel of current_row, the corresponding change must be applied + // to upper_row as well where top-right context is being read from. + if (x == 0 && y != 0) upper_row[width] = current_row[0]; + } + return residual; +} + // Returns best predictor and updates the accumulated histogram. +// If max_quantization > 1, assumes that near lossless processing will be +// applied, quantizing residuals to multiples of quantization levels up to +// max_quantization (the actual quantization level depends on smoothness near +// the given pixel). static int GetBestPredictorForTile(int width, int height, int tile_x, int tile_y, int bits, int accumulated[4][256], - const uint32_t* const argb_scratch, - int exact) { + uint32_t* const argb_scratch, + const uint32_t* const argb, + int max_quantization, + int exact, int used_subtract_green) { const int kNumPredModes = 14; - const int col_start = tile_x << bits; - const int row_start = tile_y << bits; + const int start_x = tile_x << bits; + const int start_y = tile_y << bits; const int tile_size = 1 << bits; - const int max_y = GetMin(tile_size, height - row_start); - const int max_x = GetMin(tile_size, width - col_start); + const int max_y = GetMin(tile_size, height - start_y); + const int max_x = GetMin(tile_size, width - start_x); + // Whether there exist columns just outside the tile. + const int have_left = (start_x > 0); + const int have_right = (max_x < width - start_x); + // Position and size of the strip covering the tile and adjacent columns if + // they exist. + const int context_start_x = start_x - have_left; + const int context_width = max_x + have_left + have_right; + // The width of upper_row and current_row is one pixel larger than image width + // to allow the top right pixel to point to the leftmost pixel of the next row + // when at the right edge. + uint32_t* upper_row = argb_scratch; + uint32_t* current_row = upper_row + width + 1; + uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1); float best_diff = MAX_DIFF_COST; int best_mode = 0; int mode; @@ -571,28 +758,46 @@ static int GetBestPredictorForTile(int width, int height, // Need pointers to be able to swap arrays. int (*histo_argb)[256] = histo_stack_1; int (*best_histo)[256] = histo_stack_2; - int i, j; + for (mode = 0; mode < kNumPredModes; ++mode) { - const uint32_t* current_row = argb_scratch; const VP8LPredictorFunc pred_func = VP8LPredictors[mode]; float cur_diff; - int y; + int relative_y; memset(histo_argb, 0, sizeof(histo_stack_1)); - for (y = 0; y < max_y; ++y) { - int x; - const int row = row_start + y; - const uint32_t* const upper_row = current_row; - current_row = upper_row + width; - for (x = 0; x < max_x; ++x) { - const int col = col_start + x; - const uint32_t predict = - Predict(pred_func, col, row, current_row, upper_row); - uint32_t residual = VP8LSubPixels(current_row[col], predict); - if (!exact && (current_row[col] & kMaskAlpha) == 0) { - residual &= kMaskAlpha; // See CopyTileWithPrediction. - } - UpdateHisto(histo_argb, residual); + if (start_y > 0) { + // Read the row above the tile which will become the first upper_row. + // Include a pixel to the left if it exists; include a pixel to the right + // in all cases (wrapping to the leftmost pixel of the next row if it does + // not exist). + memcpy(current_row + context_start_x, + argb + (start_y - 1) * width + context_start_x, + sizeof(*argb) * (max_x + have_left + 1)); + } + for (relative_y = 0; relative_y < max_y; ++relative_y) { + const int y = start_y + relative_y; + int relative_x; + uint32_t* tmp = upper_row; + upper_row = current_row; + current_row = tmp; + // Read current_row. Include a pixel to the left if it exists; include a + // pixel to the right in all cases except at the bottom right corner of + // the image (wrapping to the leftmost pixel of the next row if it does + // not exist in the current row). + memcpy(current_row + context_start_x, + argb + y * width + context_start_x, + sizeof(*argb) * (max_x + have_left + (y + 1 < height))); + if (max_quantization > 1 && y >= 1 && y + 1 < height) { + MaxDiffsForRow(context_width, width, argb + y * width + context_start_x, + max_diffs + context_start_x, used_subtract_green); + } + + for (relative_x = 0; relative_x < max_x; ++relative_x) { + const int x = start_x + relative_x; + UpdateHisto(histo_argb, + GetResidual(width, height, upper_row, current_row, + max_diffs, mode, pred_func, x, y, + max_quantization, exact, used_subtract_green)); } } cur_diff = PredictionCostSpatialHistogram( @@ -615,71 +820,82 @@ static int GetBestPredictorForTile(int width, int height, return best_mode; } +// Converts pixels of the image to residuals with respect to predictions. +// If max_quantization > 1, applies near lossless processing, quantizing +// residuals to multiples of quantization levels up to max_quantization +// (the actual quantization level depends on smoothness near the given pixel). static void CopyImageWithPrediction(int width, int height, int bits, uint32_t* const modes, uint32_t* const argb_scratch, uint32_t* const argb, - int low_effort, int exact) { + int low_effort, int max_quantization, + int exact, int used_subtract_green) { const int tiles_per_row = VP8LSubSampleSize(width, bits); const int mask = (1 << bits) - 1; - // The row size is one pixel longer to allow the top right pixel to point to - // the leftmost pixel of the next row when at the right edge. - uint32_t* current_row = argb_scratch; - uint32_t* upper_row = argb_scratch + width + 1; + // The width of upper_row and current_row is one pixel larger than image width + // to allow the top right pixel to point to the leftmost pixel of the next row + // when at the right edge. + uint32_t* upper_row = argb_scratch; + uint32_t* current_row = upper_row + width + 1; + uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1); + uint8_t* lower_max_diffs = current_max_diffs + width; int y; - VP8LPredictorFunc pred_func = - low_effort ? VP8LPredictors[kPredLowEffort] : NULL; + int mode = 0; + VP8LPredictorFunc pred_func = NULL; for (y = 0; y < height; ++y) { int x; - uint32_t* tmp = upper_row; + uint32_t* const tmp32 = upper_row; upper_row = current_row; - current_row = tmp; - memcpy(current_row, argb + y * width, sizeof(*current_row) * width); - current_row[width] = (y + 1 < height) ? argb[(y + 1) * width] : ARGB_BLACK; + current_row = tmp32; + memcpy(current_row, argb + y * width, + sizeof(*argb) * (width + (y + 1 < height))); if (low_effort) { for (x = 0; x < width; ++x) { - const uint32_t predict = - Predict(pred_func, x, y, current_row, upper_row); + const uint32_t predict = Predict(VP8LPredictors[kPredLowEffort], x, y, + current_row, upper_row); argb[y * width + x] = VP8LSubPixels(current_row[x], predict); } } else { + if (max_quantization > 1) { + // Compute max_diffs for the lower row now, because that needs the + // contents of argb for the current row, which we will overwrite with + // residuals before proceeding with the next row. + uint8_t* const tmp8 = current_max_diffs; + current_max_diffs = lower_max_diffs; + lower_max_diffs = tmp8; + if (y + 2 < height) { + MaxDiffsForRow(width, width, argb + (y + 1) * width, lower_max_diffs, + used_subtract_green); + } + } for (x = 0; x < width; ++x) { - uint32_t predict, residual; if ((x & mask) == 0) { - const int mode = - (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff; + mode = (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff; pred_func = VP8LPredictors[mode]; } - predict = Predict(pred_func, x, y, current_row, upper_row); - residual = VP8LSubPixels(current_row[x], predict); - if (!exact && (current_row[x] & kMaskAlpha) == 0) { - // If alpha is 0, cleanup RGB. We can choose the RGB values of the - // residual for best compression. The prediction of alpha itself can - // be non-zero and must be kept though. We choose RGB of the residual - // to be 0. - residual &= kMaskAlpha; - // Update input image so that next predictions use correct RGB value. - current_row[x] = predict & ~kMaskAlpha; - if (x == 0 && y != 0) upper_row[width] = current_row[x]; - } - argb[y * width + x] = residual; + argb[y * width + x] = GetResidual( + width, height, upper_row, current_row, current_max_diffs, mode, + pred_func, x, y, max_quantization, exact, used_subtract_green); } } } } +// Finds the best predictor for each tile, and converts the image to residuals +// with respect to predictions. If near_lossless_quality < 100, applies +// near lossless processing, shaving off more bits of residuals for lower +// qualities. void VP8LResidualImage(int width, int height, int bits, int low_effort, uint32_t* const argb, uint32_t* const argb_scratch, - uint32_t* const image, int exact) { - const int max_tile_size = 1 << bits; + uint32_t* const image, int near_lossless_quality, + int exact, int used_subtract_green) { const int tiles_per_row = VP8LSubSampleSize(width, bits); const int tiles_per_col = VP8LSubSampleSize(height, bits); - uint32_t* const upper_row = argb_scratch; - uint32_t* const current_tile_rows = argb_scratch + width; int tile_y; int histo[4][256]; + const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality); if (low_effort) { int i; for (i = 0; i < tiles_per_row * tiles_per_col; ++i) { @@ -688,26 +904,19 @@ void VP8LResidualImage(int width, int height, int bits, int low_effort, } else { memset(histo, 0, sizeof(histo)); for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) { - const int tile_y_offset = tile_y * max_tile_size; - const int this_tile_height = - (tile_y < tiles_per_col - 1) ? max_tile_size : height - tile_y_offset; int tile_x; - if (tile_y > 0) { - memcpy(upper_row, current_tile_rows + (max_tile_size - 1) * width, - width * sizeof(*upper_row)); - } - memcpy(current_tile_rows, &argb[tile_y_offset * width], - this_tile_height * width * sizeof(*current_tile_rows)); for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) { const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y, - bits, (int (*)[256])histo, argb_scratch, exact); + bits, histo, argb_scratch, argb, max_quantization, exact, + used_subtract_green); image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8); } } } - CopyImageWithPrediction(width, height, bits, - image, argb_scratch, argb, low_effort, exact); + CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb, + low_effort, max_quantization, exact, + used_subtract_green); } void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) { @@ -1053,6 +1262,17 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int quality, } //------------------------------------------------------------------------------ + +static int VectorMismatch(const uint32_t* const array1, + const uint32_t* const array2, int length) { + int match_len = 0; + + while (match_len < length && array1[match_len] == array2[match_len]) { + ++match_len; + } + return match_len; +} + // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel. void VP8LBundleColorMap(const uint8_t* const row, int width, int xbits, uint32_t* const dst) { @@ -1149,6 +1369,8 @@ GetEntropyUnrefinedHelperFunc VP8LGetEntropyUnrefinedHelper; VP8LHistogramAddFunc VP8LHistogramAdd; +VP8LVectorMismatchFunc VP8LVectorMismatch; + extern void VP8LEncDspInitSSE2(void); extern void VP8LEncDspInitSSE41(void); extern void VP8LEncDspInitNEON(void); @@ -1181,6 +1403,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) { VP8LHistogramAdd = HistogramAdd; + VP8LVectorMismatch = VectorMismatch; + // If defined, use CPUInfo() to overwrite some pointers with faster versions. if (VP8GetCPUInfo != NULL) { #if defined(WEBP_USE_SSE2) diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c index e8c9834..7c894e7 100644 --- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c @@ -325,6 +325,57 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) { #undef ANALYZE_XY //------------------------------------------------------------------------------ + +static int VectorMismatch(const uint32_t* const array1, + const uint32_t* const array2, int length) { + int match_len; + + if (length >= 12) { + __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]); + __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]); + match_len = 0; + do { + // Loop unrolling and early load both provide a speedup of 10% for the + // current function. Also, max_limit can be MAX_LENGTH=4096 at most. + const __m128i cmpA = _mm_cmpeq_epi32(A0, A1); + const __m128i B0 = + _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); + const __m128i B1 = + _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); + if (_mm_movemask_epi8(cmpA) != 0xffff) break; + match_len += 4; + + { + const __m128i cmpB = _mm_cmpeq_epi32(B0, B1); + A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]); + A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]); + if (_mm_movemask_epi8(cmpB) != 0xffff) break; + match_len += 4; + } + } while (match_len + 12 < length); + } else { + match_len = 0; + // Unroll the potential first two loops. + if (length >= 4 && + _mm_movemask_epi8(_mm_cmpeq_epi32( + _mm_loadu_si128((const __m128i*)&array1[0]), + _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) { + match_len = 4; + if (length >= 8 && + _mm_movemask_epi8(_mm_cmpeq_epi32( + _mm_loadu_si128((const __m128i*)&array1[4]), + _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) + match_len = 8; + } + } + + while (match_len < length && array1[match_len] == array2[match_len]) { + ++match_len; + } + return match_len; +} + +//------------------------------------------------------------------------------ // Entry point extern void VP8LEncDspInitSSE2(void); @@ -336,6 +387,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) { VP8LCollectColorRedTransforms = CollectColorRedTransforms; VP8LHistogramAdd = HistogramAdd; VP8LCombinedShannonEntropy = CombinedShannonEntropy; + VP8LVectorMismatch = VectorMismatch; } #else // !WEBP_USE_SSE2 diff --git a/src/3rdparty/libwebp/src/dsp/msa_macro.h b/src/3rdparty/libwebp/src/dsp/msa_macro.h new file mode 100644 index 0000000..5c707f4 --- /dev/null +++ b/src/3rdparty/libwebp/src/dsp/msa_macro.h @@ -0,0 +1,555 @@ +// Copyright 2016 Google Inc. All Rights Reserved. +// +// Use of this source code is governed by a BSD-style license +// that can be found in the COPYING file in the root of the source +// tree. An additional intellectual property rights grant can be found +// in the file PATENTS. All contributing project authors may +// be found in the AUTHORS file in the root of the source tree. +// ----------------------------------------------------------------------------- +// +// MSA common macros +// +// Author(s): Prashant Patil (prashant.patil@imgtec.com) + +#ifndef WEBP_DSP_MSA_MACRO_H_ +#define WEBP_DSP_MSA_MACRO_H_ + +#include <stdint.h> +#include <msa.h> + +#if defined(__clang__) + #define CLANG_BUILD +#endif + +#ifdef CLANG_BUILD + #define ADDVI_H(a, b) __msa_addvi_h((v8i16)a, b) + #define SRAI_H(a, b) __msa_srai_h((v8i16)a, b) + #define SRAI_W(a, b) __msa_srai_w((v4i32)a, b) +#else + #define ADDVI_H(a, b) (a + b) + #define SRAI_H(a, b) (a >> b) + #define SRAI_W(a, b) (a >> b) +#endif + +#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc)) +#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) +#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) + +#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc)) +#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) +#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) + +#define LD_W(RTYPE, psrc) *((RTYPE*)(psrc)) +#define LD_UW(...) LD_W(v4u32, __VA_ARGS__) +#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) + +#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in +#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) +#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) + +#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in +#define ST_UH(...) ST_H(v8u16, __VA_ARGS__) +#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) + +#define ST_W(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in +#define ST_UW(...) ST_W(v4u32, __VA_ARGS__) +#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) + +#define MSA_LOAD_FUNC(TYPE, INSTR, FUNC_NAME) \ + static inline TYPE FUNC_NAME(const void* const psrc) { \ + const uint8_t* const psrc_m = (const uint8_t*)psrc; \ + TYPE val_m; \ + asm volatile ( \ + "" #INSTR " %[val_m], %[psrc_m] \n\t" \ + : [val_m] "=r" (val_m) \ + : [psrc_m] "m" (*psrc_m)); \ + return val_m; \ + } + +#define MSA_LOAD(psrc, FUNC_NAME) FUNC_NAME(psrc) + +#define MSA_STORE_FUNC(TYPE, INSTR, FUNC_NAME) \ + static inline void FUNC_NAME(TYPE val, void* const pdst) { \ + uint8_t* const pdst_m = (uint8_t*)pdst; \ + TYPE val_m = val; \ + asm volatile ( \ + " " #INSTR " %[val_m], %[pdst_m] \n\t" \ + : [pdst_m] "=m" (*pdst_m) \ + : [val_m] "r" (val_m)); \ + } + +#define MSA_STORE(val, pdst, FUNC_NAME) FUNC_NAME(val, pdst) + +#if (__mips_isa_rev >= 6) + MSA_LOAD_FUNC(uint16_t, lh, msa_lh); + #define LH(psrc) MSA_LOAD(psrc, msa_lh) + MSA_LOAD_FUNC(uint32_t, lw, msa_lw); + #define LW(psrc) MSA_LOAD(psrc, msa_lw) + #if (__mips == 64) + MSA_LOAD_FUNC(uint64_t, ld, msa_ld); + #define LD(psrc) MSA_LOAD(psrc, msa_ld) + #else // !(__mips == 64) + #define LD(psrc) ((((uint64_t)MSA_LOAD(psrc + 4, msa_lw)) << 32) | \ + MSA_LOAD(psrc, msa_lw)) + #endif // (__mips == 64) + + MSA_STORE_FUNC(uint16_t, sh, msa_sh); + #define SH(val, pdst) MSA_STORE(val, pdst, msa_sh) + MSA_STORE_FUNC(uint32_t, sw, msa_sw); + #define SW(val, pdst) MSA_STORE(val, pdst, msa_sw) + MSA_STORE_FUNC(uint64_t, sd, msa_sd); + #define SD(val, pdst) MSA_STORE(val, pdst, msa_sd) +#else // !(__mips_isa_rev >= 6) + MSA_LOAD_FUNC(uint16_t, ulh, msa_ulh); + #define LH(psrc) MSA_LOAD(psrc, msa_ulh) + MSA_LOAD_FUNC(uint32_t, ulw, msa_ulw); + #define LW(psrc) MSA_LOAD(psrc, msa_ulw) + #if (__mips == 64) + MSA_LOAD_FUNC(uint64_t, uld, msa_uld); + #define LD(psrc) MSA_LOAD(psrc, msa_uld) + #else // !(__mips == 64) + #define LD(psrc) ((((uint64_t)MSA_LOAD(psrc + 4, msa_ulw)) << 32) | \ + MSA_LOAD(psrc, msa_ulw)) + #endif // (__mips == 64) + + MSA_STORE_FUNC(uint16_t, ush, msa_ush); + #define SH(val, pdst) MSA_STORE(val, pdst, msa_ush) + MSA_STORE_FUNC(uint32_t, usw, msa_usw); + #define SW(val, pdst) MSA_STORE(val, pdst, msa_usw) + #define SD(val, pdst) { \ + uint8_t* const pdst_sd_m = (uint8_t*)(pdst); \ + const uint32_t val0_m = (uint32_t)(val & 0x00000000FFFFFFFF); \ + const uint32_t val1_m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF); \ + SW(val0_m, pdst_sd_m); \ + SW(val1_m, pdst_sd_m + 4); \ + } +#endif // (__mips_isa_rev >= 6) + +/* Description : Load 4 words with stride + * Arguments : Inputs - psrc, stride + * Outputs - out0, out1, out2, out3 + * Details : Load word in 'out0' from (psrc) + * Load word in 'out1' from (psrc + stride) + * Load word in 'out2' from (psrc + 2 * stride) + * Load word in 'out3' from (psrc + 3 * stride) + */ +#define LW4(psrc, stride, out0, out1, out2, out3) { \ + const uint8_t* ptmp = (const uint8_t*)psrc; \ + out0 = LW(ptmp); \ + ptmp += stride; \ + out1 = LW(ptmp); \ + ptmp += stride; \ + out2 = LW(ptmp); \ + ptmp += stride; \ + out3 = LW(ptmp); \ +} + +/* Description : Store 4 words with stride + * Arguments : Inputs - in0, in1, in2, in3, pdst, stride + * Details : Store word from 'in0' to (pdst) + * Store word from 'in1' to (pdst + stride) + * Store word from 'in2' to (pdst + 2 * stride) + * Store word from 'in3' to (pdst + 3 * stride) + */ +#define SW4(in0, in1, in2, in3, pdst, stride) { \ + uint8_t* ptmp = (uint8_t*)pdst; \ + SW(in0, ptmp); \ + ptmp += stride; \ + SW(in1, ptmp); \ + ptmp += stride; \ + SW(in2, ptmp); \ + ptmp += stride; \ + SW(in3, ptmp); \ +} + +/* Description : Load vectors with 16 byte elements with stride + * Arguments : Inputs - psrc, stride + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Load 16 byte elements in 'out0' from (psrc) + * Load 16 byte elements in 'out1' from (psrc + stride) + */ +#define LD_B2(RTYPE, psrc, stride, out0, out1) { \ + out0 = LD_B(RTYPE, psrc); \ + out1 = LD_B(RTYPE, psrc + stride); \ +} +#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) +#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) + +#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) { \ + LD_B2(RTYPE, psrc, stride, out0, out1); \ + LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3); \ +} +#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) +#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) + +/* Description : Load vectors with 8 halfword elements with stride + * Arguments : Inputs - psrc, stride + * Outputs - out0, out1 + * Details : Load 8 halfword elements in 'out0' from (psrc) + * Load 8 halfword elements in 'out1' from (psrc + stride) + */ +#define LD_H2(RTYPE, psrc, stride, out0, out1) { \ + out0 = LD_H(RTYPE, psrc); \ + out1 = LD_H(RTYPE, psrc + stride); \ +} +#define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__) +#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) + +/* Description : Store 4x4 byte block to destination memory from input vector + * Arguments : Inputs - in0, in1, pdst, stride + * Details : 'Idx0' word element from input vector 'in0' is copied to the + * GP register and stored to (pdst) + * 'Idx1' word element from input vector 'in0' is copied to the + * GP register and stored to (pdst + stride) + * 'Idx2' word element from input vector 'in0' is copied to the + * GP register and stored to (pdst + 2 * stride) + * 'Idx3' word element from input vector 'in0' is copied to the + * GP register and stored to (pdst + 3 * stride) + */ +#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) { \ + uint8_t* const pblk_4x4_m = (uint8_t*)pdst; \ + const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0); \ + const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1); \ + const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2); \ + const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3); \ + SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \ +} + +/* Description : Immediate number of elements to slide + * Arguments : Inputs - in0, in1, slide_val + * Outputs - out + * Return Type - as per RTYPE + * Details : Byte elements from 'in1' vector are slid into 'in0' by + * value specified in the 'slide_val' + */ +#define SLDI_B(RTYPE, in0, in1, slide_val) \ + (RTYPE)__msa_sldi_b((v16i8)in0, (v16i8)in1, slide_val) \ + +#define SLDI_UB(...) SLDI_B(v16u8, __VA_ARGS__) +#define SLDI_SB(...) SLDI_B(v16i8, __VA_ARGS__) +#define SLDI_SH(...) SLDI_B(v8i16, __VA_ARGS__) + +/* Description : Shuffle halfword vector elements as per mask vector + * Arguments : Inputs - in0, in1, in2, in3, mask0, mask1 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : halfword elements from 'in0' & 'in1' are copied selectively to + * 'out0' as per control vector 'mask0' + */ +#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) { \ + out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0); \ + out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2); \ +} +#define VSHF_H2_UH(...) VSHF_H2(v8u16, __VA_ARGS__) +#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__) + +/* Description : Clips all signed halfword elements of input vector + * between 0 & 255 + * Arguments : Input/output - val + * Return Type - signed halfword + */ +#define CLIP_SH_0_255(val) { \ + const v8i16 max_m = __msa_ldi_h(255); \ + val = __msa_maxi_s_h((v8i16)val, 0); \ + val = __msa_min_s_h(max_m, (v8i16)val); \ +} +#define CLIP_SH2_0_255(in0, in1) { \ + CLIP_SH_0_255(in0); \ + CLIP_SH_0_255(in1); \ +} + +/* Description : Clips all signed word elements of input vector + * between 0 & 255 + * Arguments : Input/output - val + * Return Type - signed word + */ +#define CLIP_SW_0_255(val) { \ + const v4i32 max_m = __msa_ldi_w(255); \ + val = __msa_maxi_s_w((v4i32)val, 0); \ + val = __msa_min_s_w(max_m, (v4i32)val); \ +} +#define CLIP_SW4_0_255(in0, in1, in2, in3) { \ + CLIP_SW_0_255(in0); \ + CLIP_SW_0_255(in1); \ + CLIP_SW_0_255(in2); \ + CLIP_SW_0_255(in3); \ +} + +/* Description : Set element n input vector to GPR value + * Arguments : Inputs - in0, in1, in2, in3 + * Output - out + * Return Type - as per RTYPE + * Details : Set element 0 in vector 'out' to value specified in 'in0' + */ +#define INSERT_W2(RTYPE, in0, in1, out) { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ +} +#define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__) +#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) + +#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) { \ + out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2); \ + out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3); \ +} +#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) +#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) +#define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__) + +/* Description : Interleave right half of byte elements from vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Right half of byte elements of 'in0' and 'in1' are interleaved + * and written to out0. + */ +#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3); \ +} +#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) +#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) +#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) +#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) +#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__) + +#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) +#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) +#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) +#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) +#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__) + +/* Description : Interleave right half of halfword elements from vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Right half of halfword elements of 'in0' and 'in1' are + * interleaved and written to 'out0'. + */ +#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3); \ +} +#define ILVR_H2_UB(...) ILVR_H2(v16u8, __VA_ARGS__) +#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) +#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) + +#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \ + ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \ +} +#define ILVR_H4_UB(...) ILVR_H4(v16u8, __VA_ARGS__) +#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) +#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__) + +/* Description : Interleave right half of double word elements from vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Right half of double word elements of 'in0' and 'in1' are + * interleaved and written to 'out0'. + */ +#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1); \ + out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3); \ +} +#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) +#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) +#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) + +#define ILVRL_H2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1); \ + out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1); \ +} +#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__) +#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__) +#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) +#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) +#define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__) + +#define ILVRL_W2(RTYPE, in0, in1, out0, out1) { \ + out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1); \ + out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1); \ +} +#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) +#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) +#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) + +/* Description : Pack even byte elements of vector pairs + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Return Type - as per RTYPE + * Details : Even byte elements of 'in0' are copied to the left half of + * 'out0' & even byte elements of 'in1' are copied to the right + * half of 'out0'. + */ +#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1); \ + out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3); \ +} +#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) +#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) +#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) +#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__) + +/* Description : Arithmetic immediate shift right all elements of word vector + * Arguments : Inputs - in0, in1, shift + * Outputs - in place operation + * Return Type - as per input vector RTYPE + * Details : Each element of vector 'in0' is right shifted by 'shift' and + * the result is written in-place. 'shift' is a GP variable. + */ +#define SRAI_W2(RTYPE, in0, in1, shift_val) { \ + in0 = (RTYPE)SRAI_W(in0, shift_val); \ + in1 = (RTYPE)SRAI_W(in1, shift_val); \ +} +#define SRAI_W2_SW(...) SRAI_W2(v4i32, __VA_ARGS__) +#define SRAI_W2_UW(...) SRAI_W2(v4u32, __VA_ARGS__) + +#define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) { \ + SRAI_W2(RTYPE, in0, in1, shift_val); \ + SRAI_W2(RTYPE, in2, in3, shift_val); \ +} +#define SRAI_W4_SW(...) SRAI_W4(v4i32, __VA_ARGS__) +#define SRAI_W4_UW(...) SRAI_W4(v4u32, __VA_ARGS__) + +/* Description : Arithmetic shift right all elements of half-word vector + * Arguments : Inputs - in0, in1, shift + * Outputs - in place operation + * Return Type - as per input vector RTYPE + * Details : Each element of vector 'in0' is right shifted by 'shift' and + * the result is written in-place. 'shift' is a GP variable. + */ +#define SRAI_H2(RTYPE, in0, in1, shift_val) { \ + in0 = (RTYPE)SRAI_H(in0, shift_val); \ + in1 = (RTYPE)SRAI_H(in1, shift_val); \ +} +#define SRAI_H2_SH(...) SRAI_H2(v8i16, __VA_ARGS__) +#define SRAI_H2_UH(...) SRAI_H2(v8u16, __VA_ARGS__) + +/* Description : Arithmetic rounded shift right all elements of word vector + * Arguments : Inputs - in0, in1, shift + * Outputs - in place operation + * Return Type - as per input vector RTYPE + * Details : Each element of vector 'in0' is right shifted by 'shift' and + * the result is written in-place. 'shift' is a GP variable. + */ +#define SRARI_W2(RTYPE, in0, in1, shift) { \ + in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift); \ + in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift); \ +} +#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) + +#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) { \ + SRARI_W2(RTYPE, in0, in1, shift); \ + SRARI_W2(RTYPE, in2, in3, shift); \ +} +#define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__) +#define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__) +#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) + +/* Description : Addition of 2 pairs of half-word vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Details : Each element in 'in0' is added to 'in1' and result is written + * to 'out0'. + */ +#define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) { \ + out0 = (RTYPE)ADDVI_H(in0, in1); \ + out1 = (RTYPE)ADDVI_H(in2, in3); \ +} +#define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__) +#define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__) + +/* Description : Addition of 2 pairs of vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1 + * Details : Each element in 'in0' is added to 'in1' and result is written + * to 'out0'. + */ +#define ADD2(in0, in1, in2, in3, out0, out1) { \ + out0 = in0 + in1; \ + out1 = in2 + in3; \ +} +#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \ + out0, out1, out2, out3) { \ + ADD2(in0, in1, in2, in3, out0, out1); \ + ADD2(in4, in5, in6, in7, out2, out3); \ +} + +/* Description : Sign extend halfword elements from input vector and return + * the result in pair of vectors + * Arguments : Input - in (halfword vector) + * Outputs - out0, out1 (sign extended word vectors) + * Return Type - signed word + * Details : Sign bit of halfword elements from input vector 'in' is + * extracted and interleaved right with same vector 'in0' to + * generate 4 signed word elements in 'out0' + * Then interleaved left with same vector 'in0' to + * generate 4 signed word elements in 'out1' + */ +#define UNPCK_SH_SW(in, out0, out1) { \ + const v8i16 tmp_m = __msa_clti_s_h((v8i16)in, 0); \ + ILVRL_H2_SW(tmp_m, in, out0, out1); \ +} + +/* Description : Butterfly of 4 input vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Details : Butterfly operation + */ +#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) { \ + out0 = in0 + in3; \ + out1 = in1 + in2; \ + out2 = in1 - in2; \ + out3 = in0 - in3; \ +} + +/* Description : Transpose 4x4 block with word elements in vectors + * Arguments : Inputs - in0, in1, in2, in3 + * Outputs - out0, out1, out2, out3 + * Return Type - as per RTYPE + */ +#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) { \ + v4i32 s0_m, s1_m, s2_m, s3_m; \ + ILVRL_W2_SW(in1, in0, s0_m, s1_m); \ + ILVRL_W2_SW(in3, in2, s2_m, s3_m); \ + out0 = (RTYPE)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m); \ + out1 = (RTYPE)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m); \ + out2 = (RTYPE)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m); \ + out3 = (RTYPE)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m); \ +} +#define TRANSPOSE4x4_SW_SW(...) TRANSPOSE4x4_W(v4i32, __VA_ARGS__) + +/* Description : Add block 4x4 + * Arguments : Inputs - in0, in1, in2, in3, pdst, stride + * Details : Least significant 4 bytes from each input vector are added to + * the destination bytes, clipped between 0-255 and stored. + */ +#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) { \ + uint32_t src0_m, src1_m, src2_m, src3_m; \ + v8i16 inp0_m, inp1_m, res0_m, res1_m; \ + v16i8 dst0_m = { 0 }; \ + v16i8 dst1_m = { 0 }; \ + const v16i8 zero_m = { 0 }; \ + ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m); \ + LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \ + INSERT_W2_SB(src0_m, src1_m, dst0_m); \ + INSERT_W2_SB(src2_m, src3_m, dst1_m); \ + ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \ + ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \ + CLIP_SH2_0_255(res0_m, res1_m); \ + PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \ + ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride); \ +} + +#endif /* WEBP_DSP_MSA_MACRO_H_ */ diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c index 5ea4ddb..5b97028 100644 --- a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c @@ -18,6 +18,7 @@ #include <assert.h> #include "../utils/rescaler.h" +#include "../utils/utils.h" //------------------------------------------------------------------------------ // Implementations of critical functions ImportRow / ExportRow diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c index d4ccbe0..ed2eb74 100644 --- a/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c +++ b/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c @@ -14,9 +14,7 @@ #include "./dsp.h" -// Code is disabled for now, in favor of the plain-C version -// TODO(djordje.pesut): adapt the code to reflect the C-version. -#if 0 // defined(WEBP_USE_MIPS_DSP_R2) +#if defined(WEBP_USE_MIPS_DSP_R2) #include <assert.h> #include "./yuv.h" @@ -24,21 +22,21 @@ #if !defined(WEBP_YUV_USE_TABLE) #define YUV_TO_RGB(Y, U, V, R, G, B) do { \ - const int t1 = kYScale * Y; \ - const int t2 = kVToG * V; \ - R = kVToR * V; \ - G = kUToG * U; \ - B = kUToB * U; \ + const int t1 = MultHi(Y, 19077); \ + const int t2 = MultHi(V, 13320); \ + R = MultHi(V, 26149); \ + G = MultHi(U, 6419); \ + B = MultHi(U, 33050); \ R = t1 + R; \ G = t1 - G; \ B = t1 + B; \ - R = R + kRCst; \ - G = G - t2 + kGCst; \ - B = B + kBCst; \ + R = R - 14234; \ + G = G - t2 + 8708; \ + B = B - 17685; \ __asm__ volatile ( \ - "shll_s.w %[" #R "], %[" #R "], 9 \n\t" \ - "shll_s.w %[" #G "], %[" #G "], 9 \n\t" \ - "shll_s.w %[" #B "], %[" #B "], 9 \n\t" \ + "shll_s.w %[" #R "], %[" #R "], 17 \n\t" \ + "shll_s.w %[" #G "], %[" #G "], 17 \n\t" \ + "shll_s.w %[" #B "], %[" #B "], 17 \n\t" \ "precrqu_s.qb.ph %[" #R "], %[" #R "], $zero \n\t" \ "precrqu_s.qb.ph %[" #G "], %[" #G "], $zero \n\t" \ "precrqu_s.qb.ph %[" #B "], %[" #B "], $zero \n\t" \ @@ -279,6 +277,6 @@ WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersMIPSdspR2) #endif // WEBP_USE_MIPS_DSP_R2 -#if 1 // !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MIPS_DSP_R2)) +#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MIPS_DSP_R2)) WEBP_DSP_INIT_STUB(WebPInitUpsamplersMIPSdspR2) #endif diff --git a/src/3rdparty/libwebp/src/dsp/yuv_mips32.c b/src/3rdparty/libwebp/src/dsp/yuv_mips32.c index b8fe512..e61aac5 100644 --- a/src/3rdparty/libwebp/src/dsp/yuv_mips32.c +++ b/src/3rdparty/libwebp/src/dsp/yuv_mips32.c @@ -14,8 +14,7 @@ #include "./dsp.h" -// Code is disabled for now, in favor of the plain-C version -#if 0 // defined(WEBP_USE_MIPS32) +#if defined(WEBP_USE_MIPS32) #include "./yuv.h" @@ -29,19 +28,19 @@ static void FUNC_NAME(const uint8_t* y, \ int i, r, g, b; \ int temp0, temp1, temp2, temp3, temp4; \ for (i = 0; i < (len >> 1); i++) { \ - temp1 = kVToR * v[0]; \ - temp3 = kVToG * v[0]; \ - temp2 = kUToG * u[0]; \ - temp4 = kUToB * u[0]; \ - temp0 = kYScale * y[0]; \ - temp1 += kRCst; \ - temp3 -= kGCst; \ + temp1 = MultHi(v[0], 26149); \ + temp3 = MultHi(v[0], 13320); \ + temp2 = MultHi(u[0], 6419); \ + temp4 = MultHi(u[0], 33050); \ + temp0 = MultHi(y[0], 19077); \ + temp1 -= 14234; \ + temp3 -= 8708; \ temp2 += temp3; \ - temp4 += kBCst; \ + temp4 -= 17685; \ r = VP8Clip8(temp0 + temp1); \ g = VP8Clip8(temp0 - temp2); \ b = VP8Clip8(temp0 + temp4); \ - temp0 = kYScale * y[1]; \ + temp0 = MultHi(y[1], 19077); \ dst[R] = r; \ dst[G] = g; \ dst[B] = b; \ @@ -59,15 +58,15 @@ static void FUNC_NAME(const uint8_t* y, \ dst += 2 * XSTEP; \ } \ if (len & 1) { \ - temp1 = kVToR * v[0]; \ - temp3 = kVToG * v[0]; \ - temp2 = kUToG * u[0]; \ - temp4 = kUToB * u[0]; \ - temp0 = kYScale * y[0]; \ - temp1 += kRCst; \ - temp3 -= kGCst; \ + temp1 = MultHi(v[0], 26149); \ + temp3 = MultHi(v[0], 13320); \ + temp2 = MultHi(u[0], 6419); \ + temp4 = MultHi(u[0], 33050); \ + temp0 = MultHi(y[0], 19077); \ + temp1 -= 14234; \ + temp3 -= 8708; \ temp2 += temp3; \ - temp4 += kBCst; \ + temp4 -= 17685; \ r = VP8Clip8(temp0 + temp1); \ g = VP8Clip8(temp0 - temp2); \ b = VP8Clip8(temp0 + temp4); \ diff --git a/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c index dea0fdb..1720d41 100644 --- a/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c +++ b/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c @@ -14,8 +14,7 @@ #include "./dsp.h" -// Code is disabled for now, in favor of the plain-C version -#if 0 // defined(WEBP_USE_MIPS_DSP_R2) +#if defined(WEBP_USE_MIPS_DSP_R2) #include "./yuv.h" @@ -31,10 +30,10 @@ "mul %[temp2], %[t_con_3], %[temp4] \n\t" \ "mul %[temp4], %[t_con_4], %[temp4] \n\t" \ "mul %[temp0], %[t_con_5], %[temp0] \n\t" \ - "addu %[temp1], %[temp1], %[t_con_6] \n\t" \ + "subu %[temp1], %[temp1], %[t_con_6] \n\t" \ "subu %[temp3], %[temp3], %[t_con_7] \n\t" \ "addu %[temp2], %[temp2], %[temp3] \n\t" \ - "addu %[temp4], %[temp4], %[t_con_8] \n\t" \ + "subu %[temp4], %[temp4], %[t_con_8] \n\t" \ #define ROW_FUNC_PART_2(R, G, B, K) \ "addu %[temp5], %[temp0], %[temp1] \n\t" \ @@ -43,12 +42,12 @@ ".if " #K " \n\t" \ "lbu %[temp0], 1(%[y]) \n\t" \ ".endif \n\t" \ - "shll_s.w %[temp5], %[temp5], 9 \n\t" \ - "shll_s.w %[temp6], %[temp6], 9 \n\t" \ + "shll_s.w %[temp5], %[temp5], 17 \n\t" \ + "shll_s.w %[temp6], %[temp6], 17 \n\t" \ ".if " #K " \n\t" \ "mul %[temp0], %[t_con_5], %[temp0] \n\t" \ ".endif \n\t" \ - "shll_s.w %[temp7], %[temp7], 9 \n\t" \ + "shll_s.w %[temp7], %[temp7], 17 \n\t" \ "precrqu_s.qb.ph %[temp5], %[temp5], $zero \n\t" \ "precrqu_s.qb.ph %[temp6], %[temp6], $zero \n\t" \ "precrqu_s.qb.ph %[temp7], %[temp7], $zero \n\t" \ @@ -75,14 +74,14 @@ static void FUNC_NAME(const uint8_t* y, \ uint8_t* dst, int len) { \ int i; \ uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; \ - const int t_con_1 = kVToR; \ - const int t_con_2 = kVToG; \ - const int t_con_3 = kUToG; \ - const int t_con_4 = kUToB; \ - const int t_con_5 = kYScale; \ - const int t_con_6 = kRCst; \ - const int t_con_7 = kGCst; \ - const int t_con_8 = kBCst; \ + const int t_con_1 = 26149; \ + const int t_con_2 = 13320; \ + const int t_con_3 = 6419; \ + const int t_con_4 = 33050; \ + const int t_con_5 = 19077; \ + const int t_con_6 = 14234; \ + const int t_con_7 = 8708; \ + const int t_con_8 = 17685; \ for (i = 0; i < (len >> 1); i++) { \ __asm__ volatile ( \ ROW_FUNC_PART_1() \ diff --git a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c index f72fe32..e19bddf 100644 --- a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c +++ b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c @@ -33,7 +33,8 @@ static void ConvertYUV444ToRGB(const __m128i* const Y0, const __m128i k19077 = _mm_set1_epi16(19077); const __m128i k26149 = _mm_set1_epi16(26149); const __m128i k14234 = _mm_set1_epi16(14234); - const __m128i k33050 = _mm_set1_epi16(33050); + // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic + const __m128i k33050 = _mm_set1_epi16((short)33050); const __m128i k17685 = _mm_set1_epi16(17685); const __m128i k6419 = _mm_set1_epi16(6419); const __m128i k13320 = _mm_set1_epi16(13320); |