25 files changed, 1830 insertions, 748 deletions
diff --git a/src/3rdparty/libwebp/src/dsp/common_sse2.h b/src/3rdparty/libwebp/src/dsp/common_sse2.h
new file mode 100644
index 0000000..7cea13f
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/common_sse2.h
@@ -0,0 +1,109 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 code common to several files.
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+
+#ifndef WEBP_DSP_COMMON_SSE2_H_
+#define WEBP_DSP_COMMON_SSE2_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(WEBP_USE_SSE2)
+
+#include <emmintrin.h>
+
+//------------------------------------------------------------------------------
+// Quite useful macro for debugging. Left here for convenience.
+
+#if 0
+#include <stdio.h>
+static WEBP_INLINE void PrintReg(const __m128i r, const char* const name,
+                                 int size) {
+  int n;
+  union {
+    __m128i r;
+    uint8_t i8[16];
+    uint16_t i16[8];
+    uint32_t i32[4];
+    uint64_t i64[2];
+  } tmp;
+  tmp.r = r;
+  fprintf(stderr, "%s\t: ", name);
+  if (size == 8) {
+    for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
+  } else if (size == 16) {
+    for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
+  } else if (size == 32) {
+    for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
+  } else {
+    for (n = 0; n < 2; ++n) fprintf(stderr, "%.16lx ", tmp.i64[n]);
+  }
+  fprintf(stderr, "\n");
+}
+#endif
+
+//------------------------------------------------------------------------------
+// Math functions.
+
+// Return the sum of all the 8b in the register.
+static WEBP_INLINE int VP8HorizontalAdd8b(const __m128i* const a) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i sad8x2 = _mm_sad_epu8(*a, zero);
+  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+  const __m128i sum = _mm_add_epi32(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
+  return _mm_cvtsi128_si32(sum);
+}
+
+// Transpose two 4x4 16b matrices horizontally stored in registers.
+static WEBP_INLINE void VP8Transpose_2_4x4_16b(
+    const __m128i* const in0, const __m128i* const in1,
+    const __m128i* const in2, const __m128i* const in3, __m128i* const out0,
+    __m128i* const out1, __m128i* const out2, __m128i* const out3) {
+  // Transpose the two 4x4.
+  // a00 a01 a02 a03   b00 b01 b02 b03
+  // a10 a11 a12 a13   b10 b11 b12 b13
+  // a20 a21 a22 a23   b20 b21 b22 b23
+  // a30 a31 a32 a33   b30 b31 b32 b33
+  const __m128i transpose0_0 = _mm_unpacklo_epi16(*in0, *in1);
+  const __m128i transpose0_1 = _mm_unpacklo_epi16(*in2, *in3);
+  const __m128i transpose0_2 = _mm_unpackhi_epi16(*in0, *in1);
+  const __m128i transpose0_3 = _mm_unpackhi_epi16(*in2, *in3);
+  // a00 a10 a01 a11   a02 a12 a03 a13
+  // a20 a30 a21 a31   a22 a32 a23 a33
+  // b00 b10 b01 b11   b02 b12 b03 b13
+  // b20 b30 b21 b31   b22 b32 b23 b33
+  const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+  const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+  const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+  const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+  // a00 a10 a20 a30 a01 a11 a21 a31
+  // b00 b10 b20 b30 b01 b11 b21 b31
+  // a02 a12 a22 a32 a03 a13 a23 a33
+  // b02 b12 a22 b32 b03 b13 b23 b33
+  *out0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+  *out1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+  *out2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+  *out3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+  // a00 a10 a20 a30   b00 b10 b20 b30
+  // a01 a11 a21 a31   b01 b11 b21 b31
+  // a02 a12 a22 a32   b02 b12 b22 b32
+  // a03 a13 a23 a33   b03 b13 b23 b33
+}
+
+#endif  // WEBP_USE_SSE2
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_DSP_COMMON_SSE2_H_
diff --git a/src/3rdparty/libwebp/src/dsp/cpu.c b/src/3rdparty/libwebp/src/dsp/cpu.c
index 8844cb4..cbb08db 100644
--- a/src/3rdparty/libwebp/src/dsp/cpu.c
+++ b/src/3rdparty/libwebp/src/dsp/cpu.c
@@ -13,6 +13,11 @@
 
 #include "./dsp.h"
 
+#if defined(WEBP_HAVE_NEON_RTCD)
+#include <stdio.h>
+#include <string.h>
+#endif
+
 #if defined(WEBP_ANDROID_NEON)
 #include <cpu-features.h>
 #endif
@@ -142,13 +147,33 @@ VP8CPUInfo VP8GetCPUInfo = AndroidCPUInfo;
 // define a dummy function to enable turning off NEON at runtime by setting
 // VP8DecGetCPUInfo = NULL
 static int armCPUInfo(CPUFeature feature) {
-  (void)feature;
+  if (feature != kNEON) return 0;
+#if defined(__linux__) && defined(WEBP_HAVE_NEON_RTCD)
+  {
+    int has_neon = 0;
+    char line[200];
+    FILE* const cpuinfo = fopen("/proc/cpuinfo", "r");
+    if (cpuinfo == NULL) return 0;
+    while (fgets(line, sizeof(line), cpuinfo)) {
+      if (!strncmp(line, "Features", 8)) {
+        if (strstr(line, " neon ") != NULL) {
+          has_neon = 1;
+          break;
+        }
+      }
+    }
+    fclose(cpuinfo);
+    return has_neon;
+  }
+#else
   return 1;
+#endif
 }
 VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
-#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2)
+#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2) || \
+      defined(WEBP_USE_MSA)
 static int mipsCPUInfo(CPUFeature feature) {
-  if ((feature == kMIPS32) || (feature == kMIPSdspR2)) {
+  if ((feature == kMIPS32) || (feature == kMIPSdspR2) || (feature == kMSA)) {
     return 1;
   } else {
     return 0;
diff --git a/src/3rdparty/libwebp/src/dsp/dec.c b/src/3rdparty/libwebp/src/dsp/dec.c
index a787206..e92d693 100644
--- a/src/3rdparty/libwebp/src/dsp/dec.c
+++ b/src/3rdparty/libwebp/src/dsp/dec.c
@@ -13,6 +13,7 @@
 
 #include "./dsp.h"
 #include "../dec/vp8i.h"
+#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 
@@ -654,6 +655,23 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
 
 //------------------------------------------------------------------------------
 
+static void DitherCombine8x8(const uint8_t* dither, uint8_t* dst,
+                             int dst_stride) {
+  int i, j;
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) {
+      const int delta0 = dither[i] - VP8_DITHER_AMP_CENTER;
+      const int delta1 =
+          (delta0 + VP8_DITHER_DESCALE_ROUNDER) >> VP8_DITHER_DESCALE;
+      dst[i] = clip_8b((int)dst[i] + delta1);
+    }
+    dst += dst_stride;
+    dither += 8;
+  }
+}
+
+//------------------------------------------------------------------------------
+
 VP8DecIdct2 VP8Transform;
 VP8DecIdct VP8TransformAC3;
 VP8DecIdct VP8TransformUV;
@@ -673,11 +691,15 @@ VP8SimpleFilterFunc VP8SimpleHFilter16;
 VP8SimpleFilterFunc VP8SimpleVFilter16i;
 VP8SimpleFilterFunc VP8SimpleHFilter16i;
 
+void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
+                            int dst_stride);
+
 extern void VP8DspInitSSE2(void);
 extern void VP8DspInitSSE41(void);
 extern void VP8DspInitNEON(void);
 extern void VP8DspInitMIPS32(void);
 extern void VP8DspInitMIPSdspR2(void);
+extern void VP8DspInitMSA(void);
 
 static volatile VP8CPUInfo dec_last_cpuinfo_used =
     (VP8CPUInfo)&dec_last_cpuinfo_used;
@@ -734,6 +756,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
   VP8PredChroma8[5] = DC8uvNoLeft;
   VP8PredChroma8[6] = DC8uvNoTopLeft;
 
+  VP8DitherCombine8x8 = DitherCombine8x8;
+
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
@@ -761,6 +785,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
       VP8DspInitMIPSdspR2();
     }
 #endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      VP8DspInitMSA();
+    }
+#endif
   }
   dec_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/dec_msa.c b/src/3rdparty/libwebp/src/dsp/dec_msa.c
new file mode 100644
index 0000000..f76055c
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/dec_msa.c
@@ -0,0 +1,172 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of dsp functions
+//
+// Author(s):  Prashant Patil   (prashant.patil@imgtec.com)
+
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "./msa_macro.h"
+
+//------------------------------------------------------------------------------
+// Transforms
+
+#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v4i32 a1_m, b1_m, c1_m, d1_m;                                  \
+  v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
+  const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091);           \
+  const v4i32 sinpi8sqrt2 = __msa_fill_w(35468);                 \
+                                                                 \
+  a1_m = in0 + in2;                                              \
+  b1_m = in0 - in2;                                              \
+  c_tmp1_m = (in1 * sinpi8sqrt2) >> 16;                          \
+  c_tmp2_m = in3 + ((in3 * cospi8sqrt2minus1) >> 16);            \
+  c1_m = c_tmp1_m - c_tmp2_m;                                    \
+  d_tmp1_m = in1 + ((in1 * cospi8sqrt2minus1) >> 16);            \
+  d_tmp2_m = (in3 * sinpi8sqrt2) >> 16;                          \
+  d1_m = d_tmp1_m + d_tmp2_m;                                    \
+  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
+}
+#define MULT1(a) ((((a) * 20091) >> 16) + (a))
+#define MULT2(a) (((a) * 35468) >> 16)
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  v8i16 input0, input1;
+  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+  v4i32 res0, res1, res2, res3;
+  const v16i8 zero = { 0 };
+  v16i8 dest0, dest1, dest2, dest3;
+
+  LD_SH2(in, 8, input0, input1);
+  UNPCK_SH_SW(input0, in0, in1);
+  UNPCK_SH_SW(input1, in2, in3);
+  IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+  TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+  IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+  SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+  TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+  LD_SB4(dst, BPS, dest0, dest1, dest2, dest3);
+  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+             res0, res1, res2, res3);
+  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+             res0, res1, res2, res3);
+  ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+  CLIP_SW4_0_255(res0, res1, res2, res3);
+  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+static void TransformWHT(const int16_t* in, int16_t* out) {
+  v8i16 input0, input1;
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+  v8i16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 out0, out1;
+
+  LD_SH2(in, 8, input0, input1);
+  input1 = SLDI_SH(input1, input1, 8);
+  tmp0 = input0 + input1;
+  tmp1 = input0 - input1;
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  out0 = tmp2 + tmp3;
+  out1 = tmp2 - tmp3;
+  VSHF_H2_SH(out0, out1, out0, out1, mask2, mask3, input0, input1);
+  tmp0 = input0 + input1;
+  tmp1 = input0 - input1;
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  tmp0 = tmp2 + tmp3;
+  tmp1 = tmp2 - tmp3;
+  ADDVI_H2_SH(tmp0, 3, tmp1, 3, out0, out1);
+  SRAI_H2_SH(out0, out1, 3);
+  out[0] = __msa_copy_s_h(out0, 0);
+  out[16] = __msa_copy_s_h(out0, 4);
+  out[32] = __msa_copy_s_h(out1, 0);
+  out[48] = __msa_copy_s_h(out1, 4);
+  out[64] = __msa_copy_s_h(out0, 1);
+  out[80] = __msa_copy_s_h(out0, 5);
+  out[96] = __msa_copy_s_h(out1, 1);
+  out[112] = __msa_copy_s_h(out1, 5);
+  out[128] = __msa_copy_s_h(out0, 2);
+  out[144] = __msa_copy_s_h(out0, 6);
+  out[160] = __msa_copy_s_h(out1, 2);
+  out[176] = __msa_copy_s_h(out1, 6);
+  out[192] = __msa_copy_s_h(out0, 3);
+  out[208] = __msa_copy_s_h(out0, 7);
+  out[224] = __msa_copy_s_h(out1, 3);
+  out[240] = __msa_copy_s_h(out1, 7);
+}
+
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+  const int DC = (in[0] + 4) >> 3;
+  const v8i16 tmp0 = __msa_fill_h(DC);
+  ADDBLK_ST4x4_UB(tmp0, tmp0, tmp0, tmp0, dst, BPS);
+}
+
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  const int a = in[0] + 4;
+  const int c4 = MULT2(in[4]);
+  const int d4 = MULT1(in[4]);
+  const int in2 = MULT2(in[1]);
+  const int in3 = MULT1(in[1]);
+  v4i32 tmp0 = { 0 };
+  v4i32 out0 = __msa_fill_w(a + d4);
+  v4i32 out1 = __msa_fill_w(a + c4);
+  v4i32 out2 = __msa_fill_w(a - c4);
+  v4i32 out3 = __msa_fill_w(a - d4);
+  v4i32 res0, res1, res2, res3;
+  const v4i32 zero = { 0 };
+  v16u8 dest0, dest1, dest2, dest3;
+
+  INSERT_W4_SW(in3, in2, -in2, -in3, tmp0);
+  ADD4(out0, tmp0, out1, tmp0, out2, tmp0, out3, tmp0,
+       out0, out1, out2, out3);
+  SRAI_W4_SW(out0, out1, out2, out3, 3);
+  LD_UB4(dst, BPS, dest0, dest1, dest2, dest3);
+  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+             res0, res1, res2, res3);
+  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+             res0, res1, res2, res3);
+  ADD4(res0, out0, res1, out1, res2, out2, res3, out3, res0, res1, res2, res3);
+  CLIP_SW4_0_255(res0, res1, res2, res3);
+  PCKEV_B2_SW(res0, res1, res2, res3, out0, out1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)out0, (v16i8)out1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) {
+  VP8TransformWHT = TransformWHT;
+  VP8Transform = TransformTwo;
+  VP8TransformDC = TransformDC;
+  VP8TransformAC3 = TransformAC3;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8DspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse2.c b/src/3rdparty/libwebp/src/dsp/dec_sse2.c
index 935bf02..f0a8ddc 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_sse2.c
@@ -21,7 +21,9 @@
 // #define USE_TRANSFORM_AC3
 
 #include <emmintrin.h>
+#include "./common_sse2.h"
 #include "../dec/vp8i.h"
+#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
@@ -102,34 +104,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
     const __m128i tmp3 = _mm_sub_epi16(a, d);
 
     // Transpose the two 4x4.
-    // a00 a01 a02 a03   b00 b01 b02 b03
-    // a10 a11 a12 a13   b10 b11 b12 b13
-    // a20 a21 a22 a23   b20 b21 b22 b23
-    // a30 a31 a32 a33   b30 b31 b32 b33
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
-    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
-    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
-    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
-    // a00 a10 a01 a11   a02 a12 a03 a13
-    // a20 a30 a21 a31   a22 a32 a23 a33
-    // b00 b10 b01 b11   b02 b12 b03 b13
-    // b20 b30 b21 b31   b22 b32 b23 b33
-    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
-    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
-    // a00 a10 a20 a30 a01 a11 a21 a31
-    // b00 b10 b20 b30 b01 b11 b21 b31
-    // a02 a12 a22 a32 a03 a13 a23 a33
-    // b02 b12 a22 b32 b03 b13 b23 b33
-    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
-    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
-    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
-    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
-    // a00 a10 a20 a30   b00 b10 b20 b30
-    // a01 a11 a21 a31   b01 b11 b21 b31
-    // a02 a12 a22 a32   b02 b12 b22 b32
-    // a03 a13 a23 a33   b03 b13 b23 b33
+    VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3);
   }
 
   // Horizontal pass and subsequent transpose.
@@ -164,34 +139,8 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
     const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
 
     // Transpose the two 4x4.
-    // a00 a01 a02 a03   b00 b01 b02 b03
-    // a10 a11 a12 a13   b10 b11 b12 b13
-    // a20 a21 a22 a23   b20 b21 b22 b23
-    // a30 a31 a32 a33   b30 b31 b32 b33
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
-    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
-    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
-    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
-    // a00 a10 a01 a11   a02 a12 a03 a13
-    // a20 a30 a21 a31   a22 a32 a23 a33
-    // b00 b10 b01 b11   b02 b12 b03 b13
-    // b20 b30 b21 b31   b22 b32 b23 b33
-    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
-    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
-    // a00 a10 a20 a30 a01 a11 a21 a31
-    // b00 b10 b20 b30 b01 b11 b21 b31
-    // a02 a12 a22 a32 a03 a13 a23 a33
-    // b02 b12 a22 b32 b03 b13 b23 b33
-    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
-    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
-    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
-    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
-    // a00 a10 a20 a30   b00 b10 b20 b30
-    // a01 a11 a21 a31   b01 b11 b21 b31
-    // a02 a12 a22 a32   b02 b12 b22 b32
-    // a03 a13 a23 a33   b03 b13 b23 b33
+    VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,
+                        &T2, &T3);
   }
 
   // Add inverse transform to 'dst' and store.
diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse41.c b/src/3rdparty/libwebp/src/dsp/dec_sse41.c
index 224c6f8..8d6aed1 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_sse41.c
@@ -17,6 +17,7 @@
 
 #include <smmintrin.h>
 #include "../dec/vp8i.h"
+#include "../utils/utils.h"
 
 static void HE16(uint8_t* dst) {     // horizontal
   int j;
diff --git a/src/3rdparty/libwebp/src/dsp/dsp.h b/src/3rdparty/libwebp/src/dsp/dsp.h
index 95f1ce0..1faac27 100644
--- a/src/3rdparty/libwebp/src/dsp/dsp.h
+++ b/src/3rdparty/libwebp/src/dsp/dsp.h
@@ -14,8 +14,11 @@
 #ifndef WEBP_DSP_DSP_H_
 #define WEBP_DSP_DSP_H_
 
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
 #include "../webp/types.h"
-#include "../utils/utils.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -72,7 +75,8 @@ extern "C" {
 // The intrinsics currently cause compiler errors with arm-nacl-gcc and the
 // inline assembly would need to be modified for use with Native Client.
 #if (defined(__ARM_NEON__) || defined(WEBP_ANDROID_NEON) || \
-     defined(__aarch64__)) && !defined(__native_client__)
+     defined(__aarch64__) || defined(WEBP_HAVE_NEON)) && \
+    !defined(__native_client__)
 #define WEBP_USE_NEON
 #endif
 
@@ -92,6 +96,10 @@ extern "C" {
 #endif
 #endif
 
+#if defined(__mips_msa) && defined(__mips_isa_rev) && (__mips_isa_rev >= 5)
+#define WEBP_USE_MSA
+#endif
+
 // This macro prevents thread_sanitizer from reporting known concurrent writes.
 #define WEBP_TSAN_IGNORE_FUNCTION
 #if defined(__has_feature)
@@ -101,6 +109,27 @@ extern "C" {
 #endif
 #endif
 
+#define WEBP_UBSAN_IGNORE_UNDEF
+#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
+#if !defined(WEBP_FORCE_ALIGNED) && defined(__clang__) && \
+    defined(__has_attribute)
+#if __has_attribute(no_sanitize)
+// This macro prevents the undefined behavior sanitizer from reporting
+// failures. This is only meant to silence unaligned loads on platforms that
+// are known to support them.
+#undef WEBP_UBSAN_IGNORE_UNDEF
+#define WEBP_UBSAN_IGNORE_UNDEF \
+  __attribute__((no_sanitize("undefined")))
+
+// This macro prevents the undefined behavior sanitizer from reporting
+// failures related to unsigned integer overflows. This is only meant to
+// silence cases where this well defined behavior is expected.
+#undef WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
+#define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW \
+  __attribute__((no_sanitize("unsigned-integer-overflow")))
+#endif
+#endif
+
 typedef enum {
   kSSE2,
   kSSE3,
@@ -109,7 +138,8 @@ typedef enum {
   kAVX2,
   kNEON,
   kMIPS32,
-  kMIPSdspR2
+  kMIPSdspR2,
+  kMSA
 } CPUFeature;
 // returns true if the CPU supports the feature.
 typedef int (*VP8CPUInfo)(CPUFeature feature);
@@ -151,6 +181,8 @@ typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref);
 extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4;
 typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
                           const uint16_t* const weights);
+// The weights for VP8TDisto4x4 and VP8TDisto16x16 contain a row-major
+// 4 by 4 symmetric matrix.
 extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
 
 typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
@@ -214,6 +246,35 @@ extern VP8GetResidualCostFunc VP8GetResidualCost;
 void VP8EncDspCostInit(void);
 
 //------------------------------------------------------------------------------
+// SSIM utils
+
+// struct for accumulating statistical moments
+typedef struct {
+  double w;              // sum(w_i) : sum of weights
+  double xm, ym;         // sum(w_i * x_i), sum(w_i * y_i)
+  double xxm, xym, yym;  // sum(w_i * x_i * x_i), etc.
+} VP8DistoStats;
+
+#define VP8_SSIM_KERNEL 3   // total size of the kernel: 2 * VP8_SSIM_KERNEL + 1
+typedef void (*VP8SSIMAccumulateClippedFunc)(const uint8_t* src1, int stride1,
+                                             const uint8_t* src2, int stride2,
+                                             int xo, int yo,  // center position
+                                             int W, int H,    // plane dimension
+                                             VP8DistoStats* const stats);
+
+// This version is called with the guarantee that you can load 8 bytes and
+// 8 rows at offset src1 and src2
+typedef void (*VP8SSIMAccumulateFunc)(const uint8_t* src1, int stride1,
+                                      const uint8_t* src2, int stride2,
+                                      VP8DistoStats* const stats);
+
+extern VP8SSIMAccumulateFunc VP8SSIMAccumulate;         // unclipped / unchecked
+extern VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped;   // with clipping
+
+// must be called before using any of the above directly
+void VP8SSIMDspInit(void);
+
+//------------------------------------------------------------------------------
 // Decoding
 
 typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
@@ -265,6 +326,15 @@ extern VP8LumaFilterFunc VP8HFilter16i;
 extern VP8ChromaFilterFunc VP8VFilter8i;  // filtering u and v altogether
 extern VP8ChromaFilterFunc VP8HFilter8i;
 
+// Dithering. Combines dithering values (centered around 128) with dst[],
+// according to: dst[] = clip(dst[] + (((dither[]-128) + 8) >> 4)
+#define VP8_DITHER_DESCALE 4
+#define VP8_DITHER_DESCALE_ROUNDER (1 << (VP8_DITHER_DESCALE - 1))
+#define VP8_DITHER_AMP_BITS 7
+#define VP8_DITHER_AMP_CENTER (1 << VP8_DITHER_AMP_BITS)
+extern void (*VP8DitherCombine8x8)(const uint8_t* dither, uint8_t* dst,
+                                   int dst_stride);
+
 // must be called before anything using the above
 void VP8DspInit(void);
 
@@ -472,8 +542,10 @@ typedef enum {     // Filter types.
 
 typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
                                int stride, uint8_t* out);
-typedef void (*WebPUnfilterFunc)(int width, int height, int stride,
-                                 int row, int num_rows, uint8_t* data);
+// In-place un-filtering.
+// Warning! 'prev_line' pointer can be equal to 'cur_line' or 'preds'.
+typedef void (*WebPUnfilterFunc)(const uint8_t* prev_line, const uint8_t* preds,
+                                 uint8_t* cur_line, int width);
 
 // Filter the given data using the given predictor.
 // 'in' corresponds to a 2-dimensional pixel array of size (stride * height)
diff --git a/src/3rdparty/libwebp/src/dsp/enc.c b/src/3rdparty/libwebp/src/dsp/enc.c
index 8899d50..f639f55 100644
--- a/src/3rdparty/libwebp/src/dsp/enc.c
+++ b/src/3rdparty/libwebp/src/dsp/enc.c
@@ -69,7 +69,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 
     // Convert coefficients to bin.
     for (k = 0; k < 16; ++k) {
-      const int v = abs(out[k]) >> 3;  // TODO(skal): add rounding?
+      const int v = abs(out[k]) >> 3;
       const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
       ++distribution[clipped_value];
     }
@@ -559,6 +559,7 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
 static int TTransform(const uint8_t* in, const uint16_t* w) {
   int sum = 0;
   int tmp[16];
@@ -636,7 +637,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
       int level = QUANTDIV(coeff, iQ, B);
       if (level > MAX_LEVEL) level = MAX_LEVEL;
       if (sign) level = -level;
-      in[j] = level * Q;
+      in[j] = level * (int)Q;
       out[n] = level;
       if (level) last = n;
     } else {
@@ -670,7 +671,7 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
       int level = QUANTDIV(coeff, iQ, B);
       if (level > MAX_LEVEL) level = MAX_LEVEL;
       if (sign) level = -level;
-      in[j] = level * Q;
+      in[j] = level * (int)Q;
       out[n] = level;
       if (level) last = n;
     } else {
@@ -702,6 +703,68 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) {
 }
 
 //------------------------------------------------------------------------------
+
+static void SSIMAccumulateClipped(const uint8_t* src1, int stride1,
+                                  const uint8_t* src2, int stride2,
+                                  int xo, int yo, int W, int H,
+                                  VP8DistoStats* const stats) {
+  const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
+  const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
+                                                  : yo + VP8_SSIM_KERNEL;
+  const int xmin = (xo - VP8_SSIM_KERNEL < 0) ? 0 : xo - VP8_SSIM_KERNEL;
+  const int xmax = (xo + VP8_SSIM_KERNEL > W - 1) ? W - 1
+                                                  : xo + VP8_SSIM_KERNEL;
+  int x, y;
+  src1 += ymin * stride1;
+  src2 += ymin * stride2;
+  for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
+    for (x = xmin; x <= xmax; ++x) {
+      const int s1 = src1[x];
+      const int s2 = src2[x];
+      stats->w   += 1;
+      stats->xm  += s1;
+      stats->ym  += s2;
+      stats->xxm += s1 * s1;
+      stats->xym += s1 * s2;
+      stats->yym += s2 * s2;
+    }
+  }
+}
+
+static void SSIMAccumulate(const uint8_t* src1, int stride1,
+                           const uint8_t* src2, int stride2,
+                           VP8DistoStats* const stats) {
+  int x, y;
+  for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
+    for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
+      const int s1 = src1[x];
+      const int s2 = src2[x];
+      stats->w   += 1;
+      stats->xm  += s1;
+      stats->ym  += s2;
+      stats->xxm += s1 * s1;
+      stats->xym += s1 * s2;
+      stats->yym += s2 * s2;
+    }
+  }
+}
+
+VP8SSIMAccumulateFunc VP8SSIMAccumulate;
+VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped;
+
+static volatile VP8CPUInfo ssim_last_cpuinfo_used =
+    (VP8CPUInfo)&ssim_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
+  if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8SSIMAccumulate = SSIMAccumulate;
+  VP8SSIMAccumulateClipped = SSIMAccumulateClipped;
+
+  ssim_last_cpuinfo_used = VP8GetCPUInfo;
+}
+
+//------------------------------------------------------------------------------
 // Initialization
 
 // Speed-critical function pointers. We have to initialize them to the default
diff --git a/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
index 7c814fa..7ab96f6 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
@@ -1393,8 +1393,6 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
   "absq_s.ph  %[temp1],  %[temp1]                      \n\t"                   \
   "absq_s.ph  %[temp2],  %[temp2]                      \n\t"                   \
   "absq_s.ph  %[temp3],  %[temp3]                      \n\t"                   \
-  /* TODO(skal): add rounding ? shra_r.ph : shra.ph */                         \
-  /*             for following 4 instructions       */                         \
   "shra.ph    %[temp0],  %[temp0],    3                \n\t"                   \
   "shra.ph    %[temp1],  %[temp1],    3                \n\t"                   \
   "shra.ph    %[temp2],  %[temp2],    3                \n\t"                   \
diff --git a/src/3rdparty/libwebp/src/dsp/enc_neon.c b/src/3rdparty/libwebp/src/dsp/enc_neon.c
index c2aef58..46f6bf9 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_neon.c
@@ -560,21 +560,6 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
 // a 26ae, b 26ae
 // a 37bf, b 37bf
 //
-static WEBP_INLINE uint8x8x4_t DistoTranspose4x4U8(uint8x8x4_t d4_in) {
-  const uint8x8x2_t d2_tmp0 = vtrn_u8(d4_in.val[0], d4_in.val[1]);
-  const uint8x8x2_t d2_tmp1 = vtrn_u8(d4_in.val[2], d4_in.val[3]);
-  const uint16x4x2_t d2_tmp2 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[0]),
-                                        vreinterpret_u16_u8(d2_tmp1.val[0]));
-  const uint16x4x2_t d2_tmp3 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[1]),
-                                        vreinterpret_u16_u8(d2_tmp1.val[1]));
-
-  d4_in.val[0] = vreinterpret_u8_u16(d2_tmp2.val[0]);
-  d4_in.val[2] = vreinterpret_u8_u16(d2_tmp2.val[1]);
-  d4_in.val[1] = vreinterpret_u8_u16(d2_tmp3.val[0]);
-  d4_in.val[3] = vreinterpret_u8_u16(d2_tmp3.val[1]);
-  return d4_in;
-}
-
 static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
   const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
   const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
@@ -589,41 +574,40 @@ static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
   return q4_in;
 }
 
-static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const uint8x8x4_t d4_in) {
+static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const int16x8x4_t q4_in) {
   // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
   // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
-  const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[0],
-                                                        d4_in.val[2]));
-  const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[1],
-                                                        d4_in.val[3]));
-  const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[0],
-                                                        d4_in.val[2]));
-  const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[1],
-                                                        d4_in.val[3]));
+  const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
+  const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
+  const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
+  const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
   int16x8x4_t q4_out;
   // tmp[0] = a0 + a1
   // tmp[1] = a3 + a2
   // tmp[2] = a3 - a2
   // tmp[3] = a0 - a1
   INIT_VECTOR4(q4_out,
-               vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
-               vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
+               vabsq_s16(vaddq_s16(q_a0, q_a1)),
+               vabsq_s16(vaddq_s16(q_a3, q_a2)),
+               vabdq_s16(q_a3, q_a2), vabdq_s16(q_a0, q_a1));
   return q4_out;
 }
 
-static WEBP_INLINE int16x8x4_t DistoVerticalPass(int16x8x4_t q4_in) {
-  const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
-  const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
-  const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
-  const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
+static WEBP_INLINE int16x8x4_t DistoVerticalPass(const uint8x8x4_t q4_in) {
+  const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[0],
+                                                        q4_in.val[2]));
+  const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(q4_in.val[1],
+                                                        q4_in.val[3]));
+  const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[1],
+                                                        q4_in.val[3]));
+  const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(q4_in.val[0],
+                                                        q4_in.val[2]));
+  int16x8x4_t q4_out;
 
-  q4_in.val[0] = vaddq_s16(q_a0, q_a1);
-  q4_in.val[1] = vaddq_s16(q_a3, q_a2);
-  q4_in.val[2] = vabdq_s16(q_a3, q_a2);
-  q4_in.val[3] = vabdq_s16(q_a0, q_a1);
-  q4_in.val[0] = vabsq_s16(q4_in.val[0]);
-  q4_in.val[1] = vabsq_s16(q4_in.val[1]);
-  return q4_in;
+  INIT_VECTOR4(q4_out,
+               vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
+               vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
+  return q4_out;
 }
 
 static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
@@ -667,6 +651,7 @@ static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
 
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
                     const uint16_t* const w) {
   uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
@@ -691,18 +676,19 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
                vreinterpret_u8_u32(d_in_ab_cdef));
 
   {
-    // horizontal pass
-    const uint8x8x4_t d4_t = DistoTranspose4x4U8(d4_in);
-    const int16x8x4_t q4_h = DistoHorizontalPass(d4_t);
+    // Vertical pass first to avoid a transpose (vertical and horizontal passes
+    // are commutative because w/kWeightY is symmetric) and subsequent
+    // transpose.
+    const int16x8x4_t q4_v = DistoVerticalPass(d4_in);
     const int16x4x4_t d4_w = DistoLoadW(w);
-    // vertical pass
-    const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_h);
-    const int16x8x4_t q4_v = DistoVerticalPass(q4_t);
-    int32x2_t d_sum = DistoSum(q4_v, d4_w);
+    // horizontal pass
+    const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_v);
+    const int16x8x4_t q4_h = DistoHorizontalPass(q4_t);
+    int32x2_t d_sum = DistoSum(q4_h, d4_w);
 
     // abs(sum2 - sum1) >> 5
     d_sum = vabs_s32(d_sum);
-    d_sum  = vshr_n_s32(d_sum, 5);
+    d_sum = vshr_n_s32(d_sum, 5);
     return vget_lane_s32(d_sum, 0);
   }
 }
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse2.c b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
index 2333d2b..4a2e3ce 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
@@ -17,39 +17,11 @@
 #include <stdlib.h>  // for abs()
 #include <emmintrin.h>
 
+#include "./common_sse2.h"
 #include "../enc/cost.h"
 #include "../enc/vp8enci.h"
 
 //------------------------------------------------------------------------------
-// Quite useful macro for debugging. Left here for convenience.
-
-#if 0
-#include <stdio.h>
-static void PrintReg(const __m128i r, const char* const name, int size) {
-  int n;
-  union {
-    __m128i r;
-    uint8_t i8[16];
-    uint16_t i16[8];
-    uint32_t i32[4];
-    uint64_t i64[2];
-  } tmp;
-  tmp.r = r;
-  fprintf(stderr, "%s\t: ", name);
-  if (size == 8) {
-    for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
-  } else if (size == 16) {
-    for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
-  } else if (size == 32) {
-    for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
-  } else {
-    for (n = 0; n < 2; ++n) fprintf(stderr, "%.16lx ", tmp.i64[n]);
-  }
-  fprintf(stderr, "\n");
-}
-#endif
-
-//------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
 // Does one or two inverse transforms.
@@ -131,34 +103,7 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
     const __m128i tmp3 = _mm_sub_epi16(a, d);
 
     // Transpose the two 4x4.
-    // a00 a01 a02 a03   b00 b01 b02 b03
-    // a10 a11 a12 a13   b10 b11 b12 b13
-    // a20 a21 a22 a23   b20 b21 b22 b23
-    // a30 a31 a32 a33   b30 b31 b32 b33
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
-    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
-    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
-    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
-    // a00 a10 a01 a11   a02 a12 a03 a13
-    // a20 a30 a21 a31   a22 a32 a23 a33
-    // b00 b10 b01 b11   b02 b12 b03 b13
-    // b20 b30 b21 b31   b22 b32 b23 b33
-    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
-    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
-    // a00 a10 a20 a30 a01 a11 a21 a31
-    // b00 b10 b20 b30 b01 b11 b21 b31
-    // a02 a12 a22 a32 a03 a13 a23 a33
-    // b02 b12 a22 b32 b03 b13 b23 b33
-    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
-    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
-    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
-    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
-    // a00 a10 a20 a30   b00 b10 b20 b30
-    // a01 a11 a21 a31   b01 b11 b21 b31
-    // a02 a12 a22 a32   b02 b12 b22 b32
-    // a03 a13 a23 a33   b03 b13 b23 b33
+    VP8Transpose_2_4x4_16b(&tmp0, &tmp1, &tmp2, &tmp3, &T0, &T1, &T2, &T3);
   }
 
   // Horizontal pass and subsequent transpose.
@@ -193,34 +138,8 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
     const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);
 
     // Transpose the two 4x4.
-    // a00 a01 a02 a03   b00 b01 b02 b03
-    // a10 a11 a12 a13   b10 b11 b12 b13
-    // a20 a21 a22 a23   b20 b21 b22 b23
-    // a30 a31 a32 a33   b30 b31 b32 b33
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
-    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
-    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
-    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
-    // a00 a10 a01 a11   a02 a12 a03 a13
-    // a20 a30 a21 a31   a22 a32 a23 a33
-    // b00 b10 b01 b11   b02 b12 b03 b13
-    // b20 b30 b21 b31   b22 b32 b23 b33
-    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
-    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
-    // a00 a10 a20 a30 a01 a11 a21 a31
-    // b00 b10 b20 b30 b01 b11 b21 b31
-    // a02 a12 a22 a32 a03 a13 a23 a33
-    // b02 b12 a22 b32 b03 b13 b23 b33
-    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
-    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
-    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
-    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
-    // a00 a10 a20 a30   b00 b10 b20 b30
-    // a01 a11 a21 a31   b01 b11 b21 b31
-    // a02 a12 a22 a32   b02 b12 b22 b32
-    // a03 a13 a23 a33   b03 b13 b23 b33
+    VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,
+                        &T2, &T3);
   }
 
   // Add inverse transform to 'ref' and store.
@@ -373,42 +292,42 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
 
 static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   const __m128i zero = _mm_setzero_si128();
-
-  // Load src and convert to 16b.
+  // Load src.
   const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
   const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
   const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
   const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
-  const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
-  const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
-  const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
-  const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
-  // Load ref and convert to 16b.
+  // 00 01 02 03 *
+  // 10 11 12 13 *
+  // 20 21 22 23 *
+  // 30 31 32 33 *
+  // Shuffle.
+  const __m128i src_0 = _mm_unpacklo_epi16(src0, src1);
+  const __m128i src_1 = _mm_unpacklo_epi16(src2, src3);
+  // 00 01 10 11 02 03 12 13 * * ...
+  // 20 21 30 31 22 22 32 33 * * ...
+
+  // Load ref.
   const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
   const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
   const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
   const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
-  const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
-  const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
-  const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
-  const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
-  // Compute difference. -> 00 01 02 03 00 00 00 00
-  const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
-  const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
-  const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
-  const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
-
-  // Unpack and shuffle
-  // 00 01 02 03   0 0 0 0
-  // 10 11 12 13   0 0 0 0
-  // 20 21 22 23   0 0 0 0
-  // 30 31 32 33   0 0 0 0
-  const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
-  const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
+  const __m128i ref_0 = _mm_unpacklo_epi16(ref0, ref1);
+  const __m128i ref_1 = _mm_unpacklo_epi16(ref2, ref3);
+
+  // Convert both to 16 bit.
+  const __m128i src_0_16b = _mm_unpacklo_epi8(src_0, zero);
+  const __m128i src_1_16b = _mm_unpacklo_epi8(src_1, zero);
+  const __m128i ref_0_16b = _mm_unpacklo_epi8(ref_0, zero);
+  const __m128i ref_1_16b = _mm_unpacklo_epi8(ref_1, zero);
+
+  // Compute the difference.
+  const __m128i row01 = _mm_sub_epi16(src_0_16b, ref_0_16b);
+  const __m128i row23 = _mm_sub_epi16(src_1_16b, ref_1_16b);
   __m128i v01, v32;
 
   // First pass
-  FTransformPass1(&shuf01, &shuf23, &v01, &v32);
+  FTransformPass1(&row01, &row23, &v01, &v32);
 
   // Second pass
   FTransformPass2(&v01, &v32, out);
@@ -463,8 +382,7 @@ static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
 }
 
 static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
-  const __m128i kMult1 = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
-  const __m128i kMult2 = _mm_set_epi16(0, 0, 0, 0, -1, 1, -1, 1);
+  const __m128i kMult = _mm_set_epi16(-1, 1, -1, 1, 1, 1, 1, 1);
   const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
   const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
   const __m128i src2 = _mm_loadl_epi64((__m128i*)&in[2 * 16]);
@@ -473,33 +391,38 @@ static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
   const __m128i A23 = _mm_unpacklo_epi16(src2, src3);  // A2 A3 | ...
   const __m128i B0 = _mm_adds_epi16(A01, A23);    // a0 | a1 | ...
   const __m128i B1 = _mm_subs_epi16(A01, A23);    // a3 | a2 | ...
-  const __m128i C0 = _mm_unpacklo_epi32(B0, B1);  // a0 | a1 | a3 | a2
-  const __m128i C1 = _mm_unpacklo_epi32(B1, B0);  // a3 | a2 | a0 | a1
-  const __m128i D0 = _mm_madd_epi16(C0, kMult1);  // out0, out1
-  const __m128i D1 = _mm_madd_epi16(C1, kMult2);  // out2, out3
-  *out = _mm_unpacklo_epi64(D0, D1);
+  const __m128i C0 = _mm_unpacklo_epi32(B0, B1);  // a0 | a1 | a3 | a2 | ...
+  const __m128i C1 = _mm_unpacklo_epi32(B1, B0);  // a3 | a2 | a0 | a1 | ...
+  const __m128i D = _mm_unpacklo_epi64(C0, C1);   // a0 a1 a3 a2 a3 a2 a0 a1
+  *out = _mm_madd_epi16(D, kMult);
 }
 
 static void FTransformWHT(const int16_t* in, int16_t* out) {
+  // Input is 12b signed.
   __m128i row0, row1, row2, row3;
+  // Rows are 14b signed.
   FTransformWHTRow(in + 0 * 64, &row0);
   FTransformWHTRow(in + 1 * 64, &row1);
   FTransformWHTRow(in + 2 * 64, &row2);
   FTransformWHTRow(in + 3 * 64, &row3);
 
   {
+    // The a* are 15b signed.
     const __m128i a0 = _mm_add_epi32(row0, row2);
     const __m128i a1 = _mm_add_epi32(row1, row3);
     const __m128i a2 = _mm_sub_epi32(row1, row3);
     const __m128i a3 = _mm_sub_epi32(row0, row2);
-    const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);
-    const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);
-    const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);
-    const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1);
-    const __m128i out0 = _mm_packs_epi32(b0, b1);
-    const __m128i out1 = _mm_packs_epi32(b2, b3);
-    _mm_storeu_si128((__m128i*)&out[0], out0);
-    _mm_storeu_si128((__m128i*)&out[8], out1);
+    const __m128i a0a3 = _mm_packs_epi32(a0, a3);
+    const __m128i a1a2 = _mm_packs_epi32(a1, a2);
+
+    // The b* are 16b signed.
+    const __m128i b0b1 = _mm_add_epi16(a0a3, a1a2);
+    const __m128i b3b2 = _mm_sub_epi16(a0a3, a1a2);
+    const __m128i tmp_b2b3 = _mm_unpackhi_epi64(b3b2, b3b2);
+    const __m128i b2b3 = _mm_unpacklo_epi64(tmp_b2b3, b3b2);
+
+    _mm_storeu_si128((__m128i*)&out[0], _mm_srai_epi16(b0b1, 1));
+    _mm_storeu_si128((__m128i*)&out[8], _mm_srai_epi16(b2b3, 1));
   }
 }
 
@@ -692,12 +615,10 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
 
 static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,
                               const uint8_t* top) {
-  const __m128i zero = _mm_setzero_si128();
   const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
   const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
-  const __m128i sum_top = _mm_sad_epu8(top_values, zero);
-  const __m128i sum_left = _mm_sad_epu8(left_values, zero);
-  const int DC = _mm_cvtsi128_si32(sum_top) + _mm_cvtsi128_si32(sum_left) + 8;
+  const __m128i combined = _mm_unpacklo_epi64(top_values, left_values);
+  const int DC = VP8HorizontalAdd8b(&combined) + 8;
   Put8x8uv(DC >> 4, dst);
 }
 
@@ -735,27 +656,16 @@ static WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left,
 
 static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,
                              const uint8_t* top) {
-  const __m128i zero = _mm_setzero_si128();
   const __m128i top_row = _mm_load_si128((const __m128i*)top);
   const __m128i left_row = _mm_load_si128((const __m128i*)left);
-  const __m128i sad8x2 = _mm_sad_epu8(top_row, zero);
-  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
-  const __m128i sum_top = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
-  const __m128i sad8x2_left = _mm_sad_epu8(left_row, zero);
-  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
-  const __m128i sum_left =
-      _mm_add_epi16(sad8x2_left, _mm_shuffle_epi32(sad8x2_left, 2));
-  const int DC = _mm_cvtsi128_si32(sum_top) + _mm_cvtsi128_si32(sum_left) + 16;
+  const int DC =
+      VP8HorizontalAdd8b(&top_row) + VP8HorizontalAdd8b(&left_row) + 16;
   Put16(DC >> 5, dst);
 }
 
 static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {
-  const __m128i zero = _mm_setzero_si128();
   const __m128i top_row = _mm_load_si128((const __m128i*)top);
-  const __m128i sad8x2 = _mm_sad_epu8(top_row, zero);
-  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
-  const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
-  const int DC = _mm_cvtsi128_si32(sum) + 8;
+  const int DC = VP8HorizontalAdd8b(&top_row) + 8;
   Put16(DC >> 4, dst);
 }
 
@@ -1142,15 +1052,15 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 // reconstructed samples.
 
 // Hadamard transform
-// Returns the difference between the weighted sum of the absolute value of
-// transformed coefficients.
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
 static int TTransform(const uint8_t* inA, const uint8_t* inB,
                       const uint16_t* const w) {
   int32_t sum[4];
   __m128i tmp_0, tmp_1, tmp_2, tmp_3;
   const __m128i zero = _mm_setzero_si128();
 
-  // Load, combine and transpose inputs.
+  // Load and combine inputs.
   {
     const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
     const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
@@ -1162,37 +1072,22 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
     const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
 
     // Combine inA and inB (we'll do two transforms in parallel).
-    const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
-    const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
-    const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
-    const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
-    // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
-    // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
-    // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
-    // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0
-
-    // Transpose the two 4x4, discarding the filling zeroes.
-    const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
-    const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
-    // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
-    // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
-    const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
-    const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
-    // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
-    // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33
-
-    // Convert to 16b.
-    tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero);
-    tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero);
-    tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero);
-    tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);
-    // a00 a10 a20 a30   b00 b10 b20 b30
-    // a01 a11 a21 a31   b01 b11 b21 b31
-    // a02 a12 a22 a32   b02 b12 b22 b32
-    // a03 a13 a23 a33   b03 b13 b23 b33
+    const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
+    const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
+    const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
+    const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
+    tmp_0 = _mm_unpacklo_epi8(inAB_0, zero);
+    tmp_1 = _mm_unpacklo_epi8(inAB_1, zero);
+    tmp_2 = _mm_unpacklo_epi8(inAB_2, zero);
+    tmp_3 = _mm_unpacklo_epi8(inAB_3, zero);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
   }
 
-  // Horizontal pass and subsequent transpose.
+  // Vertical pass first to avoid a transpose (vertical and horizontal passes
+  // are commutative because w/kWeightY is symmetric) and subsequent transpose.
   {
     // Calculate a and b (two 4x4 at once).
     const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
@@ -1209,33 +1104,10 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
     // a30 a31 a32 a33   b30 b31 b32 b33
 
     // Transpose the two 4x4.
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
-    const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
-    const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
-    const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
-    // a00 a10 a01 a11   a02 a12 a03 a13
-    // a20 a30 a21 a31   a22 a32 a23 a33
-    // b00 b10 b01 b11   b02 b12 b03 b13
-    // b20 b30 b21 b31   b22 b32 b23 b33
-    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
-    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
-    // a00 a10 a20 a30 a01 a11 a21 a31
-    // b00 b10 b20 b30 b01 b11 b21 b31
-    // a02 a12 a22 a32 a03 a13 a23 a33
-    // b02 b12 a22 b32 b03 b13 b23 b33
-    tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
-    tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
-    tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
-    tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
-    // a00 a10 a20 a30   b00 b10 b20 b30
-    // a01 a11 a21 a31   b01 b11 b21 b31
-    // a02 a12 a22 a32   b02 b12 b22 b32
-    // a03 a13 a23 a33   b03 b13 b23 b33
+    VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
   }
 
-  // Vertical pass and difference of weighted sums.
+  // Horizontal pass and difference of weighted sums.
   {
     // Load all inputs.
     const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse41.c b/src/3rdparty/libwebp/src/dsp/enc_sse41.c
index 65c01ae..a178390 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_sse41.c
@@ -17,6 +17,7 @@
 #include <smmintrin.h>
 #include <stdlib.h>  // for abs()
 
+#include "./common_sse2.h"
 #include "../enc/vp8enci.h"
 
 //------------------------------------------------------------------------------
@@ -67,55 +68,45 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
 // reconstructed samples.
 
 // Hadamard transform
-// Returns the difference between the weighted sum of the absolute value of
-// transformed coefficients.
+// Returns the weighted sum of the absolute value of transformed coefficients.
+// w[] contains a row-major 4 by 4 symmetric matrix.
 static int TTransform(const uint8_t* inA, const uint8_t* inB,
                       const uint16_t* const w) {
+  int32_t sum[4];
   __m128i tmp_0, tmp_1, tmp_2, tmp_3;
 
-  // Load, combine and transpose inputs.
+  // Load and combine inputs.
   {
-    const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
-    const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
-    const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
+    const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]);
+    const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]);
+    const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]);
+    // In SSE4.1, with gcc 4.8 at least (maybe other versions),
+    // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump
+    // of inA and inB, _mm_loadl_epi64 is still used not to have an out of
+    // bound read.
     const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
-    const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
-    const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
-    const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
+    const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]);
+    const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]);
+    const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]);
     const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
 
     // Combine inA and inB (we'll do two transforms in parallel).
-    const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
-    const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
-    const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
-    const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
-    // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
-    // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
-    // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
-    // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0
-
-    // Transpose the two 4x4, discarding the filling zeroes.
-    const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
-    const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
-    // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
-    // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
-    const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
-    const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
-    // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
-    // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33
-
-    // Convert to 16b.
-    tmp_0 = _mm_cvtepu8_epi16(transpose1_0);
-    tmp_1 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_0, 8));
-    tmp_2 = _mm_cvtepu8_epi16(transpose1_1);
-    tmp_3 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_1, 8));
-    // a00 a10 a20 a30   b00 b10 b20 b30
-    // a01 a11 a21 a31   b01 b11 b21 b31
-    // a02 a12 a22 a32   b02 b12 b22 b32
-    // a03 a13 a23 a33   b03 b13 b23 b33
+    const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
+    const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
+    const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
+    const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
+    tmp_0 = _mm_cvtepu8_epi16(inAB_0);
+    tmp_1 = _mm_cvtepu8_epi16(inAB_1);
+    tmp_2 = _mm_cvtepu8_epi16(inAB_2);
+    tmp_3 = _mm_cvtepu8_epi16(inAB_3);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
   }
 
-  // Horizontal pass and subsequent transpose.
+  // Vertical pass first to avoid a transpose (vertical and horizontal passes
+  // are commutative because w/kWeightY is symmetric) and subsequent transpose.
   {
     // Calculate a and b (two 4x4 at once).
     const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
@@ -132,33 +123,10 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
     // a30 a31 a32 a33   b30 b31 b32 b33
 
     // Transpose the two 4x4.
-    const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
-    const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
-    const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
-    const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
-    // a00 a10 a01 a11   a02 a12 a03 a13
-    // a20 a30 a21 a31   a22 a32 a23 a33
-    // b00 b10 b01 b11   b02 b12 b03 b13
-    // b20 b30 b21 b31   b22 b32 b23 b33
-    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
-    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
-    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
-    // a00 a10 a20 a30 a01 a11 a21 a31
-    // b00 b10 b20 b30 b01 b11 b21 b31
-    // a02 a12 a22 a32 a03 a13 a23 a33
-    // b02 b12 a22 b32 b03 b13 b23 b33
-    tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
-    tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
-    tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
-    tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
-    // a00 a10 a20 a30   b00 b10 b20 b30
-    // a01 a11 a21 a31   b01 b11 b21 b31
-    // a02 a12 a22 a32   b02 b12 b22 b32
-    // a03 a13 a23 a33   b03 b13 b23 b33
+    VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
   }
 
-  // Vertical pass and difference of weighted sums.
+  // Horizontal pass and difference of weighted sums.
   {
     // Load all inputs.
     const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
@@ -195,11 +163,9 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
 
     // difference of weighted sums
     A_b2 = _mm_sub_epi32(A_b0, B_b0);
-    // cascading summation of the differences
-    B_b0 = _mm_hadd_epi32(A_b2, A_b2);
-    B_b2 = _mm_hadd_epi32(B_b0, B_b0);
-    return _mm_cvtsi128_si32(B_b2);
+    _mm_storeu_si128((__m128i*)&sum[0], A_b2);
   }
+  return sum[0] + sum[1] + sum[2] + sum[3];
 }
 
 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
diff --git a/src/3rdparty/libwebp/src/dsp/filters.c b/src/3rdparty/libwebp/src/dsp/filters.c
index 5c30f2e..9f04faf 100644
--- a/src/3rdparty/libwebp/src/dsp/filters.c
+++ b/src/3rdparty/libwebp/src/dsp/filters.c
@@ -184,19 +184,40 @@ static void GradientFilter(const uint8_t* data, int width, int height,
 
 //------------------------------------------------------------------------------
 
-static void VerticalUnfilter(int width, int height, int stride, int row,
-                             int num_rows, uint8_t* data) {
-  DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
+static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
+                               uint8_t* out, int width) {
+  uint8_t pred = (prev == NULL) ? 0 : prev[0];
+  int i;
+  for (i = 0; i < width; ++i) {
+    out[i] = pred + in[i];
+    pred = out[i];
+  }
 }
 
-static void HorizontalUnfilter(int width, int height, int stride, int row,
-                               int num_rows, uint8_t* data) {
-  DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
+static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
+                             uint8_t* out, int width) {
+  if (prev == NULL) {
+    HorizontalUnfilter(NULL, in, out, width);
+  } else {
+    int i;
+    for (i = 0; i < width; ++i) out[i] = prev[i] + in[i];
+  }
 }
 
-static void GradientUnfilter(int width, int height, int stride, int row,
-                             int num_rows, uint8_t* data) {
-  DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
+static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
+                             uint8_t* out, int width) {
+  if (prev == NULL) {
+    HorizontalUnfilter(NULL, in, out, width);
+  } else {
+    uint8_t top = prev[0], top_left = top, left = top;
+    int i;
+    for (i = 0; i < width; ++i) {
+      top = prev[i];  // need to read this first, in case prev==out
+      left = in[i] + GradientPredictor(left, top, top_left);
+      top_left = top;
+      out[i] = left;
+    }
+  }
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
index 8134af5..1d82e3c 100644
--- a/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
@@ -33,10 +33,6 @@
   assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
   (void)height;  // Silence unused warning.
 
-// if INVERSE
-//   preds == &dst[-1] == &src[-1]
-// else
-//   preds == &src[-1] != &dst[-1]
 #define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do {                        \
     const uint8_t* psrc = (uint8_t*)(SRC);                                     \
     uint8_t* pdst = (uint8_t*)(DST);                                           \
@@ -45,27 +41,28 @@
     __asm__ volatile (                                                         \
       ".set      push                                   \n\t"                  \
       ".set      noreorder                              \n\t"                  \
-      "srl       %[temp0],    %[length],    0x2         \n\t"                  \
+      "srl       %[temp0],    %[length],    2           \n\t"                  \
       "beqz      %[temp0],    4f                        \n\t"                  \
-      " andi     %[temp6],    %[length],    0x3         \n\t"                  \
+      " andi     %[temp6],    %[length],    3           \n\t"                  \
     ".if " #INVERSE "                                   \n\t"                  \
-      "lbu       %[temp1],    -1(%[src])                \n\t"                  \
     "1:                                                 \n\t"                  \
+      "lbu       %[temp1],    -1(%[dst])                \n\t"                  \
       "lbu       %[temp2],    0(%[src])                 \n\t"                  \
       "lbu       %[temp3],    1(%[src])                 \n\t"                  \
       "lbu       %[temp4],    2(%[src])                 \n\t"                  \
       "lbu       %[temp5],    3(%[src])                 \n\t"                  \
+      "addu      %[temp1],    %[temp1],     %[temp2]    \n\t"                  \
+      "addu      %[temp2],    %[temp1],     %[temp3]    \n\t"                  \
+      "addu      %[temp3],    %[temp2],     %[temp4]    \n\t"                  \
+      "addu      %[temp4],    %[temp3],     %[temp5]    \n\t"                  \
+      "sb        %[temp1],    0(%[dst])                 \n\t"                  \
+      "sb        %[temp2],    1(%[dst])                 \n\t"                  \
+      "sb        %[temp3],    2(%[dst])                 \n\t"                  \
+      "sb        %[temp4],    3(%[dst])                 \n\t"                  \
       "addiu     %[src],      %[src],       4           \n\t"                  \
       "addiu     %[temp0],    %[temp0],     -1          \n\t"                  \
-      "addu      %[temp2],    %[temp2],     %[temp1]    \n\t"                  \
-      "addu      %[temp3],    %[temp3],     %[temp2]    \n\t"                  \
-      "addu      %[temp4],    %[temp4],     %[temp3]    \n\t"                  \
-      "addu      %[temp1],    %[temp5],     %[temp4]    \n\t"                  \
-      "sb        %[temp2],    -4(%[src])                \n\t"                  \
-      "sb        %[temp3],    -3(%[src])                \n\t"                  \
-      "sb        %[temp4],    -2(%[src])                \n\t"                  \
       "bnez      %[temp0],    1b                        \n\t"                  \
-      " sb       %[temp1],    -1(%[src])                \n\t"                  \
+      " addiu    %[dst],      %[dst],       4           \n\t"                  \
     ".else                                              \n\t"                  \
     "1:                                                 \n\t"                  \
       "ulw       %[temp1],    -1(%[src])                \n\t"                  \
@@ -81,16 +78,16 @@
       "beqz      %[temp6],    3f                        \n\t"                  \
       " nop                                             \n\t"                  \
     "2:                                                 \n\t"                  \
-      "lbu       %[temp1],    -1(%[src])                \n\t"                  \
       "lbu       %[temp2],    0(%[src])                 \n\t"                  \
-      "addiu     %[src],      %[src],       1           \n\t"                  \
     ".if " #INVERSE "                                   \n\t"                  \
+      "lbu       %[temp1],    -1(%[dst])                \n\t"                  \
       "addu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
-      "sb        %[temp3],    -1(%[src])                \n\t"                  \
     ".else                                              \n\t"                  \
+      "lbu       %[temp1],    -1(%[src])                \n\t"                  \
       "subu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
-      "sb        %[temp3],    0(%[dst])                 \n\t"                  \
     ".endif                                             \n\t"                  \
+      "addiu     %[src],      %[src],       1           \n\t"                  \
+      "sb        %[temp3],    0(%[dst])                 \n\t"                  \
       "addiu     %[temp6],    %[temp6],     -1          \n\t"                  \
       "bnez      %[temp6],    2b                        \n\t"                  \
       " addiu    %[dst],      %[dst],       1           \n\t"                  \
@@ -105,12 +102,8 @@
   } while (0)
 
 static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
-                                    int length, int inverse) {
-  if (inverse) {
-    DO_PREDICT_LINE(src, dst, length, 1);
-  } else {
-    DO_PREDICT_LINE(src, dst, length, 0);
-  }
+                                    int length) {
+  DO_PREDICT_LINE(src, dst, length, 0);
 }
 
 #define DO_PREDICT_LINE_VERTICAL(SRC, PRED, DST, LENGTH, INVERSE) do {         \
@@ -172,16 +165,12 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
     );                                                                         \
   } while (0)
 
-#define PREDICT_LINE_ONE_PASS(SRC, PRED, DST, INVERSE) do {                    \
+#define PREDICT_LINE_ONE_PASS(SRC, PRED, DST) do {                             \
     int temp1, temp2, temp3;                                                   \
     __asm__ volatile (                                                         \
       "lbu       %[temp1],   0(%[src])               \n\t"                     \
       "lbu       %[temp2],   0(%[pred])              \n\t"                     \
-    ".if " #INVERSE "                                \n\t"                     \
-      "addu      %[temp3],   %[temp1],   %[temp2]    \n\t"                     \
-    ".else                                           \n\t"                     \
       "subu      %[temp3],   %[temp1],   %[temp2]    \n\t"                     \
-    ".endif                                          \n\t"                     \
       "sb        %[temp3],   0(%[dst])               \n\t"                     \
       : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)          \
       : [pred]"r"((PRED)), [dst]"r"((DST)), [src]"r"((SRC))                    \
@@ -192,10 +181,10 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
 //------------------------------------------------------------------------------
 // Horizontal filter.
 
-#define FILTER_LINE_BY_LINE(INVERSE) do {                                      \
+#define FILTER_LINE_BY_LINE do {                                               \
     while (row < last_row) {                                                   \
-      PREDICT_LINE_ONE_PASS(in, preds - stride, out, INVERSE);                 \
-      DO_PREDICT_LINE(in + 1, out + 1, width - 1, INVERSE);                    \
+      PREDICT_LINE_ONE_PASS(in, preds - stride, out);                          \
+      DO_PREDICT_LINE(in + 1, out + 1, width - 1, 0);                          \
       ++row;                                                                   \
       preds += stride;                                                         \
       in += stride;                                                            \
@@ -206,19 +195,19 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
 static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
                                            int width, int height, int stride,
                                            int row, int num_rows,
-                                           int inverse, uint8_t* out) {
+                                           uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
   in += start_offset;
   out += start_offset;
-  preds = inverse ? out : in;
+  preds = in;
 
   if (row == 0) {
     // Leftmost pixel is the same as input for topmost scanline.
     out[0] = in[0];
-    PredictLine(in + 1, out + 1, width - 1, inverse);
+    PredictLine(in + 1, out + 1, width - 1);
     row = 1;
     preds += stride;
     in += stride;
@@ -226,31 +215,21 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
   }
 
   // Filter line-by-line.
-  if (inverse) {
-    FILTER_LINE_BY_LINE(1);
-  } else {
-    FILTER_LINE_BY_LINE(0);
-  }
+  FILTER_LINE_BY_LINE;
 }
-
 #undef FILTER_LINE_BY_LINE
 
 static void HorizontalFilter(const uint8_t* data, int width, int height,
                              int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
-}
-
-static void HorizontalUnfilter(int width, int height, int stride, int row,
-                               int num_rows, uint8_t* data) {
-  DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
+  DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
 }
 
 //------------------------------------------------------------------------------
 // Vertical filter.
 
-#define FILTER_LINE_BY_LINE(INVERSE) do {                                      \
+#define FILTER_LINE_BY_LINE do {                                               \
     while (row < last_row) {                                                   \
-      DO_PREDICT_LINE_VERTICAL(in, preds, out, width, INVERSE);                \
+      DO_PREDICT_LINE_VERTICAL(in, preds, out, width, 0);                      \
       ++row;                                                                   \
       preds += stride;                                                         \
       in += stride;                                                            \
@@ -260,21 +239,20 @@ static void HorizontalUnfilter(int width, int height, int stride, int row,
 
 static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
                                          int width, int height, int stride,
-                                         int row, int num_rows,
-                                         int inverse, uint8_t* out) {
+                                         int row, int num_rows, uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
   in += start_offset;
   out += start_offset;
-  preds = inverse ? out : in;
+  preds = in;
 
   if (row == 0) {
     // Very first top-left pixel is copied.
     out[0] = in[0];
     // Rest of top scan-line is left-predicted.
-    PredictLine(in + 1, out + 1, width - 1, inverse);
+    PredictLine(in + 1, out + 1, width - 1);
     row = 1;
     in += stride;
     out += stride;
@@ -284,24 +262,13 @@ static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
   }
 
   // Filter line-by-line.
-  if (inverse) {
-    FILTER_LINE_BY_LINE(1);
-  } else {
-    FILTER_LINE_BY_LINE(0);
-  }
+  FILTER_LINE_BY_LINE;
 }
-
 #undef FILTER_LINE_BY_LINE
-#undef DO_PREDICT_LINE_VERTICAL
 
 static void VerticalFilter(const uint8_t* data, int width, int height,
                            int stride, uint8_t* filtered_data) {
-  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
-}
-
-static void VerticalUnfilter(int width, int height, int stride, int row,
-                             int num_rows, uint8_t* data) {
-  DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
+  DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
 }
 
 //------------------------------------------------------------------------------
@@ -321,10 +288,10 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
   return temp0;
 }
 
-#define FILTER_LINE_BY_LINE(INVERSE, PREDS, OPERATION) do {                    \
+#define FILTER_LINE_BY_LINE(PREDS, OPERATION) do {                             \
     while (row < last_row) {                                                   \
       int w;                                                                   \
-      PREDICT_LINE_ONE_PASS(in, PREDS - stride, out, INVERSE);                 \
+      PREDICT_LINE_ONE_PASS(in, PREDS - stride, out);                          \
       for (w = 1; w < width; ++w) {                                            \
         const int pred = GradientPredictor(PREDS[w - 1],                       \
                                            PREDS[w - stride],                  \
@@ -339,20 +306,19 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
 
 static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
                                          int width, int height, int stride,
-                                         int row, int num_rows,
-                                         int inverse, uint8_t* out) {
+                                         int row, int num_rows, uint8_t* out) {
   const uint8_t* preds;
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
   in += start_offset;
   out += start_offset;
-  preds = inverse ? out : in;
+  preds = in;
 
   // left prediction for top scan-line
   if (row == 0) {
     out[0] = in[0];
-    PredictLine(in + 1, out + 1, width - 1, inverse);
+    PredictLine(in + 1, out + 1, width - 1);
     row = 1;
     preds += stride;
     in += stride;
@@ -360,25 +326,49 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
   }
 
   // Filter line-by-line.
-  if (inverse) {
-    FILTER_LINE_BY_LINE(1, out, +);
-  } else {
-    FILTER_LINE_BY_LINE(0, in, -);
-  }
+  FILTER_LINE_BY_LINE(in, -);
 }
-
 #undef FILTER_LINE_BY_LINE
 
 static void GradientFilter(const uint8_t* data, int width, int height,
                            int stride, uint8_t* filtered_data) {
-  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
+  DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
 }
 
-static void GradientUnfilter(int width, int height, int stride, int row,
-                             int num_rows, uint8_t* data) {
-  DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
+//------------------------------------------------------------------------------
+
+static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
+                               uint8_t* out, int width) {
+ out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
+ DO_PREDICT_LINE(in + 1, out + 1, width - 1, 1);
 }
 
+static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
+                             uint8_t* out, int width) {
+  if (prev == NULL) {
+    HorizontalUnfilter(NULL, in, out, width);
+  } else {
+    DO_PREDICT_LINE_VERTICAL(in, prev, out, width, 1);
+  }
+}
+
+static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
+                             uint8_t* out, int width) {
+  if (prev == NULL) {
+    HorizontalUnfilter(NULL, in, out, width);
+  } else {
+    uint8_t top = prev[0], top_left = top, left = top;
+    int i;
+    for (i = 0; i < width; ++i) {
+      top = prev[i];  // need to read this first, in case prev==dst
+      left = in[i] + GradientPredictor(left, top, top_left);
+      top_left = top;
+      out[i] = left;
+    }
+  }
+}
+
+#undef DO_PREDICT_LINE_VERTICAL
 #undef PREDICT_LINE_ONE_PASS
 #undef DO_PREDICT_LINE
 #undef SANITY_CHECK
@@ -389,13 +379,13 @@ static void GradientUnfilter(int width, int height, int stride, int row,
 extern void VP8FiltersInitMIPSdspR2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
-  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
-  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
-  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
-
   WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
   WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
   WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
 }
 
 #else  // !WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/filters_sse2.c b/src/3rdparty/libwebp/src/dsp/filters_sse2.c
index bf93342..67f7799 100644
--- a/src/3rdparty/libwebp/src/dsp/filters_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/filters_sse2.c
@@ -33,82 +33,39 @@
   (void)height;  // Silence unused warning.
 
 static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
-                           uint8_t* dst, int length, int inverse) {
+                           uint8_t* dst, int length) {
   int i;
   const int max_pos = length & ~31;
   assert(length >= 0);
-  if (inverse) {
-    for (i = 0; i < max_pos; i += 32) {
-      const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i +  0]);
-      const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]);
-      const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i +  0]);
-      const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]);
-      const __m128i C0 = _mm_add_epi8(A0, B0);
-      const __m128i C1 = _mm_add_epi8(A1, B1);
-      _mm_storeu_si128((__m128i*)&dst[i +  0], C0);
-      _mm_storeu_si128((__m128i*)&dst[i + 16], C1);
-    }
-    for (; i < length; ++i) dst[i] = src[i] + pred[i];
-  } else {
-    for (i = 0; i < max_pos; i += 32) {
-      const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i +  0]);
-      const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]);
-      const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i +  0]);
-      const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]);
-      const __m128i C0 = _mm_sub_epi8(A0, B0);
-      const __m128i C1 = _mm_sub_epi8(A1, B1);
-      _mm_storeu_si128((__m128i*)&dst[i +  0], C0);
-      _mm_storeu_si128((__m128i*)&dst[i + 16], C1);
-    }
-    for (; i < length; ++i) dst[i] = src[i] - pred[i];
+  for (i = 0; i < max_pos; i += 32) {
+    const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i +  0]);
+    const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]);
+    const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i +  0]);
+    const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]);
+    const __m128i C0 = _mm_sub_epi8(A0, B0);
+    const __m128i C1 = _mm_sub_epi8(A1, B1);
+    _mm_storeu_si128((__m128i*)&dst[i +  0], C0);
+    _mm_storeu_si128((__m128i*)&dst[i + 16], C1);
   }
+  for (; i < length; ++i) dst[i] = src[i] - pred[i];
 }
 
 // Special case for left-based prediction (when preds==dst-1 or preds==src-1).
-static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length,
-                            int inverse) {
+static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length) {
   int i;
-  if (length <= 0) return;
-  if (inverse) {
-    const int max_pos = length & ~7;
-    __m128i last = _mm_set_epi32(0, 0, 0, dst[-1]);
-    for (i = 0; i < max_pos; i += 8) {
-      const __m128i A0 = _mm_loadl_epi64((const __m128i*)(src + i));
-      const __m128i A1 = _mm_add_epi8(A0, last);
-      const __m128i A2 = _mm_slli_si128(A1, 1);
-      const __m128i A3 = _mm_add_epi8(A1, A2);
-      const __m128i A4 = _mm_slli_si128(A3, 2);
-      const __m128i A5 = _mm_add_epi8(A3, A4);
-      const __m128i A6 = _mm_slli_si128(A5, 4);
-      const __m128i A7 = _mm_add_epi8(A5, A6);
-      _mm_storel_epi64((__m128i*)(dst + i), A7);
-      last = _mm_srli_epi64(A7, 56);
-    }
-    for (; i < length; ++i) dst[i] = src[i] + dst[i - 1];
-  } else {
-    const int max_pos = length & ~31;
-    for (i = 0; i < max_pos; i += 32) {
-      const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i +  0    ));
-      const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i +  0 - 1));
-      const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16    ));
-      const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1));
-      const __m128i C0 = _mm_sub_epi8(A0, B0);
-      const __m128i C1 = _mm_sub_epi8(A1, B1);
-      _mm_storeu_si128((__m128i*)(dst + i +  0), C0);
-      _mm_storeu_si128((__m128i*)(dst + i + 16), C1);
-    }
-    for (; i < length; ++i) dst[i] = src[i] - src[i - 1];
-  }
-}
-
-static void PredictLineC(const uint8_t* src, const uint8_t* pred,
-                         uint8_t* dst, int length, int inverse) {
-  int i;
-  if (inverse) {
-    for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
-  } else {
-    for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i];
+  const int max_pos = length & ~31;
+  assert(length >= 0);
+  for (i = 0; i < max_pos; i += 32) {
+    const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i +  0    ));
+    const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i +  0 - 1));
+    const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16    ));
+    const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1));
+    const __m128i C0 = _mm_sub_epi8(A0, B0);
+    const __m128i C1 = _mm_sub_epi8(A1, B1);
+    _mm_storeu_si128((__m128i*)(dst + i +  0), C0);
+    _mm_storeu_si128((__m128i*)(dst + i + 16), C1);
   }
+  for (; i < length; ++i) dst[i] = src[i] - src[i - 1];
 }
 
 //------------------------------------------------------------------------------
@@ -117,21 +74,18 @@ static void PredictLineC(const uint8_t* src, const uint8_t* pred,
 static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
                                            int width, int height, int stride,
                                            int row, int num_rows,
-                                           int inverse, uint8_t* out) {
-  const uint8_t* preds;
+                                           uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
   in += start_offset;
   out += start_offset;
-  preds = inverse ? out : in;
 
   if (row == 0) {
     // Leftmost pixel is the same as input for topmost scanline.
     out[0] = in[0];
-    PredictLineLeft(in + 1, out + 1, width - 1, inverse);
+    PredictLineLeft(in + 1, out + 1, width - 1);
     row = 1;
-    preds += stride;
     in += stride;
     out += stride;
   }
@@ -139,10 +93,9 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
   // Filter line-by-line.
   while (row < last_row) {
     // Leftmost pixel is predicted from above.
-    PredictLineC(in, preds - stride, out, 1, inverse);
-    PredictLineLeft(in + 1, out + 1, width - 1, inverse);
+    out[0] = in[0] - in[-stride];
+    PredictLineLeft(in + 1, out + 1, width - 1);
     ++row;
-    preds += stride;
     in += stride;
     out += stride;
   }
@@ -153,34 +106,27 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
 
 static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
                                          int width, int height, int stride,
-                                         int row, int num_rows,
-                                         int inverse, uint8_t* out) {
-  const uint8_t* preds;
+                                         int row, int num_rows, uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
   in += start_offset;
   out += start_offset;
-  preds = inverse ? out : in;
 
   if (row == 0) {
     // Very first top-left pixel is copied.
     out[0] = in[0];
     // Rest of top scan-line is left-predicted.
-    PredictLineLeft(in + 1, out + 1, width - 1, inverse);
+    PredictLineLeft(in + 1, out + 1, width - 1);
     row = 1;
     in += stride;
     out += stride;
-  } else {
-    // We are starting from in-between. Make sure 'preds' points to prev row.
-    preds -= stride;
   }
 
   // Filter line-by-line.
   while (row < last_row) {
-    PredictLineTop(in, preds, out, width, inverse);
+    PredictLineTop(in, in - stride, out, width);
     ++row;
-    preds += stride;
     in += stride;
     out += stride;
   }
@@ -219,49 +165,10 @@ static void GradientPredictDirect(const uint8_t* const row,
   }
 }
 
-static void GradientPredictInverse(const uint8_t* const in,
-                                   const uint8_t* const top,
-                                   uint8_t* const row, int length) {
-  if (length > 0) {
-    int i;
-    const int max_pos = length & ~7;
-    const __m128i zero = _mm_setzero_si128();
-    __m128i A = _mm_set_epi32(0, 0, 0, row[-1]);   // left sample
-    for (i = 0; i < max_pos; i += 8) {
-      const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]);
-      const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]);
-      const __m128i B = _mm_unpacklo_epi8(tmp0, zero);
-      const __m128i C = _mm_unpacklo_epi8(tmp1, zero);
-      const __m128i tmp2 = _mm_loadl_epi64((const __m128i*)&in[i]);
-      const __m128i D = _mm_unpacklo_epi8(tmp2, zero);   // base input
-      const __m128i E = _mm_sub_epi16(B, C);  // unclipped gradient basis B - C
-      __m128i out = zero;                     // accumulator for output
-      __m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff);
-      int k = 8;
-      while (1) {
-        const __m128i tmp3 = _mm_add_epi16(A, E);        // delta = A + B - C
-        const __m128i tmp4 = _mm_min_epi16(tmp3, mask_hi);
-        const __m128i tmp5 = _mm_max_epi16(tmp4, zero);  // clipped delta
-        const __m128i tmp6 = _mm_add_epi16(tmp5, D);     // add to in[] values
-        A = _mm_and_si128(tmp6, mask_hi);                // 1-complement clip
-        out = _mm_or_si128(out, A);                      // accumulate output
-        if (--k == 0) break;
-        A = _mm_slli_si128(A, 2);                        // rotate left sample
-        mask_hi = _mm_slli_si128(mask_hi, 2);            // rotate mask
-      }
-      A = _mm_srli_si128(A, 14);       // prepare left sample for next iteration
-      _mm_storel_epi64((__m128i*)&row[i], _mm_packus_epi16(out, zero));
-    }
-    for (; i < length; ++i) {
-      row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
-    }
-  }
-}
-
 static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
                                          int width, int height, int stride,
                                          int row, int num_rows,
-                                         int inverse, uint8_t* out) {
+                                         uint8_t* out) {
   const size_t start_offset = row * stride;
   const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
@@ -271,7 +178,7 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
   // left prediction for top scan-line
   if (row == 0) {
     out[0] = in[0];
-    PredictLineLeft(in + 1, out + 1, width - 1, inverse);
+    PredictLineLeft(in + 1, out + 1, width - 1);
     row = 1;
     in += stride;
     out += stride;
@@ -279,13 +186,8 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
 
   // Filter line-by-line.
   while (row < last_row) {
-    if (inverse) {
-      PredictLineC(in, out - stride, out, 1, inverse);  // predict from above
-      GradientPredictInverse(in + 1, out + 1 - stride, out + 1, width - 1);
-    } else {
-      PredictLineC(in, in - stride, out, 1, inverse);
-      GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1);
-    }
+    out[0] = in[0] - in[-stride];
+    GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1);
     ++row;
     in += stride;
     out += stride;
@@ -298,36 +200,112 @@ static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
 
 static void HorizontalFilter(const uint8_t* data, int width, int height,
                              int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+  DoHorizontalFilter(data, width, height, stride, 0, height, filtered_data);
 }
 
 static void VerticalFilter(const uint8_t* data, int width, int height,
                            int stride, uint8_t* filtered_data) {
-  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+  DoVerticalFilter(data, width, height, stride, 0, height, filtered_data);
 }
 
-
 static void GradientFilter(const uint8_t* data, int width, int height,
                            int stride, uint8_t* filtered_data) {
-  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
+  DoGradientFilter(data, width, height, stride, 0, height, filtered_data);
 }
 
-
 //------------------------------------------------------------------------------
+// Inverse transforms
 
-static void VerticalUnfilter(int width, int height, int stride, int row,
-                             int num_rows, uint8_t* data) {
-  DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
+static void HorizontalUnfilter(const uint8_t* prev, const uint8_t* in,
+                               uint8_t* out, int width) {
+  int i;
+  __m128i last;
+  out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
+  if (width <= 1) return;
+  last = _mm_set_epi32(0, 0, 0, out[0]);
+  for (i = 1; i + 8 <= width; i += 8) {
+    const __m128i A0 = _mm_loadl_epi64((const __m128i*)(in + i));
+    const __m128i A1 = _mm_add_epi8(A0, last);
+    const __m128i A2 = _mm_slli_si128(A1, 1);
+    const __m128i A3 = _mm_add_epi8(A1, A2);
+    const __m128i A4 = _mm_slli_si128(A3, 2);
+    const __m128i A5 = _mm_add_epi8(A3, A4);
+    const __m128i A6 = _mm_slli_si128(A5, 4);
+    const __m128i A7 = _mm_add_epi8(A5, A6);
+    _mm_storel_epi64((__m128i*)(out + i), A7);
+    last = _mm_srli_epi64(A7, 56);
+  }
+  for (; i < width; ++i) out[i] = in[i] + out[i - 1];
 }
 
-static void HorizontalUnfilter(int width, int height, int stride, int row,
-                               int num_rows, uint8_t* data) {
-  DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
+static void VerticalUnfilter(const uint8_t* prev, const uint8_t* in,
+                             uint8_t* out, int width) {
+  if (prev == NULL) {
+    HorizontalUnfilter(NULL, in, out, width);
+  } else {
+    int i;
+    const int max_pos = width & ~31;
+    assert(width >= 0);
+    for (i = 0; i < max_pos; i += 32) {
+      const __m128i A0 = _mm_loadu_si128((const __m128i*)&in[i +  0]);
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)&in[i + 16]);
+      const __m128i B0 = _mm_loadu_si128((const __m128i*)&prev[i +  0]);
+      const __m128i B1 = _mm_loadu_si128((const __m128i*)&prev[i + 16]);
+      const __m128i C0 = _mm_add_epi8(A0, B0);
+      const __m128i C1 = _mm_add_epi8(A1, B1);
+      _mm_storeu_si128((__m128i*)&out[i +  0], C0);
+      _mm_storeu_si128((__m128i*)&out[i + 16], C1);
+    }
+    for (; i < width; ++i) out[i] = in[i] + prev[i];
+  }
 }
 
-static void GradientUnfilter(int width, int height, int stride, int row,
-                             int num_rows, uint8_t* data) {
-  DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
+static void GradientPredictInverse(const uint8_t* const in,
+                                   const uint8_t* const top,
+                                   uint8_t* const row, int length) {
+  if (length > 0) {
+    int i;
+    const int max_pos = length & ~7;
+    const __m128i zero = _mm_setzero_si128();
+    __m128i A = _mm_set_epi32(0, 0, 0, row[-1]);   // left sample
+    for (i = 0; i < max_pos; i += 8) {
+      const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]);
+      const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]);
+      const __m128i B = _mm_unpacklo_epi8(tmp0, zero);
+      const __m128i C = _mm_unpacklo_epi8(tmp1, zero);
+      const __m128i D = _mm_loadl_epi64((const __m128i*)&in[i]);  // base input
+      const __m128i E = _mm_sub_epi16(B, C);  // unclipped gradient basis B - C
+      __m128i out = zero;                     // accumulator for output
+      __m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff);
+      int k = 8;
+      while (1) {
+        const __m128i tmp3 = _mm_add_epi16(A, E);           // delta = A + B - C
+        const __m128i tmp4 = _mm_packus_epi16(tmp3, zero);  // saturate delta
+        const __m128i tmp5 = _mm_add_epi8(tmp4, D);         // add to in[]
+        A = _mm_and_si128(tmp5, mask_hi);                   // 1-complement clip
+        out = _mm_or_si128(out, A);                         // accumulate output
+        if (--k == 0) break;
+        A = _mm_slli_si128(A, 1);                        // rotate left sample
+        mask_hi = _mm_slli_si128(mask_hi, 1);            // rotate mask
+        A = _mm_unpacklo_epi8(A, zero);                  // convert 8b->16b
+      }
+      A = _mm_srli_si128(A, 7);       // prepare left sample for next iteration
+      _mm_storel_epi64((__m128i*)&row[i], out);
+    }
+    for (; i < length; ++i) {
+      row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
+    }
+  }
+}
+
+static void GradientUnfilter(const uint8_t* prev, const uint8_t* in,
+                             uint8_t* out, int width) {
+  if (prev == NULL) {
+    HorizontalUnfilter(NULL, in, out, width);
+  } else {
+    out[0] = in[0] + prev[0];  // predict from above
+    GradientPredictInverse(in + 1, prev + 1, out + 1, width - 1);
+  }
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.c b/src/3rdparty/libwebp/src/dsp/lossless.c
index 71ae9d4..af913ef 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless.c
@@ -28,9 +28,7 @@
 
 // In-place sum of each component with mod 256.
 static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
-  const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
-  const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
-  *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+  *a = VP8LAddPixels(*a, b);
 }
 
 static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.h b/src/3rdparty/libwebp/src/dsp/lossless.h
index e063bdd..9f0d7a2 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.h
+++ b/src/3rdparty/libwebp/src/dsp/lossless.h
@@ -158,7 +158,8 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
 
 void VP8LResidualImage(int width, int height, int bits, int low_effort,
                        uint32_t* const argb, uint32_t* const argb_scratch,
-                       uint32_t* const image, int exact);
+                       uint32_t* const image, int near_lossless, int exact,
+                       int used_subtract_green);
 
 void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
                              uint32_t* const argb, uint32_t* image);
@@ -172,6 +173,17 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
   return (size + (1 << sampling_bits) - 1) >> sampling_bits;
 }
 
+// Converts near lossless quality into max number of bits shaved off.
+static WEBP_INLINE int VP8LNearLosslessBits(int near_lossless_quality) {
+  //    100 -> 0
+  // 80..99 -> 1
+  // 60..79 -> 2
+  // 40..59 -> 3
+  // 20..39 -> 4
+  //  0..19 -> 5
+  return 5 - near_lossless_quality / 20;
+}
+
 // -----------------------------------------------------------------------------
 // Faster logarithm for integers. Small values use a look-up table.
 
@@ -262,6 +274,11 @@ extern VP8LHistogramAddFunc VP8LHistogramAdd;
 // -----------------------------------------------------------------------------
 // PrefixEncode()
 
+typedef int (*VP8LVectorMismatchFunc)(const uint32_t* const array1,
+                                      const uint32_t* const array2, int length);
+// Returns the first index where array1 and array2 are different.
+extern VP8LVectorMismatchFunc VP8LVectorMismatch;
+
 static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
   const int log_floor = BitsLog2Floor(n);
   if (n == (n & ~(n - 1)))  // zero or a power of two.
@@ -324,7 +341,14 @@ static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
   }
 }
 
-// In-place difference of each component with mod 256.
+// Sum of each component, mod 256.
+static WEBP_INLINE uint32_t VP8LAddPixels(uint32_t a, uint32_t b) {
+  const uint32_t alpha_and_green = (a & 0xff00ff00u) + (b & 0xff00ff00u);
+  const uint32_t red_and_blue = (a & 0x00ff00ffu) + (b & 0x00ff00ffu);
+  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+}
+
+// Difference of each component, mod 256.
 static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
   const uint32_t alpha_and_green =
       0x00ff00ffu + (a & 0xff00ff00u) - (b & 0xff00ff00u);
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc.c b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
index 2eafa3d..256f6f5 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
@@ -382,6 +382,7 @@ static float FastLog2Slow(uint32_t v) {
 
 // Mostly used to reduce code size + readability
 static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
+static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
 
 //------------------------------------------------------------------------------
 // Methods to calculate Entropy (Shannon).
@@ -551,18 +552,204 @@ static WEBP_INLINE uint32_t Predict(VP8LPredictorFunc pred_func,
   }
 }
 
+static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
+  const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
+  const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
+  const int diff_g = abs((int)((p1 >> 8) & 0xff) - (int)((p2 >> 8) & 0xff));
+  const int diff_b = abs((int)(p1 & 0xff) - (int)(p2 & 0xff));
+  return GetMax(GetMax(diff_a, diff_r), GetMax(diff_g, diff_b));
+}
+
+static int MaxDiffAroundPixel(uint32_t current, uint32_t up, uint32_t down,
+                              uint32_t left, uint32_t right) {
+  const int diff_up = MaxDiffBetweenPixels(current, up);
+  const int diff_down = MaxDiffBetweenPixels(current, down);
+  const int diff_left = MaxDiffBetweenPixels(current, left);
+  const int diff_right = MaxDiffBetweenPixels(current, right);
+  return GetMax(GetMax(diff_up, diff_down), GetMax(diff_left, diff_right));
+}
+
+static uint32_t AddGreenToBlueAndRed(uint32_t argb) {
+  const uint32_t green = (argb >> 8) & 0xff;
+  uint32_t red_blue = argb & 0x00ff00ffu;
+  red_blue += (green << 16) | green;
+  red_blue &= 0x00ff00ffu;
+  return (argb & 0xff00ff00u) | red_blue;
+}
+
+static void MaxDiffsForRow(int width, int stride, const uint32_t* const argb,
+                           uint8_t* const max_diffs, int used_subtract_green) {
+  uint32_t current, up, down, left, right;
+  int x;
+  if (width <= 2) return;
+  current = argb[0];
+  right = argb[1];
+  if (used_subtract_green) {
+    current = AddGreenToBlueAndRed(current);
+    right = AddGreenToBlueAndRed(right);
+  }
+  // max_diffs[0] and max_diffs[width - 1] are never used.
+  for (x = 1; x < width - 1; ++x) {
+    up = argb[-stride + x];
+    down = argb[stride + x];
+    left = current;
+    current = right;
+    right = argb[x + 1];
+    if (used_subtract_green) {
+      up = AddGreenToBlueAndRed(up);
+      down = AddGreenToBlueAndRed(down);
+      right = AddGreenToBlueAndRed(right);
+    }
+    max_diffs[x] = MaxDiffAroundPixel(current, up, down, left, right);
+  }
+}
+
+// Quantize the difference between the actual component value and its prediction
+// to a multiple of quantization, working modulo 256, taking care not to cross
+// a boundary (inclusive upper limit).
+static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
+                                     uint8_t boundary, int quantization) {
+  const int residual = (value - predict) & 0xff;
+  const int boundary_residual = (boundary - predict) & 0xff;
+  const int lower = residual & ~(quantization - 1);
+  const int upper = lower + quantization;
+  // Resolve ties towards a value closer to the prediction (i.e. towards lower
+  // if value comes after prediction and towards upper otherwise).
+  const int bias = ((boundary - value) & 0xff) < boundary_residual;
+  if (residual - lower < upper - residual + bias) {
+    // lower is closer to residual than upper.
+    if (residual > boundary_residual && lower <= boundary_residual) {
+      // Halve quantization step to avoid crossing boundary. This midpoint is
+      // on the same side of boundary as residual because midpoint >= residual
+      // (since lower is closer than upper) and residual is above the boundary.
+      return lower + (quantization >> 1);
+    }
+    return lower;
+  } else {
+    // upper is closer to residual than lower.
+    if (residual <= boundary_residual && upper > boundary_residual) {
+      // Halve quantization step to avoid crossing boundary. This midpoint is
+      // on the same side of boundary as residual because midpoint <= residual
+      // (since upper is closer than lower) and residual is below the boundary.
+      return lower + (quantization >> 1);
+    }
+    return upper & 0xff;
+  }
+}
+
+// Quantize every component of the difference between the actual pixel value and
+// its prediction to a multiple of a quantization (a power of 2, not larger than
+// max_quantization which is a power of 2, smaller than max_diff). Take care if
+// value and predict have undergone subtract green, which means that red and
+// blue are represented as offsets from green.
+static uint32_t NearLossless(uint32_t value, uint32_t predict,
+                             int max_quantization, int max_diff,
+                             int used_subtract_green) {
+  int quantization;
+  uint8_t new_green = 0;
+  uint8_t green_diff = 0;
+  uint8_t a, r, g, b;
+  if (max_diff <= 2) {
+    return VP8LSubPixels(value, predict);
+  }
+  quantization = max_quantization;
+  while (quantization >= max_diff) {
+    quantization >>= 1;
+  }
+  if ((value >> 24) == 0 || (value >> 24) == 0xff) {
+    // Preserve transparency of fully transparent or fully opaque pixels.
+    a = ((value >> 24) - (predict >> 24)) & 0xff;
+  } else {
+    a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
+  }
+  g = NearLosslessComponent((value >> 8) & 0xff, (predict >> 8) & 0xff, 0xff,
+                            quantization);
+  if (used_subtract_green) {
+    // The green offset will be added to red and blue components during decoding
+    // to obtain the actual red and blue values.
+    new_green = ((predict >> 8) + g) & 0xff;
+    // The amount by which green has been adjusted during quantization. It is
+    // subtracted from red and blue for compensation, to avoid accumulating two
+    // quantization errors in them.
+    green_diff = (new_green - (value >> 8)) & 0xff;
+  }
+  r = NearLosslessComponent(((value >> 16) - green_diff) & 0xff,
+                            (predict >> 16) & 0xff, 0xff - new_green,
+                            quantization);
+  b = NearLosslessComponent((value - green_diff) & 0xff, predict & 0xff,
+                            0xff - new_green, quantization);
+  return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
+}
+
+// Returns the difference between the pixel and its prediction. In case of a
+// lossy encoding, updates the source image to avoid propagating the deviation
+// further to pixels which depend on the current pixel for their predictions.
+static WEBP_INLINE uint32_t GetResidual(int width, int height,
+                                        uint32_t* const upper_row,
+                                        uint32_t* const current_row,
+                                        const uint8_t* const max_diffs,
+                                        int mode, VP8LPredictorFunc pred_func,
+                                        int x, int y, int max_quantization,
+                                        int exact, int used_subtract_green) {
+  const uint32_t predict = Predict(pred_func, x, y, current_row, upper_row);
+  uint32_t residual;
+  if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
+      x == 0 || x == width - 1) {
+    residual = VP8LSubPixels(current_row[x], predict);
+  } else {
+    residual = NearLossless(current_row[x], predict, max_quantization,
+                            max_diffs[x], used_subtract_green);
+    // Update the source image.
+    current_row[x] = VP8LAddPixels(predict, residual);
+    // x is never 0 here so we do not need to update upper_row like below.
+  }
+  if (!exact && (current_row[x] & kMaskAlpha) == 0) {
+    // If alpha is 0, cleanup RGB. We can choose the RGB values of the residual
+    // for best compression. The prediction of alpha itself can be non-zero and
+    // must be kept though. We choose RGB of the residual to be 0.
+    residual &= kMaskAlpha;
+    // Update the source image.
+    current_row[x] = predict & ~kMaskAlpha;
+    // The prediction for the rightmost pixel in a row uses the leftmost pixel
+    // in that row as its top-right context pixel. Hence if we change the
+    // leftmost pixel of current_row, the corresponding change must be applied
+    // to upper_row as well where top-right context is being read from.
+    if (x == 0 && y != 0) upper_row[width] = current_row[0];
+  }
+  return residual;
+}
+
 // Returns best predictor and updates the accumulated histogram.
+// If max_quantization > 1, assumes that near lossless processing will be
+// applied, quantizing residuals to multiples of quantization levels up to
+// max_quantization (the actual quantization level depends on smoothness near
+// the given pixel).
 static int GetBestPredictorForTile(int width, int height,
                                    int tile_x, int tile_y, int bits,
                                    int accumulated[4][256],
-                                   const uint32_t* const argb_scratch,
-                                   int exact) {
+                                   uint32_t* const argb_scratch,
+                                   const uint32_t* const argb,
+                                   int max_quantization,
+                                   int exact, int used_subtract_green) {
   const int kNumPredModes = 14;
-  const int col_start = tile_x << bits;
-  const int row_start = tile_y << bits;
+  const int start_x = tile_x << bits;
+  const int start_y = tile_y << bits;
   const int tile_size = 1 << bits;
-  const int max_y = GetMin(tile_size, height - row_start);
-  const int max_x = GetMin(tile_size, width - col_start);
+  const int max_y = GetMin(tile_size, height - start_y);
+  const int max_x = GetMin(tile_size, width - start_x);
+  // Whether there exist columns just outside the tile.
+  const int have_left = (start_x > 0);
+  const int have_right = (max_x < width - start_x);
+  // Position and size of the strip covering the tile and adjacent columns if
+  // they exist.
+  const int context_start_x = start_x - have_left;
+  const int context_width = max_x + have_left + have_right;
+  // The width of upper_row and current_row is one pixel larger than image width
+  // to allow the top right pixel to point to the leftmost pixel of the next row
+  // when at the right edge.
+  uint32_t* upper_row = argb_scratch;
+  uint32_t* current_row = upper_row + width + 1;
+  uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1);
   float best_diff = MAX_DIFF_COST;
   int best_mode = 0;
   int mode;
@@ -571,28 +758,46 @@ static int GetBestPredictorForTile(int width, int height,
   // Need pointers to be able to swap arrays.
   int (*histo_argb)[256] = histo_stack_1;
   int (*best_histo)[256] = histo_stack_2;
-
   int i, j;
+
   for (mode = 0; mode < kNumPredModes; ++mode) {
-    const uint32_t* current_row = argb_scratch;
     const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
     float cur_diff;
-    int y;
+    int relative_y;
     memset(histo_argb, 0, sizeof(histo_stack_1));
-    for (y = 0; y < max_y; ++y) {
-      int x;
-      const int row = row_start + y;
-      const uint32_t* const upper_row = current_row;
-      current_row = upper_row + width;
-      for (x = 0; x < max_x; ++x) {
-        const int col = col_start + x;
-        const uint32_t predict =
-            Predict(pred_func, col, row, current_row, upper_row);
-        uint32_t residual = VP8LSubPixels(current_row[col], predict);
-        if (!exact && (current_row[col] & kMaskAlpha) == 0) {
-          residual &= kMaskAlpha;  // See CopyTileWithPrediction.
-        }
-        UpdateHisto(histo_argb, residual);
+    if (start_y > 0) {
+      // Read the row above the tile which will become the first upper_row.
+      // Include a pixel to the left if it exists; include a pixel to the right
+      // in all cases (wrapping to the leftmost pixel of the next row if it does
+      // not exist).
+      memcpy(current_row + context_start_x,
+             argb + (start_y - 1) * width + context_start_x,
+             sizeof(*argb) * (max_x + have_left + 1));
+    }
+    for (relative_y = 0; relative_y < max_y; ++relative_y) {
+      const int y = start_y + relative_y;
+      int relative_x;
+      uint32_t* tmp = upper_row;
+      upper_row = current_row;
+      current_row = tmp;
+      // Read current_row. Include a pixel to the left if it exists; include a
+      // pixel to the right in all cases except at the bottom right corner of
+      // the image (wrapping to the leftmost pixel of the next row if it does
+      // not exist in the current row).
+      memcpy(current_row + context_start_x,
+             argb + y * width + context_start_x,
+             sizeof(*argb) * (max_x + have_left + (y + 1 < height)));
+      if (max_quantization > 1 && y >= 1 && y + 1 < height) {
+        MaxDiffsForRow(context_width, width, argb + y * width + context_start_x,
+                       max_diffs + context_start_x, used_subtract_green);
+      }
+
+      for (relative_x = 0; relative_x < max_x; ++relative_x) {
+        const int x = start_x + relative_x;
+        UpdateHisto(histo_argb,
+                    GetResidual(width, height, upper_row, current_row,
+                                max_diffs, mode, pred_func, x, y,
+                                max_quantization, exact, used_subtract_green));
       }
     }
     cur_diff = PredictionCostSpatialHistogram(
@@ -615,71 +820,82 @@ static int GetBestPredictorForTile(int width, int height,
   return best_mode;
 }
 
+// Converts pixels of the image to residuals with respect to predictions.
+// If max_quantization > 1, applies near lossless processing, quantizing
+// residuals to multiples of quantization levels up to max_quantization
+// (the actual quantization level depends on smoothness near the given pixel).
 static void CopyImageWithPrediction(int width, int height,
                                     int bits, uint32_t* const modes,
                                     uint32_t* const argb_scratch,
                                     uint32_t* const argb,
-                                    int low_effort, int exact) {
+                                    int low_effort, int max_quantization,
+                                    int exact, int used_subtract_green) {
   const int tiles_per_row = VP8LSubSampleSize(width, bits);
   const int mask = (1 << bits) - 1;
-  // The row size is one pixel longer to allow the top right pixel to point to
-  // the leftmost pixel of the next row when at the right edge.
-  uint32_t* current_row = argb_scratch;
-  uint32_t* upper_row = argb_scratch + width + 1;
+  // The width of upper_row and current_row is one pixel larger than image width
+  // to allow the top right pixel to point to the leftmost pixel of the next row
+  // when at the right edge.
+  uint32_t* upper_row = argb_scratch;
+  uint32_t* current_row = upper_row + width + 1;
+  uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1);
+  uint8_t* lower_max_diffs = current_max_diffs + width;
   int y;
-  VP8LPredictorFunc pred_func =
-      low_effort ? VP8LPredictors[kPredLowEffort] : NULL;
+  int mode = 0;
+  VP8LPredictorFunc pred_func = NULL;
 
   for (y = 0; y < height; ++y) {
     int x;
-    uint32_t* tmp = upper_row;
+    uint32_t* const tmp32 = upper_row;
     upper_row = current_row;
-    current_row = tmp;
-    memcpy(current_row, argb + y * width, sizeof(*current_row) * width);
-    current_row[width] = (y + 1 < height) ? argb[(y + 1) * width] : ARGB_BLACK;
+    current_row = tmp32;
+    memcpy(current_row, argb + y * width,
+           sizeof(*argb) * (width + (y + 1 < height)));
 
     if (low_effort) {
       for (x = 0; x < width; ++x) {
-        const uint32_t predict =
-            Predict(pred_func, x, y, current_row, upper_row);
+        const uint32_t predict = Predict(VP8LPredictors[kPredLowEffort], x, y,
+                                         current_row, upper_row);
         argb[y * width + x] = VP8LSubPixels(current_row[x], predict);
       }
     } else {
+      if (max_quantization > 1) {
+        // Compute max_diffs for the lower row now, because that needs the
+        // contents of argb for the current row, which we will overwrite with
+        // residuals before proceeding with the next row.
+        uint8_t* const tmp8 = current_max_diffs;
+        current_max_diffs = lower_max_diffs;
+        lower_max_diffs = tmp8;
+        if (y + 2 < height) {
+          MaxDiffsForRow(width, width, argb + (y + 1) * width, lower_max_diffs,
+                         used_subtract_green);
+        }
+      }
       for (x = 0; x < width; ++x) {
-        uint32_t predict, residual;
         if ((x & mask) == 0) {
-          const int mode =
-              (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff;
+          mode = (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff;
           pred_func = VP8LPredictors[mode];
         }
-        predict = Predict(pred_func, x, y, current_row, upper_row);
-        residual = VP8LSubPixels(current_row[x], predict);
-        if (!exact && (current_row[x] & kMaskAlpha) == 0) {
-          // If alpha is 0, cleanup RGB. We can choose the RGB values of the
-          // residual for best compression. The prediction of alpha itself can
-          // be non-zero and must be kept though. We choose RGB of the residual
-          // to be 0.
-          residual &= kMaskAlpha;
-          // Update input image so that next predictions use correct RGB value.
-          current_row[x] = predict & ~kMaskAlpha;
-          if (x == 0 && y != 0) upper_row[width] = current_row[x];
-        }
-        argb[y * width + x] = residual;
+        argb[y * width + x] = GetResidual(
+            width, height, upper_row, current_row, current_max_diffs, mode,
+            pred_func, x, y, max_quantization, exact, used_subtract_green);
       }
     }
   }
 }
 
+// Finds the best predictor for each tile, and converts the image to residuals
+// with respect to predictions. If near_lossless_quality < 100, applies
+// near lossless processing, shaving off more bits of residuals for lower
+// qualities.
 void VP8LResidualImage(int width, int height, int bits, int low_effort,
                        uint32_t* const argb, uint32_t* const argb_scratch,
-                       uint32_t* const image, int exact) {
-  const int max_tile_size = 1 << bits;
+                       uint32_t* const image, int near_lossless_quality,
+                       int exact, int used_subtract_green) {
   const int tiles_per_row = VP8LSubSampleSize(width, bits);
   const int tiles_per_col = VP8LSubSampleSize(height, bits);
-  uint32_t* const upper_row = argb_scratch;
-  uint32_t* const current_tile_rows = argb_scratch + width;
   int tile_y;
   int histo[4][256];
+  const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
   if (low_effort) {
     int i;
     for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
@@ -688,26 +904,19 @@ void VP8LResidualImage(int width, int height, int bits, int low_effort,
   } else {
     memset(histo, 0, sizeof(histo));
     for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
-      const int tile_y_offset = tile_y * max_tile_size;
-      const int this_tile_height =
-          (tile_y < tiles_per_col - 1) ? max_tile_size : height - tile_y_offset;
       int tile_x;
-      if (tile_y > 0) {
-        memcpy(upper_row, current_tile_rows + (max_tile_size - 1) * width,
-               width * sizeof(*upper_row));
-      }
-      memcpy(current_tile_rows, &argb[tile_y_offset * width],
-             this_tile_height * width * sizeof(*current_tile_rows));
       for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
         const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y,
-            bits, (int (*)[256])histo, argb_scratch, exact);
+            bits, histo, argb_scratch, argb, max_quantization, exact,
+            used_subtract_green);
         image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
       }
     }
   }
 
-  CopyImageWithPrediction(width, height, bits,
-                          image, argb_scratch, argb, low_effort, exact);
+  CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
+                          low_effort, max_quantization, exact,
+                          used_subtract_green);
 }
 
 void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
@@ -1053,6 +1262,17 @@ void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
 }
 
 //------------------------------------------------------------------------------
+
+static int VectorMismatch(const uint32_t* const array1,
+                          const uint32_t* const array2, int length) {
+  int match_len = 0;
+
+  while (match_len < length && array1[match_len] == array2[match_len]) {
+    ++match_len;
+  }
+  return match_len;
+}
+
 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
 void VP8LBundleColorMap(const uint8_t* const row, int width,
                         int xbits, uint32_t* const dst) {
@@ -1149,6 +1369,8 @@ GetEntropyUnrefinedHelperFunc VP8LGetEntropyUnrefinedHelper;
 
 VP8LHistogramAddFunc VP8LHistogramAdd;
 
+VP8LVectorMismatchFunc VP8LVectorMismatch;
+
 extern void VP8LEncDspInitSSE2(void);
 extern void VP8LEncDspInitSSE41(void);
 extern void VP8LEncDspInitNEON(void);
@@ -1181,6 +1403,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
 
   VP8LHistogramAdd = HistogramAdd;
 
+  VP8LVectorMismatch = VectorMismatch;
+
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
index e8c9834..7c894e7 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -325,6 +325,57 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
 #undef ANALYZE_XY
 
 //------------------------------------------------------------------------------
+
+static int VectorMismatch(const uint32_t* const array1,
+                          const uint32_t* const array2, int length) {
+  int match_len;
+
+  if (length >= 12) {
+    __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]);
+    __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]);
+    match_len = 0;
+    do {
+      // Loop unrolling and early load both provide a speedup of 10% for the
+      // current function. Also, max_limit can be MAX_LENGTH=4096 at most.
+      const __m128i cmpA = _mm_cmpeq_epi32(A0, A1);
+      const __m128i B0 =
+          _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
+      const __m128i B1 =
+          _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
+      if (_mm_movemask_epi8(cmpA) != 0xffff) break;
+      match_len += 4;
+
+      {
+        const __m128i cmpB = _mm_cmpeq_epi32(B0, B1);
+        A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
+        A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
+        if (_mm_movemask_epi8(cmpB) != 0xffff) break;
+        match_len += 4;
+      }
+    } while (match_len + 12 < length);
+  } else {
+    match_len = 0;
+    // Unroll the potential first two loops.
+    if (length >= 4 &&
+        _mm_movemask_epi8(_mm_cmpeq_epi32(
+            _mm_loadu_si128((const __m128i*)&array1[0]),
+            _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) {
+      match_len = 4;
+      if (length >= 8 &&
+          _mm_movemask_epi8(_mm_cmpeq_epi32(
+              _mm_loadu_si128((const __m128i*)&array1[4]),
+              _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff)
+        match_len = 8;
+    }
+  }
+
+  while (match_len < length && array1[match_len] == array2[match_len]) {
+    ++match_len;
+  }
+  return match_len;
+}
+
+//------------------------------------------------------------------------------
 // Entry point
 
 extern void VP8LEncDspInitSSE2(void);
@@ -336,6 +387,7 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
   VP8LCollectColorRedTransforms = CollectColorRedTransforms;
   VP8LHistogramAdd = HistogramAdd;
   VP8LCombinedShannonEntropy = CombinedShannonEntropy;
+  VP8LVectorMismatch = VectorMismatch;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/msa_macro.h b/src/3rdparty/libwebp/src/dsp/msa_macro.h
new file mode 100644
index 0000000..5c707f4
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/msa_macro.h
@@ -0,0 +1,555 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA common macros
+//
+// Author(s):  Prashant Patil   (prashant.patil@imgtec.com)
+
+#ifndef WEBP_DSP_MSA_MACRO_H_
+#define WEBP_DSP_MSA_MACRO_H_
+
+#include <stdint.h>
+#include <msa.h>
+
+#if defined(__clang__)
+  #define CLANG_BUILD
+#endif
+
+#ifdef CLANG_BUILD
+  #define ADDVI_H(a, b)  __msa_addvi_h((v8i16)a, b)
+  #define SRAI_H(a, b)  __msa_srai_h((v8i16)a, b)
+  #define SRAI_W(a, b)  __msa_srai_w((v4i32)a, b)
+#else
+  #define ADDVI_H(a, b)  (a + b)
+  #define SRAI_H(a, b)  (a >> b)
+  #define SRAI_W(a, b)  (a >> b)
+#endif
+
+#define LD_B(RTYPE, psrc) *((RTYPE*)(psrc))
+#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
+
+#define LD_H(RTYPE, psrc) *((RTYPE*)(psrc))
+#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
+
+#define LD_W(RTYPE, psrc) *((RTYPE*)(psrc))
+#define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
+#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
+#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
+#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
+
+#define ST_W(RTYPE, in, pdst) *((RTYPE*)(pdst)) = in
+#define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
+#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+
+#define MSA_LOAD_FUNC(TYPE, INSTR, FUNC_NAME)             \
+  static inline TYPE FUNC_NAME(const void* const psrc) {  \
+    const uint8_t* const psrc_m = (const uint8_t*)psrc;   \
+    TYPE val_m;                                           \
+    asm volatile (                                        \
+      "" #INSTR " %[val_m], %[psrc_m]  \n\t"              \
+      : [val_m] "=r" (val_m)                              \
+      : [psrc_m] "m" (*psrc_m));                          \
+    return val_m;                                         \
+  }
+
+#define MSA_LOAD(psrc, FUNC_NAME)  FUNC_NAME(psrc)
+
+#define MSA_STORE_FUNC(TYPE, INSTR, FUNC_NAME)               \
+  static inline void FUNC_NAME(TYPE val, void* const pdst) { \
+    uint8_t* const pdst_m = (uint8_t*)pdst;                  \
+    TYPE val_m = val;                                        \
+    asm volatile (                                           \
+      " " #INSTR "  %[val_m],  %[pdst_m]  \n\t"              \
+      : [pdst_m] "=m" (*pdst_m)                              \
+      : [val_m] "r" (val_m));                                \
+  }
+
+#define MSA_STORE(val, pdst, FUNC_NAME)  FUNC_NAME(val, pdst)
+
+#if (__mips_isa_rev >= 6)
+  MSA_LOAD_FUNC(uint16_t, lh, msa_lh);
+  #define LH(psrc)  MSA_LOAD(psrc, msa_lh)
+  MSA_LOAD_FUNC(uint32_t, lw, msa_lw);
+  #define LW(psrc)  MSA_LOAD(psrc, msa_lw)
+  #if (__mips == 64)
+    MSA_LOAD_FUNC(uint64_t, ld, msa_ld);
+    #define LD(psrc)  MSA_LOAD(psrc, msa_ld)
+  #else  // !(__mips == 64)
+    #define LD(psrc)  ((((uint64_t)MSA_LOAD(psrc + 4, msa_lw)) << 32) | \
+                       MSA_LOAD(psrc, msa_lw))
+  #endif  // (__mips == 64)
+
+  MSA_STORE_FUNC(uint16_t, sh, msa_sh);
+  #define SH(val, pdst)  MSA_STORE(val, pdst, msa_sh)
+  MSA_STORE_FUNC(uint32_t, sw, msa_sw);
+  #define SW(val, pdst)  MSA_STORE(val, pdst, msa_sw)
+  MSA_STORE_FUNC(uint64_t, sd, msa_sd);
+  #define SD(val, pdst)  MSA_STORE(val, pdst, msa_sd)
+#else  // !(__mips_isa_rev >= 6)
+  MSA_LOAD_FUNC(uint16_t, ulh, msa_ulh);
+  #define LH(psrc)  MSA_LOAD(psrc, msa_ulh)
+  MSA_LOAD_FUNC(uint32_t, ulw, msa_ulw);
+  #define LW(psrc)  MSA_LOAD(psrc, msa_ulw)
+  #if (__mips == 64)
+    MSA_LOAD_FUNC(uint64_t, uld, msa_uld);
+    #define LD(psrc)  MSA_LOAD(psrc, msa_uld)
+  #else  // !(__mips == 64)
+    #define LD(psrc)  ((((uint64_t)MSA_LOAD(psrc + 4, msa_ulw)) << 32) | \
+                        MSA_LOAD(psrc, msa_ulw))
+  #endif  // (__mips == 64)
+
+  MSA_STORE_FUNC(uint16_t, ush, msa_ush);
+  #define SH(val, pdst)  MSA_STORE(val, pdst, msa_ush)
+  MSA_STORE_FUNC(uint32_t, usw, msa_usw);
+  #define SW(val, pdst)  MSA_STORE(val, pdst, msa_usw)
+  #define SD(val, pdst) {                                                  \
+    uint8_t* const pdst_sd_m = (uint8_t*)(pdst);                           \
+    const uint32_t val0_m = (uint32_t)(val & 0x00000000FFFFFFFF);          \
+    const uint32_t val1_m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF);  \
+    SW(val0_m, pdst_sd_m);                                                 \
+    SW(val1_m, pdst_sd_m + 4);                                             \
+  }
+#endif  // (__mips_isa_rev >= 6)
+
+/* Description : Load 4 words with stride
+ * Arguments   : Inputs  - psrc, stride
+ *               Outputs - out0, out1, out2, out3
+ * Details     : Load word in 'out0' from (psrc)
+ *               Load word in 'out1' from (psrc + stride)
+ *               Load word in 'out2' from (psrc + 2 * stride)
+ *               Load word in 'out3' from (psrc + 3 * stride)
+ */
+#define LW4(psrc, stride, out0, out1, out2, out3) {  \
+  const uint8_t* ptmp = (const uint8_t*)psrc;        \
+  out0 = LW(ptmp);                                   \
+  ptmp += stride;                                    \
+  out1 = LW(ptmp);                                   \
+  ptmp += stride;                                    \
+  out2 = LW(ptmp);                                   \
+  ptmp += stride;                                    \
+  out3 = LW(ptmp);                                   \
+}
+
+/* Description : Store 4 words with stride
+ * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+ * Details     : Store word from 'in0' to (pdst)
+ *               Store word from 'in1' to (pdst + stride)
+ *               Store word from 'in2' to (pdst + 2 * stride)
+ *               Store word from 'in3' to (pdst + 3 * stride)
+ */
+#define SW4(in0, in1, in2, in3, pdst, stride) {  \
+  uint8_t* ptmp = (uint8_t*)pdst;                \
+  SW(in0, ptmp);                                 \
+  ptmp += stride;                                \
+  SW(in1, ptmp);                                 \
+  ptmp += stride;                                \
+  SW(in2, ptmp);                                 \
+  ptmp += stride;                                \
+  SW(in3, ptmp);                                 \
+}
+
+/* Description : Load vectors with 16 byte elements with stride
+ * Arguments   : Inputs  - psrc, stride
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Load 16 byte elements in 'out0' from (psrc)
+ *               Load 16 byte elements in 'out1' from (psrc + stride)
+ */
+#define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
+  out0 = LD_B(RTYPE, psrc);                       \
+  out1 = LD_B(RTYPE, psrc + stride);              \
+}
+#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
+  LD_B2(RTYPE, psrc, stride, out0, out1);                     \
+  LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3);       \
+}
+#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+
+/* Description : Load vectors with 8 halfword elements with stride
+ * Arguments   : Inputs  - psrc, stride
+ *               Outputs - out0, out1
+ * Details     : Load 8 halfword elements in 'out0' from (psrc)
+ *               Load 8 halfword elements in 'out1' from (psrc + stride)
+ */
+#define LD_H2(RTYPE, psrc, stride, out0, out1) {  \
+  out0 = LD_H(RTYPE, psrc);                       \
+  out1 = LD_H(RTYPE, psrc + stride);              \
+}
+#define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
+#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
+
+/* Description : Store 4x4 byte block to destination memory from input vector
+ * Arguments   : Inputs - in0, in1, pdst, stride
+ * Details     : 'Idx0' word element from input vector 'in0' is copied to the
+ *               GP register and stored to (pdst)
+ *               'Idx1' word element from input vector 'in0' is copied to the
+ *               GP register and stored to (pdst + stride)
+ *               'Idx2' word element from input vector 'in0' is copied to the
+ *               GP register and stored to (pdst + 2 * stride)
+ *               'Idx3' word element from input vector 'in0' is copied to the
+ *               GP register and stored to (pdst + 3 * stride)
+ */
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
+  uint8_t* const pblk_4x4_m = (uint8_t*)pdst;                       \
+  const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0);         \
+  const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1);         \
+  const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2);         \
+  const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3);         \
+  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \
+}
+
+/* Description : Immediate number of elements to slide
+ * Arguments   : Inputs  - in0, in1, slide_val
+ *               Outputs - out
+ *               Return Type - as per RTYPE
+ * Details     : Byte elements from 'in1' vector are slid into 'in0' by
+ *               value specified in the 'slide_val'
+ */
+#define SLDI_B(RTYPE, in0, in1, slide_val)                      \
+        (RTYPE)__msa_sldi_b((v16i8)in0, (v16i8)in1, slide_val)  \
+
+#define SLDI_UB(...) SLDI_B(v16u8, __VA_ARGS__)
+#define SLDI_SB(...) SLDI_B(v16i8, __VA_ARGS__)
+#define SLDI_SH(...) SLDI_B(v8i16, __VA_ARGS__)
+
+/* Description : Shuffle halfword vector elements as per mask vector
+ * Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : halfword elements from 'in0' & 'in1' are copied selectively to
+ *               'out0' as per control vector 'mask0'
+ */
+#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
+  out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0);     \
+  out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2);     \
+}
+#define VSHF_H2_UH(...) VSHF_H2(v8u16, __VA_ARGS__)
+#define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
+
+/* Description : Clips all signed halfword elements of input vector
+ *               between 0 & 255
+ * Arguments   : Input/output  - val
+ *               Return Type - signed halfword
+ */
+#define CLIP_SH_0_255(val) {                      \
+  const v8i16 max_m = __msa_ldi_h(255);           \
+  val = __msa_maxi_s_h((v8i16)val, 0);            \
+  val = __msa_min_s_h(max_m, (v8i16)val);         \
+}
+#define CLIP_SH2_0_255(in0, in1) {  \
+  CLIP_SH_0_255(in0);               \
+  CLIP_SH_0_255(in1);               \
+}
+
+/* Description : Clips all signed word elements of input vector
+ *               between 0 & 255
+ * Arguments   : Input/output  - val
+ *               Return Type - signed word
+ */
+#define CLIP_SW_0_255(val) {                      \
+  const v4i32 max_m = __msa_ldi_w(255);           \
+  val = __msa_maxi_s_w((v4i32)val, 0);            \
+  val = __msa_min_s_w(max_m, (v4i32)val);         \
+}
+#define CLIP_SW4_0_255(in0, in1, in2, in3) {  \
+  CLIP_SW_0_255(in0);                         \
+  CLIP_SW_0_255(in1);                         \
+  CLIP_SW_0_255(in2);                         \
+  CLIP_SW_0_255(in3);                         \
+}
+
+/* Description : Set element n input vector to GPR value
+ * Arguments   : Inputs - in0, in1, in2, in3
+ *               Output - out
+ *               Return Type - as per RTYPE
+ * Details     : Set element 0 in vector 'out' to value specified in 'in0'
+ */
+#define INSERT_W2(RTYPE, in0, in1, out) {           \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
+}
+#define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
+#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
+
+#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);   \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);   \
+}
+#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
+#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
+#define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
+
+/* Description : Interleave right half of byte elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
+ *               and written to out0.
+ */
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
+  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \
+}
+#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
+#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
+#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+#define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
+
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
+#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
+#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
+#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
+#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
+
+/* Description : Interleave right half of halfword elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Right half of halfword elements of 'in0' and 'in1' are
+ *               interleaved and written to 'out0'.
+ */
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
+  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
+}
+#define ILVR_H2_UB(...) ILVR_H2(v16u8, __VA_ARGS__)
+#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
+
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVR_H4_UB(...) ILVR_H4(v16u8, __VA_ARGS__)
+#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
+#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
+
+/* Description : Interleave right half of double word elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Right half of double word elements of 'in0' and 'in1' are
+ *               interleaved and written to 'out0'.
+ */
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1);     \
+  out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3);     \
+}
+#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
+#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
+#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \
+  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
+  out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
+}
+#define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
+#define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+#define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \
+  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
+  out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
+}
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+
+/* Description : Pack even byte elements of vector pairs
+ *  Arguments   : Inputs  - in0, in1, in2, in3
+ *                Outputs - out0, out1
+ *                Return Type - as per RTYPE
+ *  Details     : Even byte elements of 'in0' are copied to the left half of
+ *                'out0' & even byte elements of 'in1' are copied to the right
+ *                half of 'out0'.
+ */
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
+  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \
+}
+#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
+#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
+#define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
+
+/* Description : Arithmetic immediate shift right all elements of word vector
+ * Arguments   : Inputs  - in0, in1, shift
+ *               Outputs - in place operation
+ *               Return Type - as per input vector RTYPE
+ * Details     : Each element of vector 'in0' is right shifted by 'shift' and
+ *               the result is written in-place. 'shift' is a GP variable.
+ */
+#define SRAI_W2(RTYPE, in0, in1, shift_val) {  \
+  in0 = (RTYPE)SRAI_W(in0, shift_val);         \
+  in1 = (RTYPE)SRAI_W(in1, shift_val);         \
+}
+#define SRAI_W2_SW(...) SRAI_W2(v4i32, __VA_ARGS__)
+#define SRAI_W2_UW(...) SRAI_W2(v4u32, __VA_ARGS__)
+
+#define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) {  \
+  SRAI_W2(RTYPE, in0, in1, shift_val);                   \
+  SRAI_W2(RTYPE, in2, in3, shift_val);                   \
+}
+#define SRAI_W4_SW(...) SRAI_W4(v4i32, __VA_ARGS__)
+#define SRAI_W4_UW(...) SRAI_W4(v4u32, __VA_ARGS__)
+
+/* Description : Arithmetic shift right all elements of half-word vector
+ * Arguments   : Inputs  - in0, in1, shift
+ *               Outputs - in place operation
+ *               Return Type - as per input vector RTYPE
+ * Details     : Each element of vector 'in0' is right shifted by 'shift' and
+ *               the result is written in-place. 'shift' is a GP variable.
+ */
+#define SRAI_H2(RTYPE, in0, in1, shift_val) {  \
+  in0 = (RTYPE)SRAI_H(in0, shift_val);         \
+  in1 = (RTYPE)SRAI_H(in1, shift_val);         \
+}
+#define SRAI_H2_SH(...) SRAI_H2(v8i16, __VA_ARGS__)
+#define SRAI_H2_UH(...) SRAI_H2(v8u16, __VA_ARGS__)
+
+/* Description : Arithmetic rounded shift right all elements of word vector
+ * Arguments   : Inputs  - in0, in1, shift
+ *               Outputs - in place operation
+ *               Return Type - as per input vector RTYPE
+ * Details     : Each element of vector 'in0' is right shifted by 'shift' and
+ *               the result is written in-place. 'shift' is a GP variable.
+ */
+#define SRARI_W2(RTYPE, in0, in1, shift) {        \
+  in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
+  in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
+}
+#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
+
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \
+  SRARI_W2(RTYPE, in0, in1, shift);                   \
+  SRARI_W2(RTYPE, in2, in3, shift);                   \
+}
+#define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
+#define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__)
+#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
+
+/* Description : Addition of 2 pairs of half-word vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ * Details     : Each element in 'in0' is added to 'in1' and result is written
+ *               to 'out0'.
+ */
+#define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)ADDVI_H(in0, in1);                         \
+  out1 = (RTYPE)ADDVI_H(in2, in3);                         \
+}
+#define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__)
+#define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__)
+
+/* Description : Addition of 2 pairs of vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ * Details     : Each element in 'in0' is added to 'in1' and result is written
+ *               to 'out0'.
+ */
+#define ADD2(in0, in1, in2, in3, out0, out1) {  \
+  out0 = in0 + in1;                             \
+  out1 = in2 + in3;                             \
+}
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3) {                \
+  ADD2(in0, in1, in2, in3, out0, out1);               \
+  ADD2(in4, in5, in6, in7, out2, out3);               \
+}
+
+/* Description : Sign extend halfword elements from input vector and return
+ *               the result in pair of vectors
+ * Arguments   : Input   - in            (halfword vector)
+ *               Outputs - out0, out1   (sign extended word vectors)
+ *               Return Type - signed word
+ * Details     : Sign bit of halfword elements from input vector 'in' is
+ *               extracted and interleaved right with same vector 'in0' to
+ *               generate 4 signed word elements in 'out0'
+ *               Then interleaved left with same vector 'in0' to
+ *               generate 4 signed word elements in 'out1'
+ */
+#define UNPCK_SH_SW(in, out0, out1) {                 \
+  const v8i16 tmp_m = __msa_clti_s_h((v8i16)in, 0);   \
+  ILVRL_H2_SW(tmp_m, in, out0, out1);                 \
+}
+
+/* Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     : Butterfly operation
+ */
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  out0 = in0 + in3;                                                \
+  out1 = in1 + in2;                                                \
+  out2 = in1 - in2;                                                \
+  out3 = in0 - in3;                                                \
+}
+
+/* Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *                Outputs - out0, out1, out2, out3
+ *                Return Type - as per RTYPE
+ */
+#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v4i32 s0_m, s1_m, s2_m, s3_m;                                              \
+  ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                         \
+  ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                         \
+  out0 = (RTYPE)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                      \
+  out1 = (RTYPE)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                      \
+  out2 = (RTYPE)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                      \
+  out3 = (RTYPE)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                      \
+}
+#define TRANSPOSE4x4_SW_SW(...) TRANSPOSE4x4_W(v4i32, __VA_ARGS__)
+
+/* Description : Add block 4x4
+ * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+ * Details     : Least significant 4 bytes from each input vector are added to
+ *               the destination bytes, clipped between 0-255 and stored.
+ */
+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
+  uint32_t src0_m, src1_m, src2_m, src3_m;                      \
+  v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
+  v16i8 dst0_m = { 0 };                                         \
+  v16i8 dst1_m = { 0 };                                         \
+  const v16i8 zero_m = { 0 };                                   \
+  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m);               \
+  LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m);            \
+  INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
+  INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
+  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
+  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
+  CLIP_SH2_0_255(res0_m, res1_m);                               \
+  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
+  ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
+}
+
+#endif  /* WEBP_DSP_MSA_MACRO_H_ */
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
index 5ea4ddb..5b97028 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
@@ -18,6 +18,7 @@
 
 #include <assert.h>
 #include "../utils/rescaler.h"
+#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Implementations of critical functions ImportRow / ExportRow
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c
index d4ccbe0..ed2eb74 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c
@@ -14,9 +14,7 @@
 
 #include "./dsp.h"
 
-// Code is disabled for now, in favor of the plain-C version
-// TODO(djordje.pesut): adapt the code to reflect the C-version.
-#if 0 // defined(WEBP_USE_MIPS_DSP_R2)
+#if defined(WEBP_USE_MIPS_DSP_R2)
 
 #include <assert.h>
 #include "./yuv.h"
@@ -24,21 +22,21 @@
 #if !defined(WEBP_YUV_USE_TABLE)
 
 #define YUV_TO_RGB(Y, U, V, R, G, B) do {                                      \
-    const int t1 = kYScale * Y;                                                \
-    const int t2 = kVToG * V;                                                  \
-    R = kVToR * V;                                                             \
-    G = kUToG * U;                                                             \
-    B = kUToB * U;                                                             \
+    const int t1 = MultHi(Y, 19077);                                           \
+    const int t2 = MultHi(V, 13320);                                           \
+    R = MultHi(V, 26149);                                                      \
+    G = MultHi(U, 6419);                                                       \
+    B = MultHi(U, 33050);                                                      \
     R = t1 + R;                                                                \
     G = t1 - G;                                                                \
     B = t1 + B;                                                                \
-    R = R + kRCst;                                                             \
-    G = G - t2 + kGCst;                                                        \
-    B = B + kBCst;                                                             \
+    R = R - 14234;                                                             \
+    G = G - t2 + 8708;                                                         \
+    B = B - 17685;                                                             \
     __asm__ volatile (                                                         \
-      "shll_s.w         %[" #R "],      %[" #R "],        9          \n\t"     \
-      "shll_s.w         %[" #G "],      %[" #G "],        9          \n\t"     \
-      "shll_s.w         %[" #B "],      %[" #B "],        9          \n\t"     \
+      "shll_s.w         %[" #R "],      %[" #R "],        17         \n\t"     \
+      "shll_s.w         %[" #G "],      %[" #G "],        17         \n\t"     \
+      "shll_s.w         %[" #B "],      %[" #B "],        17         \n\t"     \
       "precrqu_s.qb.ph  %[" #R "],      %[" #R "],        $zero      \n\t"     \
       "precrqu_s.qb.ph  %[" #G "],      %[" #G "],        $zero      \n\t"     \
       "precrqu_s.qb.ph  %[" #B "],      %[" #B "],        $zero      \n\t"     \
@@ -279,6 +277,6 @@ WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersMIPSdspR2)
 
 #endif  // WEBP_USE_MIPS_DSP_R2
 
-#if 1  // !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MIPS_DSP_R2))
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MIPS_DSP_R2))
 WEBP_DSP_INIT_STUB(WebPInitUpsamplersMIPSdspR2)
 #endif
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_mips32.c b/src/3rdparty/libwebp/src/dsp/yuv_mips32.c
index b8fe512..e61aac5 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv_mips32.c
@@ -14,8 +14,7 @@
 
 #include "./dsp.h"
 
-// Code is disabled for now, in favor of the plain-C version
-#if 0  // defined(WEBP_USE_MIPS32)
+#if defined(WEBP_USE_MIPS32)
 
 #include "./yuv.h"
 
@@ -29,19 +28,19 @@ static void FUNC_NAME(const uint8_t* y,                                        \
   int i, r, g, b;                                                              \
   int temp0, temp1, temp2, temp3, temp4;                                       \
   for (i = 0; i < (len >> 1); i++) {                                           \
-    temp1 = kVToR * v[0];                                                      \
-    temp3 = kVToG * v[0];                                                      \
-    temp2 = kUToG * u[0];                                                      \
-    temp4 = kUToB * u[0];                                                      \
-    temp0 = kYScale * y[0];                                                    \
-    temp1 += kRCst;                                                            \
-    temp3 -= kGCst;                                                            \
+    temp1 = MultHi(v[0], 26149);                                               \
+    temp3 = MultHi(v[0], 13320);                                               \
+    temp2 = MultHi(u[0], 6419);                                                \
+    temp4 = MultHi(u[0], 33050);                                               \
+    temp0 = MultHi(y[0], 19077);                                               \
+    temp1 -= 14234;                                                            \
+    temp3 -= 8708;                                                             \
     temp2 += temp3;                                                            \
-    temp4 += kBCst;                                                            \
+    temp4 -= 17685;                                                            \
     r = VP8Clip8(temp0 + temp1);                                               \
     g = VP8Clip8(temp0 - temp2);                                               \
     b = VP8Clip8(temp0 + temp4);                                               \
-    temp0 = kYScale * y[1];                                                    \
+    temp0 = MultHi(y[1], 19077);                                               \
     dst[R] = r;                                                                \
     dst[G] = g;                                                                \
     dst[B] = b;                                                                \
@@ -59,15 +58,15 @@ static void FUNC_NAME(const uint8_t* y,                                        \
     dst += 2 * XSTEP;                                                          \
   }                                                                            \
   if (len & 1) {                                                               \
-    temp1 = kVToR * v[0];                                                      \
-    temp3 = kVToG * v[0];                                                      \
-    temp2 = kUToG * u[0];                                                      \
-    temp4 = kUToB * u[0];                                                      \
-    temp0 = kYScale * y[0];                                                    \
-    temp1 += kRCst;                                                            \
-    temp3 -= kGCst;                                                            \
+    temp1 = MultHi(v[0], 26149);                                               \
+    temp3 = MultHi(v[0], 13320);                                               \
+    temp2 = MultHi(u[0], 6419);                                                \
+    temp4 = MultHi(u[0], 33050);                                               \
+    temp0 = MultHi(y[0], 19077);                                               \
+    temp1 -= 14234;                                                            \
+    temp3 -= 8708;                                                             \
     temp2 += temp3;                                                            \
-    temp4 += kBCst;                                                            \
+    temp4 -= 17685;                                                            \
     r = VP8Clip8(temp0 + temp1);                                               \
     g = VP8Clip8(temp0 - temp2);                                               \
     b = VP8Clip8(temp0 + temp4);                                               \
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c
index dea0fdb..1720d41 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c
@@ -14,8 +14,7 @@
 
 #include "./dsp.h"
 
-// Code is disabled for now, in favor of the plain-C version
-#if 0  // defined(WEBP_USE_MIPS_DSP_R2)
+#if defined(WEBP_USE_MIPS_DSP_R2)
 
 #include "./yuv.h"
 
@@ -31,10 +30,10 @@
   "mul              %[temp2],   %[t_con_3],     %[temp4]        \n\t"          \
   "mul              %[temp4],   %[t_con_4],     %[temp4]        \n\t"          \
   "mul              %[temp0],   %[t_con_5],     %[temp0]        \n\t"          \
-  "addu             %[temp1],   %[temp1],       %[t_con_6]      \n\t"          \
+  "subu             %[temp1],   %[temp1],       %[t_con_6]      \n\t"          \
   "subu             %[temp3],   %[temp3],       %[t_con_7]      \n\t"          \
   "addu             %[temp2],   %[temp2],       %[temp3]        \n\t"          \
-  "addu             %[temp4],   %[temp4],       %[t_con_8]      \n\t"          \
+  "subu             %[temp4],   %[temp4],       %[t_con_8]      \n\t"          \
 
 #define ROW_FUNC_PART_2(R, G, B, K)                                            \
   "addu             %[temp5],   %[temp0],       %[temp1]        \n\t"          \
@@ -43,12 +42,12 @@
 ".if " #K "                                                     \n\t"          \
   "lbu              %[temp0],   1(%[y])                         \n\t"          \
 ".endif                                                         \n\t"          \
-  "shll_s.w         %[temp5],   %[temp5],       9               \n\t"          \
-  "shll_s.w         %[temp6],   %[temp6],       9               \n\t"          \
+  "shll_s.w         %[temp5],   %[temp5],       17              \n\t"          \
+  "shll_s.w         %[temp6],   %[temp6],       17              \n\t"          \
 ".if " #K "                                                     \n\t"          \
   "mul              %[temp0],   %[t_con_5],     %[temp0]        \n\t"          \
 ".endif                                                         \n\t"          \
-  "shll_s.w         %[temp7],   %[temp7],       9               \n\t"          \
+  "shll_s.w         %[temp7],   %[temp7],       17              \n\t"          \
   "precrqu_s.qb.ph  %[temp5],   %[temp5],       $zero           \n\t"          \
   "precrqu_s.qb.ph  %[temp6],   %[temp6],       $zero           \n\t"          \
   "precrqu_s.qb.ph  %[temp7],   %[temp7],       $zero           \n\t"          \
@@ -75,14 +74,14 @@ static void FUNC_NAME(const uint8_t* y,                                        \
                       uint8_t* dst, int len) {                                 \
   int i;                                                                       \
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;             \
-  const int t_con_1 = kVToR;                                                   \
-  const int t_con_2 = kVToG;                                                   \
-  const int t_con_3 = kUToG;                                                   \
-  const int t_con_4 = kUToB;                                                   \
-  const int t_con_5 = kYScale;                                                 \
-  const int t_con_6 = kRCst;                                                   \
-  const int t_con_7 = kGCst;                                                   \
-  const int t_con_8 = kBCst;                                                   \
+  const int t_con_1 = 26149;                                                   \
+  const int t_con_2 = 13320;                                                   \
+  const int t_con_3 = 6419;                                                    \
+  const int t_con_4 = 33050;                                                   \
+  const int t_con_5 = 19077;                                                   \
+  const int t_con_6 = 14234;                                                   \
+  const int t_con_7 = 8708;                                                    \
+  const int t_con_8 = 17685;                                                   \
   for (i = 0; i < (len >> 1); i++) {                                           \
     __asm__ volatile (                                                         \
       ROW_FUNC_PART_1()                                                        \
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
index f72fe32..e19bddf 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
@@ -33,7 +33,8 @@ static void ConvertYUV444ToRGB(const __m128i* const Y0,
   const __m128i k19077 = _mm_set1_epi16(19077);
   const __m128i k26149 = _mm_set1_epi16(26149);
   const __m128i k14234 = _mm_set1_epi16(14234);
-  const __m128i k33050 = _mm_set1_epi16(33050);
+  // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
+  const __m128i k33050 = _mm_set1_epi16((short)33050);
   const __m128i k17685 = _mm_set1_epi16(17685);
   const __m128i k6419  = _mm_set1_epi16(6419);
   const __m128i k13320 = _mm_set1_epi16(13320);