libwebp: update to 0.5.0

This commit imports libwebp 0.5.0, including AUTHORS, COPYING, ChangeLog, NEWS, PATENTS, README and src directories. In src, only includes header and source files. The patches required to build it in Qt will follow in separate commit(s). Change-Id: I96b4961ba63c75cc7fbab158c36a0f403f254c14 Reviewed-by: aavit <eirik.aavitsland@theqtcompany.com>
author: Liang Qi <liang.qi@theqtcompany.com> 2016-03-09 10:22:13 +0100
committer: Liang Qi <liang.qi@theqtcompany.com> 2016-03-11 20:05:19 +0000
commit: b114e552211456fbde3ff6ca2da21cbc8d1e90e2 (patch)
tree: 9c033ea7bcc9cc7314eaa8aff57356b2ae301257 /src/3rdparty/libwebp/src/dsp
parent: 1d4f24820c0fff474d524e006d715e13e409a4b8 (diff)
58 files changed, 14023 insertions, 3757 deletions
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing.c b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
index c8e0b4b..1716cac 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
@@ -134,7 +134,7 @@ static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
 
 #endif    // USE_TABLES_FOR_ALPHA_MULT
 
-static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse) {
   int x;
   for (x = 0; x < width; ++x) {
     const uint32_t argb = ptr[x];
@@ -154,8 +154,8 @@ static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
   }
 }
 
-static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
-                    int width, int inverse) {
+void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
+                  int width, int inverse) {
   int x;
   for (x = 0; x < width; ++x) {
     const uint32_t a = alpha[x];
@@ -284,6 +284,38 @@ static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
 #endif
 }
 
+static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
+                         int width, int height,
+                         uint8_t* dst, int dst_stride) {
+  uint32_t alpha_mask = 0xff;
+  int i, j;
+
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      const uint32_t alpha_value = alpha[i];
+      dst[4 * i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+
+  return (alpha_mask != 0xff);
+}
+
+static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
+                                 int width, int height,
+                                 uint32_t* dst, int dst_stride) {
+  int i, j;
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < width; ++i) {
+      dst[i] = alpha[i] << 8;  // leave A/R/B channels zero'd.
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+}
+
 static int ExtractAlpha(const uint8_t* argb, int argb_stride,
                         int width, int height,
                         uint8_t* alpha, int alpha_stride) {
@@ -304,23 +336,29 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
 
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
+int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
+void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
 int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 
 //------------------------------------------------------------------------------
 // Init function
 
+extern void WebPInitAlphaProcessingMIPSdspR2(void);
 extern void WebPInitAlphaProcessingSSE2(void);
+extern void WebPInitAlphaProcessingSSE41(void);
 
 static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
     (VP8CPUInfo)&alpha_processing_last_cpuinfo_used;
 
-void WebPInitAlphaProcessing(void) {
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
   if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
 
-  WebPMultARGBRow = MultARGBRow;
-  WebPMultRow = MultRow;
+  WebPMultARGBRow = WebPMultARGBRowC;
+  WebPMultRow = WebPMultRowC;
   WebPApplyAlphaMultiply = ApplyAlphaMultiply;
   WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
+  WebPDispatchAlpha = DispatchAlpha;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
   WebPExtractAlpha = ExtractAlpha;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
@@ -328,6 +366,16 @@ void WebPInitAlphaProcessing(void) {
 #if defined(WEBP_USE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitAlphaProcessingSSE2();
+#if defined(WEBP_USE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        WebPInitAlphaProcessingSSE41();
+      }
+#endif
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      WebPInitAlphaProcessingMIPSdspR2();
     }
 #endif
   }
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
new file mode 100644
index 0000000..c631d78
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
@@ -0,0 +1,141 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+//            Djordje Pesut  (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
+                         int width, int height,
+                         uint8_t* dst, int dst_stride) {
+  uint32_t alpha_mask = 0xffffffff;
+  int i, j, temp0;
+
+  for (j = 0; j < height; ++j) {
+    uint8_t* pdst = dst;
+    const uint8_t* palpha = alpha;
+    for (i = 0; i < (width >> 2); ++i) {
+      int temp1, temp2, temp3;
+
+      __asm__ volatile (
+        "ulw    %[temp0],      0(%[palpha])                \n\t"
+        "addiu  %[palpha],     %[palpha],     4            \n\t"
+        "addiu  %[pdst],       %[pdst],       16           \n\t"
+        "srl    %[temp1],      %[temp0],      8            \n\t"
+        "srl    %[temp2],      %[temp0],      16           \n\t"
+        "srl    %[temp3],      %[temp0],      24           \n\t"
+        "and    %[alpha_mask], %[alpha_mask], %[temp0]     \n\t"
+        "sb     %[temp0],      -16(%[pdst])                \n\t"
+        "sb     %[temp1],      -12(%[pdst])                \n\t"
+        "sb     %[temp2],      -8(%[pdst])                 \n\t"
+        "sb     %[temp3],      -4(%[pdst])                 \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+          [temp3]"=&r"(temp3), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
+          [alpha_mask]"+r"(alpha_mask)
+        :
+        : "memory"
+      );
+    }
+
+    for (i = 0; i < (width & 3); ++i) {
+      __asm__ volatile (
+        "lbu    %[temp0],      0(%[palpha])                \n\t"
+        "addiu  %[palpha],     %[palpha],     1            \n\t"
+        "sb     %[temp0],      0(%[pdst])                  \n\t"
+        "and    %[alpha_mask], %[alpha_mask], %[temp0]     \n\t"
+        "addiu  %[pdst],       %[pdst],       4            \n\t"
+        : [temp0]"=&r"(temp0), [palpha]"+r"(palpha), [pdst]"+r"(pdst),
+          [alpha_mask]"+r"(alpha_mask)
+        :
+        : "memory"
+      );
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+
+  __asm__ volatile (
+    "ext    %[temp0],      %[alpha_mask], 0, 16            \n\t"
+    "srl    %[alpha_mask], %[alpha_mask], 16               \n\t"
+    "and    %[alpha_mask], %[alpha_mask], %[temp0]         \n\t"
+    "ext    %[temp0],      %[alpha_mask], 0, 8             \n\t"
+    "srl    %[alpha_mask], %[alpha_mask], 8                \n\t"
+    "and    %[alpha_mask], %[alpha_mask], %[temp0]         \n\t"
+    : [temp0]"=&r"(temp0), [alpha_mask]"+r"(alpha_mask)
+    :
+  );
+
+  return (alpha_mask != 0xff);
+}
+
+static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+  int x;
+  const uint32_t c_00ffffff = 0x00ffffffu;
+  const uint32_t c_ff000000 = 0xff000000u;
+  const uint32_t c_8000000  = 0x00800000u;
+  const uint32_t c_8000080  = 0x00800080u;
+  for (x = 0; x < width; ++x) {
+    const uint32_t argb = ptr[x];
+    if (argb < 0xff000000u) {      // alpha < 255
+      if (argb <= 0x00ffffffu) {   // alpha == 0
+        ptr[x] = 0;
+      } else {
+        int temp0, temp1, temp2, temp3, alpha;
+        __asm__ volatile (
+          "srl          %[alpha],   %[argb],       24                \n\t"
+          "replv.qb     %[temp0],   %[alpha]                         \n\t"
+          "and          %[temp0],   %[temp0],      %[c_00ffffff]     \n\t"
+          "beqz         %[inverse], 0f                               \n\t"
+          "divu         $zero,      %[c_ff000000], %[alpha]          \n\t"
+          "mflo         %[temp0]                                     \n\t"
+        "0:                                                          \n\t"
+          "andi         %[temp1],   %[argb],       0xff              \n\t"
+          "ext          %[temp2],   %[argb],       8,             8  \n\t"
+          "ext          %[temp3],   %[argb],       16,            8  \n\t"
+          "mul          %[temp1],   %[temp1],      %[temp0]          \n\t"
+          "mul          %[temp2],   %[temp2],      %[temp0]          \n\t"
+          "mul          %[temp3],   %[temp3],      %[temp0]          \n\t"
+          "precrq.ph.w  %[temp1],   %[temp2],      %[temp1]          \n\t"
+          "addu         %[temp3],   %[temp3],      %[c_8000000]      \n\t"
+          "addu         %[temp1],   %[temp1],      %[c_8000080]      \n\t"
+          "precrq.ph.w  %[temp3],   %[argb],       %[temp3]          \n\t"
+          "precrq.qb.ph %[temp1],   %[temp3],      %[temp1]          \n\t"
+          : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+            [temp3]"=&r"(temp3), [alpha]"=&r"(alpha)
+          : [inverse]"r"(inverse), [c_00ffffff]"r"(c_00ffffff),
+            [c_8000000]"r"(c_8000000), [c_8000080]"r"(c_8000080),
+            [c_ff000000]"r"(c_ff000000), [argb]"r"(argb)
+          : "memory", "hi", "lo"
+        );
+        ptr[x] = temp1;
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitAlphaProcessingMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
+  WebPDispatchAlpha = DispatchAlpha;
+  WebPMultARGBRow = MultARGBRow;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
index 3d0a9b5..5acb481 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
@@ -18,6 +18,86 @@
 
 //------------------------------------------------------------------------------
 
+static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
+                         int width, int height,
+                         uint8_t* dst, int dst_stride) {
+  // alpha_and stores an 'and' operation of all the alpha[] values. The final
+  // value is not 0xff if any of the alpha[] is not equal to 0xff.
+  uint32_t alpha_and = 0xff;
+  int i, j;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u);  // to preserve RGB
+  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
+  __m128i all_alphas = all_0xff;
+
+  // We must be able to access 3 extra bytes after the last written byte
+  // 'dst[4 * width - 4]', because we don't know if alpha is the first or the
+  // last byte of the quadruplet.
+  const int limit = (width - 1) & ~7;
+
+  for (j = 0; j < height; ++j) {
+    __m128i* out = (__m128i*)dst;
+    for (i = 0; i < limit; i += 8) {
+      // load 8 alpha bytes
+      const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]);
+      const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
+      const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
+      const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
+      // load 8 dst pixels (32 bytes)
+      const __m128i b0_lo = _mm_loadu_si128(out + 0);
+      const __m128i b0_hi = _mm_loadu_si128(out + 1);
+      // mask dst alpha values
+      const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask);
+      const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask);
+      // combine
+      const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo);
+      const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi);
+      // store
+      _mm_storeu_si128(out + 0, b2_lo);
+      _mm_storeu_si128(out + 1, b2_hi);
+      // accumulate eight alpha 'and' in parallel
+      all_alphas = _mm_and_si128(all_alphas, a0);
+      out += 2;
+    }
+    for (; i < width; ++i) {
+      const uint32_t alpha_value = alpha[i];
+      dst[4 * i] = alpha_value;
+      alpha_and &= alpha_value;
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+  // Combine the eight alpha 'and' into a 8-bit mask.
+  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
+  return (alpha_and != 0xff);
+}
+
+static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
+                                 int width, int height,
+                                 uint32_t* dst, int dst_stride) {
+  int i, j;
+  const __m128i zero = _mm_setzero_si128();
+  const int limit = width & ~15;
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i < limit; i += 16) {   // process 16 alpha bytes
+      const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]);
+      const __m128i a1 = _mm_unpacklo_epi8(zero, a0);  // note the 'zero' first!
+      const __m128i b1 = _mm_unpackhi_epi8(zero, a0);
+      const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
+      const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero);
+      const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
+      const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero);
+      _mm_storeu_si128((__m128i*)&dst[i +  0], a2_lo);
+      _mm_storeu_si128((__m128i*)&dst[i +  4], a2_hi);
+      _mm_storeu_si128((__m128i*)&dst[i +  8], b2_lo);
+      _mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi);
+    }
+    for (; i < width; ++i) dst[i] = alpha[i] << 8;
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+}
+
 static int ExtractAlpha(const uint8_t* argb, int argb_stride,
                         int width, int height,
                         uint8_t* alpha, int alpha_stride) {
@@ -63,15 +143,156 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
   return (alpha_and == 0xff);
 }
 
-#endif   // WEBP_USE_SSE2
+//------------------------------------------------------------------------------
+// Non-dither premultiplied modes
+
+#define MULTIPLIER(a)   ((a) * 0x8081)
+#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
+
+// We can't use a 'const int' for the SHUFFLE value, because it has to be an
+// immediate in the _mm_shufflexx_epi16() instruction. We really a macro here.
+#define APPLY_ALPHA(RGBX, SHUFFLE, MASK, MULT) do {             \
+  const __m128i argb0 = _mm_loadl_epi64((__m128i*)&(RGBX));     \
+  const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);         \
+  const __m128i alpha0 = _mm_and_si128(argb1, MASK);            \
+  const __m128i alpha1 = _mm_shufflelo_epi16(alpha0, SHUFFLE);  \
+  const __m128i alpha2 = _mm_shufflehi_epi16(alpha1, SHUFFLE);  \
+  /* alpha2 = [0 a0 a0 a0][0 a1 a1 a1] */                       \
+  const __m128i scale0 = _mm_mullo_epi16(alpha2, MULT);         \
+  const __m128i scale1 = _mm_mulhi_epu16(alpha2, MULT);         \
+  const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);         \
+  const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);         \
+  const __m128i argb4 = _mm_adds_epu16(argb2, argb3);           \
+  const __m128i argb5 = _mm_srli_epi16(argb4, 7);               \
+  const __m128i argb6 = _mm_or_si128(argb5, alpha0);            \
+  const __m128i argb7 = _mm_packus_epi16(argb6, zero);          \
+  _mm_storel_epi64((__m128i*)&(RGBX), argb7);                   \
+} while (0)
+
+static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
+                               int w, int h, int stride) {
+  const __m128i zero = _mm_setzero_si128();
+  const int kSpan = 2;
+  const int w2 = w & ~(kSpan - 1);
+  while (h-- > 0) {
+    uint32_t* const rgbx = (uint32_t*)rgba;
+    int i;
+    if (!alpha_first) {
+      const __m128i kMask = _mm_set_epi16(0xff, 0, 0, 0, 0xff, 0, 0, 0);
+      const __m128i kMult =
+          _mm_set_epi16(0, 0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081);
+      for (i = 0; i < w2; i += kSpan) {
+        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 3, 3, 3), kMask, kMult);
+      }
+    } else {
+      const __m128i kMask = _mm_set_epi16(0, 0, 0, 0xff, 0, 0, 0, 0xff);
+      const __m128i kMult =
+          _mm_set_epi16(0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081, 0);
+      for (i = 0; i < w2; i += kSpan) {
+        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 3), kMask, kMult);
+      }
+    }
+    // Finish with left-overs.
+    for (; i < w; ++i) {
+      uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
+      const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
+      const uint32_t a = alpha[4 * i];
+      if (a != 0xff) {
+        const uint32_t mult = MULTIPLIER(a);
+        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
+        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
+        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
+      }
+    }
+    rgba += stride;
+  }
+}
+#undef MULTIPLIER
+#undef PREMULTIPLY
+
+// -----------------------------------------------------------------------------
+// Apply alpha value to rows
+
+// We use: kINV255 = (1 << 24) / 255 = 0x010101
+// So: a * kINV255 = (a << 16) | [(a << 8) | a]
+// -> _mm_mulhi_epu16() takes care of the (a<<16) part,
+// and _mm_mullo_epu16(a * 0x0101,...) takes care of the "(a << 8) | a" one.
+
+static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+  int x = 0;
+  if (!inverse) {
+    const int kSpan = 2;
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i kRound =
+        _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7);
+    const __m128i kMult =
+        _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101);
+    const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0);
+    const int w2 = width & ~(kSpan - 1);
+    for (x = 0; x < w2; x += kSpan) {
+      const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
+      const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);
+      const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3));
+      const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3));
+      const __m128i tmp2 = _mm_srli_epi64(tmp1, 16);
+      const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult);
+      const __m128i scale1 = _mm_or_si128(tmp2, kOne64);
+      const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);
+      const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);
+      const __m128i argb4 = _mm_adds_epu16(argb2, argb3);
+      const __m128i argb5 = _mm_adds_epu16(argb4, kRound);
+      const __m128i argb6 = _mm_srli_epi16(argb5, 8);
+      const __m128i argb7 = _mm_packus_epi16(argb6, zero);
+      _mm_storel_epi64((__m128i*)&ptr[x], argb7);
+    }
+  }
+  width -= x;
+  if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
+}
+
+static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
+                    int width, int inverse) {
+  int x = 0;
+  if (!inverse) {
+    const int kSpan = 8;
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i kRound = _mm_set1_epi16(1 << 7);
+    const int w2 = width & ~(kSpan - 1);
+    for (x = 0; x < w2; x += kSpan) {
+      const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
+      const __m128i v1 = _mm_unpacklo_epi8(v0, zero);
+      const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);
+      const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero);
+      const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0);
+      const __m128i v2 = _mm_mulhi_epu16(v1, alpha2);
+      const __m128i v3 = _mm_mullo_epi16(v1, alpha1);
+      const __m128i v4 = _mm_adds_epu16(v2, v3);
+      const __m128i v5 = _mm_adds_epu16(v4, kRound);
+      const __m128i v6 = _mm_srli_epi16(v5, 8);
+      const __m128i v7 = _mm_packus_epi16(v6, zero);
+      _mm_storel_epi64((__m128i*)&ptr[x], v7);
+    }
+  }
+  width -= x;
+  if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
+}
 
 //------------------------------------------------------------------------------
-// Init function
+// Entry point
 
 extern void WebPInitAlphaProcessingSSE2(void);
 
-void WebPInitAlphaProcessingSSE2(void) {
-#if defined(WEBP_USE_SSE2)
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
+  WebPMultARGBRow = MultARGBRow;
+  WebPMultRow = MultRow;
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
+  WebPDispatchAlpha = DispatchAlpha;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
   WebPExtractAlpha = ExtractAlpha;
-#endif
 }
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
new file mode 100644
index 0000000..986fde9
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse41.c
@@ -0,0 +1,92 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel, SSE4.1 variant.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include <smmintrin.h>
+
+//------------------------------------------------------------------------------
+
+static int ExtractAlpha(const uint8_t* argb, int argb_stride,
+                        int width, int height,
+                        uint8_t* alpha, int alpha_stride) {
+  // alpha_and stores an 'and' operation of all the alpha[] values. The final
+  // value is not 0xff if any of the alpha[] is not equal to 0xff.
+  uint32_t alpha_and = 0xff;
+  int i, j;
+  const __m128i all_0xff = _mm_set1_epi32(~0u);
+  __m128i all_alphas = all_0xff;
+
+  // We must be able to access 3 extra bytes after the last written byte
+  // 'src[4 * width - 4]', because we don't know if alpha is the first or the
+  // last byte of the quadruplet.
+  const int limit = (width - 1) & ~15;
+  const __m128i kCstAlpha0 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
+                                          -1, -1, -1, -1, 12, 8, 4, 0);
+  const __m128i kCstAlpha1 = _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1,
+                                          12, 8, 4, 0, -1, -1, -1, -1);
+  const __m128i kCstAlpha2 = _mm_set_epi8(-1, -1, -1, -1, 12, 8, 4, 0,
+                                          -1, -1, -1, -1, -1, -1, -1, -1);
+  const __m128i kCstAlpha3 = _mm_set_epi8(12, 8, 4, 0, -1, -1, -1, -1,
+                                          -1, -1, -1, -1, -1, -1, -1, -1);
+  for (j = 0; j < height; ++j) {
+    const __m128i* src = (const __m128i*)argb;
+    for (i = 0; i < limit; i += 16) {
+      // load 64 argb bytes
+      const __m128i a0 = _mm_loadu_si128(src + 0);
+      const __m128i a1 = _mm_loadu_si128(src + 1);
+      const __m128i a2 = _mm_loadu_si128(src + 2);
+      const __m128i a3 = _mm_loadu_si128(src + 3);
+      const __m128i b0 = _mm_shuffle_epi8(a0, kCstAlpha0);
+      const __m128i b1 = _mm_shuffle_epi8(a1, kCstAlpha1);
+      const __m128i b2 = _mm_shuffle_epi8(a2, kCstAlpha2);
+      const __m128i b3 = _mm_shuffle_epi8(a3, kCstAlpha3);
+      const __m128i c0 = _mm_or_si128(b0, b1);
+      const __m128i c1 = _mm_or_si128(b2, b3);
+      const __m128i d0 = _mm_or_si128(c0, c1);
+      // store
+      _mm_storeu_si128((__m128i*)&alpha[i], d0);
+      // accumulate sixteen alpha 'and' in parallel
+      all_alphas = _mm_and_si128(all_alphas, d0);
+      src += 4;
+    }
+    for (; i < width; ++i) {
+      const uint32_t alpha_value = argb[4 * i];
+      alpha[i] = alpha_value;
+      alpha_and &= alpha_value;
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  // Combine the sixteen alpha 'and' into an 8-bit mask.
+  alpha_and |= 0xff00u;  // pretend the upper bits [8..15] were tested ok.
+  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
+  return (alpha_and == 0xffffu);
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitAlphaProcessingSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE41(void) {
+  WebPExtractAlpha = ExtractAlpha;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE41)
+
+#endif  // WEBP_USE_SSE41
diff --git a/src/3rdparty/libwebp/src/dsp/argb.c b/src/3rdparty/libwebp/src/dsp/argb.c
new file mode 100644
index 0000000..cc1f9a9
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/argb.c
@@ -0,0 +1,68 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   ARGB making functions.
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
+static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int len, uint32_t* out) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
+  }
+}
+
+static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                    int len, int step, uint32_t* out) {
+  int i, offset = 0;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(0xff, r[offset], g[offset], b[offset]);
+    offset += step;
+  }
+}
+
+void (*VP8PackARGB)(const uint8_t*, const uint8_t*, const uint8_t*,
+                    const uint8_t*, int, uint32_t*);
+void (*VP8PackRGB)(const uint8_t*, const uint8_t*, const uint8_t*,
+                   int, int, uint32_t*);
+
+extern void VP8EncDspARGBInitMIPSdspR2(void);
+extern void VP8EncDspARGBInitSSE2(void);
+
+static volatile VP8CPUInfo argb_last_cpuinfo_used =
+    (VP8CPUInfo)&argb_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInit(void) {
+  if (argb_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8PackARGB = PackARGB;
+  VP8PackRGB = PackRGB;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspARGBInitSSE2();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspARGBInitMIPSdspR2();
+    }
+#endif
+  }
+  argb_last_cpuinfo_used = VP8GetCPUInfo;
+}
diff --git a/src/3rdparty/libwebp/src/dsp/argb_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/argb_mips_dsp_r2.c
new file mode 100644
index 0000000..af65acb
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/argb_mips_dsp_r2.c
@@ -0,0 +1,110 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   ARGB making functions (mips version).
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int len, uint32_t* out) {
+  int temp0, temp1, temp2, temp3, offset;
+  const int rest = len & 1;
+  const uint32_t* const loop_end = out + len - rest;
+  const int step = 4;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+
+static void PackRGB(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                    int len, int step, uint32_t* out) {
+  int temp0, temp1, temp2, offset;
+  const int rest = len & 1;
+  const int a = 0xff;
+  const uint32_t* const loop_end = out + len - rest;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[r])         \n\t"
+    "lbux         %[temp1],    %[offset](%[g])         \n\t"
+    "lbux         %[temp2],    %[offset](%[b])         \n\t"
+    "ins          %[temp0],    %[a],      16,     16   \n\t"
+    "ins          %[temp2],    %[temp1],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp0],  %[temp2]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspARGBInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitMIPSdspR2(void) {
+  VP8PackARGB = PackARGB;
+  VP8PackRGB = PackRGB;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8EncDspARGBInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/argb_sse2.c b/src/3rdparty/libwebp/src/dsp/argb_sse2.c
new file mode 100644
index 0000000..afcb195
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/argb_sse2.c
@@ -0,0 +1,67 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+//   ARGB making functions (SSE2 version).
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <string.h>
+
+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
+static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int len, uint32_t* out) {
+  if (g == r + 1) {  // RGBA input order. Need to swap R and B.
+    int i = 0;
+    const int len_max = len & ~3;  // max length processed in main loop
+    const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
+    assert(b == r + 2);
+    assert(a == r + 3);
+    for (; i < len_max; i += 4) {
+      const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
+      const __m128i B = _mm_and_si128(A, red_blue_mask);     // R 0 B 0
+      const __m128i C = _mm_andnot_si128(red_blue_mask, A);  // 0 G 0 A
+      const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
+      const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
+      const __m128i F = _mm_or_si128(E, C);
+      _mm_storeu_si128((__m128i*)(out + i), F);
+    }
+    for (; i < len; ++i) {
+      out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
+    }
+  } else {
+    assert(g == b + 1);
+    assert(r == b + 2);
+    assert(a == b + 3);
+    memcpy(out, b, len * 4);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspARGBInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspARGBInitSSE2(void) {
+  VP8PackARGB = PackARGB;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspARGBInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/cost.c b/src/3rdparty/libwebp/src/dsp/cost.c
new file mode 100644
index 0000000..fe72d26
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/cost.c
@@ -0,0 +1,412 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+#include "../enc/cost.h"
+
+//------------------------------------------------------------------------------
+// Boolean-cost cost table
+
+const uint16_t VP8EntropyCost[256] = {
+  1792, 1792, 1792, 1536, 1536, 1408, 1366, 1280, 1280, 1216,
+  1178, 1152, 1110, 1076, 1061, 1024, 1024,  992,  968,  951,
+   939,  911,  896,  878,  871,  854,  838,  820,  811,  794,
+   786,  768,  768,  752,  740,  732,  720,  709,  704,  690,
+   683,  672,  666,  655,  647,  640,  631,  622,  615,  607,
+   598,  592,  586,  576,  572,  564,  559,  555,  547,  541,
+   534,  528,  522,  512,  512,  504,  500,  494,  488,  483,
+   477,  473,  467,  461,  458,  452,  448,  443,  438,  434,
+   427,  424,  419,  415,  410,  406,  403,  399,  394,  390,
+   384,  384,  377,  374,  370,  366,  362,  359,  355,  351,
+   347,  342,  342,  336,  333,  330,  326,  323,  320,  316,
+   312,  308,  305,  302,  299,  296,  293,  288,  287,  283,
+   280,  277,  274,  272,  268,  266,  262,  256,  256,  256,
+   251,  248,  245,  242,  240,  237,  234,  232,  228,  226,
+   223,  221,  218,  216,  214,  211,  208,  205,  203,  201,
+   198,  196,  192,  191,  188,  187,  183,  181,  179,  176,
+   175,  171,  171,  168,  165,  163,  160,  159,  156,  154,
+   152,  150,  148,  146,  144,  142,  139,  138,  135,  133,
+   131,  128,  128,  125,  123,  121,  119,  117,  115,  113,
+   111,  110,  107,  105,  103,  102,  100,   98,   96,   94,
+    92,   91,   89,   86,   86,   83,   82,   80,   77,   76,
+    74,   73,   71,   69,   67,   66,   64,   63,   61,   59,
+    57,   55,   54,   52,   51,   49,   47,   46,   44,   43,
+    41,   40,   38,   36,   35,   33,   32,   30,   29,   27,
+    25,   24,   22,   21,   19,   18,   16,   15,   13,   12,
+    10,    9,    7,    6,    4,    3
+};
+
+//------------------------------------------------------------------------------
+// Level cost tables
+
+// fixed costs for coding levels, deduce from the coding tree.
+// This is only the part that doesn't depend on the probability state.
+const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
+     0,  256,  256,  256,  256,  432,  618,  630,
+   731,  640,  640,  828,  901,  948, 1021, 1101,
+  1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
+  1245, 1275, 1318, 1337, 1380, 1410, 1453, 1497,
+  1540, 1570, 1613, 1280, 1295, 1317, 1332, 1358,
+  1373, 1395, 1410, 1454, 1469, 1491, 1506, 1532,
+  1547, 1569, 1584, 1601, 1616, 1638, 1653, 1679,
+  1694, 1716, 1731, 1775, 1790, 1812, 1827, 1853,
+  1868, 1890, 1905, 1727, 1733, 1742, 1748, 1759,
+  1765, 1774, 1780, 1800, 1806, 1815, 1821, 1832,
+  1838, 1847, 1853, 1878, 1884, 1893, 1899, 1910,
+  1916, 1925, 1931, 1951, 1957, 1966, 1972, 1983,
+  1989, 1998, 2004, 2027, 2033, 2042, 2048, 2059,
+  2065, 2074, 2080, 2100, 2106, 2115, 2121, 2132,
+  2138, 2147, 2153, 2178, 2184, 2193, 2199, 2210,
+  2216, 2225, 2231, 2251, 2257, 2266, 2272, 2283,
+  2289, 2298, 2304, 2168, 2174, 2183, 2189, 2200,
+  2206, 2215, 2221, 2241, 2247, 2256, 2262, 2273,
+  2279, 2288, 2294, 2319, 2325, 2334, 2340, 2351,
+  2357, 2366, 2372, 2392, 2398, 2407, 2413, 2424,
+  2430, 2439, 2445, 2468, 2474, 2483, 2489, 2500,
+  2506, 2515, 2521, 2541, 2547, 2556, 2562, 2573,
+  2579, 2588, 2594, 2619, 2625, 2634, 2640, 2651,
+  2657, 2666, 2672, 2692, 2698, 2707, 2713, 2724,
+  2730, 2739, 2745, 2540, 2546, 2555, 2561, 2572,
+  2578, 2587, 2593, 2613, 2619, 2628, 2634, 2645,
+  2651, 2660, 2666, 2691, 2697, 2706, 2712, 2723,
+  2729, 2738, 2744, 2764, 2770, 2779, 2785, 2796,
+  2802, 2811, 2817, 2840, 2846, 2855, 2861, 2872,
+  2878, 2887, 2893, 2913, 2919, 2928, 2934, 2945,
+  2951, 2960, 2966, 2991, 2997, 3006, 3012, 3023,
+  3029, 3038, 3044, 3064, 3070, 3079, 3085, 3096,
+  3102, 3111, 3117, 2981, 2987, 2996, 3002, 3013,
+  3019, 3028, 3034, 3054, 3060, 3069, 3075, 3086,
+  3092, 3101, 3107, 3132, 3138, 3147, 3153, 3164,
+  3170, 3179, 3185, 3205, 3211, 3220, 3226, 3237,
+  3243, 3252, 3258, 3281, 3287, 3296, 3302, 3313,
+  3319, 3328, 3334, 3354, 3360, 3369, 3375, 3386,
+  3392, 3401, 3407, 3432, 3438, 3447, 3453, 3464,
+  3470, 3479, 3485, 3505, 3511, 3520, 3526, 3537,
+  3543, 3552, 3558, 2816, 2822, 2831, 2837, 2848,
+  2854, 2863, 2869, 2889, 2895, 2904, 2910, 2921,
+  2927, 2936, 2942, 2967, 2973, 2982, 2988, 2999,
+  3005, 3014, 3020, 3040, 3046, 3055, 3061, 3072,
+  3078, 3087, 3093, 3116, 3122, 3131, 3137, 3148,
+  3154, 3163, 3169, 3189, 3195, 3204, 3210, 3221,
+  3227, 3236, 3242, 3267, 3273, 3282, 3288, 3299,
+  3305, 3314, 3320, 3340, 3346, 3355, 3361, 3372,
+  3378, 3387, 3393, 3257, 3263, 3272, 3278, 3289,
+  3295, 3304, 3310, 3330, 3336, 3345, 3351, 3362,
+  3368, 3377, 3383, 3408, 3414, 3423, 3429, 3440,
+  3446, 3455, 3461, 3481, 3487, 3496, 3502, 3513,
+  3519, 3528, 3534, 3557, 3563, 3572, 3578, 3589,
+  3595, 3604, 3610, 3630, 3636, 3645, 3651, 3662,
+  3668, 3677, 3683, 3708, 3714, 3723, 3729, 3740,
+  3746, 3755, 3761, 3781, 3787, 3796, 3802, 3813,
+  3819, 3828, 3834, 3629, 3635, 3644, 3650, 3661,
+  3667, 3676, 3682, 3702, 3708, 3717, 3723, 3734,
+  3740, 3749, 3755, 3780, 3786, 3795, 3801, 3812,
+  3818, 3827, 3833, 3853, 3859, 3868, 3874, 3885,
+  3891, 3900, 3906, 3929, 3935, 3944, 3950, 3961,
+  3967, 3976, 3982, 4002, 4008, 4017, 4023, 4034,
+  4040, 4049, 4055, 4080, 4086, 4095, 4101, 4112,
+  4118, 4127, 4133, 4153, 4159, 4168, 4174, 4185,
+  4191, 4200, 4206, 4070, 4076, 4085, 4091, 4102,
+  4108, 4117, 4123, 4143, 4149, 4158, 4164, 4175,
+  4181, 4190, 4196, 4221, 4227, 4236, 4242, 4253,
+  4259, 4268, 4274, 4294, 4300, 4309, 4315, 4326,
+  4332, 4341, 4347, 4370, 4376, 4385, 4391, 4402,
+  4408, 4417, 4423, 4443, 4449, 4458, 4464, 4475,
+  4481, 4490, 4496, 4521, 4527, 4536, 4542, 4553,
+  4559, 4568, 4574, 4594, 4600, 4609, 4615, 4626,
+  4632, 4641, 4647, 3515, 3521, 3530, 3536, 3547,
+  3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
+  3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
+  3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
+  3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
+  3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
+  3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
+  4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
+  4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
+  3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
+  4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
+  4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
+  4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
+  4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
+  4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
+  4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
+  4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
+  4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
+  4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
+  4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
+  4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
+  4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
+  4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
+  4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
+  4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
+  4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
+  4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
+  4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
+  5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
+  5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
+  5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
+  5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
+  5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
+  4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
+  4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
+  4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
+  4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
+  4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
+  5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
+  5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
+  5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
+  5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
+  5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
+  5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
+  5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
+  5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
+  5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
+  5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
+  5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
+  5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
+  5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
+  5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
+  5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
+  5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
+  5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
+  5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
+  5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
+  5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
+  5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
+  6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
+  6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
+  6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
+  6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
+  6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
+  6420, 6429, 6435, 3515, 3521, 3530, 3536, 3547,
+  3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
+  3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
+  3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
+  3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
+  3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
+  3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
+  4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
+  4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
+  3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
+  4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
+  4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
+  4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
+  4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
+  4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
+  4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
+  4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
+  4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
+  4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
+  4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
+  4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
+  4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
+  4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
+  4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
+  4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
+  4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
+  4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
+  4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
+  5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
+  5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
+  5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
+  5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
+  5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
+  4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
+  4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
+  4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
+  4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
+  4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
+  5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
+  5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
+  5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
+  5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
+  5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
+  5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
+  5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
+  5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
+  5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
+  5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
+  5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
+  5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
+  5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
+  5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
+  5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
+  5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
+  5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
+  5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
+  5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
+  5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
+  5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
+  6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
+  6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
+  6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
+  6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
+  6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
+  6420, 6429, 6435, 5303, 5309, 5318, 5324, 5335,
+  5341, 5350, 5356, 5376, 5382, 5391, 5397, 5408,
+  5414, 5423, 5429, 5454, 5460, 5469, 5475, 5486,
+  5492, 5501, 5507, 5527, 5533, 5542, 5548, 5559,
+  5565, 5574, 5580, 5603, 5609, 5618, 5624, 5635,
+  5641, 5650, 5656, 5676, 5682, 5691, 5697, 5708,
+  5714, 5723, 5729, 5754, 5760, 5769, 5775, 5786,
+  5792, 5801, 5807, 5827, 5833, 5842, 5848, 5859,
+  5865, 5874, 5880, 5744, 5750, 5759, 5765, 5776,
+  5782, 5791, 5797, 5817, 5823, 5832, 5838, 5849,
+  5855, 5864, 5870, 5895, 5901, 5910, 5916, 5927,
+  5933, 5942, 5948, 5968, 5974, 5983, 5989, 6000,
+  6006, 6015, 6021, 6044, 6050, 6059, 6065, 6076,
+  6082, 6091, 6097, 6117, 6123, 6132, 6138, 6149,
+  6155, 6164, 6170, 6195, 6201, 6210, 6216, 6227,
+  6233, 6242, 6248, 6268, 6274, 6283, 6289, 6300,
+  6306, 6315, 6321, 6116, 6122, 6131, 6137, 6148,
+  6154, 6163, 6169, 6189, 6195, 6204, 6210, 6221,
+  6227, 6236, 6242, 6267, 6273, 6282, 6288, 6299,
+  6305, 6314, 6320, 6340, 6346, 6355, 6361, 6372,
+  6378, 6387, 6393, 6416, 6422, 6431, 6437, 6448,
+  6454, 6463, 6469, 6489, 6495, 6504, 6510, 6521,
+  6527, 6536, 6542, 6567, 6573, 6582, 6588, 6599,
+  6605, 6614, 6620, 6640, 6646, 6655, 6661, 6672,
+  6678, 6687, 6693, 6557, 6563, 6572, 6578, 6589,
+  6595, 6604, 6610, 6630, 6636, 6645, 6651, 6662,
+  6668, 6677, 6683, 6708, 6714, 6723, 6729, 6740,
+  6746, 6755, 6761, 6781, 6787, 6796, 6802, 6813,
+  6819, 6828, 6834, 6857, 6863, 6872, 6878, 6889,
+  6895, 6904, 6910, 6930, 6936, 6945, 6951, 6962,
+  6968, 6977, 6983, 7008, 7014, 7023, 7029, 7040,
+  7046, 7055, 7061, 7081, 7087, 7096, 7102, 7113,
+  7119, 7128, 7134, 6392, 6398, 6407, 6413, 6424,
+  6430, 6439, 6445, 6465, 6471, 6480, 6486, 6497,
+  6503, 6512, 6518, 6543, 6549, 6558, 6564, 6575,
+  6581, 6590, 6596, 6616, 6622, 6631, 6637, 6648,
+  6654, 6663, 6669, 6692, 6698, 6707, 6713, 6724,
+  6730, 6739, 6745, 6765, 6771, 6780, 6786, 6797,
+  6803, 6812, 6818, 6843, 6849, 6858, 6864, 6875,
+  6881, 6890, 6896, 6916, 6922, 6931, 6937, 6948,
+  6954, 6963, 6969, 6833, 6839, 6848, 6854, 6865,
+  6871, 6880, 6886, 6906, 6912, 6921, 6927, 6938,
+  6944, 6953, 6959, 6984, 6990, 6999, 7005, 7016,
+  7022, 7031, 7037, 7057, 7063, 7072, 7078, 7089,
+  7095, 7104, 7110, 7133, 7139, 7148, 7154, 7165,
+  7171, 7180, 7186, 7206, 7212, 7221, 7227, 7238,
+  7244, 7253, 7259, 7284, 7290, 7299, 7305, 7316,
+  7322, 7331, 7337, 7357, 7363, 7372, 7378, 7389,
+  7395, 7404, 7410, 7205, 7211, 7220, 7226, 7237,
+  7243, 7252, 7258, 7278, 7284, 7293, 7299, 7310,
+  7316, 7325, 7331, 7356, 7362, 7371, 7377, 7388,
+  7394, 7403, 7409, 7429, 7435, 7444, 7450, 7461,
+  7467, 7476, 7482, 7505, 7511, 7520, 7526, 7537,
+  7543, 7552, 7558, 7578, 7584, 7593, 7599, 7610,
+  7616, 7625, 7631, 7656, 7662, 7671, 7677, 7688,
+  7694, 7703, 7709, 7729, 7735, 7744, 7750, 7761
+};
+
+//------------------------------------------------------------------------------
+// Tables for level coding
+
+const uint8_t VP8EncBands[16 + 1] = {
+  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
+  0  // sentinel
+};
+
+//------------------------------------------------------------------------------
+// Mode costs
+
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+  for (; n < res->last; ++n) {
+    const int v = abs(res->coeffs[n]);
+    const int ctx = (v >= 2) ? 2 : v;
+    cost += VP8LevelCost(t, v);
+    t = costs[n + 1][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+static void SetResidualCoeffs(const int16_t* const coeffs,
+                              VP8Residual* const res) {
+  int n;
+  res->last = -1;
+  assert(res->first == 0 || coeffs[0] == 0);
+  for (n = 15; n >= 0; --n) {
+    if (coeffs[n]) {
+      res->last = n;
+      break;
+    }
+  }
+  res->coeffs = coeffs;
+}
+
+//------------------------------------------------------------------------------
+// init function
+
+VP8GetResidualCostFunc VP8GetResidualCost;
+VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+extern void VP8EncDspCostInitMIPS32(void);
+extern void VP8EncDspCostInitMIPSdspR2(void);
+extern void VP8EncDspCostInitSSE2(void);
+
+static volatile VP8CPUInfo cost_last_cpuinfo_used =
+    (VP8CPUInfo)&cost_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
+  if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8GetResidualCost = GetResidualCost;
+  VP8SetResidualCoeffs = SetResidualCoeffs;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8EncDspCostInitMIPS32();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspCostInitMIPSdspR2();
+    }
+#endif
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8EncDspCostInitSSE2();
+    }
+#endif
+  }
+
+  cost_last_cpuinfo_used = VP8GetCPUInfo;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/cost_mips32.c b/src/3rdparty/libwebp/src/dsp/cost_mips32.c
new file mode 100644
index 0000000..d1e240e
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/cost_mips32.c
@@ -0,0 +1,154 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include "../enc/cost.h"
+
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+  int temp0, temp1;
+  int v_reg, ctx_reg;
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+  const int16_t* res_coeffs = res->coeffs;
+  const int res_last = res->last;
+  const int const_max_level = MAX_VARIABLE_LEVEL;
+  const int const_2 = 2;
+  const uint16_t** p_costs = &costs[n][0];
+  const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  __asm__ volatile (
+    ".set      push                                                        \n\t"
+    ".set      noreorder                                                   \n\t"
+    "subu      %[temp1],        %[res_last],        %[n]                   \n\t"
+    "sll       %[temp0],        %[n],               1                      \n\t"
+    "blez      %[temp1],        2f                                         \n\t"
+    " addu     %[res_coeffs],   %[res_coeffs],      %[temp0]               \n\t"
+  "1:                                                                      \n\t"
+    "lh        %[v_reg],        0(%[res_coeffs])                           \n\t"
+    "addiu     %[n],            %[n],               1                      \n\t"
+    "negu      %[temp0],        %[v_reg]                                   \n\t"
+    "slti      %[temp1],        %[v_reg],           0                      \n\t"
+    "movn      %[v_reg],        %[temp0],           %[temp1]               \n\t"
+    "sltiu     %[temp0],        %[v_reg],           2                      \n\t"
+    "move      %[ctx_reg],      %[v_reg]                                   \n\t"
+    "movz      %[ctx_reg],      %[const_2],         %[temp0]               \n\t"
+    "sll       %[temp1],        %[v_reg],           1                      \n\t"
+    "addu      %[temp1],        %[temp1],           %[VP8LevelFixedCosts]  \n\t"
+    "lhu       %[temp1],        0(%[temp1])                                \n\t"
+    "slt       %[temp0],        %[v_reg],           %[const_max_level]     \n\t"
+    "movz      %[v_reg],        %[const_max_level], %[temp0]               \n\t"
+    "addu      %[cost],         %[cost],            %[temp1]               \n\t"
+    "sll       %[v_reg],        %[v_reg],           1                      \n\t"
+    "sll       %[ctx_reg],      %[ctx_reg],         2                      \n\t"
+    "addu      %[v_reg],        %[v_reg],           %[t]                   \n\t"
+    "lhu       %[temp0],        0(%[v_reg])                                \n\t"
+    "addu      %[p_costs],      %[p_costs],         %[inc_p_costs]         \n\t"
+    "addu      %[t],            %[p_costs],         %[ctx_reg]             \n\t"
+    "addu      %[cost],         %[cost],            %[temp0]               \n\t"
+    "addiu     %[res_coeffs],   %[res_coeffs],      2                      \n\t"
+    "bne       %[n],            %[res_last],        1b                     \n\t"
+    " lw       %[t],            0(%[t])                                    \n\t"
+  "2:                                                                      \n\t"
+    ".set      pop                                                         \n\t"
+    : [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
+      [ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [res_coeffs]"+&r"(res_coeffs)
+    : [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
+      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
+      [inc_p_costs]"r"(inc_p_costs)
+    : "memory"
+  );
+
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+static void SetResidualCoeffs(const int16_t* const coeffs,
+                              VP8Residual* const res) {
+  const int16_t* p_coeffs = (int16_t*)coeffs;
+  int temp0, temp1, temp2, n, n1;
+  assert(res->first == 0 || coeffs[0] == 0);
+
+  __asm__ volatile (
+    ".set     push                                      \n\t"
+    ".set     noreorder                                 \n\t"
+    "addiu    %[p_coeffs],   %[p_coeffs],    28         \n\t"
+    "li       %[n],          15                         \n\t"
+    "li       %[temp2],      -1                         \n\t"
+  "0:                                                   \n\t"
+    "ulw      %[temp0],      0(%[p_coeffs])             \n\t"
+    "beqz     %[temp0],      1f                         \n\t"
+#if defined(WORDS_BIGENDIAN)
+    " sll     %[temp1],      %[temp0],       16         \n\t"
+#else
+    " srl     %[temp1],      %[temp0],       16         \n\t"
+#endif
+    "addiu    %[n1],         %[n],           -1         \n\t"
+    "movz     %[temp0],      %[n1],          %[temp1]   \n\t"
+    "movn     %[temp0],      %[n],           %[temp1]   \n\t"
+    "j        2f                                        \n\t"
+    " addiu   %[temp2],      %[temp0],       0          \n\t"
+  "1:                                                   \n\t"
+    "addiu    %[n],          %[n],           -2         \n\t"
+    "bgtz     %[n],          0b                         \n\t"
+    " addiu   %[p_coeffs],   %[p_coeffs],    -4         \n\t"
+  "2:                                                   \n\t"
+    ".set     pop                                       \n\t"
+    : [p_coeffs]"+&r"(p_coeffs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [n]"=&r"(n), [n1]"=&r"(n1)
+    :
+    : "memory"
+  );
+  res->last = temp2;
+  res->coeffs = coeffs;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPS32(void) {
+  VP8GetResidualCost = GetResidualCost;
+  VP8SetResidualCoeffs = SetResidualCoeffs;
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
new file mode 100644
index 0000000..ce64067
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
@@ -0,0 +1,107 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../enc/cost.h"
+
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+  int temp0, temp1;
+  int v_reg, ctx_reg;
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+  const int16_t* res_coeffs = res->coeffs;
+  const int res_last = res->last;
+  const int const_max_level = MAX_VARIABLE_LEVEL;
+  const int const_2 = 2;
+  const uint16_t** p_costs = &costs[n][0];
+  const size_t inc_p_costs = NUM_CTX * sizeof(*p_costs);
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  __asm__ volatile (
+    ".set      push                                                     \n\t"
+    ".set      noreorder                                                \n\t"
+    "subu      %[temp1],        %[res_last],        %[n]                \n\t"
+    "blez      %[temp1],        2f                                      \n\t"
+    " nop                                                               \n\t"
+  "1:                                                                   \n\t"
+    "sll       %[temp0],        %[n],               1                   \n\t"
+    "lhx       %[v_reg],        %[temp0](%[res_coeffs])                 \n\t"
+    "addiu     %[n],            %[n],               1                   \n\t"
+    "absq_s.w  %[v_reg],        %[v_reg]                                \n\t"
+    "sltiu     %[temp0],        %[v_reg],           2                   \n\t"
+    "move      %[ctx_reg],      %[v_reg]                                \n\t"
+    "movz      %[ctx_reg],      %[const_2],         %[temp0]            \n\t"
+    "sll       %[temp1],        %[v_reg],           1                   \n\t"
+    "lhx       %[temp1],        %[temp1](%[VP8LevelFixedCosts])         \n\t"
+    "slt       %[temp0],        %[v_reg],           %[const_max_level]  \n\t"
+    "movz      %[v_reg],        %[const_max_level], %[temp0]            \n\t"
+    "addu      %[cost],         %[cost],            %[temp1]            \n\t"
+    "sll       %[v_reg],        %[v_reg],           1                   \n\t"
+    "sll       %[ctx_reg],      %[ctx_reg],         2                   \n\t"
+    "lhx       %[temp0],        %[v_reg](%[t])                          \n\t"
+    "addu      %[p_costs],      %[p_costs],         %[inc_p_costs]      \n\t"
+    "addu      %[t],            %[p_costs],         %[ctx_reg]          \n\t"
+    "addu      %[cost],         %[cost],            %[temp0]            \n\t"
+    "bne       %[n],            %[res_last],        1b                  \n\t"
+    " lw       %[t],            0(%[t])                                 \n\t"
+  "2:                                                                   \n\t"
+    ".set      pop                                                      \n\t"
+    : [cost]"+&r"(cost), [t]"+&r"(t), [n]"+&r"(n), [v_reg]"=&r"(v_reg),
+      [ctx_reg]"=&r"(ctx_reg), [p_costs]"+&r"(p_costs), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1)
+    : [const_2]"r"(const_2), [const_max_level]"r"(const_max_level),
+      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_last]"r"(res_last),
+      [res_coeffs]"r"(res_coeffs), [inc_p_costs]"r"(inc_p_costs)
+    : "memory"
+  );
+
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitMIPSdspR2(void) {
+  VP8GetResidualCost = GetResidualCost;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/cost_sse2.c b/src/3rdparty/libwebp/src/dsp/cost_sse2.c
new file mode 100644
index 0000000..0cb1c1f
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/cost_sse2.c
@@ -0,0 +1,119 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 version of cost functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+
+#include "../enc/cost.h"
+#include "../enc/vp8enci.h"
+#include "../utils/utils.h"
+
+//------------------------------------------------------------------------------
+
+static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
+                                  VP8Residual* const res) {
+  const __m128i c0 = _mm_loadu_si128((const __m128i*)(coeffs + 0));
+  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
+  // Use SSE2 to compare 16 values with a single instruction.
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i m0 = _mm_packs_epi16(c0, c1);
+  const __m128i m1 = _mm_cmpeq_epi8(m0, zero);
+  // Get the comparison results as a bitmask into 16bits. Negate the mask to get
+  // the position of entries that are not equal to zero. We don't need to mask
+  // out least significant bits according to res->first, since coeffs[0] is 0
+  // if res->first > 0.
+  const uint32_t mask = 0x0000ffffu ^ (uint32_t)_mm_movemask_epi8(m1);
+  // The position of the most significant non-zero bit indicates the position of
+  // the last non-zero value.
+  assert(res->first == 0 || coeffs[0] == 0);
+  res->last = mask ? BitsLog2Floor(mask) : -1;
+  res->coeffs = coeffs;
+}
+
+static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
+  uint8_t levels[16], ctxs[16];
+  uint16_t abs_levels[16];
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  CostArrayPtr const costs = res->costs;
+  const uint16_t* t = costs[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+
+  {   // precompute clamped levels and contexts, packed to 8b.
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i kCst2 = _mm_set1_epi8(2);
+    const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL);
+    const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]);
+    const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]);
+    const __m128i D0 = _mm_sub_epi16(zero, c0);
+    const __m128i D1 = _mm_sub_epi16(zero, c1);
+    const __m128i E0 = _mm_max_epi16(c0, D0);   // abs(v), 16b
+    const __m128i E1 = _mm_max_epi16(c1, D1);
+    const __m128i F = _mm_packs_epi16(E0, E1);
+    const __m128i G = _mm_min_epu8(F, kCst2);    // context = 0,1,2
+    const __m128i H = _mm_min_epu8(F, kCst67);   // clamp_level in [0..67]
+
+    _mm_storeu_si128((__m128i*)&ctxs[0], G);
+    _mm_storeu_si128((__m128i*)&levels[0], H);
+
+    _mm_storeu_si128((__m128i*)&abs_levels[0], E0);
+    _mm_storeu_si128((__m128i*)&abs_levels[8], E1);
+  }
+  for (; n < res->last; ++n) {
+    const int ctx = ctxs[n];
+    const int level = levels[n];
+    const int flevel = abs_levels[n];   // full level
+    cost += VP8LevelFixedCosts[flevel] + t[level];  // simplified VP8LevelCost()
+    t = costs[n + 1][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int level = levels[n];
+    const int flevel = abs_levels[n];
+    assert(flevel != 0);
+    cost += VP8LevelFixedCosts[flevel] + t[level];
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = ctxs[n];
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspCostInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInitSSE2(void) {
+  VP8SetResidualCoeffs = SetResidualCoeffsSSE2;
+  VP8GetResidualCost = GetResidualCostSSE2;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspCostInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/cpu.c b/src/3rdparty/libwebp/src/dsp/cpu.c
index ef04a75..8844cb4 100644
--- a/src/3rdparty/libwebp/src/dsp/cpu.c
+++ b/src/3rdparty/libwebp/src/dsp/cpu.c
@@ -13,7 +13,7 @@
 
 #include "./dsp.h"
 
-#if defined(__ANDROID__)
+#if defined(WEBP_ANDROID_NEON)
 #include <cpu-features.h>
 #endif
 
@@ -31,6 +31,18 @@ static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
     : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3])
     : "a"(info_type), "c"(0));
 }
+#elif defined(__x86_64__) && \
+      (defined(__code_model_medium__) || defined(__code_model_large__)) && \
+      defined(__PIC__)
+static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
+  __asm__ volatile (
+    "xchg{q}\t{%%rbx}, %q1\n"
+    "cpuid\n"
+    "xchg{q}\t{%%rbx}, %q1\n"
+    : "=a"(cpu_info[0]), "=&r"(cpu_info[1]), "=c"(cpu_info[2]),
+      "=d"(cpu_info[3])
+    : "a"(info_type), "c"(0));
+}
 #elif defined(__i386__) || defined(__x86_64__)
 static WEBP_INLINE void GetCPUInfo(int cpu_info[4], int info_type) {
   __asm__ volatile (
@@ -79,7 +91,16 @@ static WEBP_INLINE uint64_t xgetbv(void) {
 
 #if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
 static int x86CPUInfo(CPUFeature feature) {
+  int max_cpuid_value;
   int cpu_info[4];
+
+  // get the highest feature value cpuid supports
+  GetCPUInfo(cpu_info, 0);
+  max_cpuid_value = cpu_info[0];
+  if (max_cpuid_value < 1) {
+    return 0;
+  }
+
   GetCPUInfo(cpu_info, 1);
   if (feature == kSSE2) {
     return 0 != (cpu_info[3] & 0x04000000);
@@ -87,6 +108,9 @@ static int x86CPUInfo(CPUFeature feature) {
   if (feature == kSSE3) {
     return 0 != (cpu_info[2] & 0x00000001);
   }
+  if (feature == kSSE4_1) {
+    return 0 != (cpu_info[2] & 0x00080000);
+  }
   if (feature == kAVX) {
     // bits 27 (OSXSAVE) & 28 (256-bit AVX)
     if ((cpu_info[2] & 0x18000000) == 0x18000000) {
@@ -95,7 +119,7 @@ static int x86CPUInfo(CPUFeature feature) {
     }
   }
   if (feature == kAVX2) {
-    if (x86CPUInfo(kAVX)) {
+    if (x86CPUInfo(kAVX) && max_cpuid_value >= 7) {
       GetCPUInfo(cpu_info, 7);
       return ((cpu_info[1] & 0x00000020) == 0x00000020);
     }
@@ -122,10 +146,14 @@ static int armCPUInfo(CPUFeature feature) {
   return 1;
 }
 VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
-#elif defined(WEBP_USE_MIPS32)
+#elif defined(WEBP_USE_MIPS32) || defined(WEBP_USE_MIPS_DSP_R2)
 static int mipsCPUInfo(CPUFeature feature) {
-  (void)feature;
-  return 1;
+  if ((feature == kMIPS32) || (feature == kMIPSdspR2)) {
+    return 1;
+  } else {
+    return 0;
+  }
+
 }
 VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
 #else
diff --git a/src/3rdparty/libwebp/src/dsp/dec.c b/src/3rdparty/libwebp/src/dsp/dec.c
index 3a8dc81..a787206 100644
--- a/src/3rdparty/libwebp/src/dsp/dec.c
+++ b/src/3rdparty/libwebp/src/dsp/dec.c
@@ -7,7 +7,7 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// Speed-critical decoding functions.
+// Speed-critical decoding functions, default plain-C implementations.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
@@ -34,9 +34,8 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
   STORE(3, y, DC - (d));            \
 } while (0)
 
-static const int kC1 = 20091 + (1 << 16);
-static const int kC2 = 35468;
-#define MUL(a, b) (((a) * (b)) >> 16)
+#define MUL1(a) ((((a) * 20091) >> 16) + (a))
+#define MUL2(a) (((a) * 35468) >> 16)
 
 static void TransformOne(const int16_t* in, uint8_t* dst) {
   int C[4 * 4], *tmp;
@@ -45,8 +44,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
   for (i = 0; i < 4; ++i) {    // vertical pass
     const int a = in[0] + in[8];    // [-4096, 4094]
     const int b = in[0] - in[8];    // [-4095, 4095]
-    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);   // [-3783, 3783]
-    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);   // [-3785, 3781]
+    const int c = MUL2(in[4]) - MUL1(in[12]);   // [-3783, 3783]
+    const int d = MUL1(in[4]) + MUL2(in[12]);   // [-3785, 3781]
     tmp[0] = a + d;   // [-7881, 7875]
     tmp[1] = b + c;   // [-7878, 7878]
     tmp[2] = b - c;   // [-7878, 7878]
@@ -55,7 +54,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     in++;
   }
   // Each pass is expanding the dynamic range by ~3.85 (upper bound).
-  // The exact value is (2. + (kC1 + kC2) / 65536).
+  // The exact value is (2. + (20091 + 35468) / 65536).
   // After the second pass, maximum interval is [-3794, 3794], assuming
   // an input in [-2048, 2047] interval. We then need to add a dst value
   // in the [0, 255] range.
@@ -66,8 +65,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     const int dc = tmp[0] + 4;
     const int a =  dc +  tmp[8];
     const int b =  dc -  tmp[8];
-    const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1);
-    const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2);
+    const int c = MUL2(tmp[4]) - MUL1(tmp[12]);
+    const int d = MUL1(tmp[4]) + MUL2(tmp[12]);
     STORE(0, 0, a + d);
     STORE(1, 0, b + c);
     STORE(2, 0, b - c);
@@ -80,16 +79,17 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
 // Simplified transform when only in[0], in[1] and in[4] are non-zero
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
   const int a = in[0] + 4;
-  const int c4 = MUL(in[4], kC2);
-  const int d4 = MUL(in[4], kC1);
-  const int c1 = MUL(in[1], kC2);
-  const int d1 = MUL(in[1], kC1);
+  const int c4 = MUL2(in[4]);
+  const int d4 = MUL1(in[4]);
+  const int c1 = MUL2(in[1]);
+  const int d1 = MUL1(in[1]);
   STORE2(0, a + d4, d1, c1);
   STORE2(1, a + c4, d1, c1);
   STORE2(2, a - c4, d1, c1);
   STORE2(3, a - d4, d1, c1);
 }
-#undef MUL
+#undef MUL1
+#undef MUL2
 #undef STORE2
 
 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
@@ -104,7 +104,7 @@ static void TransformUV(const int16_t* in, uint8_t* dst) {
   VP8Transform(in + 2 * 16, dst + 4 * BPS, 1);
 }
 
-static void TransformDC(const int16_t *in, uint8_t* dst) {
+static void TransformDC(const int16_t* in, uint8_t* dst) {
   const int DC = in[0] + 4;
   int i, j;
   for (j = 0; j < 4; ++j) {
@@ -160,7 +160,7 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out);
 
 #define DST(x, y) dst[(x) + (y) * BPS]
 
-static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
   const uint8_t* top = dst - BPS;
   const uint8_t* const clip0 = VP8kclip1 - top[-1];
   int y;
@@ -173,21 +173,21 @@ static WEBP_INLINE void TrueMotion(uint8_t *dst, int size) {
     dst += BPS;
   }
 }
-static void TM4(uint8_t *dst)   { TrueMotion(dst, 4); }
-static void TM8uv(uint8_t *dst) { TrueMotion(dst, 8); }
-static void TM16(uint8_t *dst)  { TrueMotion(dst, 16); }
+static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
 
 //------------------------------------------------------------------------------
 // 16x16
 
-static void VE16(uint8_t *dst) {     // vertical
+static void VE16(uint8_t* dst) {     // vertical
   int j;
   for (j = 0; j < 16; ++j) {
     memcpy(dst + j * BPS, dst - BPS, 16);
   }
 }
 
-static void HE16(uint8_t *dst) {     // horizontal
+static void HE16(uint8_t* dst) {     // horizontal
   int j;
   for (j = 16; j > 0; --j) {
     memset(dst, dst[-1], 16);
@@ -202,7 +202,7 @@ static WEBP_INLINE void Put16(int v, uint8_t* dst) {
   }
 }
 
-static void DC16(uint8_t *dst) {    // DC
+static void DC16(uint8_t* dst) {    // DC
   int DC = 16;
   int j;
   for (j = 0; j < 16; ++j) {
@@ -211,7 +211,7 @@ static void DC16(uint8_t *dst) {    // DC
   Put16(DC >> 5, dst);
 }
 
-static void DC16NoTop(uint8_t *dst) {   // DC with top samples not available
+static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
   int DC = 8;
   int j;
   for (j = 0; j < 16; ++j) {
@@ -220,7 +220,7 @@ static void DC16NoTop(uint8_t *dst) {   // DC with top samples not available
   Put16(DC >> 4, dst);
 }
 
-static void DC16NoLeft(uint8_t *dst) {  // DC with left samples not available
+static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
   int DC = 8;
   int i;
   for (i = 0; i < 16; ++i) {
@@ -229,17 +229,19 @@ static void DC16NoLeft(uint8_t *dst) {  // DC with left samples not available
   Put16(DC >> 4, dst);
 }
 
-static void DC16NoTopLeft(uint8_t *dst) {  // DC with no top and left samples
+static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
   Put16(0x80, dst);
 }
 
+VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
+
 //------------------------------------------------------------------------------
 // 4x4
 
 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
-static void VE4(uint8_t *dst) {    // vertical
+static void VE4(uint8_t* dst) {    // vertical
   const uint8_t* top = dst - BPS;
   const uint8_t vals[4] = {
     AVG3(top[-1], top[0], top[1]),
@@ -253,19 +255,19 @@ static void VE4(uint8_t *dst) {    // vertical
   }
 }
 
-static void HE4(uint8_t *dst) {    // horizontal
+static void HE4(uint8_t* dst) {    // horizontal
   const int A = dst[-1 - BPS];
   const int B = dst[-1];
   const int C = dst[-1 + BPS];
   const int D = dst[-1 + 2 * BPS];
   const int E = dst[-1 + 3 * BPS];
-  *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(A, B, C);
-  *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(B, C, D);
-  *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(C, D, E);
-  *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(D, E, E);
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(A, B, C));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(B, C, D));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(C, D, E));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(D, E, E));
 }
 
-static void DC4(uint8_t *dst) {   // DC
+static void DC4(uint8_t* dst) {   // DC
   uint32_t dc = 4;
   int i;
   for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
@@ -273,7 +275,7 @@ static void DC4(uint8_t *dst) {   // DC
   for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
 }
 
-static void RD4(uint8_t *dst) {   // Down-right
+static void RD4(uint8_t* dst) {   // Down-right
   const int I = dst[-1 + 0 * BPS];
   const int J = dst[-1 + 1 * BPS];
   const int K = dst[-1 + 2 * BPS];
@@ -284,15 +286,15 @@ static void RD4(uint8_t *dst) {   // Down-right
   const int C = dst[2 - BPS];
   const int D = dst[3 - BPS];
   DST(0, 3)                                     = AVG3(J, K, L);
-  DST(0, 2) = DST(1, 3)                         = AVG3(I, J, K);
-  DST(0, 1) = DST(1, 2) = DST(2, 3)             = AVG3(X, I, J);
-  DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I);
-  DST(1, 0) = DST(2, 1) = DST(3, 2)             = AVG3(B, A, X);
-  DST(2, 0) = DST(3, 1)                         = AVG3(C, B, A);
-  DST(3, 0)                                     = AVG3(D, C, B);
+  DST(1, 3) = DST(0, 2)                         = AVG3(I, J, K);
+  DST(2, 3) = DST(1, 2) = DST(0, 1)             = AVG3(X, I, J);
+  DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I);
+              DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X);
+                          DST(3, 1) = DST(2, 0) = AVG3(C, B, A);
+                                      DST(3, 0) = AVG3(D, C, B);
 }
 
-static void LD4(uint8_t *dst) {   // Down-Left
+static void LD4(uint8_t* dst) {   // Down-Left
   const int A = dst[0 - BPS];
   const int B = dst[1 - BPS];
   const int C = dst[2 - BPS];
@@ -305,12 +307,12 @@ static void LD4(uint8_t *dst) {   // Down-Left
   DST(1, 0) = DST(0, 1)                         = AVG3(B, C, D);
   DST(2, 0) = DST(1, 1) = DST(0, 2)             = AVG3(C, D, E);
   DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F);
-  DST(3, 1) = DST(2, 2) = DST(1, 3)             = AVG3(E, F, G);
-  DST(3, 2) = DST(2, 3)                         = AVG3(F, G, H);
-  DST(3, 3)                                     = AVG3(G, H, H);
+              DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G);
+                          DST(3, 2) = DST(2, 3) = AVG3(F, G, H);
+                                      DST(3, 3) = AVG3(G, H, H);
 }
 
-static void VR4(uint8_t *dst) {   // Vertical-Right
+static void VR4(uint8_t* dst) {   // Vertical-Right
   const int I = dst[-1 + 0 * BPS];
   const int J = dst[-1 + 1 * BPS];
   const int K = dst[-1 + 2 * BPS];
@@ -332,7 +334,7 @@ static void VR4(uint8_t *dst) {   // Vertical-Right
   DST(3, 1) =             AVG3(B, C, D);
 }
 
-static void VL4(uint8_t *dst) {   // Vertical-Left
+static void VL4(uint8_t* dst) {   // Vertical-Left
   const int A = dst[0 - BPS];
   const int B = dst[1 - BPS];
   const int C = dst[2 - BPS];
@@ -354,7 +356,7 @@ static void VL4(uint8_t *dst) {   // Vertical-Left
               DST(3, 3) = AVG3(F, G, H);
 }
 
-static void HU4(uint8_t *dst) {   // Horizontal-Up
+static void HU4(uint8_t* dst) {   // Horizontal-Up
   const int I = dst[-1 + 0 * BPS];
   const int J = dst[-1 + 1 * BPS];
   const int K = dst[-1 + 2 * BPS];
@@ -369,7 +371,7 @@ static void HU4(uint8_t *dst) {   // Horizontal-Up
     DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
 }
 
-static void HD4(uint8_t *dst) {  // Horizontal-Down
+static void HD4(uint8_t* dst) {  // Horizontal-Down
   const int I = dst[-1 + 0 * BPS];
   const int J = dst[-1 + 1 * BPS];
   const int K = dst[-1 + 2 * BPS];
@@ -396,17 +398,19 @@ static void HD4(uint8_t *dst) {  // Horizontal-Down
 #undef AVG3
 #undef AVG2
 
+VP8PredFunc VP8PredLuma4[NUM_BMODES];
+
 //------------------------------------------------------------------------------
 // Chroma
 
-static void VE8uv(uint8_t *dst) {    // vertical
+static void VE8uv(uint8_t* dst) {    // vertical
   int j;
   for (j = 0; j < 8; ++j) {
     memcpy(dst + j * BPS, dst - BPS, 8);
   }
 }
 
-static void HE8uv(uint8_t *dst) {    // horizontal
+static void HE8uv(uint8_t* dst) {    // horizontal
   int j;
   for (j = 0; j < 8; ++j) {
     memset(dst, dst[-1], 8);
@@ -422,7 +426,7 @@ static WEBP_INLINE void Put8x8uv(uint8_t value, uint8_t* dst) {
   }
 }
 
-static void DC8uv(uint8_t *dst) {     // DC
+static void DC8uv(uint8_t* dst) {     // DC
   int dc0 = 8;
   int i;
   for (i = 0; i < 8; ++i) {
@@ -431,7 +435,7 @@ static void DC8uv(uint8_t *dst) {     // DC
   Put8x8uv(dc0 >> 4, dst);
 }
 
-static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
   int dc0 = 4;
   int i;
   for (i = 0; i < 8; ++i) {
@@ -440,7 +444,7 @@ static void DC8uvNoLeft(uint8_t *dst) {   // DC with no left samples
   Put8x8uv(dc0 >> 3, dst);
 }
 
-static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
   int dc0 = 4;
   int i;
   for (i = 0; i < 8; ++i) {
@@ -449,26 +453,11 @@ static void DC8uvNoTop(uint8_t *dst) {  // DC with no top samples
   Put8x8uv(dc0 >> 3, dst);
 }
 
-static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
+static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
   Put8x8uv(0x80, dst);
 }
 
-//------------------------------------------------------------------------------
-// default C implementations
-
-const VP8PredFunc VP8PredLuma4[NUM_BMODES] = {
-  DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4
-};
-
-const VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = {
-  DC16, TM16, VE16, HE16,
-  DC16NoTop, DC16NoLeft, DC16NoTopLeft
-};
-
-const VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = {
-  DC8uv, TM8uv, VE8uv, HE8uv,
-  DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
-};
+VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES];
 
 //------------------------------------------------------------------------------
 // Edge filtering functions
@@ -685,13 +674,15 @@ VP8SimpleFilterFunc VP8SimpleVFilter16i;
 VP8SimpleFilterFunc VP8SimpleHFilter16i;
 
 extern void VP8DspInitSSE2(void);
+extern void VP8DspInitSSE41(void);
 extern void VP8DspInitNEON(void);
 extern void VP8DspInitMIPS32(void);
+extern void VP8DspInitMIPSdspR2(void);
 
 static volatile VP8CPUInfo dec_last_cpuinfo_used =
     (VP8CPUInfo)&dec_last_cpuinfo_used;
 
-void VP8DspInit(void) {
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
   if (dec_last_cpuinfo_used == VP8GetCPUInfo) return;
 
   VP8InitClipTables();
@@ -716,21 +707,60 @@ void VP8DspInit(void) {
   VP8SimpleVFilter16i = SimpleVFilter16i;
   VP8SimpleHFilter16i = SimpleHFilter16i;
 
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[3] = HE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[5] = VR4;
+  VP8PredLuma4[6] = LD4;
+  VP8PredLuma4[7] = VL4;
+  VP8PredLuma4[8] = HD4;
+  VP8PredLuma4[9] = HU4;
+
+  VP8PredLuma16[0] = DC16;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
+
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8DspInitSSE2();
+#if defined(WEBP_USE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8DspInitSSE41();
+      }
+#endif
     }
-#elif defined(WEBP_USE_NEON)
+#endif
+#if defined(WEBP_USE_NEON)
     if (VP8GetCPUInfo(kNEON)) {
       VP8DspInitNEON();
     }
-#elif defined(WEBP_USE_MIPS32)
+#endif
+#if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       VP8DspInitMIPS32();
     }
 #endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8DspInitMIPSdspR2();
+    }
+#endif
   }
   dec_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c b/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c
index eec5a6d..3b6dde8 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c
@@ -344,7 +344,7 @@ const int8_t* const VP8ksclip2 = &sclip2[112];
 const uint8_t* const VP8kclip1 = &clip1[255];
 const uint8_t* const VP8kabs0 = &abs0[255];
 
-void VP8InitClipTables(void) {
+WEBP_TSAN_IGNORE_FUNCTION void VP8InitClipTables(void) {
 #if !defined(USE_STATIC_TABLES)
   int i;
   if (!tables_ok) {
diff --git a/src/3rdparty/libwebp/src/dsp/dec_mips32.c b/src/3rdparty/libwebp/src/dsp/dec_mips32.c
index 3e89ed3..4e9ef42 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_mips32.c
@@ -16,6 +16,8 @@
 
 #if defined(WEBP_USE_MIPS32)
 
+#include "./mips_macro.h"
+
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 
@@ -52,6 +54,7 @@ static WEBP_INLINE void do_filter6(uint8_t* p, int step) {
   const int p2 = p[-3 * step], p1 = p[-2 * step], p0 = p[-step];
   const int q0 = p[0], q1 = p[step], q2 = p[2 * step];
   const int a = VP8ksclip1[3 * (q0 - p0) + VP8ksclip1[p1 - q1]];
+  // a is in [-128,127], a1 in [-27,27], a2 in [-18,18] and a3 in [-9,9]
   const int a1 = (27 * a + 63) >> 7;  // eq. to ((3 * a + 7) * 9) >> 7
   const int a2 = (18 * a + 63) >> 7;  // eq. to ((2 * a + 7) * 9) >> 7
   const int a3 = (9  * a + 63) >> 7;  // eq. to ((1 * a + 7) * 9) >> 7
@@ -68,9 +71,9 @@ static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
   return (abs_mips32(p1 - p0) > thresh) || (abs_mips32(q1 - q0) > thresh);
 }
 
-static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
+static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int t) {
   const int p1 = p[-2 * step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  return ((2 * abs_mips32(p0 - q0) + (abs_mips32(p1 - q1) >> 1)) <= thresh);
+  return ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) <= t);
 }
 
 static WEBP_INLINE int needs_filter2(const uint8_t* p,
@@ -78,7 +81,7 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
   const int p3 = p[-4 * step], p2 = p[-3 * step];
   const int p1 = p[-2 * step], p0 = p[-step];
   const int q0 = p[0], q1 = p[step], q2 = p[2 * step], q3 = p[3 * step];
-  if ((2 * abs_mips32(p0 - q0) + (abs_mips32(p1 - q1) >> 1)) > t) {
+  if ((4 * abs_mips32(p0 - q0) + abs_mips32(p1 - q1)) > t) {
     return 0;
   }
   return abs_mips32(p3 - p2) <= it && abs_mips32(p2 - p1) <= it &&
@@ -89,8 +92,9 @@ static WEBP_INLINE int needs_filter2(const uint8_t* p,
 static WEBP_INLINE void FilterLoop26(uint8_t* p,
                                      int hstride, int vstride, int size,
                                      int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
   while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh, ithresh)) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
       if (hev(p, hstride, hev_thresh)) {
         do_filter2(p, hstride);
       } else {
@@ -104,8 +108,9 @@ static WEBP_INLINE void FilterLoop26(uint8_t* p,
 static WEBP_INLINE void FilterLoop24(uint8_t* p,
                                      int hstride, int vstride, int size,
                                      int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
   while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh, ithresh)) {
+    if (needs_filter2(p, hstride, thresh2, ithresh)) {
       if (hev(p, hstride, hev_thresh)) {
         do_filter2(p, hstride);
       } else {
@@ -176,8 +181,9 @@ static void HFilter16i(uint8_t* p, int stride,
 
 static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
   int i;
+  const int thresh2 = 2 * thresh + 1;
   for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i, stride, thresh)) {
+    if (needs_filter(p + i, stride, thresh2)) {
       do_filter2(p + i, stride);
     }
   }
@@ -185,8 +191,9 @@ static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
 
 static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
   int i;
+  const int thresh2 = 2 * thresh + 1;
   for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i * stride, 1, thresh)) {
+    if (needs_filter(p + i * stride, 1, thresh2)) {
       do_filter2(p + i * stride, 1);
     }
   }
@@ -384,7 +391,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "sra      %[temp7],  %[temp7],  3                  \n\t"
     "sra      %[temp4],  %[temp4],  3                  \n\t"
     "addiu    %[temp6],  $zero,     255                \n\t"
-    "lbu      %[temp1],  0(%[dst])                     \n\t"
+    "lbu      %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
     "addu     %[temp1],  %[temp1],  %[temp5]           \n\t"
     "sra      %[temp5],  %[temp1],  8                  \n\t"
     "sra      %[temp18], %[temp1],  31                 \n\t"
@@ -392,8 +399,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
     "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
   "1:                                                  \n\t"
-    "lbu      %[temp18], 1(%[dst])                     \n\t"
-    "sb       %[temp1],  0(%[dst])                     \n\t"
+    "lbu      %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp1],  0+0*" XSTR(BPS) "(%[dst])     \n\t"
     "addu     %[temp18], %[temp18], %[temp11]          \n\t"
     "sra      %[temp11], %[temp18], 8                  \n\t"
     "sra      %[temp1],  %[temp18], 31                 \n\t"
@@ -401,8 +408,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp18], %[temp18], %[temp18]          \n\t"
     "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
   "2:                                                  \n\t"
-    "lbu      %[temp1],  2(%[dst])                     \n\t"
-    "sb       %[temp18], 1(%[dst])                     \n\t"
+    "lbu      %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp18], 1+0*" XSTR(BPS) "(%[dst])     \n\t"
     "addu     %[temp1],  %[temp1],  %[temp8]           \n\t"
     "sra      %[temp8],  %[temp1],  8                  \n\t"
     "sra      %[temp18], %[temp1],  31                 \n\t"
@@ -410,8 +417,8 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp1],  %[temp1],  %[temp1]           \n\t"
     "movz     %[temp1],  %[temp6],  %[temp18]          \n\t"
   "3:                                                  \n\t"
-    "lbu      %[temp18], 3(%[dst])                     \n\t"
-    "sb       %[temp1],  2(%[dst])                     \n\t"
+    "lbu      %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp1],  2+0*" XSTR(BPS) "(%[dst])     \n\t"
     "addu     %[temp18], %[temp18], %[temp16]          \n\t"
     "sra      %[temp16], %[temp18], 8                  \n\t"
     "sra      %[temp1],  %[temp18], 31                 \n\t"
@@ -419,11 +426,11 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp18], %[temp18], %[temp18]          \n\t"
     "movz     %[temp18], %[temp6],  %[temp1]           \n\t"
   "4:                                                  \n\t"
-    "sb       %[temp18], 3(%[dst])                     \n\t"
-    "lbu      %[temp5],  32(%[dst])                    \n\t"
-    "lbu      %[temp8],  33(%[dst])                    \n\t"
-    "lbu      %[temp11], 34(%[dst])                    \n\t"
-    "lbu      %[temp16], 35(%[dst])                    \n\t"
+    "sb       %[temp18], 3+0*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
     "addu     %[temp5],  %[temp5],  %[temp17]          \n\t"
     "addu     %[temp8],  %[temp8],  %[temp15]          \n\t"
     "addu     %[temp11], %[temp11], %[temp12]          \n\t"
@@ -452,14 +459,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
     "movz     %[temp16], %[temp6],  %[temp15]          \n\t"
   "8:                                                  \n\t"
-    "sb       %[temp5],  32(%[dst])                    \n\t"
-    "sb       %[temp8],  33(%[dst])                    \n\t"
-    "sb       %[temp11], 34(%[dst])                    \n\t"
-    "sb       %[temp16], 35(%[dst])                    \n\t"
-    "lbu      %[temp5],  64(%[dst])                    \n\t"
-    "lbu      %[temp8],  65(%[dst])                    \n\t"
-    "lbu      %[temp11], 66(%[dst])                    \n\t"
-    "lbu      %[temp16], 67(%[dst])                    \n\t"
+    "sb       %[temp5],  0+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+1*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
     "addu     %[temp5],  %[temp5],  %[temp9]           \n\t"
     "addu     %[temp8],  %[temp8],  %[temp3]           \n\t"
     "addu     %[temp11], %[temp11], %[temp0]           \n\t"
@@ -488,14 +495,14 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
     "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
   "12:                                                 \n\t"
-    "sb       %[temp5],  64(%[dst])                    \n\t"
-    "sb       %[temp8],  65(%[dst])                    \n\t"
-    "sb       %[temp11], 66(%[dst])                    \n\t"
-    "sb       %[temp16], 67(%[dst])                    \n\t"
-    "lbu      %[temp5],  96(%[dst])                    \n\t"
-    "lbu      %[temp8],  97(%[dst])                    \n\t"
-    "lbu      %[temp11], 98(%[dst])                    \n\t"
-    "lbu      %[temp16], 99(%[dst])                    \n\t"
+    "sb       %[temp5],  0+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+2*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "lbu      %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
     "addu     %[temp5],  %[temp5],  %[temp13]          \n\t"
     "addu     %[temp8],  %[temp8],  %[temp7]           \n\t"
     "addu     %[temp11], %[temp11], %[temp4]           \n\t"
@@ -524,10 +531,10 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     "xor      %[temp16], %[temp16], %[temp16]          \n\t"
     "movz     %[temp16], %[temp6],  %[temp3]           \n\t"
   "16:                                                 \n\t"
-    "sb       %[temp5],  96(%[dst])                    \n\t"
-    "sb       %[temp8],  97(%[dst])                    \n\t"
-    "sb       %[temp11], 98(%[dst])                    \n\t"
-    "sb       %[temp16], 99(%[dst])                    \n\t"
+    "sb       %[temp5],  0+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp8],  1+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp11], 2+3*" XSTR(BPS) "(%[dst])     \n\t"
+    "sb       %[temp16], 3+3*" XSTR(BPS) "(%[dst])     \n\t"
 
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
@@ -548,15 +555,12 @@ static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
   }
 }
 
-#endif  // WEBP_USE_MIPS32
-
 //------------------------------------------------------------------------------
 // Entry point
 
 extern void VP8DspInitMIPS32(void);
 
-void VP8DspInitMIPS32(void) {
-#if defined(WEBP_USE_MIPS32)
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPS32(void) {
   VP8InitClipTables();
 
   VP8Transform = TransformTwo;
@@ -574,5 +578,10 @@ void VP8DspInitMIPS32(void) {
   VP8SimpleHFilter16 = SimpleHFilter16;
   VP8SimpleVFilter16i = SimpleVFilter16i;
   VP8SimpleHFilter16i = SimpleHFilter16i;
-#endif  // WEBP_USE_MIPS32
 }
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8DspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/src/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
new file mode 100644
index 0000000..db5c657
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/dec_mips_dsp_r2.c
@@ -0,0 +1,994 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of dsp functions
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "./mips_macro.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+#define MUL(a, b) (((a) * (b)) >> 16)
+
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9, temp10;
+
+  __asm__ volatile (
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    "lh               %[temp5],  0(%[in])               \n\t"
+    "addiu            %[temp5],  %[temp5],  4           \n\t"
+    "ins              %[temp5],  %[temp5],  16, 16      \n\t"
+    "shra.ph          %[temp5],  %[temp5],  3           \n\t"
+    CONVERT_2_BYTES_TO_HALF(temp6, temp7, temp8, temp9, temp10, temp1, temp2,
+                            temp3, temp1, temp2, temp3, temp4)
+    STORE_SAT_SUM_X2(temp6, temp7, temp8, temp9, temp10, temp1, temp2, temp3,
+                     temp5, temp5, temp5, temp5, temp5, temp5, temp5, temp5,
+                     dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_10()
+    : [in]"r"(in), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  const int a = in[0] + 4;
+  int c4 = MUL(in[4], kC2);
+  const int d4 = MUL(in[4], kC1);
+  const int c1 = MUL(in[1], kC2);
+  const int d1 = MUL(in[1], kC1);
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ins              %[c4],      %[d4],     16,       16    \n\t"
+    "replv.ph         %[temp1],   %[a]                       \n\t"
+    "replv.ph         %[temp4],   %[d1]                      \n\t"
+    ADD_SUB_HALVES(temp2, temp3, temp1, c4)
+    "replv.ph         %[temp5],   %[c1]                      \n\t"
+    SHIFT_R_SUM_X2(temp1, temp6, temp7, temp8, temp2, temp9, temp10, temp4,
+                   temp2, temp2, temp3, temp3, temp4, temp5, temp4, temp5)
+    LOAD_WITH_OFFSET_X4(temp3, temp5, temp11, temp12, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp13, temp14, temp3, temp15, temp5, temp16,
+                            temp11, temp17, temp3, temp5, temp11, temp12)
+    PACK_2_HALVES_TO_WORD(temp12, temp18, temp7, temp6, temp1, temp8, temp2,
+                          temp4, temp7, temp6, temp10, temp9)
+    STORE_SAT_SUM_X2(temp13, temp14, temp3, temp15, temp5, temp16, temp11,
+                     temp17, temp12, temp18, temp1, temp8, temp2, temp4,
+                     temp7, temp6, dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18(),
+      [c4]"+&r"(c4)
+    : [dst]"r"(dst), [a]"r"(a), [d1]"r"(d1), [d4]"r"(d4), [c1]"r"(c1)
+    : "memory"
+  );
+}
+
+static void TransformOne(const int16_t* in, uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ulw              %[temp1],   0(%[in])                 \n\t"
+    "ulw              %[temp2],   16(%[in])                \n\t"
+    LOAD_IN_X2(temp5, temp6, 24, 26)
+    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
+    LOAD_IN_X2(temp1, temp2, 8, 10)
+    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
+                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
+                  temp13, temp11, temp14, temp12)
+    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
+    "ulw              %[temp17],  4(%[in])                 \n\t"
+    "ulw              %[temp18],  20(%[in])                \n\t"
+    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
+    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
+    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
+    LOAD_IN_X2(temp17, temp18, 12, 14)
+    LOAD_IN_X2(temp9, temp10, 28, 30)
+    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
+                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
+                  temp15, temp4, temp16, temp17)
+    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
+    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
+    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
+
+    // horizontal
+    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
+    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
+    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
+    "repl.ph          %[temp2],   0x4                      \n\t"
+    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
+    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
+    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
+    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
+    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
+    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
+                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
+                  temp6, temp17, temp8, temp18)
+    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
+                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
+                  temp18, temp12, temp17, temp16)
+    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
+    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
+    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
+                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
+                   temp6)
+    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
+                          temp16, temp11, temp10, temp15, temp14)
+    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, dst,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
+                            temp11, temp10, temp11, temp14, temp15)
+    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
+                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
+                     dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18()
+    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
+  if (do_two) {
+    TransformOne(in + 16, dst + 4);
+  }
+}
+
+static WEBP_INLINE void FilterLoop26(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  const int thresh2 = 2 * thresh + 1;
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15;
+
+  __asm__ volatile (
+    ".set      push                                      \n\t"
+    ".set      noreorder                                 \n\t"
+  "1:                                                    \n\t"
+    "negu      %[temp1],  %[hstride]                     \n\t"
+    "addiu     %[size],   %[size],        -1             \n\t"
+    "sll       %[temp2],  %[hstride],     1              \n\t"
+    "sll       %[temp3],  %[temp1],       1              \n\t"
+    "addu      %[temp4],  %[temp2],       %[hstride]     \n\t"
+    "addu      %[temp5],  %[temp3],       %[temp1]       \n\t"
+    "lbu       %[temp7],  0(%[p])                        \n\t"
+    "sll       %[temp6],  %[temp3],       1              \n\t"
+    "lbux      %[temp8],  %[temp5](%[p])                 \n\t"
+    "lbux      %[temp9],  %[temp3](%[p])                 \n\t"
+    "lbux      %[temp10], %[temp1](%[p])                 \n\t"
+    "lbux      %[temp11], %[temp6](%[p])                 \n\t"
+    "lbux      %[temp12], %[hstride](%[p])               \n\t"
+    "lbux      %[temp13], %[temp2](%[p])                 \n\t"
+    "lbux      %[temp14], %[temp4](%[p])                 \n\t"
+    "subu      %[temp1],  %[temp10],      %[temp7]       \n\t"
+    "subu      %[temp2],  %[temp9],       %[temp12]      \n\t"
+    "absq_s.w  %[temp3],  %[temp1]                       \n\t"
+    "absq_s.w  %[temp4],  %[temp2]                       \n\t"
+    "negu      %[temp1],  %[temp1]                       \n\t"
+    "sll       %[temp3],  %[temp3],       2              \n\t"
+    "addu      %[temp15], %[temp3],       %[temp4]       \n\t"
+    "subu      %[temp3],  %[temp15],      %[thresh2]     \n\t"
+    "sll       %[temp6],  %[temp1],       1              \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp4],  %[temp11],      %[temp8]       \n\t"
+    "absq_s.w  %[temp4],  %[temp4]                       \n\t"
+    "shll_s.w  %[temp2],  %[temp2],       24             \n\t"
+    "subu      %[temp4],  %[temp4],       %[ithresh]     \n\t"
+    "bgtz      %[temp4],  3f                             \n\t"
+    " subu     %[temp3],  %[temp8],       %[temp9]       \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp5],  %[temp9],       %[temp10]      \n\t"
+    "absq_s.w  %[temp3],  %[temp5]                       \n\t"
+    "absq_s.w  %[temp5],  %[temp5]                       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp3],  %[temp14],      %[temp13]      \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "slt       %[temp5],  %[hev_thresh],  %[temp5]       \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp3],  %[temp13],      %[temp12]      \n\t"
+    "absq_s.w  %[temp3],  %[temp3]                       \n\t"
+    "sra       %[temp4],  %[temp2],       24             \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " subu     %[temp15], %[temp12],      %[temp7]       \n\t"
+    "absq_s.w  %[temp3],  %[temp15]                      \n\t"
+    "absq_s.w  %[temp15], %[temp15]                      \n\t"
+    "subu      %[temp3],  %[temp3],       %[ithresh]     \n\t"
+    "bgtz      %[temp3],  3f                             \n\t"
+    " slt      %[temp15], %[hev_thresh],  %[temp15]      \n\t"
+    "addu      %[temp3],  %[temp6],       %[temp1]       \n\t"
+    "or        %[temp2],  %[temp5],       %[temp15]      \n\t"
+    "addu      %[temp5],  %[temp4],       %[temp3]       \n\t"
+    "beqz      %[temp2],  4f                             \n\t"
+    " shra_r.w %[temp1],  %[temp5],       3              \n\t"
+    "addiu     %[temp2],  %[temp5],       3              \n\t"
+    "sra       %[temp2],  %[temp2],       3              \n\t"
+    "shll_s.w  %[temp1],  %[temp1],       27             \n\t"
+    "shll_s.w  %[temp2],  %[temp2],       27             \n\t"
+    "subu      %[temp3],  %[p],           %[hstride]     \n\t"
+    "sra       %[temp1],  %[temp1],       27             \n\t"
+    "sra       %[temp2],  %[temp2],       27             \n\t"
+    "subu      %[temp1],  %[temp7],       %[temp1]       \n\t"
+    "addu      %[temp2],  %[temp10],      %[temp2]       \n\t"
+    "lbux      %[temp2],  %[temp2](%[VP8kclip1])         \n\t"
+    "lbux      %[temp1],  %[temp1](%[VP8kclip1])         \n\t"
+    "sb        %[temp2],  0(%[temp3])                    \n\t"
+    "j         3f                                        \n\t"
+    " sb       %[temp1],  0(%[p])                        \n\t"
+  "4:                                                    \n\t"
+    "shll_s.w  %[temp5],  %[temp5],       24             \n\t"
+    "subu      %[temp14], %[p],           %[hstride]     \n\t"
+    "subu      %[temp11], %[temp14],      %[hstride]     \n\t"
+    "sra       %[temp6],  %[temp5],       24             \n\t"
+    "sll       %[temp1],  %[temp6],       3              \n\t"
+    "subu      %[temp15], %[temp11],      %[hstride]     \n\t"
+    "addu      %[temp2],  %[temp6],       %[temp1]       \n\t"
+    "sll       %[temp3],  %[temp2],       1              \n\t"
+    "addu      %[temp4],  %[temp3],       %[temp2]       \n\t"
+    "addiu     %[temp2],  %[temp2],       63             \n\t"
+    "addiu     %[temp3],  %[temp3],       63             \n\t"
+    "addiu     %[temp4],  %[temp4],       63             \n\t"
+    "sra       %[temp2],  %[temp2],       7              \n\t"
+    "sra       %[temp3],  %[temp3],       7              \n\t"
+    "sra       %[temp4],  %[temp4],       7              \n\t"
+    "addu      %[temp1],  %[temp8],       %[temp2]       \n\t"
+    "addu      %[temp5],  %[temp9],       %[temp3]       \n\t"
+    "addu      %[temp6],  %[temp10],      %[temp4]       \n\t"
+    "subu      %[temp8],  %[temp7],       %[temp4]       \n\t"
+    "subu      %[temp7],  %[temp12],      %[temp3]       \n\t"
+    "addu      %[temp10], %[p],           %[hstride]     \n\t"
+    "subu      %[temp9],  %[temp13],      %[temp2]       \n\t"
+    "addu      %[temp12], %[temp10],      %[hstride]     \n\t"
+    "lbux      %[temp2],  %[temp1](%[VP8kclip1])         \n\t"
+    "lbux      %[temp3],  %[temp5](%[VP8kclip1])         \n\t"
+    "lbux      %[temp4],  %[temp6](%[VP8kclip1])         \n\t"
+    "lbux      %[temp5],  %[temp8](%[VP8kclip1])         \n\t"
+    "lbux      %[temp6],  %[temp7](%[VP8kclip1])         \n\t"
+    "lbux      %[temp8],  %[temp9](%[VP8kclip1])         \n\t"
+    "sb        %[temp2],  0(%[temp15])                   \n\t"
+    "sb        %[temp3],  0(%[temp11])                   \n\t"
+    "sb        %[temp4],  0(%[temp14])                   \n\t"
+    "sb        %[temp5],  0(%[p])                        \n\t"
+    "sb        %[temp6],  0(%[temp10])                   \n\t"
+    "sb        %[temp8],  0(%[temp12])                   \n\t"
+  "3:                                                    \n\t"
+    "bgtz      %[size],   1b                             \n\t"
+    " addu     %[p],      %[p],           %[vstride]     \n\t"
+    ".set      pop                                       \n\t"
+    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),[temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+      [temp7]"=&r"(temp7),[temp8]"=&r"(temp8),[temp9]"=&r"(temp9),
+      [temp10]"=&r"(temp10),[temp11]"=&r"(temp11),[temp12]"=&r"(temp12),
+      [temp13]"=&r"(temp13),[temp14]"=&r"(temp14),[temp15]"=&r"(temp15),
+      [size]"+&r"(size), [p]"+&r"(p)
+    : [hstride]"r"(hstride), [thresh2]"r"(thresh2),
+      [ithresh]"r"(ithresh),[vstride]"r"(vstride), [hev_thresh]"r"(hev_thresh),
+      [VP8kclip1]"r"(VP8kclip1)
+    : "memory"
+  );
+}
+
+static WEBP_INLINE void FilterLoop24(uint8_t* p,
+                                     int hstride, int vstride, int size,
+                                     int thresh, int ithresh, int hev_thresh) {
+  int p0, q0, p1, q1, p2, q2, p3, q3;
+  int step1, step2, temp1, temp2, temp3, temp4;
+  uint8_t* pTemp0;
+  uint8_t* pTemp1;
+  const int thresh2 = 2 * thresh + 1;
+
+  __asm__ volatile (
+    ".set      push                                   \n\t"
+    ".set      noreorder                              \n\t"
+    "bltz      %[size],    3f                         \n\t"
+    " nop                                             \n\t"
+  "2:                                                 \n\t"
+    "negu      %[step1],   %[hstride]                 \n\t"
+    "lbu       %[q0],      0(%[p])                    \n\t"
+    "lbux      %[p0],      %[step1](%[p])             \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "lbux      %[q1],      %[hstride](%[p])           \n\t"
+    "subu      %[temp1],   %[p0],         %[q0]       \n\t"
+    "lbux      %[p1],      %[step1](%[p])             \n\t"
+    "addu      %[step2],   %[hstride],    %[hstride]  \n\t"
+    "absq_s.w  %[temp2],   %[temp1]                   \n\t"
+    "subu      %[temp3],   %[p1],         %[q1]       \n\t"
+    "absq_s.w  %[temp4],   %[temp3]                   \n\t"
+    "sll       %[temp2],   %[temp2],      2           \n\t"
+    "addu      %[temp2],   %[temp2],      %[temp4]    \n\t"
+    "subu      %[temp4],   %[temp2],      %[thresh2]  \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " lbux     %[p2],      %[step1](%[p])             \n\t"
+    "subu      %[step1],   %[step1],      %[hstride]  \n\t"
+    "lbux      %[q2],      %[step2](%[p])             \n\t"
+    "lbux      %[p3],      %[step1](%[p])             \n\t"
+    "subu      %[temp4],   %[p2],         %[p1]       \n\t"
+    "addu      %[step2],   %[step2],      %[hstride]  \n\t"
+    "subu      %[temp2],   %[p3],         %[p2]       \n\t"
+    "absq_s.w  %[temp4],   %[temp4]                   \n\t"
+    "absq_s.w  %[temp2],   %[temp2]                   \n\t"
+    "lbux      %[q3],      %[step2](%[p])             \n\t"
+    "subu      %[temp4],   %[temp4],      %[ithresh]  \n\t"
+    "negu      %[temp1],   %[temp1]                   \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " subu     %[temp2],   %[temp2],      %[ithresh]  \n\t"
+    "subu      %[p3],      %[p1],         %[p0]       \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " absq_s.w %[p3],      %[p3]                      \n\t"
+    "subu      %[temp4],   %[q3],         %[q2]       \n\t"
+    "subu      %[pTemp0],  %[p],          %[hstride]  \n\t"
+    "absq_s.w  %[temp4],   %[temp4]                   \n\t"
+    "subu      %[temp2],   %[p3],         %[ithresh]  \n\t"
+    "sll       %[step1],   %[temp1],      1           \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " subu     %[temp4],   %[temp4],      %[ithresh]  \n\t"
+    "subu      %[temp2],   %[q2],         %[q1]       \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " absq_s.w %[temp2],   %[temp2]                   \n\t"
+    "subu      %[q3],      %[q1],         %[q0]       \n\t"
+    "absq_s.w  %[q3],      %[q3]                      \n\t"
+    "subu      %[temp2],   %[temp2],      %[ithresh]  \n\t"
+    "addu      %[temp1],   %[temp1],      %[step1]    \n\t"
+    "bgtz      %[temp2],   0f                         \n\t"
+    " subu     %[temp4],   %[q3],         %[ithresh]  \n\t"
+    "slt       %[p3],      %[hev_thresh], %[p3]       \n\t"
+    "bgtz      %[temp4],   0f                         \n\t"
+    " slt      %[q3],      %[hev_thresh], %[q3]       \n\t"
+    "or        %[q3],      %[q3],         %[p3]       \n\t"
+    "bgtz      %[q3],      1f                         \n\t"
+    " shra_r.w %[temp2],   %[temp1],      3           \n\t"
+    "addiu     %[temp1],   %[temp1],      3           \n\t"
+    "sra       %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
+    "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
+    "addu      %[pTemp1],  %[p],          %[hstride]  \n\t"
+    "sra       %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      27          \n\t"
+    "addiu     %[step1],   %[temp2],      1           \n\t"
+    "sra       %[step1],   %[step1],      1           \n\t"
+    "addu      %[p0],      %[p0],         %[temp1]    \n\t"
+    "addu      %[p1],      %[p1],         %[step1]    \n\t"
+    "subu      %[q0],      %[q0],         %[temp2]    \n\t"
+    "subu      %[q1],      %[q1],         %[step1]    \n\t"
+    "lbux      %[temp2],   %[p0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp3],   %[q0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp4],   %[q1](%[VP8kclip1])        \n\t"
+    "sb        %[temp2],   0(%[pTemp0])               \n\t"
+    "lbux      %[temp1],   %[p1](%[VP8kclip1])        \n\t"
+    "subu      %[pTemp0],  %[pTemp0],    %[hstride]   \n\t"
+    "sb        %[temp3],   0(%[p])                    \n\t"
+    "sb        %[temp4],   0(%[pTemp1])               \n\t"
+    "j         0f                                     \n\t"
+    " sb       %[temp1],   0(%[pTemp0])               \n\t"
+  "1:                                                 \n\t"
+    "shll_s.w  %[temp3],   %[temp3],      24          \n\t"
+    "sra       %[temp3],   %[temp3],      24          \n\t"
+    "addu      %[temp1],   %[temp1],      %[temp3]    \n\t"
+    "shra_r.w  %[temp2],   %[temp1],      3           \n\t"
+    "addiu     %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      3           \n\t"
+    "shll_s.w  %[temp1],   %[temp1],      27          \n\t"
+    "sra       %[temp2],   %[temp2],      27          \n\t"
+    "sra       %[temp1],   %[temp1],      27          \n\t"
+    "addu      %[p0],      %[p0],         %[temp1]    \n\t"
+    "subu      %[q0],      %[q0],         %[temp2]    \n\t"
+    "lbux      %[temp1],   %[p0](%[VP8kclip1])        \n\t"
+    "lbux      %[temp2],   %[q0](%[VP8kclip1])        \n\t"
+    "sb        %[temp2],   0(%[p])                    \n\t"
+    "sb        %[temp1],   0(%[pTemp0])               \n\t"
+  "0:                                                 \n\t"
+    "subu      %[size],    %[size],       1           \n\t"
+    "bgtz      %[size],    2b                         \n\t"
+    " addu     %[p],       %[p],          %[vstride]  \n\t"
+  "3:                                                 \n\t"
+    ".set      pop                                    \n\t"
+    : [p0]"=&r"(p0), [q0]"=&r"(q0), [p1]"=&r"(p1), [q1]"=&r"(q1),
+      [p2]"=&r"(p2), [q2]"=&r"(q2), [p3]"=&r"(p3), [q3]"=&r"(q3),
+      [step2]"=&r"(step2), [step1]"=&r"(step1), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+      [pTemp0]"=&r"(pTemp0), [pTemp1]"=&r"(pTemp1), [p]"+&r"(p),
+      [size]"+&r"(size)
+    : [vstride]"r"(vstride), [ithresh]"r"(ithresh),
+      [hev_thresh]"r"(hev_thresh), [hstride]"r"(hstride),
+      [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+// on macroblock edges
+static void VFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter16(uint8_t* p, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+}
+
+// 8-pixels wide variant, for chroma filtering
+static void VFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8(uint8_t* u, uint8_t* v, int stride,
+                     int thresh, int ithresh, int hev_thresh) {
+  FilterLoop26(u, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop26(v, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+// on three inner edges
+static void VFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void HFilter16i(uint8_t* p, int stride,
+                       int thresh, int ithresh, int hev_thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
+  }
+}
+
+static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
+}
+
+static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
+                      int thresh, int ithresh, int hev_thresh) {
+  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
+}
+
+#undef MUL
+
+//------------------------------------------------------------------------------
+// Simple In-loop filtering (Paragraph 15.2)
+
+static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  uint8_t* p1 = p - stride;
+  __asm__ volatile (
+    ".set      push                                      \n\t"
+    ".set      noreorder                                 \n\t"
+    "li        %[i],        16                           \n\t"
+  "0:                                                    \n\t"
+    "negu      %[temp4],    %[stride]                    \n\t"
+    "sll       %[temp5],    %[temp4],       1            \n\t"
+    "lbu       %[temp2],    0(%[p])                      \n\t"
+    "lbux      %[temp3],    %[stride](%[p])              \n\t"
+    "lbux      %[temp1],    %[temp4](%[p])               \n\t"
+    "lbux      %[temp0],    %[temp5](%[p])               \n\t"
+    "subu      %[temp7],    %[temp1],       %[temp2]     \n\t"
+    "subu      %[temp6],    %[temp0],       %[temp3]     \n\t"
+    "absq_s.w  %[temp4],    %[temp7]                     \n\t"
+    "absq_s.w  %[temp5],    %[temp6]                     \n\t"
+    "sll       %[temp4],    %[temp4],       2            \n\t"
+    "subu      %[temp5],    %[temp5],       %[thresh2]   \n\t"
+    "addu      %[temp5],    %[temp4],       %[temp5]     \n\t"
+    "negu      %[temp8],    %[temp7]                     \n\t"
+    "bgtz      %[temp5],    1f                           \n\t"
+    " addiu    %[i],        %[i],           -1           \n\t"
+    "sll       %[temp4],    %[temp8],       1            \n\t"
+    "shll_s.w  %[temp5],    %[temp6],       24           \n\t"
+    "addu      %[temp3],    %[temp4],       %[temp8]     \n\t"
+    "sra       %[temp5],    %[temp5],       24           \n\t"
+    "addu      %[temp3],    %[temp3],       %[temp5]     \n\t"
+    "addiu     %[temp7],    %[temp3],       3            \n\t"
+    "sra       %[temp7],    %[temp7],       3            \n\t"
+    "shra_r.w  %[temp8],    %[temp3],       3            \n\t"
+    "shll_s.w  %[temp0],    %[temp7],       27           \n\t"
+    "shll_s.w  %[temp4],    %[temp8],       27           \n\t"
+    "sra       %[temp0],    %[temp0],       27           \n\t"
+    "sra       %[temp4],    %[temp4],       27           \n\t"
+    "addu      %[temp7],    %[temp1],       %[temp0]     \n\t"
+    "subu      %[temp2],    %[temp2],       %[temp4]     \n\t"
+    "lbux      %[temp3],    %[temp7](%[VP8kclip1])       \n\t"
+    "lbux      %[temp4],    %[temp2](%[VP8kclip1])       \n\t"
+    "sb        %[temp3],    0(%[p1])                     \n\t"
+    "sb        %[temp4],    0(%[p])                      \n\t"
+  "1:                                                    \n\t"
+    "addiu     %[p1],       %[p1],          1            \n\t"
+    "bgtz      %[i],        0b                           \n\t"
+    " addiu    %[p],        %[p],           1            \n\t"
+    " .set     pop                                       \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [p]"+&r"(p), [i]"=&r"(i), [p1]"+&r"(p1)
+    : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+// TEMP0 = SRC[A + A1 * BPS]
+// TEMP1 = SRC[B + B1 * BPS]
+// TEMP2 = SRC[C + C1 * BPS]
+// TEMP3 = SRC[D + D1 * BPS]
+#define LOAD_4_BYTES(TEMP0, TEMP1, TEMP2, TEMP3,                               \
+                     A, A1, B, B1, C, C1, D, D1, SRC)                          \
+  "lbu      %[" #TEMP0 "],   " #A "+" #A1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP1 "],   " #B "+" #B1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP2 "],   " #C "+" #C1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+  "lbu      %[" #TEMP3 "],   " #D "+" #D1 "*" XSTR(BPS) "(%[" #SRC "]) \n\t"   \
+
+static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
+  int i;
+  const int thresh2 = 2 * thresh + 1;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    ".set      push                                     \n\t"
+    ".set      noreorder                                \n\t"
+    "li        %[i],       16                           \n\t"
+  "0:                                                   \n\t"
+    LOAD_4_BYTES(temp0, temp1, temp2, temp3, -2, 0, -1, 0, 0, 0, 1, 0, p)
+    "subu      %[temp7],    %[temp1],       %[temp2]    \n\t"
+    "subu      %[temp6],    %[temp0],       %[temp3]    \n\t"
+    "absq_s.w  %[temp4],    %[temp7]                    \n\t"
+    "absq_s.w  %[temp5],    %[temp6]                    \n\t"
+    "sll       %[temp4],    %[temp4],       2           \n\t"
+    "addu      %[temp5],    %[temp4],       %[temp5]    \n\t"
+    "subu      %[temp5],    %[temp5],       %[thresh2]  \n\t"
+    "negu      %[temp8],    %[temp7]                    \n\t"
+    "bgtz      %[temp5],    1f                          \n\t"
+    " addiu    %[i],        %[i],           -1          \n\t"
+    "sll       %[temp4],    %[temp8],       1           \n\t"
+    "shll_s.w  %[temp5],    %[temp6],       24          \n\t"
+    "addu      %[temp3],    %[temp4],       %[temp8]    \n\t"
+    "sra       %[temp5],    %[temp5],       24          \n\t"
+    "addu      %[temp3],    %[temp3],       %[temp5]    \n\t"
+    "addiu     %[temp7],    %[temp3],       3           \n\t"
+    "sra       %[temp7],    %[temp7],       3           \n\t"
+    "shra_r.w  %[temp8],    %[temp3],       3           \n\t"
+    "shll_s.w  %[temp0],    %[temp7],       27          \n\t"
+    "shll_s.w  %[temp4],    %[temp8],       27          \n\t"
+    "sra       %[temp0],    %[temp0],       27          \n\t"
+    "sra       %[temp4],    %[temp4],       27          \n\t"
+    "addu      %[temp7],    %[temp1],       %[temp0]    \n\t"
+    "subu      %[temp2],    %[temp2],       %[temp4]    \n\t"
+    "lbux      %[temp3],    %[temp7](%[VP8kclip1])      \n\t"
+    "lbux      %[temp4],    %[temp2](%[VP8kclip1])      \n\t"
+    "sb        %[temp3],    -1(%[p])                    \n\t"
+    "sb        %[temp4],    0(%[p])                     \n\t"
+  "1:                                                   \n\t"
+    "bgtz      %[i],        0b                          \n\t"
+    " addu     %[p],        %[p],           %[stride]   \n\t"
+    ".set      pop                                      \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [p]"+&r"(p), [i]"=&r"(i)
+    : [stride]"r"(stride), [VP8kclip1]"r"(VP8kclip1), [thresh2]"r"(thresh2)
+    : "memory"
+  );
+}
+
+static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4 * stride;
+    SimpleVFilter16(p, stride, thresh);
+  }
+}
+
+static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
+  int k;
+  for (k = 3; k > 0; --k) {
+    p += 4;
+    SimpleHFilter16(p, stride, thresh);
+  }
+}
+
+// DST[A * BPS]     = TEMP0
+// DST[B + C * BPS] = TEMP1
+#define STORE_8_BYTES(TEMP0, TEMP1, A, B, C, DST)                              \
+  "usw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #DST "])         \n\t"     \
+  "usw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #DST "])  \n\t"
+
+static void VE4(uint8_t* dst) {    // vertical
+  const uint8_t* top = dst - BPS;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  __asm__ volatile (
+    "ulw             %[temp0],   -1(%[top])              \n\t"
+    "ulh             %[temp1],   3(%[top])               \n\t"
+    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
+    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
+    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
+    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
+    "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
+    "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
+    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
+    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
+    "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
+    STORE_8_BYTES(temp4, temp4, 0, 0, 1, dst)
+    STORE_8_BYTES(temp4, temp4, 2, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC4(uint8_t* dst) {   // DC
+  int temp0, temp1, temp2, temp3, temp4;
+  __asm__ volatile (
+    "ulw          %[temp0],   -1*" XSTR(BPS) "(%[dst]) \n\t"
+    LOAD_4_BYTES(temp1, temp2, temp3, temp4, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    "ins          %[temp1],   %[temp2],    8,     8    \n\t"
+    "ins          %[temp1],   %[temp3],    16,    8    \n\t"
+    "ins          %[temp1],   %[temp4],    24,    8    \n\t"
+    "raddu.w.qb   %[temp0],   %[temp0]                 \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                 \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]    \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3           \n\t"
+    "replv.qb     %[temp0],   %[temp0]                 \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 0, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    LOAD_4_BYTES(temp0, temp1, temp2, temp3, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    "ulw            %[temp7],   -1-" XSTR(BPS) "(%[dst])       \n\t"
+    "ins            %[temp1],   %[temp0], 16, 16               \n\t"
+    "preceu.ph.qbr  %[temp5],   %[temp7]                       \n\t"
+    "ins            %[temp2],   %[temp1], 16, 16               \n\t"
+    "preceu.ph.qbl  %[temp4],   %[temp7]                       \n\t"
+    "ins            %[temp3],   %[temp2], 16, 16               \n\t"
+    "shll.ph        %[temp2],   %[temp2], 1                    \n\t"
+    "addq.ph        %[temp3],   %[temp3], %[temp1]             \n\t"
+    "packrl.ph      %[temp6],   %[temp5], %[temp1]             \n\t"
+    "addq.ph        %[temp3],   %[temp3], %[temp2]             \n\t"
+    "addq.ph        %[temp1],   %[temp1], %[temp5]             \n\t"
+    "shll.ph        %[temp6],   %[temp6], 1                    \n\t"
+    "addq.ph        %[temp1],   %[temp1], %[temp6]             \n\t"
+    "packrl.ph      %[temp0],   %[temp4], %[temp5]             \n\t"
+    "addq.ph        %[temp8],   %[temp5], %[temp4]             \n\t"
+    "shra_r.ph      %[temp3],   %[temp3], 2                    \n\t"
+    "shll.ph        %[temp0],   %[temp0], 1                    \n\t"
+    "shra_r.ph      %[temp1],   %[temp1], 2                    \n\t"
+    "addq.ph        %[temp8],   %[temp0], %[temp8]             \n\t"
+    "lbu            %[temp5],   3-" XSTR(BPS) "(%[dst])        \n\t"
+    "precrq.ph.w    %[temp7],   %[temp7], %[temp7]             \n\t"
+    "shra_r.ph      %[temp8],   %[temp8], 2                    \n\t"
+    "ins            %[temp7],   %[temp5], 0,  8                \n\t"
+    "precr.qb.ph    %[temp2],   %[temp1], %[temp3]             \n\t"
+    "raddu.w.qb     %[temp4],   %[temp7]                       \n\t"
+    "precr.qb.ph    %[temp6],   %[temp8], %[temp1]             \n\t"
+    "shra_r.w       %[temp4],   %[temp4], 2                    \n\t"
+    STORE_8_BYTES(temp2, temp6, 3, 0, 1, dst)
+    "prepend        %[temp2],   %[temp8], 8                    \n\t"
+    "prepend        %[temp6],   %[temp4], 8                    \n\t"
+    STORE_8_BYTES(temp2, temp6, 2, 0, 0, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+// TEMP0 = SRC[A * BPS]
+// TEMP1 = SRC[B + C * BPS]
+#define LOAD_8_BYTES(TEMP0, TEMP1, A, B, C, SRC)                               \
+  "ulw    %[" #TEMP0 "],   " #A "*" XSTR(BPS) "(%[" #SRC "])         \n\t"     \
+  "ulw    %[" #TEMP1 "],   " #B "+" #C "*" XSTR(BPS) "(%[" #SRC "])  \n\t"
+
+static void LD4(uint8_t* dst) {   // Down-Left
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    "preceu.ph.qbl   %[temp2],    %[temp0]                     \n\t"
+    "preceu.ph.qbr   %[temp3],    %[temp0]                     \n\t"
+    "preceu.ph.qbr   %[temp4],    %[temp1]                     \n\t"
+    "preceu.ph.qbl   %[temp5],    %[temp1]                     \n\t"
+    "packrl.ph       %[temp6],    %[temp2],    %[temp3]        \n\t"
+    "packrl.ph       %[temp7],    %[temp4],    %[temp2]        \n\t"
+    "packrl.ph       %[temp8],    %[temp5],    %[temp4]        \n\t"
+    "shll.ph         %[temp6],    %[temp6],    1               \n\t"
+    "addq.ph         %[temp9],    %[temp2],    %[temp6]        \n\t"
+    "shll.ph         %[temp7],    %[temp7],    1               \n\t"
+    "addq.ph         %[temp9],    %[temp9],    %[temp3]        \n\t"
+    "shll.ph         %[temp8],    %[temp8],    1               \n\t"
+    "shra_r.ph       %[temp9],    %[temp9],    2               \n\t"
+    "addq.ph         %[temp3],    %[temp4],    %[temp7]        \n\t"
+    "addq.ph         %[temp0],    %[temp5],    %[temp8]        \n\t"
+    "addq.ph         %[temp3],    %[temp3],    %[temp2]        \n\t"
+    "addq.ph         %[temp0],    %[temp0],    %[temp4]        \n\t"
+    "shra_r.ph       %[temp3],    %[temp3],    2               \n\t"
+    "shra_r.ph       %[temp0],    %[temp0],    2               \n\t"
+    "srl             %[temp1],    %[temp1],    24              \n\t"
+    "sll             %[temp1],    %[temp1],    1               \n\t"
+    "raddu.w.qb      %[temp5],    %[temp5]                     \n\t"
+    "precr.qb.ph     %[temp9],    %[temp3],    %[temp9]        \n\t"
+    "precr.qb.ph     %[temp3],    %[temp0],    %[temp3]        \n\t"
+    "addu            %[temp1],    %[temp1],    %[temp5]        \n\t"
+    "shra_r.w        %[temp1],    %[temp1],    2               \n\t"
+    STORE_8_BYTES(temp9, temp3, 0, 0, 2, dst)
+    "prepend         %[temp9],    %[temp0],    8               \n\t"
+    "prepend         %[temp3],    %[temp1],    8               \n\t"
+    STORE_8_BYTES(temp9, temp3, 1, 0, 3, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void DC8uv(uint8_t* dst) {     // DC
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    LOAD_4_BYTES(temp6, temp7, temp8, temp9, -1, 4, -1, 5, -1, 6, -1, 7, dst)
+    "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
+    "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
+    "addu         %[temp8],   %[temp8],    %[temp9]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp2]      \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp6]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    4             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+  int temp0, temp1;
+  __asm__ volatile (
+    LOAD_8_BYTES(temp0, temp1, -1, 4, -1, dst)
+    "raddu.w.qb   %[temp0],   %[temp0]                   \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                   \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8;
+  __asm__ volatile (
+    LOAD_4_BYTES(temp2, temp3, temp4, temp5, -1, 0, -1, 1, -1, 2, -1, 3, dst)
+    LOAD_4_BYTES(temp6, temp7, temp8, temp1, -1, 4, -1, 5, -1, 6, -1, 7, dst)
+    "addu         %[temp2],   %[temp2],    %[temp3]      \n\t"
+    "addu         %[temp4],   %[temp4],    %[temp5]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp7]      \n\t"
+    "addu         %[temp8],   %[temp8],    %[temp1]      \n\t"
+    "addu         %[temp2],   %[temp2],    %[temp4]      \n\t"
+    "addu         %[temp6],   %[temp6],    %[temp8]      \n\t"
+    "addu         %[temp0],   %[temp6],    %[temp2]      \n\t"
+    "shra_r.w     %[temp0],   %[temp0],    3             \n\t"
+    "replv.qb     %[temp0],   %[temp0]                   \n\t"
+    STORE_8_BYTES(temp0, temp0, 0, 4, 0, dst)
+    STORE_8_BYTES(temp0, temp0, 1, 4, 1, dst)
+    STORE_8_BYTES(temp0, temp0, 2, 4, 2, dst)
+    STORE_8_BYTES(temp0, temp0, 3, 4, 3, dst)
+    STORE_8_BYTES(temp0, temp0, 4, 4, 4, dst)
+    STORE_8_BYTES(temp0, temp0, 5, 4, 5, dst)
+    STORE_8_BYTES(temp0, temp0, 6, 4, 6, dst)
+    STORE_8_BYTES(temp0, temp0, 7, 4, 7, dst)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+    : [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+#undef LOAD_8_BYTES
+#undef STORE_8_BYTES
+#undef LOAD_4_BYTES
+
+#define CLIPPING(SIZE)                                                         \
+  "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
+".endif                                                  \n\t"                 \
+  "addu.ph         %[temp2],   %[temp2],   %[dst_1]      \n\t"                 \
+  "addu.ph         %[temp0],   %[temp0],   %[dst_1]      \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "addu.ph         %[temp3],   %[temp3],   %[dst_1]      \n\t"                 \
+  "addu.ph         %[temp1],   %[temp1],   %[dst_1]      \n\t"                 \
+".endif                                                  \n\t"                 \
+  "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
+  "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
+  "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
+".endif                                                  \n\t"                 \
+  "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
+".if " #SIZE " == 8                                      \n\t"                 \
+  "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"                 \
+".endif                                                  \n\t"
+
+
+#define CLIP_8B_TO_DST(DST, TOP, SIZE) do {                                    \
+  int dst_1 = ((int)(DST)[-1] << 16) + (DST)[-1];                              \
+  int temp0, temp1, temp2, temp3;                                              \
+  __asm__ volatile (                                                           \
+  ".if " #SIZE " < 8                                     \n\t"                 \
+    "ulw             %[temp0],   0(%[top])               \n\t"                 \
+    "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
+    CLIPPING(4)                                                                \
+    "usw             %[temp0],   0(%[dst])               \n\t"                 \
+  ".else                                                 \n\t"                 \
+    "ulw             %[temp0],   0(%[top])               \n\t"                 \
+    "ulw             %[temp1],   4(%[top])               \n\t"                 \
+    "subu.ph         %[dst_1],   %[dst_1],    %[top_1]   \n\t"                 \
+    CLIPPING(8)                                                                \
+    "usw             %[temp0],   0(%[dst])               \n\t"                 \
+    "usw             %[temp1],   4(%[dst])               \n\t"                 \
+  ".if " #SIZE " == 16                                   \n\t"                 \
+    "ulw             %[temp0],   8(%[top])               \n\t"                 \
+    "ulw             %[temp1],   12(%[top])              \n\t"                 \
+    CLIPPING(8)                                                                \
+    "usw             %[temp0],   8(%[dst])               \n\t"                 \
+    "usw             %[temp1],   12(%[dst])              \n\t"                 \
+  ".endif                                                \n\t"                 \
+  ".endif                                                \n\t"                 \
+    : [dst_1]"+&r"(dst_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),           \
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
+    : [top_1]"r"(top_1), [top]"r"((TOP)), [dst]"r"((DST))                      \
+    : "memory"                                                                 \
+  );                                                                           \
+} while (0)
+
+#define CLIP_TO_DST(DST, SIZE) do {                                            \
+  int y;                                                                       \
+  const uint8_t* top = (DST) - BPS;                                            \
+  const int top_1 = ((int)top[-1] << 16) + top[-1];                            \
+  for (y = 0; y < (SIZE); ++y) {                                               \
+    CLIP_8B_TO_DST((DST), top, (SIZE));                                        \
+    (DST) += BPS;                                                              \
+  }                                                                            \
+} while (0)
+
+#define TRUE_MOTION(DST, SIZE)                                                 \
+static void TrueMotion##SIZE(uint8_t* (DST)) {                                 \
+  CLIP_TO_DST((DST), (SIZE));                                                  \
+}
+
+TRUE_MOTION(dst, 4)
+TRUE_MOTION(dst, 8)
+TRUE_MOTION(dst, 16)
+
+#undef TRUE_MOTION
+#undef CLIP_TO_DST
+#undef CLIP_8B_TO_DST
+#undef CLIPPING
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMIPSdspR2(void) {
+  VP8TransformDC = TransformDC;
+  VP8TransformAC3 = TransformAC3;
+  VP8Transform = TransformTwo;
+
+  VP8VFilter16 = VFilter16;
+  VP8HFilter16 = HFilter16;
+  VP8VFilter8 = VFilter8;
+  VP8HFilter8 = HFilter8;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16 = SimpleVFilter16;
+  VP8SimpleHFilter16 = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TrueMotion4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[6] = LD4;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TrueMotion8;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+
+  VP8PredLuma16[1] = TrueMotion16;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8DspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/dec_neon.c b/src/3rdparty/libwebp/src/dsp/dec_neon.c
index 4afae07..a63f43f 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_neon.c
@@ -389,9 +389,9 @@ static WEBP_INLINE void Store4x8x2(const uint8x16_t p1, const uint8x16_t p0,
 
 #endif  // !WORK_AROUND_GCC
 
-// Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
-static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
-  return vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_u32(v)));
+// Zero extend 'v' to an int16x8_t.
+static WEBP_INLINE int16x8_t ConvertU8ToS16(uint8x8_t v) {
+  return vreinterpretq_s16_u16(vmovl_u8(v));
 }
 
 // Performs unsigned 8b saturation on 'dst01' and 'dst23' storing the result
@@ -423,8 +423,8 @@ static WEBP_INLINE void Add4x4(const int16x8_t row01, const int16x8_t row23,
 
   {
     // Convert to 16b.
-    const int16x8_t dst01_s16 = ConvertU8ToS16(dst01);
-    const int16x8_t dst23_s16 = ConvertU8ToS16(dst23);
+    const int16x8_t dst01_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst01));
+    const int16x8_t dst23_s16 = ConvertU8ToS16(vreinterpret_u8_u32(dst23));
 
     // Descale with rounding.
     const int16x8_t out01 = vrsraq_n_s16(dst01_s16, row01, 3);
@@ -479,6 +479,21 @@ static int8x16_t GetBaseDelta0(const int8x16_t p0, const int8x16_t q0) {
 
 //------------------------------------------------------------------------------
 
+static void ApplyFilter2NoFlip(const int8x16_t p0s, const int8x16_t q0s,
+                               const int8x16_t delta,
+                               int8x16_t* const op0, int8x16_t* const oq0) {
+  const int8x16_t kCst3 = vdupq_n_s8(0x03);
+  const int8x16_t kCst4 = vdupq_n_s8(0x04);
+  const int8x16_t delta_p3 = vqaddq_s8(delta, kCst3);
+  const int8x16_t delta_p4 = vqaddq_s8(delta, kCst4);
+  const int8x16_t delta3 = vshrq_n_s8(delta_p3, 3);
+  const int8x16_t delta4 = vshrq_n_s8(delta_p4, 3);
+  *op0 = vqaddq_s8(p0s, delta3);
+  *oq0 = vqsubq_s8(q0s, delta4);
+}
+
+#if defined(WEBP_USE_INTRINSICS)
+
 static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
                          const int8x16_t delta,
                          uint8x16_t* const op0, uint8x16_t* const oq0) {
@@ -494,8 +509,6 @@ static void ApplyFilter2(const int8x16_t p0s, const int8x16_t q0s,
   *oq0 = FlipSignBack(sq0);
 }
 
-#if defined(USE_INTRINSICS)
-
 static void DoFilter2(const uint8x16_t p1, const uint8x16_t p0,
                       const uint8x16_t q0, const uint8x16_t q1,
                       const uint8x16_t mask,
@@ -626,7 +639,7 @@ static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
   );
 }
 
-#endif    // USE_INTRINSICS
+#endif    // WEBP_USE_INTRINSICS
 
 static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
   uint32_t k;
@@ -721,11 +734,7 @@ static void DoFilter4(
     const int8x16_t delta = GetBaseDelta(p1s, p0s, q0s, q1s);
     const int8x16_t simple_lf_delta =
         vandq_s8(delta, vreinterpretq_s8_u8(simple_lf_mask));
-    uint8x16_t tmp_p0, tmp_q0;
-    ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
-    // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
-    p0s = FlipSign(tmp_p0);
-    q0s = FlipSign(tmp_q0);
+    ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
   }
 
   // do_filter4 part (complex loopfilter on pixels without hev)
@@ -797,11 +806,7 @@ static void DoFilter6(
   {
     const int8x16_t simple_lf_delta =
         vandq_s8(delta0, vreinterpretq_s8_u8(simple_lf_mask));
-    uint8x16_t tmp_p0, tmp_q0;
-    ApplyFilter2(p0s, q0s, simple_lf_delta, &tmp_p0, &tmp_q0);
-    // TODO(skal): avoid the double FlipSign() in ApplyFilter2() and here
-    p0s = FlipSign(tmp_p0);
-    q0s = FlipSign(tmp_q0);
+    ApplyFilter2NoFlip(p0s, q0s, simple_lf_delta, &p0s, &q0s);
   }
 
   // do_filter6 part (complex loopfilter on pixels without hev)
@@ -986,7 +991,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
 static const int16_t kC1 = 20091;
 static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
 
-#if defined(USE_INTRINSICS)
+#if defined(WEBP_USE_INTRINSICS)
 static WEBP_INLINE void Transpose8x2(const int16x8_t in0, const int16x8_t in1,
                                      int16x8x2_t* const out) {
   // a0 a1 a2 a3 | b0 b1 b2 b3   => a0 b0 c0 d0 | a1 b1 c1 d1
@@ -1163,7 +1168,7 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
   );
 }
 
-#endif    // USE_INTRINSICS
+#endif    // WEBP_USE_INTRINSICS
 
 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
   TransformOne(in, dst);
@@ -1241,7 +1246,7 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
 static void TransformAC3(const int16_t* in, uint8_t* dst) {
   static const int kC1_full = 20091 + (1 << 16);
   static const int kC2_full = 35468;
-  const int16x4_t A = vdup_n_s16(in[0]);
+  const int16x4_t A = vld1_dup_s16(in);
   const int16x4_t c4 = vdup_n_s16(MUL(in[4], kC2_full));
   const int16x4_t d4 = vdup_n_s16(MUL(in[4], kC1_full));
   const int c1 = MUL(in[1], kC2_full);
@@ -1258,15 +1263,330 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
 }
 #undef MUL
 
-#endif   // WEBP_USE_NEON
+//------------------------------------------------------------------------------
+// 4x4
+
+static void DC4(uint8_t* dst) {    // DC
+  const uint8x8_t A = vld1_u8(dst - BPS);  // top row
+  const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+  const uint16x4_t p1 = vpadd_u16(p0, p0);
+  const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
+  const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
+  const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
+  const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
+  const uint16x8_t s0 = vaddq_u16(L0, L1);
+  const uint16x8_t s1 = vaddq_u16(L2, L3);
+  const uint16x8_t s01 = vaddq_u16(s0, s1);
+  const uint16x8_t sum = vaddq_u16(s01, vcombine_u16(p1, p1));
+  const uint8x8_t dc0 = vrshrn_n_u16(sum, 3);  // (sum + 4) >> 3
+  const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc), 0);
+  }
+}
+
+// TrueMotion (4x4 + 8x8)
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
+  const uint8x8_t T = vld1_u8(dst - BPS);  // top row 'A[0..3]'
+  const int16x8_t d = vreinterpretq_s16_u16(vsubl_u8(T, TL));  // A[c] - A[-1]
+  int y;
+  for (y = 0; y < size; y += 4) {
+    // left edge
+    const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
+    const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
+    const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
+    const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
+    const int16x8_t r0 = vaddq_s16(L0, d);  // L[r] + A[c] - A[-1]
+    const int16x8_t r1 = vaddq_s16(L1, d);
+    const int16x8_t r2 = vaddq_s16(L2, d);
+    const int16x8_t r3 = vaddq_s16(L3, d);
+    // Saturate and store the result.
+    const uint32x2_t r0_u32 = vreinterpret_u32_u8(vqmovun_s16(r0));
+    const uint32x2_t r1_u32 = vreinterpret_u32_u8(vqmovun_s16(r1));
+    const uint32x2_t r2_u32 = vreinterpret_u32_u8(vqmovun_s16(r2));
+    const uint32x2_t r3_u32 = vreinterpret_u32_u8(vqmovun_s16(r3));
+    if (size == 4) {
+      vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0_u32, 0);
+      vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1_u32, 0);
+      vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2_u32, 0);
+      vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3_u32, 0);
+    } else {
+      vst1_u32((uint32_t*)(dst + 0 * BPS), r0_u32);
+      vst1_u32((uint32_t*)(dst + 1 * BPS), r1_u32);
+      vst1_u32((uint32_t*)(dst + 2 * BPS), r2_u32);
+      vst1_u32((uint32_t*)(dst + 3 * BPS), r3_u32);
+    }
+    dst += 4 * BPS;
+  }
+}
+
+static void TM4(uint8_t* dst) { TrueMotion(dst, 4); }
+
+static void VE4(uint8_t* dst) {    // vertical
+  // NB: avoid vld1_u64 here as an alignment hint may be added -> SIGBUS.
+  const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(dst - BPS - 1));  // top row
+  const uint64x1_t A1 = vshr_n_u64(A0, 8);
+  const uint64x1_t A2 = vshr_n_u64(A0, 16);
+  const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0);
+  const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1);
+  const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2);
+  const uint8x8_t b = vhadd_u8(ABCDEFGH, CDEFGH00);
+  const uint8x8_t avg = vrhadd_u8(b, BCDEFGH0);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    vst1_lane_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(avg), 0);
+  }
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  const uint8x8_t XABCD_u8 = vld1_u8(dst - BPS - 1);
+  const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8);
+  const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32);
+  const uint32_t I = dst[-1 + 0 * BPS];
+  const uint32_t J = dst[-1 + 1 * BPS];
+  const uint32_t K = dst[-1 + 2 * BPS];
+  const uint32_t L = dst[-1 + 3 * BPS];
+  const uint64x1_t LKJI____ = vcreate_u64(L | (K << 8) | (J << 16) | (I << 24));
+  const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC);
+  const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8));
+  const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16));
+  const uint8_t D = vget_lane_u8(XABCD_u8, 4);
+  const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6);
+  const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC);
+  const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r3 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
+  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
+  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
+  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
+}
+
+static void LD4(uint8_t* dst) {    // Down-left
+  // Note using the same shift trick as VE4() is slower here.
+  const uint8x8_t ABCDEFGH = vld1_u8(dst - BPS + 0);
+  const uint8x8_t BCDEFGH0 = vld1_u8(dst - BPS + 1);
+  const uint8x8_t CDEFGH00 = vld1_u8(dst - BPS + 2);
+  const uint8x8_t CDEFGHH0 = vset_lane_u8(dst[-BPS + 7], CDEFGH00, 6);
+  const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGHH0);
+  const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0);
+  const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2);
+  const uint32x2_t r0 = vreinterpret_u32_u8(avg2);
+  const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8));
+  const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16));
+  const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24));
+  vst1_lane_u32((uint32_t*)(dst + 0 * BPS), r0, 0);
+  vst1_lane_u32((uint32_t*)(dst + 1 * BPS), r1, 0);
+  vst1_lane_u32((uint32_t*)(dst + 2 * BPS), r2, 0);
+  vst1_lane_u32((uint32_t*)(dst + 3 * BPS), r3, 0);
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void VE8uv(uint8_t* dst) {    // vertical
+  const uint8x8_t top = vld1_u8(dst - BPS);
+  int j;
+  for (j = 0; j < 8; ++j) {
+    vst1_u8(dst + j * BPS, top);
+  }
+}
+
+static void HE8uv(uint8_t* dst) {    // horizontal
+  int j;
+  for (j = 0; j < 8; ++j) {
+    const uint8x8_t left = vld1_dup_u8(dst - 1);
+    vst1_u8(dst, left);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void DC8(uint8_t* dst, int do_top, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_top) {
+    const uint8x8_t A = vld1_u8(dst - BPS);  // top row
+    const uint16x4_t p0 = vpaddl_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vpadd_u16(p0, p0);
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    sum_top = vcombine_u16(p2, p2);
+  }
+
+  if (do_left) {
+    const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + 0 * BPS - 1));
+    const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + 1 * BPS - 1));
+    const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + 2 * BPS - 1));
+    const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + 3 * BPS - 1));
+    const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + 4 * BPS - 1));
+    const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + 5 * BPS - 1));
+    const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + 6 * BPS - 1));
+    const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + 7 * BPS - 1));
+    const uint16x8_t s0 = vaddq_u16(L0, L1);
+    const uint16x8_t s1 = vaddq_u16(L2, L3);
+    const uint16x8_t s2 = vaddq_u16(L4, L5);
+    const uint16x8_t s3 = vaddq_u16(L6, L7);
+    const uint16x8_t s01 = vaddq_u16(s0, s1);
+    const uint16x8_t s23 = vaddq_u16(s2, s3);
+    sum_left = vaddq_u16(s01, s23);
+  }
+
+  if (do_top && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 4);
+  } else if (do_top) {
+    dc0 = vrshrn_n_u16(sum_top, 3);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 3);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x8_t dc = vdup_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 8; ++i) {
+      vst1_u32((uint32_t*)(dst + i * BPS), vreinterpret_u32_u8(dc));
+    }
+  }
+}
+
+static void DC8uv(uint8_t* dst) { DC8(dst, 1, 1); }
+static void DC8uvNoTop(uint8_t* dst) { DC8(dst, 0, 1); }
+static void DC8uvNoLeft(uint8_t* dst) { DC8(dst, 1, 0); }
+static void DC8uvNoTopLeft(uint8_t* dst) { DC8(dst, 0, 0); }
+
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+
+//------------------------------------------------------------------------------
+// 16x16
+
+static void VE16(uint8_t* dst) {     // vertical
+  const uint8x16_t top = vld1q_u8(dst - BPS);
+  int j;
+  for (j = 0; j < 16; ++j) {
+    vst1q_u8(dst + j * BPS, top);
+  }
+}
+
+static void HE16(uint8_t* dst) {     // horizontal
+  int j;
+  for (j = 0; j < 16; ++j) {
+    const uint8x16_t left = vld1q_dup_u8(dst - 1);
+    vst1q_u8(dst, left);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void DC16(uint8_t* dst, int do_top, int do_left) {
+  uint16x8_t sum_top;
+  uint16x8_t sum_left;
+  uint8x8_t dc0;
+
+  if (do_top) {
+    const uint8x16_t A = vld1q_u8(dst - BPS);  // top row
+    const uint16x8_t p0 = vpaddlq_u8(A);  // cascading summation of the top
+    const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0));
+    const uint16x4_t p2 = vpadd_u16(p1, p1);
+    const uint16x4_t p3 = vpadd_u16(p2, p2);
+    sum_top = vcombine_u16(p3, p3);
+  }
+
+  if (do_left) {
+    int i;
+    sum_left = vdupq_n_u16(0);
+    for (i = 0; i < 16; i += 8) {
+      const uint16x8_t L0 = vmovl_u8(vld1_u8(dst + (i + 0) * BPS - 1));
+      const uint16x8_t L1 = vmovl_u8(vld1_u8(dst + (i + 1) * BPS - 1));
+      const uint16x8_t L2 = vmovl_u8(vld1_u8(dst + (i + 2) * BPS - 1));
+      const uint16x8_t L3 = vmovl_u8(vld1_u8(dst + (i + 3) * BPS - 1));
+      const uint16x8_t L4 = vmovl_u8(vld1_u8(dst + (i + 4) * BPS - 1));
+      const uint16x8_t L5 = vmovl_u8(vld1_u8(dst + (i + 5) * BPS - 1));
+      const uint16x8_t L6 = vmovl_u8(vld1_u8(dst + (i + 6) * BPS - 1));
+      const uint16x8_t L7 = vmovl_u8(vld1_u8(dst + (i + 7) * BPS - 1));
+      const uint16x8_t s0 = vaddq_u16(L0, L1);
+      const uint16x8_t s1 = vaddq_u16(L2, L3);
+      const uint16x8_t s2 = vaddq_u16(L4, L5);
+      const uint16x8_t s3 = vaddq_u16(L6, L7);
+      const uint16x8_t s01 = vaddq_u16(s0, s1);
+      const uint16x8_t s23 = vaddq_u16(s2, s3);
+      const uint16x8_t sum = vaddq_u16(s01, s23);
+      sum_left = vaddq_u16(sum_left, sum);
+    }
+  }
+
+  if (do_top && do_left) {
+    const uint16x8_t sum = vaddq_u16(sum_left, sum_top);
+    dc0 = vrshrn_n_u16(sum, 5);
+  } else if (do_top) {
+    dc0 = vrshrn_n_u16(sum_top, 4);
+  } else if (do_left) {
+    dc0 = vrshrn_n_u16(sum_left, 4);
+  } else {
+    dc0 = vdup_n_u8(0x80);
+  }
+
+  {
+    const uint8x16_t dc = vdupq_lane_u8(dc0, 0);
+    int i;
+    for (i = 0; i < 16; ++i) {
+      vst1q_u8(dst + i * BPS, dc);
+    }
+  }
+}
+
+static void DC16TopLeft(uint8_t* dst) { DC16(dst, 1, 1); }
+static void DC16NoTop(uint8_t* dst) { DC16(dst, 0, 1); }
+static void DC16NoLeft(uint8_t* dst) { DC16(dst, 1, 0); }
+static void DC16NoTopLeft(uint8_t* dst) { DC16(dst, 0, 0); }
+
+static void TM16(uint8_t* dst) {
+  const uint8x8_t TL = vld1_dup_u8(dst - BPS - 1);  // top-left pixel 'A[-1]'
+  const uint8x16_t T = vld1q_u8(dst - BPS);  // top row 'A[0..15]'
+  // A[c] - A[-1]
+  const int16x8_t d_lo = vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), TL));
+  const int16x8_t d_hi = vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), TL));
+  int y;
+  for (y = 0; y < 16; y += 4) {
+    // left edge
+    const int16x8_t L0 = ConvertU8ToS16(vld1_dup_u8(dst + 0 * BPS - 1));
+    const int16x8_t L1 = ConvertU8ToS16(vld1_dup_u8(dst + 1 * BPS - 1));
+    const int16x8_t L2 = ConvertU8ToS16(vld1_dup_u8(dst + 2 * BPS - 1));
+    const int16x8_t L3 = ConvertU8ToS16(vld1_dup_u8(dst + 3 * BPS - 1));
+    const int16x8_t r0_lo = vaddq_s16(L0, d_lo);  // L[r] + A[c] - A[-1]
+    const int16x8_t r1_lo = vaddq_s16(L1, d_lo);
+    const int16x8_t r2_lo = vaddq_s16(L2, d_lo);
+    const int16x8_t r3_lo = vaddq_s16(L3, d_lo);
+    const int16x8_t r0_hi = vaddq_s16(L0, d_hi);
+    const int16x8_t r1_hi = vaddq_s16(L1, d_hi);
+    const int16x8_t r2_hi = vaddq_s16(L2, d_hi);
+    const int16x8_t r3_hi = vaddq_s16(L3, d_hi);
+    // Saturate and store the result.
+    const uint8x16_t row0 = vcombine_u8(vqmovun_s16(r0_lo), vqmovun_s16(r0_hi));
+    const uint8x16_t row1 = vcombine_u8(vqmovun_s16(r1_lo), vqmovun_s16(r1_hi));
+    const uint8x16_t row2 = vcombine_u8(vqmovun_s16(r2_lo), vqmovun_s16(r2_hi));
+    const uint8x16_t row3 = vcombine_u8(vqmovun_s16(r3_lo), vqmovun_s16(r3_hi));
+    vst1q_u8(dst + 0 * BPS, row0);
+    vst1q_u8(dst + 1 * BPS, row1);
+    vst1q_u8(dst + 2 * BPS, row2);
+    vst1q_u8(dst + 3 * BPS, row3);
+    dst += 4 * BPS;
+  }
+}
 
 //------------------------------------------------------------------------------
 // Entry point
 
 extern void VP8DspInitNEON(void);
 
-void VP8DspInitNEON(void) {
-#if defined(WEBP_USE_NEON)
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitNEON(void) {
   VP8Transform = TransformTwo;
   VP8TransformAC3 = TransformAC3;
   VP8TransformDC = TransformDC;
@@ -1288,5 +1608,32 @@ void VP8DspInitNEON(void) {
   VP8SimpleHFilter16 = SimpleHFilter16;
   VP8SimpleVFilter16i = SimpleVFilter16i;
   VP8SimpleHFilter16i = SimpleHFilter16i;
-#endif   // WEBP_USE_NEON
+
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[6] = LD4;
+
+  VP8PredLuma16[0] = DC16TopLeft;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
 }
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8DspInitNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse2.c b/src/3rdparty/libwebp/src/dsp/dec_sse2.c
index c37a637..935bf02 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_sse2.c
@@ -52,19 +52,19 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
   // vectors will just contain random value we'll never use nor store.
   __m128i in0, in1, in2, in3;
   {
-    in0 = _mm_loadl_epi64((__m128i*)&in[0]);
-    in1 = _mm_loadl_epi64((__m128i*)&in[4]);
-    in2 = _mm_loadl_epi64((__m128i*)&in[8]);
-    in3 = _mm_loadl_epi64((__m128i*)&in[12]);
+    in0 = _mm_loadl_epi64((const __m128i*)&in[0]);
+    in1 = _mm_loadl_epi64((const __m128i*)&in[4]);
+    in2 = _mm_loadl_epi64((const __m128i*)&in[8]);
+    in3 = _mm_loadl_epi64((const __m128i*)&in[12]);
     // a00 a10 a20 a30   x x x x
     // a01 a11 a21 a31   x x x x
     // a02 a12 a22 a32   x x x x
     // a03 a13 a23 a33   x x x x
     if (do_two) {
-      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
-      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
-      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
-      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
+      const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]);
+      const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]);
+      const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]);
+      const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]);
       in0 = _mm_unpacklo_epi64(in0, inB0);
       in1 = _mm_unpacklo_epi64(in1, inB1);
       in2 = _mm_unpacklo_epi64(in2, inB2);
@@ -207,10 +207,10 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
       dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
     } else {
       // Load four bytes/pixels per line.
-      dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
-      dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
-      dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
-      dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
+      dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
+      dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
+      dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
+      dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
     }
     // Convert to 16b.
     dst0 = _mm_unpacklo_epi8(dst0, zero);
@@ -236,10 +236,10 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
       _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
     } else {
       // Store four bytes/pixels per line.
-      *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
-      *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
-      *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
-      *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
+      WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
+      WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
+      WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
+      WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
     }
   }
 }
@@ -262,10 +262,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   const __m128i m3 = _mm_subs_epi16(B, d4);
   const __m128i zero = _mm_setzero_si128();
   // Load the source pixels.
-  __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
-  __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
-  __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
-  __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
+  __m128i dst0 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 0 * BPS));
+  __m128i dst1 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 1 * BPS));
+  __m128i dst2 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 2 * BPS));
+  __m128i dst3 = _mm_cvtsi32_si128(WebPMemToUint32(dst + 3 * BPS));
   // Convert to 16b.
   dst0 = _mm_unpacklo_epi8(dst0, zero);
   dst1 = _mm_unpacklo_epi8(dst1, zero);
@@ -282,10 +282,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
   dst2 = _mm_packus_epi16(dst2, dst2);
   dst3 = _mm_packus_epi16(dst3, dst3);
   // Store the results.
-  *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
-  *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
-  *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
-  *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(dst0));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(dst1));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(dst2));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(dst3));
 }
 #undef MUL
 #endif   // USE_TRANSFORM_AC3
@@ -301,11 +301,10 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
 // Shift each byte of "x" by 3 bits while preserving by the sign bit.
 static WEBP_INLINE void SignedShift8b(__m128i* const x) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i signs = _mm_cmpgt_epi8(zero, *x);
-  const __m128i lo_0 = _mm_unpacklo_epi8(*x, signs);  // s8 -> s16 sign extend
-  const __m128i hi_0 = _mm_unpackhi_epi8(*x, signs);
-  const __m128i lo_1 = _mm_srai_epi16(lo_0, 3);
-  const __m128i hi_1 = _mm_srai_epi16(hi_0, 3);
+  const __m128i lo_0 = _mm_unpacklo_epi8(zero, *x);
+  const __m128i hi_0 = _mm_unpackhi_epi8(zero, *x);
+  const __m128i lo_1 = _mm_srai_epi16(lo_0, 3 + 8);
+  const __m128i hi_1 = _mm_srai_epi16(hi_0, 3 + 8);
   *x = _mm_packs_epi16(lo_1, hi_1);
 }
 
@@ -330,11 +329,10 @@ static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
   const __m128i t_2 = MM_ABS(*q1, *q0);
 
   const __m128i h = _mm_set1_epi8(hev_thresh);
-  const __m128i t_3 = _mm_subs_epu8(t_1, h);  // abs(p1 - p0) - hev_tresh
-  const __m128i t_4 = _mm_subs_epu8(t_2, h);  // abs(q1 - q0) - hev_tresh
+  const __m128i t_max = _mm_max_epu8(t_1, t_2);
 
-  *not_hev = _mm_or_si128(t_3, t_4);
-  *not_hev = _mm_cmpeq_epi8(*not_hev, zero);  // not_hev <= t1 && not_hev <= t2
+  const __m128i t_max_h = _mm_subs_epu8(t_max, h);
+  *not_hev = _mm_cmpeq_epi8(t_max_h, zero);  // not_hev <= t1 && not_hev <= t2
 }
 
 // input pixels are int8_t
@@ -428,9 +426,11 @@ static WEBP_INLINE void DoFilter2(__m128i* const p1, __m128i* const p0,
 static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
                                   __m128i* const q0, __m128i* const q1,
                                   const __m128i* const mask, int hev_thresh) {
-  const __m128i sign_bit = _mm_set1_epi8(0x80);
-  const __m128i k64 = _mm_set1_epi8(0x40);
   const __m128i zero = _mm_setzero_si128();
+  const __m128i sign_bit = _mm_set1_epi8(0x80);
+  const __m128i k64 = _mm_set1_epi8(64);
+  const __m128i k3 = _mm_set1_epi8(3);
+  const __m128i k4 = _mm_set1_epi8(4);
   __m128i not_hev;
   __m128i t1, t2, t3;
 
@@ -448,10 +448,8 @@ static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
   t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
   t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about
 
-  t2 = _mm_set1_epi8(3);
-  t3 = _mm_set1_epi8(4);
-  t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 3
-  t3 = _mm_adds_epi8(t1, t3);        // 3 * (q0 - p0) + (p1 - q1) + 4
+  t2 = _mm_adds_epi8(t1, k3);        // 3 * (q0 - p0) + hev(p1 - q1) + 3
+  t3 = _mm_adds_epi8(t1, k4);        // 3 * (q0 - p0) + hev(p1 - q1) + 4
   SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
   SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
   *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
@@ -520,47 +518,31 @@ static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
 }
 
 // reads 8 rows across a vertical edge.
-//
-// TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into
-// two Load4x4() to avoid code duplication.
 static WEBP_INLINE void Load8x4(const uint8_t* const b, int stride,
                                 __m128i* const p, __m128i* const q) {
-  __m128i t1, t2;
-
-  // Load 0th, 1st, 4th and 5th rows
-  __m128i r0 =  _mm_cvtsi32_si128(*((int*)&b[0 * stride]));  // 03 02 01 00
-  __m128i r1 =  _mm_cvtsi32_si128(*((int*)&b[1 * stride]));  // 13 12 11 10
-  __m128i r4 =  _mm_cvtsi32_si128(*((int*)&b[4 * stride]));  // 43 42 41 40
-  __m128i r5 =  _mm_cvtsi32_si128(*((int*)&b[5 * stride]));  // 53 52 51 50
-
-  r0 = _mm_unpacklo_epi32(r0, r4);               // 43 42 41 40 03 02 01 00
-  r1 = _mm_unpacklo_epi32(r1, r5);               // 53 52 51 50 13 12 11 10
-
-  // t1 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
-  t1 = _mm_unpacklo_epi8(r0, r1);
-
-  // Load 2nd, 3rd, 6th and 7th rows
-  r0 =  _mm_cvtsi32_si128(*((int*)&b[2 * stride]));          // 23 22 21 22
-  r1 =  _mm_cvtsi32_si128(*((int*)&b[3 * stride]));          // 33 32 31 30
-  r4 =  _mm_cvtsi32_si128(*((int*)&b[6 * stride]));          // 63 62 61 60
-  r5 =  _mm_cvtsi32_si128(*((int*)&b[7 * stride]));          // 73 72 71 70
-
-  r0 = _mm_unpacklo_epi32(r0, r4);               // 63 62 61 60 23 22 21 20
-  r1 = _mm_unpacklo_epi32(r1, r5);               // 73 72 71 70 33 32 31 30
-
-  // t2 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
-  t2 = _mm_unpacklo_epi8(r0, r1);
-
-  // t1 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-  // t2 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-  r0 = t1;
-  t1 = _mm_unpacklo_epi16(t1, t2);
-  t2 = _mm_unpackhi_epi16(r0, t2);
+  // A0 = 63 62 61 60 23 22 21 20 43 42 41 40 03 02 01 00
+  // A1 = 73 72 71 70 33 32 31 30 53 52 51 50 13 12 11 10
+  const __m128i A0 = _mm_set_epi32(
+      WebPMemToUint32(&b[6 * stride]), WebPMemToUint32(&b[2 * stride]),
+      WebPMemToUint32(&b[4 * stride]), WebPMemToUint32(&b[0 * stride]));
+  const __m128i A1 = _mm_set_epi32(
+      WebPMemToUint32(&b[7 * stride]), WebPMemToUint32(&b[3 * stride]),
+      WebPMemToUint32(&b[5 * stride]), WebPMemToUint32(&b[1 * stride]));
+
+  // B0 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
+  // B1 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
+  const __m128i B0 = _mm_unpacklo_epi8(A0, A1);
+  const __m128i B1 = _mm_unpackhi_epi8(A0, A1);
+
+  // C0 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
+  // C1 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
+  const __m128i C0 = _mm_unpacklo_epi16(B0, B1);
+  const __m128i C1 = _mm_unpackhi_epi16(B0, B1);
 
   // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
   // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-  *p = _mm_unpacklo_epi32(t1, t2);
-  *q = _mm_unpackhi_epi32(t1, t2);
+  *p = _mm_unpacklo_epi32(C0, C1);
+  *q = _mm_unpackhi_epi32(C0, C1);
 }
 
 static WEBP_INLINE void Load16x4(const uint8_t* const r0,
@@ -568,7 +550,6 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
                                  int stride,
                                  __m128i* const p1, __m128i* const p0,
                                  __m128i* const q0, __m128i* const q1) {
-  __m128i t1, t2;
   // Assume the pixels around the edge (|) are numbered as follows
   //                00 01 | 02 03
   //                10 11 | 12 13
@@ -587,22 +568,24 @@ static WEBP_INLINE void Load16x4(const uint8_t* const r0,
   Load8x4(r0, stride, p1, q0);
   Load8x4(r8, stride, p0, q1);
 
-  t1 = *p1;
-  t2 = *q0;
-  // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-  // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-  // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-  // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-  *p1 = _mm_unpacklo_epi64(t1, *p0);
-  *p0 = _mm_unpackhi_epi64(t1, *p0);
-  *q0 = _mm_unpacklo_epi64(t2, *q1);
-  *q1 = _mm_unpackhi_epi64(t2, *q1);
+  {
+    // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+    // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
+    // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+    // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
+    const __m128i t1 = *p1;
+    const __m128i t2 = *q0;
+    *p1 = _mm_unpacklo_epi64(t1, *p0);
+    *p0 = _mm_unpackhi_epi64(t1, *p0);
+    *q0 = _mm_unpacklo_epi64(t2, *q1);
+    *q1 = _mm_unpackhi_epi64(t2, *q1);
+  }
 }
 
 static WEBP_INLINE void Store4x4(__m128i* const x, uint8_t* dst, int stride) {
   int i;
   for (i = 0; i < 4; ++i, dst += stride) {
-    *((int32_t*)dst) = _mm_cvtsi128_si32(*x);
+    WebPUint32ToMem(dst, _mm_cvtsi128_si32(*x));
     *x = _mm_srli_si128(*x, 4);
   }
 }
@@ -947,15 +930,308 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
   Store16x4(&p1, &p0, &q0, &q1, u, v, stride);
 }
 
-#endif   // WEBP_USE_SSE2
+//------------------------------------------------------------------------------
+// 4x4 predictions
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+
+// We use the following 8b-arithmetic tricks:
+//     (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1
+//   where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1]
+// and:
+//     (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb
+//   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
+//   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
+
+static void VE4(uint8_t* dst) {    // vertical
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
+  const __m128i b = _mm_subs_epu8(a, lsb);
+  const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
+  const uint32_t vals = _mm_cvtsi128_si32(avg);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    WebPUint32ToMem(dst + i * BPS, vals);
+  }
+}
+
+static void LD4(uint8_t* dst) {   // Down-Left
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, dst[-BPS + 7], 3);
+  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+}
+
+static void VR4(uint8_t* dst) {   // Vertical-Right
+  const __m128i one = _mm_set1_epi8(1);
+  const int I = dst[-1 + 0 * BPS];
+  const int J = dst[-1 + 1 * BPS];
+  const int K = dst[-1 + 2 * BPS];
+  const int X = dst[-1 - BPS];
+  const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
+  const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
+  const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
+  const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
+  const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0);
+  const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
+
+  // these two are hard to implement in SSE2, so we keep the C-version:
+  DST(0, 2) = AVG3(J, I, X);
+  DST(0, 3) = AVG3(K, J, I);
+}
+
+static void VL4(uint8_t* dst) {   // Vertical-Left
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(dst - BPS));
+  const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_);
+  const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_);
+  const __m128i avg3 = _mm_avg_epu8(avg1, avg2);
+  const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one);
+  const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_);
+  const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_);
+  const __m128i abbc = _mm_or_si128(ab, bc);
+  const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
+  const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
+  const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
+
+  // these two are hard to get and irregular
+  DST(3, 2) = (extra_out >> 0) & 0xff;
+  DST(3, 3) = (extra_out >> 8) & 0xff;
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i XABCD = _mm_loadl_epi64((__m128i*)(dst - BPS - 1));
+  const __m128i ____XABCD = _mm_slli_si128(XABCD, 4);
+  const uint32_t I = dst[-1 + 0 * BPS];
+  const uint32_t J = dst[-1 + 1 * BPS];
+  const uint32_t K = dst[-1 + 2 * BPS];
+  const uint32_t L = dst[-1 + 3 * BPS];
+  const __m128i LKJI_____ =
+      _mm_cvtsi32_si128(L | (K << 8) | (J << 16) | (I << 24));
+  const __m128i LKJIXABCD = _mm_or_si128(LKJI_____, ____XABCD);
+  const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
+  const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
+  const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+}
+
+#undef DST
+#undef AVG3
+
+//------------------------------------------------------------------------------
+// Luma 16x16
+
+static WEBP_INLINE void TrueMotion(uint8_t* dst, int size) {
+  const uint8_t* top = dst - BPS;
+  const __m128i zero = _mm_setzero_si128();
+  int y;
+  if (size == 4) {
+    const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
+    const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+    for (y = 0; y < 4; ++y, dst += BPS) {
+      const int val = dst[-1] - top[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+      WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
+    }
+  } else if (size == 8) {
+    const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+    const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+    for (y = 0; y < 8; ++y, dst += BPS) {
+      const int val = dst[-1] - top[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+      _mm_storel_epi64((__m128i*)dst, out);
+    }
+  } else {
+    const __m128i top_values = _mm_loadu_si128((const __m128i*)top);
+    const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero);
+    const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero);
+    for (y = 0; y < 16; ++y, dst += BPS) {
+      const int val = dst[-1] - top[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out_0 = _mm_add_epi16(base, top_base_0);
+      const __m128i out_1 = _mm_add_epi16(base, top_base_1);
+      const __m128i out = _mm_packus_epi16(out_0, out_1);
+      _mm_storeu_si128((__m128i*)dst, out);
+    }
+  }
+}
+
+static void TM4(uint8_t* dst)   { TrueMotion(dst, 4); }
+static void TM8uv(uint8_t* dst) { TrueMotion(dst, 8); }
+static void TM16(uint8_t* dst)  { TrueMotion(dst, 16); }
+
+static void VE16(uint8_t* dst) {
+  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
+  int j;
+  for (j = 0; j < 16; ++j) {
+    _mm_storeu_si128((__m128i*)(dst + j * BPS), top);
+  }
+}
+
+static void HE16(uint8_t* dst) {     // horizontal
+  int j;
+  for (j = 16; j > 0; --j) {
+    const __m128i values = _mm_set1_epi8(dst[-1]);
+    _mm_storeu_si128((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+  int j;
+  const __m128i values = _mm_set1_epi8(v);
+  for (j = 0; j < 16; ++j) {
+    _mm_storeu_si128((__m128i*)(dst + j * BPS), values);
+  }
+}
+
+static void DC16(uint8_t* dst) {    // DC
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
+  const __m128i sad8x2 = _mm_sad_epu8(top, zero);
+  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+  const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
+  int left = 0;
+  int j;
+  for (j = 0; j < 16; ++j) {
+    left += dst[-1 + j * BPS];
+  }
+  {
+    const int DC = _mm_cvtsi128_si32(sum) + left + 16;
+    Put16(DC >> 5, dst);
+  }
+}
+
+static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+  int DC = 8;
+  int j;
+  for (j = 0; j < 16; ++j) {
+    DC += dst[-1 + j * BPS];
+  }
+  Put16(DC >> 4, dst);
+}
+
+static void DC16NoLeft(uint8_t* dst) {  // DC with left samples not available
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top = _mm_loadu_si128((const __m128i*)(dst - BPS));
+  const __m128i sad8x2 = _mm_sad_epu8(top, zero);
+  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+  const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
+  const int DC = _mm_cvtsi128_si32(sum) + 8;
+  Put16(DC >> 4, dst);
+}
+
+static void DC16NoTopLeft(uint8_t* dst) {  // DC with no top and left samples
+  Put16(0x80, dst);
+}
+
+//------------------------------------------------------------------------------
+// Chroma
+
+static void VE8uv(uint8_t* dst) {    // vertical
+  int j;
+  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
+  for (j = 0; j < 8; ++j) {
+    _mm_storel_epi64((__m128i*)(dst + j * BPS), top);
+  }
+}
+
+static void HE8uv(uint8_t* dst) {    // horizontal
+  int j;
+  for (j = 0; j < 8; ++j) {
+    const __m128i values = _mm_set1_epi8(dst[-1]);
+    _mm_storel_epi64((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+// helper for chroma-DC predictions
+static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+  int j;
+  const __m128i values = _mm_set1_epi8(v);
+  for (j = 0; j < 8; ++j) {
+    _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
+  }
+}
+
+static void DC8uv(uint8_t* dst) {     // DC
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
+  const __m128i sum = _mm_sad_epu8(top, zero);
+  int left = 0;
+  int j;
+  for (j = 0; j < 8; ++j) {
+    left += dst[-1 + j * BPS];
+  }
+  {
+    const int DC = _mm_cvtsi128_si32(sum) + left + 8;
+    Put8x8uv(DC >> 4, dst);
+  }
+}
+
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top = _mm_loadl_epi64((const __m128i*)(dst - BPS));
+  const __m128i sum = _mm_sad_epu8(top, zero);
+  const int DC = _mm_cvtsi128_si32(sum) + 4;
+  Put8x8uv(DC >> 3, dst);
+}
+
+static void DC8uvNoTop(uint8_t* dst) {  // DC with no top samples
+  int dc0 = 4;
+  int i;
+  for (i = 0; i < 8; ++i) {
+    dc0 += dst[-1 + i * BPS];
+  }
+  Put8x8uv(dc0 >> 3, dst);
+}
+
+static void DC8uvNoTopLeft(uint8_t* dst) {    // DC with nothing
+  Put8x8uv(0x80, dst);
+}
 
 //------------------------------------------------------------------------------
 // Entry point
 
 extern void VP8DspInitSSE2(void);
 
-void VP8DspInitSSE2(void) {
-#if defined(WEBP_USE_SSE2)
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE2(void) {
   VP8Transform = Transform;
 #if defined(USE_TRANSFORM_AC3)
   VP8TransformAC3 = TransformAC3;
@@ -974,5 +1250,33 @@ void VP8DspInitSSE2(void) {
   VP8SimpleHFilter16 = SimpleHFilter16;
   VP8SimpleVFilter16i = SimpleVFilter16i;
   VP8SimpleHFilter16i = SimpleHFilter16i;
-#endif   // WEBP_USE_SSE2
+
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[5] = VR4;
+  VP8PredLuma4[6] = LD4;
+  VP8PredLuma4[7] = VL4;
+
+  VP8PredLuma16[0] = DC16;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
 }
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8DspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse41.c b/src/3rdparty/libwebp/src/dsp/dec_sse41.c
new file mode 100644
index 0000000..224c6f8
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/dec_sse41.c
@@ -0,0 +1,45 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4 version of some decoding functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include <smmintrin.h>
+#include "../dec/vp8i.h"
+
+static void HE16(uint8_t* dst) {     // horizontal
+  int j;
+  const __m128i kShuffle3 = _mm_set1_epi8(3);
+  for (j = 16; j > 0; --j) {
+    const __m128i in = _mm_cvtsi32_si128(WebPMemToUint32(dst - 4));
+    const __m128i values = _mm_shuffle_epi8(in, kShuffle3);
+    _mm_storeu_si128((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8DspInitSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitSSE41(void) {
+  VP8PredLuma16[3] = HE16;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8DspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
diff --git a/src/3rdparty/libwebp/src/dsp/dsp.h b/src/3rdparty/libwebp/src/dsp/dsp.h
index a2c3951..95f1ce0 100644
--- a/src/3rdparty/libwebp/src/dsp/dsp.h
+++ b/src/3rdparty/libwebp/src/dsp/dsp.h
@@ -14,16 +14,15 @@
 #ifndef WEBP_DSP_DSP_H_
 #define WEBP_DSP_DSP_H_
 
-#ifdef HAVE_CONFIG_H
-#include "../webp/config.h"
-#endif
-
 #include "../webp/types.h"
+#include "../utils/utils.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+#define BPS 32   // this is the common stride for enc/dec
+
 //------------------------------------------------------------------------------
 // CPU detection
 
@@ -45,6 +44,11 @@ extern "C" {
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
 #endif
 
+#if defined(_MSC_VER) && _MSC_VER >= 1500 && \
+    (defined(_M_X64) || defined(_M_IX86))
+#define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
+#endif
+
 // WEBP_HAVE_* are used to indicate the presence of the instruction set in dsp
 // files without intrinsics, allowing the corresponding Init() to be called.
 // Files containing intrinsics will need to be built targeting the instruction
@@ -53,6 +57,10 @@ extern "C" {
 #define WEBP_USE_SSE2
 #endif
 
+#if defined(__SSE4_1__) || defined(WEBP_MSC_SSE41) || defined(WEBP_HAVE_SSE41)
+#define WEBP_USE_SSE41
+#endif
+
 #if defined(__AVX2__) || defined(WEBP_HAVE_AVX2)
 #define WEBP_USE_AVX2
 #endif
@@ -68,25 +76,53 @@ extern "C" {
 #define WEBP_USE_NEON
 #endif
 
+#if defined(_MSC_VER) && _MSC_VER >= 1700 && defined(_M_ARM)
+#define WEBP_USE_NEON
+#define WEBP_USE_INTRINSICS
+#endif
+
 #if defined(__mips__) && !defined(__mips64) && \
     defined(__mips_isa_rev) && (__mips_isa_rev >= 1) && (__mips_isa_rev < 6)
 #define WEBP_USE_MIPS32
 #if (__mips_isa_rev >= 2)
 #define WEBP_USE_MIPS32_R2
+#if defined(__mips_dspr2) || (__mips_dsp_rev >= 2)
+#define WEBP_USE_MIPS_DSP_R2
+#endif
+#endif
+#endif
+
+// This macro prevents thread_sanitizer from reporting known concurrent writes.
+#define WEBP_TSAN_IGNORE_FUNCTION
+#if defined(__has_feature)
+#if __has_feature(thread_sanitizer)
+#undef WEBP_TSAN_IGNORE_FUNCTION
+#define WEBP_TSAN_IGNORE_FUNCTION __attribute__((no_sanitize_thread))
 #endif
 #endif
 
 typedef enum {
   kSSE2,
   kSSE3,
+  kSSE4_1,
   kAVX,
   kAVX2,
   kNEON,
-  kMIPS32
+  kMIPS32,
+  kMIPSdspR2
 } CPUFeature;
 // returns true if the CPU supports the feature.
 typedef int (*VP8CPUInfo)(CPUFeature feature);
-extern VP8CPUInfo VP8GetCPUInfo;
+WEBP_EXTERN(VP8CPUInfo) VP8GetCPUInfo;
+
+//------------------------------------------------------------------------------
+// Init stub generator
+
+// Defines an init function stub to ensure each module exposes a symbol,
+// avoiding a compiler warning.
+#define WEBP_DSP_INIT_STUB(func) \
+  extern void func(void); \
+  WEBP_TSAN_IGNORE_FUNCTION void func(void) {}
 
 //------------------------------------------------------------------------------
 // Encoding
@@ -100,6 +136,7 @@ typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out);
 typedef void (*VP8WHT)(const int16_t* in, int16_t* out);
 extern VP8Idct VP8ITransform;
 extern VP8Fdct VP8FTransform;
+extern VP8Fdct VP8FTransform2;   // performs two transforms at a time
 extern VP8WHT VP8FTransformWHT;
 // Predictions
 // *dst is the destination block. *top and *left can be NULL.
@@ -118,26 +155,63 @@ extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
 
 typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
 extern VP8BlockCopy VP8Copy4x4;
+extern VP8BlockCopy VP8Copy16x8;
 // Quantization
 struct VP8Matrix;   // forward declaration
 typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
                                 const struct VP8Matrix* const mtx);
+// Same as VP8QuantizeBlock, but quantizes two consecutive blocks.
+typedef int (*VP8Quantize2Blocks)(int16_t in[32], int16_t out[32],
+                                  const struct VP8Matrix* const mtx);
+
 extern VP8QuantizeBlock VP8EncQuantizeBlock;
+extern VP8Quantize2Blocks VP8EncQuantize2Blocks;
 
 // specific to 2nd transform:
 typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
                                    const struct VP8Matrix* const mtx);
 extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 
-// Collect histogram for susceptibility calculation and accumulate in histo[].
-struct VP8Histogram;
+extern const int VP8DspScan[16 + 4 + 4];
+
+// Collect histogram for susceptibility calculation.
+#define MAX_COEFF_THRESH   31   // size of histogram used by CollectHistogram.
+typedef struct {
+  // We only need to store max_value and last_non_zero, not the distribution.
+  int max_value;
+  int last_non_zero;
+} VP8Histogram;
 typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
                           int start_block, int end_block,
-                          struct VP8Histogram* const histo);
-extern const int VP8DspScan[16 + 4 + 4];
+                          VP8Histogram* const histo);
 extern VP8CHisto VP8CollectHistogram;
+// General-purpose util function to help VP8CollectHistogram().
+void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
+                         VP8Histogram* const histo);
 
-void VP8EncDspInit(void);   // must be called before using any of the above
+// must be called before using any of the above
+void VP8EncDspInit(void);
+
+//------------------------------------------------------------------------------
+// cost functions (encoding)
+
+extern const uint16_t VP8EntropyCost[256];        // 8bit fixed-point log(p)
+// approximate cost per level:
+extern const uint16_t VP8LevelFixedCosts[2047 /*MAX_LEVEL*/ + 1];
+extern const uint8_t VP8EncBands[16 + 1];
+
+struct VP8Residual;
+typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
+                                         struct VP8Residual* const res);
+extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+// Cost calculation function.
+typedef int (*VP8GetResidualCostFunc)(int ctx0,
+                                      const struct VP8Residual* const res);
+extern VP8GetResidualCostFunc VP8GetResidualCost;
+
+// must be called before anything using the above
+void VP8EncDspCostInit(void);
 
 //------------------------------------------------------------------------------
 // Decoding
@@ -155,16 +229,17 @@ extern VP8WHT VP8TransformWHT;
 // *dst is the destination block, with stride BPS. Boundary samples are
 // assumed accessible when needed.
 typedef void (*VP8PredFunc)(uint8_t* dst);
-extern const VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
-extern const VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
-extern const VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
+extern VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */];
+extern VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */];
+extern VP8PredFunc VP8PredLuma4[/* NUM_BMODES */];
 
 // clipping tables (for filtering)
 extern const int8_t* const VP8ksclip1;  // clips [-1020, 1020] to [-128, 127]
 extern const int8_t* const VP8ksclip2;  // clips [-112, 112] to [-16, 15]
 extern const uint8_t* const VP8kclip1;  // clips [-255,511] to [0,255]
 extern const uint8_t* const VP8kabs0;   // abs(x) for x in [-255,255]
-void VP8InitClipTables(void);           // must be called first
+// must be called first
+void VP8InitClipTables(void);
 
 // simple filter (only for luma)
 typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh);
@@ -236,13 +311,81 @@ typedef void (*WebPYUV444Converter)(const uint8_t* y,
                                     const uint8_t* u, const uint8_t* v,
                                     uint8_t* dst, int len);
 
-extern const WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
+extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
 
 // Must be called before using the WebPUpsamplers[] (and for premultiplied
 // colorspaces like rgbA, rgbA4444, etc)
 void WebPInitUpsamplers(void);
 // Must be called before using WebPSamplers[]
 void WebPInitSamplers(void);
+// Must be called before using WebPYUV444Converters[]
+void WebPInitYUV444Converters(void);
+
+//------------------------------------------------------------------------------
+// ARGB -> YUV converters
+
+// Convert ARGB samples to luma Y.
+extern void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
+// Convert ARGB samples to U/V with downsampling. do_store should be '1' for
+// even lines and '0' for odd ones. 'src_width' is the original width, not
+// the U/V one.
+extern void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
+                                   int src_width, int do_store);
+
+// Convert a row of accumulated (four-values) of rgba32 toward U/V
+extern void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
+                                     uint8_t* u, uint8_t* v, int width);
+
+// Convert RGB or BGR to Y
+extern void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
+extern void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
+
+// used for plain-C fallback.
+extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
+                                  int src_width, int do_store);
+extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
+                                    uint8_t* u, uint8_t* v, int width);
+
+// Must be called before using the above.
+void WebPInitConvertARGBToYUV(void);
+
+//------------------------------------------------------------------------------
+// Rescaler
+
+struct WebPRescaler;
+
+// Import a row of data and save its contribution in the rescaler.
+// 'channel' denotes the channel number to be imported. 'Expand' corresponds to
+// the wrk->x_expand case. Otherwise, 'Shrink' is to be used.
+typedef void (*WebPRescalerImportRowFunc)(struct WebPRescaler* const wrk,
+                                          const uint8_t* src);
+
+extern WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
+extern WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
+
+// Export one row (starting at x_out position) from rescaler.
+// 'Expand' corresponds to the wrk->y_expand case.
+// Otherwise 'Shrink' is to be used
+typedef void (*WebPRescalerExportRowFunc)(struct WebPRescaler* const wrk);
+extern WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
+extern WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
+
+// Plain-C implementation, as fall-back.
+extern void WebPRescalerImportRowExpandC(struct WebPRescaler* const wrk,
+                                         const uint8_t* src);
+extern void WebPRescalerImportRowShrinkC(struct WebPRescaler* const wrk,
+                                         const uint8_t* src);
+extern void WebPRescalerExportRowExpandC(struct WebPRescaler* const wrk);
+extern void WebPRescalerExportRowShrinkC(struct WebPRescaler* const wrk);
+
+// Main entry calls:
+extern void WebPRescalerImportRow(struct WebPRescaler* const wrk,
+                                  const uint8_t* src);
+// Export one row (starting at x_out position) from rescaler.
+extern void WebPRescalerExportRow(struct WebPRescaler* const wrk);
+
+// Must be called first before using the above.
+void WebPRescalerDspInit(void);
 
 //------------------------------------------------------------------------------
 // Utilities for processing transparent channel.
@@ -256,6 +399,18 @@ extern void (*WebPApplyAlphaMultiply)(
 extern void (*WebPApplyAlphaMultiply4444)(
     uint8_t* rgba4444, int w, int h, int stride);
 
+// Dispatch the values from alpha[] plane to the ARGB destination 'dst'.
+// Returns true if alpha[] plane has non-trivial values different from 0xff.
+extern int (*WebPDispatchAlpha)(const uint8_t* alpha, int alpha_stride,
+                                int width, int height,
+                                uint8_t* dst, int dst_stride);
+
+// Transfer packed 8b alpha[] values to green channel in dst[], zero'ing the
+// A/R/B values. 'dst_stride' is the stride for dst[] in uint32_t units.
+extern void (*WebPDispatchAlphaToGreen)(const uint8_t* alpha, int alpha_stride,
+                                        int width, int height,
+                                        uint32_t* dst, int dst_stride);
+
 // Extract the alpha values from 32b values in argb[] and pack them into alpha[]
 // (this is the opposite of WebPDispatchAlpha).
 // Returns true if there's only trivial 0xff alpha values.
@@ -282,9 +437,59 @@ void WebPMultRows(uint8_t* ptr, int stride,
                   const uint8_t* alpha, int alpha_stride,
                   int width, int num_rows, int inverse);
 
+// Plain-C versions, used as fallback by some implementations.
+void WebPMultRowC(uint8_t* const ptr, const uint8_t* const alpha,
+                  int width, int inverse);
+void WebPMultARGBRowC(uint32_t* const ptr, int width, int inverse);
+
 // To be called first before using the above.
 void WebPInitAlphaProcessing(void);
 
+// ARGB packing function: a/r/g/b input is rgba or bgra order.
+extern void (*VP8PackARGB)(const uint8_t* a, const uint8_t* r,
+                           const uint8_t* g, const uint8_t* b, int len,
+                           uint32_t* out);
+
+// RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
+extern void (*VP8PackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
+                          int len, int step, uint32_t* out);
+
+// To be called first before using the above.
+void VP8EncDspARGBInit(void);
+
+//------------------------------------------------------------------------------
+// Filter functions
+
+typedef enum {     // Filter types.
+  WEBP_FILTER_NONE = 0,
+  WEBP_FILTER_HORIZONTAL,
+  WEBP_FILTER_VERTICAL,
+  WEBP_FILTER_GRADIENT,
+  WEBP_FILTER_LAST = WEBP_FILTER_GRADIENT + 1,  // end marker
+  WEBP_FILTER_BEST,    // meta-types
+  WEBP_FILTER_FAST
+} WEBP_FILTER_TYPE;
+
+typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
+                               int stride, uint8_t* out);
+typedef void (*WebPUnfilterFunc)(int width, int height, int stride,
+                                 int row, int num_rows, uint8_t* data);
+
+// Filter the given data using the given predictor.
+// 'in' corresponds to a 2-dimensional pixel array of size (stride * height)
+// in raster order.
+// 'stride' is number of bytes per scan line (with possible padding).
+// 'out' should be pre-allocated.
+extern WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
+
+// In-place reconstruct the original data from the given filtered data.
+// The reconstruction will be done for 'num_rows' rows starting from 'row'
+// (assuming rows upto 'row - 1' are already reconstructed).
+extern WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
+
+// To be called first before using the above.
+void VP8FiltersInit(void);
+
 #ifdef __cplusplus
 }    // extern "C"
 #endif
diff --git a/src/3rdparty/libwebp/src/dsp/enc.c b/src/3rdparty/libwebp/src/dsp/enc.c
index f4e72d4..8899d50 100644
--- a/src/3rdparty/libwebp/src/dsp/enc.c
+++ b/src/3rdparty/libwebp/src/dsp/enc.c
@@ -40,10 +40,27 @@ const int VP8DspScan[16 + 4 + 4] = {
   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 };
 
+// general-purpose util function
+void VP8SetHistogramData(const int distribution[MAX_COEFF_THRESH + 1],
+                         VP8Histogram* const histo) {
+  int max_value = 0, last_non_zero = 1;
+  int k;
+  for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
+    const int value = distribution[k];
+    if (value > 0) {
+      if (value > max_value) max_value = value;
+      last_non_zero = k;
+    }
+  }
+  histo->max_value = max_value;
+  histo->last_non_zero = last_non_zero;
+}
+
 static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
                              int start_block, int end_block,
                              VP8Histogram* const histo) {
   int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   for (j = start_block; j < end_block; ++j) {
     int k;
     int16_t out[16];
@@ -54,9 +71,10 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
     for (k = 0; k < 16; ++k) {
       const int v = abs(out[k]) >> 3;  // TODO(skal): add rounding?
       const int clipped_value = clip_max(v, MAX_COEFF_THRESH);
-      histo->distribution[clipped_value]++;
+      ++distribution[clipped_value];
     }
   }
+  VP8SetHistogramData(distribution, histo);
 }
 
 //------------------------------------------------------------------------------
@@ -68,7 +86,7 @@ static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
 // and make sure it's set to true _last_ (so as to be thread-safe)
 static volatile int tables_ok = 0;
 
-static void InitTables(void) {
+static WEBP_TSAN_IGNORE_FUNCTION void InitTables(void) {
   if (!tables_ok) {
     int i;
     for (i = -255; i <= 255 + 255; ++i) {
@@ -159,6 +177,11 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
   }
 }
 
+static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  VP8FTransform(src, ref, out);
+  VP8FTransform(src + 4, ref + 4, out + 16);
+}
+
 static void FTransformWHT(const int16_t* in, int16_t* out) {
   // input is 12b signed
   int32_t tmp[16];
@@ -195,8 +218,6 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
 //------------------------------------------------------------------------------
 // Intra predictions
 
-#define DST(x, y) dst[(x) + (y) * BPS]
-
 static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
   int j;
   for (j = 0; j < size; ++j) {
@@ -207,7 +228,7 @@ static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
 static WEBP_INLINE void VerticalPred(uint8_t* dst,
                                      const uint8_t* top, int size) {
   int j;
-  if (top) {
+  if (top != NULL) {
     for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size);
   } else {
     Fill(dst, 127, size);
@@ -216,7 +237,7 @@ static WEBP_INLINE void VerticalPred(uint8_t* dst,
 
 static WEBP_INLINE void HorizontalPred(uint8_t* dst,
                                        const uint8_t* left, int size) {
-  if (left) {
+  if (left != NULL) {
     int j;
     for (j = 0; j < size; ++j) {
       memset(dst + j * BPS, left[j], size);
@@ -229,8 +250,8 @@ static WEBP_INLINE void HorizontalPred(uint8_t* dst,
 static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
                                    const uint8_t* top, int size) {
   int y;
-  if (left) {
-    if (top) {
+  if (left != NULL) {
+    if (top != NULL) {
       const uint8_t* const clip = clip1 + 255 - left[-1];
       for (y = 0; y < size; ++y) {
         const uint8_t* const clip_table = clip + left[y];
@@ -248,7 +269,7 @@ static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
     // is equivalent to VE prediction where you just copy the top samples.
     // Note that if top samples are not available, the default value is
     // then 129, and not 127 as in the VerticalPred case.
-    if (top) {
+    if (top != NULL) {
       VerticalPred(dst, top, size);
     } else {
       Fill(dst, 129, size);
@@ -261,15 +282,15 @@ static WEBP_INLINE void DCMode(uint8_t* dst, const uint8_t* left,
                                int size, int round, int shift) {
   int DC = 0;
   int j;
-  if (top) {
+  if (top != NULL) {
     for (j = 0; j < size; ++j) DC += top[j];
-    if (left) {   // top and left present
+    if (left != NULL) {   // top and left present
       for (j = 0; j < size; ++j) DC += left[j];
     } else {      // top, but no left
       DC += DC;
     }
     DC = (DC + round) >> shift;
-  } else if (left) {   // left but no top
+  } else if (left != NULL) {   // left but no top
     for (j = 0; j < size; ++j) DC += left[j];
     DC += DC;
     DC = (DC + round) >> shift;
@@ -291,8 +312,8 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
   TrueMotion(C8TM8 + dst, left, top, 8);
   // V block
   dst += 8;
-  if (top) top += 8;
-  if (left) left += 16;
+  if (top != NULL) top += 8;
+  if (left != NULL) left += 16;
   DCMode(C8DC8 + dst, left, top, 8, 8, 4);
   VerticalPred(C8VE8 + dst, top, 8);
   HorizontalPred(C8HE8 + dst, left, 8);
@@ -313,6 +334,7 @@ static void Intra16Preds(uint8_t* dst,
 //------------------------------------------------------------------------------
 // luma 4x4 prediction
 
+#define DST(x, y) dst[(x) + (y) * BPS]
 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
@@ -335,10 +357,10 @@ static void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
   const int J = top[-3];
   const int K = top[-4];
   const int L = top[-5];
-  *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J);
-  *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K);
-  *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L);
-  *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L);
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
 }
 
 static void DC4(uint8_t* dst, const uint8_t* top) {
@@ -625,6 +647,14 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return (last >= 0);
 }
 
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
 static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
                             const VP8Matrix* const mtx) {
   int n, last = -1;
@@ -654,16 +684,22 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
 //------------------------------------------------------------------------------
 // Block copy
 
-static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int size) {
+static WEBP_INLINE void Copy(const uint8_t* src, uint8_t* dst, int w, int h) {
   int y;
-  for (y = 0; y < size; ++y) {
-    memcpy(dst, src, size);
+  for (y = 0; y < h; ++y) {
+    memcpy(dst, src, w);
     src += BPS;
     dst += BPS;
   }
 }
 
-static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
+static void Copy4x4(const uint8_t* src, uint8_t* dst) {
+  Copy(src, dst, 4, 4);
+}
+
+static void Copy16x8(const uint8_t* src, uint8_t* dst) {
+  Copy(src, dst, 16, 8);
+}
 
 //------------------------------------------------------------------------------
 // Initialization
@@ -673,6 +709,7 @@ static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); }
 VP8CHisto VP8CollectHistogram;
 VP8Idct VP8ITransform;
 VP8Fdct VP8FTransform;
+VP8Fdct VP8FTransform2;
 VP8WHT VP8FTransformWHT;
 VP8Intra4Preds VP8EncPredLuma4;
 VP8IntraPreds VP8EncPredLuma16;
@@ -684,18 +721,22 @@ VP8Metric VP8SSE4x4;
 VP8WMetric VP8TDisto4x4;
 VP8WMetric VP8TDisto16x16;
 VP8QuantizeBlock VP8EncQuantizeBlock;
+VP8Quantize2Blocks VP8EncQuantize2Blocks;
 VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 VP8BlockCopy VP8Copy4x4;
+VP8BlockCopy VP8Copy16x8;
 
 extern void VP8EncDspInitSSE2(void);
+extern void VP8EncDspInitSSE41(void);
 extern void VP8EncDspInitAVX2(void);
 extern void VP8EncDspInitNEON(void);
 extern void VP8EncDspInitMIPS32(void);
+extern void VP8EncDspInitMIPSdspR2(void);
 
 static volatile VP8CPUInfo enc_last_cpuinfo_used =
     (VP8CPUInfo)&enc_last_cpuinfo_used;
 
-void VP8EncDspInit(void) {
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
   if (enc_last_cpuinfo_used == VP8GetCPUInfo) return;
 
   VP8DspInit();  // common inverse transforms
@@ -705,6 +746,7 @@ void VP8EncDspInit(void) {
   VP8CollectHistogram = CollectHistogram;
   VP8ITransform = ITransform;
   VP8FTransform = FTransform;
+  VP8FTransform2 = FTransform2;
   VP8FTransformWHT = FTransformWHT;
   VP8EncPredLuma4 = Intra4Preds;
   VP8EncPredLuma16 = Intra16Preds;
@@ -716,14 +758,21 @@ void VP8EncDspInit(void) {
   VP8TDisto4x4 = Disto4x4;
   VP8TDisto16x16 = Disto16x16;
   VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
   VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
   VP8Copy4x4 = Copy4x4;
+  VP8Copy16x8 = Copy16x8;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       VP8EncDspInitSSE2();
+#if defined(WEBP_USE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8EncDspInitSSE41();
+      }
+#endif
     }
 #endif
 #if defined(WEBP_USE_AVX2)
@@ -741,7 +790,11 @@ void VP8EncDspInit(void) {
       VP8EncDspInitMIPS32();
     }
 #endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8EncDspInitMIPSdspR2();
+    }
+#endif
   }
   enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
-
diff --git a/src/3rdparty/libwebp/src/dsp/enc_avx2.c b/src/3rdparty/libwebp/src/dsp/enc_avx2.c
index 372e616..93efb30 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_avx2.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_avx2.c
@@ -18,7 +18,4 @@
 //------------------------------------------------------------------------------
 // Entry point
 
-extern void VP8EncDspInitAVX2(void);
-
-void VP8EncDspInitAVX2(void) {
-}
+WEBP_DSP_INIT_STUB(VP8EncDspInitAVX2)
diff --git a/src/3rdparty/libwebp/src/dsp/enc_mips32.c b/src/3rdparty/libwebp/src/dsp/enc_mips32.c
index 6cede18..fd10143 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_mips32.c
@@ -17,13 +17,10 @@
 
 #if defined(WEBP_USE_MIPS32)
 
+#include "./mips_macro.h"
 #include "../enc/vp8enci.h"
 #include "../enc/cost.h"
 
-#if defined(__GNUC__) && defined(__ANDROID__) && LOCAL_GCC_VERSION == 0x409
-#define WORK_AROUND_GCC
-#endif
-
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 
@@ -59,61 +56,61 @@ static const int kC2 = 35468;
 // MUL and STORE macros inlined
 // a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255)
 // temp0..temp15 holds tmp[0]..tmp[15]
-// A..D - offsets in bytes to load from ref and store to dst buffer
+// A - offset in bytes to load from ref and store to dst buffer
 // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)              \
-  "addiu   %[" #TEMP0 "],    %[" #TEMP0 "],    4             \n\t"            \
-  "addu    %[temp16],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
-  "subu    %[temp17],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
-  "mul     %[" #TEMP0 "],    %[" #TEMP4 "],    %[kC2]        \n\t"            \
-  "mul     %[" #TEMP8 "],    %[" #TEMP12 "],   %[kC1]        \n\t"            \
-  "mul     %[" #TEMP4 "],    %[" #TEMP4 "],    %[kC1]        \n\t"            \
-  "mul     %[" #TEMP12 "],   %[" #TEMP12 "],   %[kC2]        \n\t"            \
-  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\t"            \
-  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    16            \n\t"            \
-  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    16            \n\t"            \
-  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   16            \n\t"            \
-  "subu    %[temp18],      %[" #TEMP0 "],    %[" #TEMP8 "]   \n\t"            \
-  "addu    %[temp19],      %[" #TEMP4 "],    %[" #TEMP12 "]  \n\t"            \
-  "addu    %[" #TEMP0 "],    %[temp16],      %[temp19]       \n\t"            \
-  "addu    %[" #TEMP4 "],    %[temp17],      %[temp18]       \n\t"            \
-  "subu    %[" #TEMP8 "],    %[temp17],      %[temp18]       \n\t"            \
-  "subu    %[" #TEMP12 "],   %[temp16],      %[temp19]       \n\t"            \
-  "lw      %[temp20],      0(%[args])                        \n\t"            \
-  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    3             \n\t"            \
-  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    3             \n\t"            \
-  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    3             \n\t"            \
-  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   3             \n\t"            \
-  "lbu     %[temp16],      " #A "(%[temp20])                 \n\t"            \
-  "lbu     %[temp17],      " #B "(%[temp20])                 \n\t"            \
-  "lbu     %[temp18],      " #C "(%[temp20])                 \n\t"            \
-  "lbu     %[temp19],      " #D "(%[temp20])                 \n\t"            \
-  "addu    %[" #TEMP0 "],    %[temp16],      %[" #TEMP0 "]   \n\t"            \
-  "addu    %[" #TEMP4 "],    %[temp17],      %[" #TEMP4 "]   \n\t"            \
-  "addu    %[" #TEMP8 "],    %[temp18],      %[" #TEMP8 "]   \n\t"            \
-  "addu    %[" #TEMP12 "],   %[temp19],      %[" #TEMP12 "]  \n\t"            \
-  "slt     %[temp16],      %[" #TEMP0 "],    $zero           \n\t"            \
-  "slt     %[temp17],      %[" #TEMP4 "],    $zero           \n\t"            \
-  "slt     %[temp18],      %[" #TEMP8 "],    $zero           \n\t"            \
-  "slt     %[temp19],      %[" #TEMP12 "],   $zero           \n\t"            \
-  "movn    %[" #TEMP0 "],    $zero,          %[temp16]       \n\t"            \
-  "movn    %[" #TEMP4 "],    $zero,          %[temp17]       \n\t"            \
-  "movn    %[" #TEMP8 "],    $zero,          %[temp18]       \n\t"            \
-  "movn    %[" #TEMP12 "],   $zero,          %[temp19]       \n\t"            \
-  "addiu   %[temp20],      $zero,          255               \n\t"            \
-  "slt     %[temp16],      %[" #TEMP0 "],    %[temp20]       \n\t"            \
-  "slt     %[temp17],      %[" #TEMP4 "],    %[temp20]       \n\t"            \
-  "slt     %[temp18],      %[" #TEMP8 "],    %[temp20]       \n\t"            \
-  "slt     %[temp19],      %[" #TEMP12 "],   %[temp20]       \n\t"            \
-  "movz    %[" #TEMP0 "],    %[temp20],      %[temp16]       \n\t"            \
-  "movz    %[" #TEMP4 "],    %[temp20],      %[temp17]       \n\t"            \
-  "lw      %[temp16],      8(%[args])                        \n\t"            \
-  "movz    %[" #TEMP8 "],    %[temp20],      %[temp18]       \n\t"            \
-  "movz    %[" #TEMP12 "],   %[temp20],      %[temp19]       \n\t"            \
-  "sb      %[" #TEMP0 "],    " #A "(%[temp16])               \n\t"            \
-  "sb      %[" #TEMP4 "],    " #B "(%[temp16])               \n\t"            \
-  "sb      %[" #TEMP8 "],    " #C "(%[temp16])               \n\t"            \
-  "sb      %[" #TEMP12 "],   " #D "(%[temp16])               \n\t"
+#define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12)                       \
+  "addiu   %[" #TEMP0 "],    %[" #TEMP0 "],    4               \n\t"          \
+  "addu    %[temp16],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "subu    %[temp17],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "mul     %[" #TEMP0 "],    %[" #TEMP4 "],    %[kC2]          \n\t"          \
+  "mul     %[" #TEMP8 "],    %[" #TEMP12 "],   %[kC1]          \n\t"          \
+  "mul     %[" #TEMP4 "],    %[" #TEMP4 "],    %[kC1]          \n\t"          \
+  "mul     %[" #TEMP12 "],   %[" #TEMP12 "],   %[kC2]          \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16              \n\t"          \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    16              \n\t"          \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    16              \n\t"          \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   16              \n\t"          \
+  "subu    %[temp18],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \
+  "addu    %[temp19],      %[" #TEMP4 "],    %[" #TEMP12 "]    \n\t"          \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[temp19]         \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[temp18]         \n\t"          \
+  "subu    %[" #TEMP8 "],    %[temp17],      %[temp18]         \n\t"          \
+  "subu    %[" #TEMP12 "],   %[temp16],      %[temp19]         \n\t"          \
+  "lw      %[temp20],      0(%[args])                          \n\t"          \
+  "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    3               \n\t"          \
+  "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    3               \n\t"          \
+  "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    3               \n\t"          \
+  "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   3               \n\t"          \
+  "lbu     %[temp16],      0+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp17],      1+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp18],      2+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "lbu     %[temp19],      3+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \
+  "addu    %[" #TEMP0 "],    %[temp16],      %[" #TEMP0 "]     \n\t"          \
+  "addu    %[" #TEMP4 "],    %[temp17],      %[" #TEMP4 "]     \n\t"          \
+  "addu    %[" #TEMP8 "],    %[temp18],      %[" #TEMP8 "]     \n\t"          \
+  "addu    %[" #TEMP12 "],   %[temp19],      %[" #TEMP12 "]    \n\t"          \
+  "slt     %[temp16],      %[" #TEMP0 "],    $zero             \n\t"          \
+  "slt     %[temp17],      %[" #TEMP4 "],    $zero             \n\t"          \
+  "slt     %[temp18],      %[" #TEMP8 "],    $zero             \n\t"          \
+  "slt     %[temp19],      %[" #TEMP12 "],   $zero             \n\t"          \
+  "movn    %[" #TEMP0 "],    $zero,          %[temp16]         \n\t"          \
+  "movn    %[" #TEMP4 "],    $zero,          %[temp17]         \n\t"          \
+  "movn    %[" #TEMP8 "],    $zero,          %[temp18]         \n\t"          \
+  "movn    %[" #TEMP12 "],   $zero,          %[temp19]         \n\t"          \
+  "addiu   %[temp20],      $zero,          255                 \n\t"          \
+  "slt     %[temp16],      %[" #TEMP0 "],    %[temp20]         \n\t"          \
+  "slt     %[temp17],      %[" #TEMP4 "],    %[temp20]         \n\t"          \
+  "slt     %[temp18],      %[" #TEMP8 "],    %[temp20]         \n\t"          \
+  "slt     %[temp19],      %[" #TEMP12 "],   %[temp20]         \n\t"          \
+  "movz    %[" #TEMP0 "],    %[temp20],      %[temp16]         \n\t"          \
+  "movz    %[" #TEMP4 "],    %[temp20],      %[temp17]         \n\t"          \
+  "lw      %[temp16],      8(%[args])                          \n\t"          \
+  "movz    %[" #TEMP8 "],    %[temp20],      %[temp18]         \n\t"          \
+  "movz    %[" #TEMP12 "],   %[temp20],      %[temp19]         \n\t"          \
+  "sb      %[" #TEMP0 "],    0+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP4 "],    1+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP8 "],    2+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \
+  "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"
 
 // Does one or two inverse transforms.
 static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
@@ -130,10 +127,10 @@ static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
     VERTICAL_PASS(4, 20, 12, 28, temp12, temp8,  temp9,  temp10, temp11)
     VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15)
 
-    HORIZONTAL_PASS( 0,  1,  2,  3, temp0, temp4, temp8,  temp12)
-    HORIZONTAL_PASS(16, 17, 18, 19, temp1, temp5, temp9,  temp13)
-    HORIZONTAL_PASS(32, 33, 34, 35, temp2, temp6, temp10, temp14)
-    HORIZONTAL_PASS(48, 49, 50, 51, temp3, temp7, temp11, temp15)
+    HORIZONTAL_PASS(0, temp0, temp4, temp8,  temp12)
+    HORIZONTAL_PASS(1, temp1, temp5, temp9,  temp13)
+    HORIZONTAL_PASS(2, temp2, temp6, temp10, temp14)
+    HORIZONTAL_PASS(3, temp3, temp7, temp11, temp15)
 
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
@@ -241,46 +238,54 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return 0;
 }
 
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
 #undef QUANTIZE_ONE
 
 // macro for one horizontal pass in Disto4x4 (TTransform)
 // two calls of function TTransform are merged into single one
-// A..D - offsets in bytes to load from a and b buffers
+// A - offset in bytes to load from a and b buffers
 // E..H - offsets in bytes to store first results to tmp buffer
 // E1..H1 - offsets in bytes to store second results to tmp buffer
-#define HORIZONTAL_PASS(A, B, C, D, E, F, G, H, E1, F1, G1, H1)   \
-  "lbu    %[temp0],  " #A "(%[a])            \n\t"                \
-  "lbu    %[temp1],  " #B "(%[a])            \n\t"                \
-  "lbu    %[temp2],  " #C "(%[a])            \n\t"                \
-  "lbu    %[temp3],  " #D "(%[a])            \n\t"                \
-  "lbu    %[temp4],  " #A "(%[b])            \n\t"                \
-  "lbu    %[temp5],  " #B "(%[b])            \n\t"                \
-  "lbu    %[temp6],  " #C "(%[b])            \n\t"                \
-  "lbu    %[temp7],  " #D "(%[b])            \n\t"                \
-  "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \
-  "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \
-  "addu   %[temp2],  %[temp1],    %[temp3]   \n\t"                \
-  "subu   %[temp1],  %[temp1],    %[temp3]   \n\t"                \
-  "addu   %[temp3],  %[temp4],    %[temp6]   \n\t"                \
-  "subu   %[temp4],  %[temp4],    %[temp6]   \n\t"                \
-  "addu   %[temp6],  %[temp5],    %[temp7]   \n\t"                \
-  "subu   %[temp5],  %[temp5],    %[temp7]   \n\t"                \
-  "addu   %[temp7],  %[temp8],    %[temp2]   \n\t"                \
-  "subu   %[temp2],  %[temp8],    %[temp2]   \n\t"                \
-  "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \
-  "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \
-  "addu   %[temp1],  %[temp3],    %[temp6]   \n\t"                \
-  "subu   %[temp3],  %[temp3],    %[temp6]   \n\t"                \
-  "addu   %[temp6],  %[temp4],    %[temp5]   \n\t"                \
-  "subu   %[temp4],  %[temp4],    %[temp5]   \n\t"                \
-  "sw     %[temp7],  " #E "(%[tmp])          \n\t"                \
-  "sw     %[temp2],  " #H "(%[tmp])          \n\t"                \
-  "sw     %[temp8],  " #F "(%[tmp])          \n\t"                \
-  "sw     %[temp0],  " #G "(%[tmp])          \n\t"                \
-  "sw     %[temp1],  " #E1 "(%[tmp])         \n\t"                \
-  "sw     %[temp3],  " #H1 "(%[tmp])         \n\t"                \
-  "sw     %[temp6],  " #F1 "(%[tmp])         \n\t"                \
-  "sw     %[temp4],  " #G1 "(%[tmp])         \n\t"
+#define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1)                  \
+  "lbu    %[temp0],  0+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp1],  1+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp2],  2+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp3],  3+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \
+  "lbu    %[temp4],  0+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp5],  1+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp6],  2+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "lbu    %[temp7],  3+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp2]         \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp2]         \n\t"                \
+  "addu   %[temp2],  %[temp1],    %[temp3]         \n\t"                \
+  "subu   %[temp1],  %[temp1],    %[temp3]         \n\t"                \
+  "addu   %[temp3],  %[temp4],    %[temp6]         \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp6]         \n\t"                \
+  "addu   %[temp6],  %[temp5],    %[temp7]         \n\t"                \
+  "subu   %[temp5],  %[temp5],    %[temp7]         \n\t"                \
+  "addu   %[temp7],  %[temp8],    %[temp2]         \n\t"                \
+  "subu   %[temp2],  %[temp8],    %[temp2]         \n\t"                \
+  "addu   %[temp8],  %[temp0],    %[temp1]         \n\t"                \
+  "subu   %[temp0],  %[temp0],    %[temp1]         \n\t"                \
+  "addu   %[temp1],  %[temp3],    %[temp6]         \n\t"                \
+  "subu   %[temp3],  %[temp3],    %[temp6]         \n\t"                \
+  "addu   %[temp6],  %[temp4],    %[temp5]         \n\t"                \
+  "subu   %[temp4],  %[temp4],    %[temp5]         \n\t"                \
+  "sw     %[temp7],  " #E "(%[tmp])                \n\t"                \
+  "sw     %[temp2],  " #H "(%[tmp])                \n\t"                \
+  "sw     %[temp8],  " #F "(%[tmp])                \n\t"                \
+  "sw     %[temp0],  " #G "(%[tmp])                \n\t"                \
+  "sw     %[temp1],  " #E1 "(%[tmp])               \n\t"                \
+  "sw     %[temp3],  " #H1 "(%[tmp])               \n\t"                \
+  "sw     %[temp6],  " #F1 "(%[tmp])               \n\t"                \
+  "sw     %[temp4],  " #G1 "(%[tmp])               \n\t"
 
 // macro for one vertical pass in Disto4x4 (TTransform)
 // two calls of function TTransform are merged into single one
@@ -362,10 +367,10 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
 
   __asm__ volatile(
-    HORIZONTAL_PASS( 0,  1,  2,  3,    0,  4,  8, 12,    64,  68,  72,  76)
-    HORIZONTAL_PASS(16, 17, 18, 19,   16, 20, 24, 28,    80,  84,  88,  92)
-    HORIZONTAL_PASS(32, 33, 34, 35,   32, 36, 40, 44,    96, 100, 104, 108)
-    HORIZONTAL_PASS(48, 49, 50, 51,   48, 52, 56, 60,   112, 116, 120, 124)
+    HORIZONTAL_PASS(0,   0,  4,  8, 12,    64,  68,  72,  76)
+    HORIZONTAL_PASS(1,  16, 20, 24, 28,    80,  84,  88,  92)
+    HORIZONTAL_PASS(2,  32, 36, 40, 44,    96, 100, 104, 108)
+    HORIZONTAL_PASS(3,  48, 52, 56, 60,   112, 116, 120, 124)
     "mthi   $zero                             \n\t"
     "mtlo   $zero                             \n\t"
     VERTICAL_PASS( 0, 16, 32, 48,     64, 80,  96, 112,   0,  8, 16, 24)
@@ -405,41 +410,41 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
 
 // macro for one horizontal pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
-// A..D - offsets in bytes to load from src and ref buffers
+// A - offset in bytes to load from src and ref buffers
 // TEMP0..TEMP3 - registers for corresponding tmp elements
-#define HORIZONTAL_PASS(A, B, C, D, TEMP0, TEMP1, TEMP2, TEMP3)   \
-  "lw     %[" #TEMP1 "],  0(%[args])                     \n\t"    \
-  "lw     %[" #TEMP2 "],  4(%[args])                     \n\t"    \
-  "lbu    %[temp16],    " #A "(%[" #TEMP1 "])            \n\t"    \
-  "lbu    %[temp17],    " #A "(%[" #TEMP2 "])            \n\t"    \
-  "lbu    %[temp18],    " #B "(%[" #TEMP1 "])            \n\t"    \
-  "lbu    %[temp19],    " #B "(%[" #TEMP2 "])            \n\t"    \
-  "subu   %[temp20],    %[temp16],    %[temp17]          \n\t"    \
-  "lbu    %[temp16],    " #C "(%[" #TEMP1 "])            \n\t"    \
-  "lbu    %[temp17],    " #C "(%[" #TEMP2 "])            \n\t"    \
-  "subu   %[" #TEMP0 "],  %[temp18],    %[temp19]        \n\t"    \
-  "lbu    %[temp18],    " #D "(%[" #TEMP1 "])            \n\t"    \
-  "lbu    %[temp19],    " #D "(%[" #TEMP2 "])            \n\t"    \
-  "subu   %[" #TEMP1 "],  %[temp16],    %[temp17]        \n\t"    \
-  "subu   %[" #TEMP2 "],  %[temp18],    %[temp19]        \n\t"    \
-  "addu   %[" #TEMP3 "],  %[temp20],    %[" #TEMP2 "]    \n\t"    \
-  "subu   %[" #TEMP2 "],  %[temp20],    %[" #TEMP2 "]    \n\t"    \
-  "addu   %[temp20],    %[" #TEMP0 "],  %[" #TEMP1 "]    \n\t"    \
-  "subu   %[" #TEMP0 "],  %[" #TEMP0 "],  %[" #TEMP1 "]  \n\t"    \
-  "mul    %[temp16],    %[" #TEMP2 "],  %[c5352]         \n\t"    \
-  "mul    %[temp17],    %[" #TEMP2 "],  %[c2217]         \n\t"    \
-  "mul    %[temp18],    %[" #TEMP0 "],  %[c5352]         \n\t"    \
-  "mul    %[temp19],    %[" #TEMP0 "],  %[c2217]         \n\t"    \
-  "addu   %[" #TEMP1 "],  %[" #TEMP3 "],  %[temp20]      \n\t"    \
-  "subu   %[temp20],    %[" #TEMP3 "],  %[temp20]        \n\t"    \
-  "sll    %[" #TEMP0 "],  %[" #TEMP1 "],  3              \n\t"    \
-  "sll    %[" #TEMP2 "],  %[temp20],    3                \n\t"    \
-  "addiu  %[temp16],    %[temp16],    1812               \n\t"    \
-  "addiu  %[temp17],    %[temp17],    937                \n\t"    \
-  "addu   %[temp16],    %[temp16],    %[temp19]          \n\t"    \
-  "subu   %[temp17],    %[temp17],    %[temp18]          \n\t"    \
-  "sra    %[" #TEMP1 "],  %[temp16],    9                \n\t"    \
-  "sra    %[" #TEMP3 "],  %[temp17],    9                \n\t"
+#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                  \
+  "lw     %[" #TEMP1 "],  0(%[args])                           \n\t"    \
+  "lw     %[" #TEMP2 "],  4(%[args])                           \n\t"    \
+  "lbu    %[temp16],    0+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp17],    0+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "lbu    %[temp18],    1+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp19],    1+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[temp20],    %[temp16],    %[temp17]                \n\t"    \
+  "lbu    %[temp16],    2+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp17],    2+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[" #TEMP0 "],  %[temp18],    %[temp19]              \n\t"    \
+  "lbu    %[temp18],    3+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \
+  "lbu    %[temp19],    3+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \
+  "subu   %[" #TEMP1 "],  %[temp16],    %[temp17]              \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp18],    %[temp19]              \n\t"    \
+  "addu   %[" #TEMP3 "],  %[temp20],    %[" #TEMP2 "]          \n\t"    \
+  "subu   %[" #TEMP2 "],  %[temp20],    %[" #TEMP2 "]          \n\t"    \
+  "addu   %[temp20],    %[" #TEMP0 "],  %[" #TEMP1 "]          \n\t"    \
+  "subu   %[" #TEMP0 "],  %[" #TEMP0 "],  %[" #TEMP1 "]        \n\t"    \
+  "mul    %[temp16],    %[" #TEMP2 "],  %[c5352]               \n\t"    \
+  "mul    %[temp17],    %[" #TEMP2 "],  %[c2217]               \n\t"    \
+  "mul    %[temp18],    %[" #TEMP0 "],  %[c5352]               \n\t"    \
+  "mul    %[temp19],    %[" #TEMP0 "],  %[c2217]               \n\t"    \
+  "addu   %[" #TEMP1 "],  %[" #TEMP3 "],  %[temp20]            \n\t"    \
+  "subu   %[temp20],    %[" #TEMP3 "],  %[temp20]              \n\t"    \
+  "sll    %[" #TEMP0 "],  %[" #TEMP1 "],  3                    \n\t"    \
+  "sll    %[" #TEMP2 "],  %[temp20],    3                      \n\t"    \
+  "addiu  %[temp16],    %[temp16],    1812                     \n\t"    \
+  "addiu  %[temp17],    %[temp17],    937                      \n\t"    \
+  "addu   %[temp16],    %[temp16],    %[temp19]                \n\t"    \
+  "subu   %[temp17],    %[temp17],    %[temp18]                \n\t"    \
+  "sra    %[" #TEMP1 "],  %[temp16],    9                      \n\t"    \
+  "sra    %[" #TEMP3 "],  %[temp17],    9                      \n\t"
 
 // macro for one vertical pass in FTransform
 // temp0..temp15 holds tmp[0]..tmp[15]
@@ -483,10 +488,10 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
       { (const int*)src, (const int*)ref, (const int*)out };
 
   __asm__ volatile(
-    HORIZONTAL_PASS( 0,  1,  2,  3, temp0,  temp1,  temp2,  temp3)
-    HORIZONTAL_PASS(16, 17, 18, 19, temp4,  temp5,  temp6,  temp7)
-    HORIZONTAL_PASS(32, 33, 34, 35, temp8,  temp9,  temp10, temp11)
-    HORIZONTAL_PASS(48, 49, 50, 51, temp12, temp13, temp14, temp15)
+    HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
+    HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
+    HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
+    HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
     "lw   %[temp20],    8(%[args])                     \n\t"
     VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
     VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
@@ -508,118 +513,7 @@ static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
 #undef VERTICAL_PASS
 #undef HORIZONTAL_PASS
 
-// Forward declaration.
-extern int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res);
-
-int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
-  int n = res->first;
-  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  int p0 = res->prob[n][ctx0][0];
-  const uint16_t* t = res->cost[n][ctx0];
-  int cost;
-  const int const_2 = 2;
-  const int const_255 = 255;
-  const int const_max_level = MAX_VARIABLE_LEVEL;
-  int res_cost;
-  int res_prob;
-  int res_coeffs;
-  int res_last;
-  int v_reg;
-  int b_reg;
-  int ctx_reg;
-  int cost_add, temp_1, temp_2, temp_3;
-
-  if (res->last < 0) {
-    return VP8BitCost(0, p0);
-  }
-
-  cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
-
-  res_cost = (int)res->cost;
-  res_prob = (int)res->prob;
-  res_coeffs = (int)res->coeffs;
-  res_last = (int)res->last;
-
-  __asm__ volatile(
-    ".set   push                                                           \n\t"
-    ".set   noreorder                                                      \n\t"
-
-    "sll    %[temp_1],     %[n],              1                            \n\t"
-    "addu   %[res_coeffs], %[res_coeffs],     %[temp_1]                    \n\t"
-    "slt    %[temp_2],     %[n],              %[res_last]                  \n\t"
-    "bnez   %[temp_2],     1f                                              \n\t"
-    " li    %[cost_add],   0                                               \n\t"
-    "b      2f                                                             \n\t"
-    " nop                                                                  \n\t"
-  "1:                                                                      \n\t"
-    "lh     %[v_reg],      0(%[res_coeffs])                                \n\t"
-    "addu   %[b_reg],      %[n],              %[VP8EncBands]               \n\t"
-    "move   %[temp_1],     %[const_max_level]                              \n\t"
-    "addu   %[cost],       %[cost],           %[cost_add]                  \n\t"
-    "negu   %[temp_2],     %[v_reg]                                        \n\t"
-    "slti   %[temp_3],     %[v_reg],          0                            \n\t"
-    "movn   %[v_reg],      %[temp_2],         %[temp_3]                    \n\t"
-    "lbu    %[b_reg],      1(%[b_reg])                                     \n\t"
-    "li     %[cost_add],   0                                               \n\t"
-
-    "sltiu  %[temp_3],     %[v_reg],          2                            \n\t"
-    "move   %[ctx_reg],    %[v_reg]                                        \n\t"
-    "movz   %[ctx_reg],    %[const_2],        %[temp_3]                    \n\t"
-    //  cost += VP8LevelCost(t, v);
-    "slt    %[temp_3],     %[v_reg],          %[const_max_level]           \n\t"
-    "movn   %[temp_1],     %[v_reg],          %[temp_3]                    \n\t"
-    "sll    %[temp_2],     %[v_reg],          1                            \n\t"
-    "addu   %[temp_2],     %[temp_2],         %[VP8LevelFixedCosts]        \n\t"
-    "lhu    %[temp_2],     0(%[temp_2])                                    \n\t"
-    "sll    %[temp_1],     %[temp_1],         1                            \n\t"
-    "addu   %[temp_1],     %[temp_1],         %[t]                         \n\t"
-    "lhu    %[temp_3],     0(%[temp_1])                                    \n\t"
-    "addu   %[cost],       %[cost],           %[temp_2]                    \n\t"
-
-    //  t = res->cost[b][ctx];
-    "sll    %[temp_1],     %[ctx_reg],        7                            \n\t"
-    "sll    %[temp_2],     %[ctx_reg],        3                            \n\t"
-    "addu   %[cost],       %[cost],           %[temp_3]                    \n\t"
-    "addu   %[temp_1],     %[temp_1],         %[temp_2]                    \n\t"
-    "sll    %[temp_2],     %[b_reg],          3                            \n\t"
-    "sll    %[temp_3],     %[b_reg],          5                            \n\t"
-    "sub    %[temp_2],     %[temp_3],         %[temp_2]                    \n\t"
-    "sll    %[temp_3],     %[temp_2],         4                            \n\t"
-    "addu   %[temp_1],     %[temp_1],         %[temp_3]                    \n\t"
-    "addu   %[temp_2],     %[temp_2],         %[res_cost]                  \n\t"
-    "addiu  %[n],          %[n],              1                            \n\t"
-    "addu   %[t],          %[temp_1],         %[temp_2]                    \n\t"
-    "slt    %[temp_1],     %[n],              %[res_last]                  \n\t"
-    "bnez   %[temp_1],     1b                                              \n\t"
-    " addiu %[res_coeffs], %[res_coeffs],     2                            \n\t"
-   "2:                                                                     \n\t"
-
-    ".set   pop                                                            \n\t"
-    : [cost]"+r"(cost), [t]"+r"(t), [n]"+r"(n), [v_reg]"=&r"(v_reg),
-      [ctx_reg]"=&r"(ctx_reg), [b_reg]"=&r"(b_reg), [cost_add]"=&r"(cost_add),
-      [temp_1]"=&r"(temp_1), [temp_2]"=&r"(temp_2), [temp_3]"=&r"(temp_3)
-    : [const_2]"r"(const_2), [const_255]"r"(const_255), [res_last]"r"(res_last),
-      [VP8EntropyCost]"r"(VP8EntropyCost), [VP8EncBands]"r"(VP8EncBands),
-      [const_max_level]"r"(const_max_level), [res_prob]"r"(res_prob),
-      [VP8LevelFixedCosts]"r"(VP8LevelFixedCosts), [res_coeffs]"r"(res_coeffs),
-      [res_cost]"r"(res_cost)
-    : "memory"
-  );
-
-  // Last coefficient is always non-zero
-  {
-    const int v = abs(res->coeffs[n]);
-    assert(v != 0);
-    cost += VP8LevelCost(t, v);
-    if (n < 15) {
-      const int b = VP8EncBands[n + 1];
-      const int ctx = (v == 1) ? 1 : 2;
-      const int last_p0 = res->prob[b][ctx][0];
-      cost += VP8BitCost(0, last_p0);
-    }
-  }
-  return cost;
-}
+#if !defined(WORK_AROUND_GCC)
 
 #define GET_SSE_INNER(A, B, C, D)                               \
   "lbu     %[temp0],    " #A "(%[a])                 \n\t"      \
@@ -645,7 +539,6 @@ int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res) {
   GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \
   GET_SSE_INNER(D, D + 1, D + 2, D + 3)
 
-#if !defined(WORK_AROUND_GCC)
 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
   int count;
   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
@@ -653,29 +546,29 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
   __asm__ volatile(
      "mult   $zero,    $zero                            \n\t"
 
-     GET_SSE(  0,   4,   8,  12)
-     GET_SSE( 16,  20,  24,  28)
-     GET_SSE( 32,  36,  40,  44)
-     GET_SSE( 48,  52,  56,  60)
-     GET_SSE( 64,  68,  72,  76)
-     GET_SSE( 80,  84,  88,  92)
-     GET_SSE( 96, 100, 104, 108)
-     GET_SSE(112, 116, 120, 124)
-     GET_SSE(128, 132, 136, 140)
-     GET_SSE(144, 148, 152, 156)
-     GET_SSE(160, 164, 168, 172)
-     GET_SSE(176, 180, 184, 188)
-     GET_SSE(192, 196, 200, 204)
-     GET_SSE(208, 212, 216, 220)
-     GET_SSE(224, 228, 232, 236)
-     GET_SSE(240, 244, 248, 252)
+     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
+     GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS)
+     GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS)
+     GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
+     GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
+     GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
+     GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
+     GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
+     GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
 
     "mflo    %[count]                                   \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
     : [a]"r"(a), [b]"r"(b)
-    : "memory", "hi" , "lo"
+    : "memory", "hi", "lo"
   );
   return count;
 }
@@ -687,21 +580,21 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
   __asm__ volatile(
      "mult   $zero,    $zero                            \n\t"
 
-     GET_SSE(  0,   4,   8,  12)
-     GET_SSE( 16,  20,  24,  28)
-     GET_SSE( 32,  36,  40,  44)
-     GET_SSE( 48,  52,  56,  60)
-     GET_SSE( 64,  68,  72,  76)
-     GET_SSE( 80,  84,  88,  92)
-     GET_SSE( 96, 100, 104, 108)
-     GET_SSE(112, 116, 120, 124)
+     GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+     GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+     GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+     GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+     GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+     GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+     GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+     GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
 
     "mflo    %[count]                                   \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
     : [a]"r"(a), [b]"r"(b)
-    : "memory", "hi" , "lo"
+    : "memory", "hi", "lo"
   );
   return count;
 }
@@ -713,17 +606,17 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
   __asm__ volatile(
      "mult   $zero,    $zero                            \n\t"
 
-     GET_SSE( 0,   4,  16,  20)
-     GET_SSE(32,  36,  48,  52)
-     GET_SSE(64,  68,  80,  84)
-     GET_SSE(96, 100, 112, 116)
+     GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
+     GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
+     GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
+     GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
 
     "mflo    %[count]                                   \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
     : [a]"r"(a), [b]"r"(b)
-    : "memory", "hi" , "lo"
+    : "memory", "hi", "lo"
   );
   return count;
 }
@@ -735,42 +628,45 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
   __asm__ volatile(
      "mult   $zero,    $zero                            \n\t"
 
-     GET_SSE(0, 16, 32, 48)
+     GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
 
     "mflo    %[count]                                   \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [count]"=&r"(count)
     : [a]"r"(a), [b]"r"(b)
-    : "memory", "hi" , "lo"
+    : "memory", "hi", "lo"
   );
   return count;
 }
 
-#endif  // WORK_AROUND_GCC
+#undef GET_SSE
+#undef GET_SSE_INNER
 
-#undef GET_SSE_MIPS32
-#undef GET_SSE_MIPS32_INNER
-
-#endif  // WEBP_USE_MIPS32
+#endif  // !WORK_AROUND_GCC
 
 //------------------------------------------------------------------------------
 // Entry point
 
 extern void VP8EncDspInitMIPS32(void);
 
-void VP8EncDspInitMIPS32(void) {
-#if defined(WEBP_USE_MIPS32)
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) {
   VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
   VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
   VP8TDisto4x4 = Disto4x4;
   VP8TDisto16x16 = Disto16x16;
-  VP8FTransform = FTransform;
 #if !defined(WORK_AROUND_GCC)
   VP8SSE16x16 = SSE16x16;
   VP8SSE8x8 = SSE8x8;
   VP8SSE16x8 = SSE16x8;
   VP8SSE4x4 = SSE4x4;
 #endif
-#endif  // WEBP_USE_MIPS32
 }
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
new file mode 100644
index 0000000..7c814fa
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
@@ -0,0 +1,1512 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of speed-critical encoding functions.
+//
+// Author(s): Darko Laus (darko.laus@imgtec.com)
+//            Mirko Raus (mirko.raus@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "./mips_macro.h"
+#include "../enc/cost.h"
+#include "../enc/vp8enci.h"
+
+static const int kC1 = 20091 + (1 << 16);
+static const int kC2 = 35468;
+
+// O - output
+// I - input (macro doesn't change it)
+#define ADD_SUB_HALVES_X4(O0, O1, O2, O3, O4, O5, O6, O7,                      \
+                          I0, I1, I2, I3, I4, I5, I6, I7)                      \
+  "addq.ph          %[" #O0 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
+  "subq.ph          %[" #O1 "],   %[" #I0 "],  %[" #I1 "]     \n\t"            \
+  "addq.ph          %[" #O2 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
+  "subq.ph          %[" #O3 "],   %[" #I2 "],  %[" #I3 "]     \n\t"            \
+  "addq.ph          %[" #O4 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
+  "subq.ph          %[" #O5 "],   %[" #I4 "],  %[" #I5 "]     \n\t"            \
+  "addq.ph          %[" #O6 "],   %[" #I6 "],  %[" #I7 "]     \n\t"            \
+  "subq.ph          %[" #O7 "],   %[" #I6 "],  %[" #I7 "]     \n\t"
+
+// IO - input/output
+#define ABS_X8(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7)                         \
+  "absq_s.ph        %[" #IO0 "],   %[" #IO0 "]                \n\t"            \
+  "absq_s.ph        %[" #IO1 "],   %[" #IO1 "]                \n\t"            \
+  "absq_s.ph        %[" #IO2 "],   %[" #IO2 "]                \n\t"            \
+  "absq_s.ph        %[" #IO3 "],   %[" #IO3 "]                \n\t"            \
+  "absq_s.ph        %[" #IO4 "],   %[" #IO4 "]                \n\t"            \
+  "absq_s.ph        %[" #IO5 "],   %[" #IO5 "]                \n\t"            \
+  "absq_s.ph        %[" #IO6 "],   %[" #IO6 "]                \n\t"            \
+  "absq_s.ph        %[" #IO7 "],   %[" #IO7 "]                \n\t"
+
+// dpa.w.ph $ac0 temp0 ,temp1
+//  $ac += temp0[31..16] * temp1[31..16] + temp0[15..0] * temp1[15..0]
+// dpax.w.ph $ac0 temp0 ,temp1
+//  $ac += temp0[31..16] * temp1[15..0] + temp0[15..0] * temp1[31..16]
+// O - output
+// I - input (macro doesn't change it)
+#define MUL_HALF(O0, I0, I1, I2, I3, I4, I5, I6, I7,                           \
+                 I8, I9, I10, I11, I12, I13, I14, I15)                         \
+    "mult            $ac0,      $zero,     $zero              \n\t"            \
+    "dpa.w.ph        $ac0,      %[" #I2 "],  %[" #I0 "]       \n\t"            \
+    "dpax.w.ph       $ac0,      %[" #I5 "],  %[" #I6 "]       \n\t"            \
+    "dpa.w.ph        $ac0,      %[" #I8 "],  %[" #I9 "]       \n\t"            \
+    "dpax.w.ph       $ac0,      %[" #I11 "], %[" #I4 "]       \n\t"            \
+    "dpa.w.ph        $ac0,      %[" #I12 "], %[" #I7 "]       \n\t"            \
+    "dpax.w.ph       $ac0,      %[" #I13 "], %[" #I1 "]       \n\t"            \
+    "dpa.w.ph        $ac0,      %[" #I14 "], %[" #I3 "]       \n\t"            \
+    "dpax.w.ph       $ac0,      %[" #I15 "], %[" #I10 "]      \n\t"            \
+    "mflo            %[" #O0 "],  $ac0                        \n\t"
+
+#define OUTPUT_EARLY_CLOBBER_REGS_17()                                         \
+  OUTPUT_EARLY_CLOBBER_REGS_10(),                                              \
+  [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13),         \
+  [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16),         \
+  [temp17]"=&r"(temp17)
+
+// macro for one horizontal pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A - offset in bytes to load from src and ref buffers
+// TEMP0..TEMP3 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                         \
+  "lw              %[" #TEMP0 "],   0(%[args])                          \n\t"  \
+  "lw              %[" #TEMP1 "],   4(%[args])                          \n\t"  \
+  "lw              %[" #TEMP2 "],   " XSTR(BPS) "*" #A "(%[" #TEMP0 "]) \n\t"  \
+  "lw              %[" #TEMP3 "],   " XSTR(BPS) "*" #A "(%[" #TEMP1 "]) \n\t"  \
+  "preceu.ph.qbl   %[" #TEMP0 "],   %[" #TEMP2 "]                       \n\t"  \
+  "preceu.ph.qbl   %[" #TEMP1 "],   %[" #TEMP3 "]                       \n\t"  \
+  "preceu.ph.qbr   %[" #TEMP2 "],   %[" #TEMP2 "]                       \n\t"  \
+  "preceu.ph.qbr   %[" #TEMP3 "],   %[" #TEMP3 "]                       \n\t"  \
+  "subq.ph         %[" #TEMP0 "],   %[" #TEMP0 "],   %[" #TEMP1 "]      \n\t"  \
+  "subq.ph         %[" #TEMP2 "],   %[" #TEMP2 "],   %[" #TEMP3 "]      \n\t"  \
+  "rotr            %[" #TEMP0 "],   %[" #TEMP0 "],   16                 \n\t"  \
+  "addq.ph         %[" #TEMP1 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
+  "subq.ph         %[" #TEMP3 "],   %[" #TEMP2 "],   %[" #TEMP0 "]      \n\t"  \
+  "seh             %[" #TEMP0 "],   %[" #TEMP1 "]                       \n\t"  \
+  "sra             %[temp16],     %[" #TEMP1 "],   16                   \n\t"  \
+  "seh             %[temp19],     %[" #TEMP3 "]                         \n\t"  \
+  "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   16                 \n\t"  \
+  "subu            %[" #TEMP2 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
+  "addu            %[" #TEMP0 "],   %[" #TEMP0 "],   %[temp16]          \n\t"  \
+  "mul             %[temp17],     %[temp19],     %[c2217]               \n\t"  \
+  "mul             %[temp18],     %[" #TEMP3 "],   %[c5352]             \n\t"  \
+  "mul             %[" #TEMP1 "],   %[temp19],     %[c5352]             \n\t"  \
+  "mul             %[temp16],     %[" #TEMP3 "],   %[c2217]             \n\t"  \
+  "sll             %[" #TEMP2 "],   %[" #TEMP2 "],   3                  \n\t"  \
+  "sll             %[" #TEMP0 "],   %[" #TEMP0 "],   3                  \n\t"  \
+  "subu            %[" #TEMP3 "],   %[temp17],     %[temp18]            \n\t"  \
+  "addu            %[" #TEMP1 "],   %[temp16],     %[" #TEMP1 "]        \n\t"  \
+  "addiu           %[" #TEMP3 "],   %[" #TEMP3 "],   937                \n\t"  \
+  "addiu           %[" #TEMP1 "],   %[" #TEMP1 "],   1812               \n\t"  \
+  "sra             %[" #TEMP3 "],   %[" #TEMP3 "],   9                  \n\t"  \
+  "sra             %[" #TEMP1 "],   %[" #TEMP1 "],   9                  \n\t"
+
+// macro for one vertical pass in FTransform
+// temp0..temp15 holds tmp[0]..tmp[15]
+// A..D - offsets in bytes to store to out buffer
+// TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements
+#define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)                 \
+  "addu            %[temp16],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
+  "subu            %[temp19],     %[" #TEMP0 "],   %[" #TEMP12 "]   \n\t"      \
+  "addu            %[temp17],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
+  "subu            %[temp18],     %[" #TEMP4 "],   %[" #TEMP8 "]    \n\t"      \
+  "mul             %[" #TEMP8 "],   %[temp19],     %[c2217]         \n\t"      \
+  "mul             %[" #TEMP12 "],  %[temp18],     %[c2217]         \n\t"      \
+  "mul             %[" #TEMP4 "],   %[temp19],     %[c5352]         \n\t"      \
+  "mul             %[temp18],     %[temp18],     %[c5352]           \n\t"      \
+  "addiu           %[temp16],     %[temp16],     7                  \n\t"      \
+  "addu            %[" #TEMP0 "],   %[temp16],     %[temp17]        \n\t"      \
+  "sra             %[" #TEMP0 "],   %[" #TEMP0 "],   4              \n\t"      \
+  "addu            %[" #TEMP12 "],  %[" #TEMP12 "],  %[" #TEMP4 "]  \n\t"      \
+  "subu            %[" #TEMP4 "],   %[temp16],     %[temp17]        \n\t"      \
+  "sra             %[" #TEMP4 "],   %[" #TEMP4 "],   4              \n\t"      \
+  "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   30000          \n\t"      \
+  "addiu           %[" #TEMP12 "],  %[" #TEMP12 "],  12000          \n\t"      \
+  "addiu           %[" #TEMP8 "],   %[" #TEMP8 "],   21000          \n\t"      \
+  "subu            %[" #TEMP8 "],   %[" #TEMP8 "],   %[temp18]      \n\t"      \
+  "sra             %[" #TEMP12 "],  %[" #TEMP12 "],  16             \n\t"      \
+  "sra             %[" #TEMP8 "],   %[" #TEMP8 "],   16             \n\t"      \
+  "addiu           %[temp16],     %[" #TEMP12 "],  1                \n\t"      \
+  "movn            %[" #TEMP12 "],  %[temp16],     %[temp19]        \n\t"      \
+  "sh              %[" #TEMP0 "],   " #A "(%[temp20])               \n\t"      \
+  "sh              %[" #TEMP4 "],   " #C "(%[temp20])               \n\t"      \
+  "sh              %[" #TEMP8 "],   " #D "(%[temp20])               \n\t"      \
+  "sh              %[" #TEMP12 "],  " #B "(%[temp20])               \n\t"
+
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  const int c2217 = 2217;
+  const int c5352 = 5352;
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+  int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16;
+  int temp17, temp18, temp19, temp20;
+  const int* const args[3] =
+      { (const int*)src, (const int*)ref, (const int*)out };
+
+  __asm__ volatile (
+    HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3)
+    HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7)
+    HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11)
+    HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15)
+    "lw            %[temp20],     8(%[args])                  \n\t"
+    VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12)
+    VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13)
+    VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14)
+    VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15)
+    OUTPUT_EARLY_CLOBBER_REGS_18(),
+      [temp0]"=&r"(temp0), [temp19]"=&r"(temp19), [temp20]"=&r"(temp20)
+    : [args]"r"(args), [c2217]"r"(c2217), [c5352]"r"(c5352)
+    : "memory", "hi", "lo"
+  );
+}
+
+#undef VERTICAL_PASS
+#undef HORIZONTAL_PASS
+
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17, temp18;
+
+  __asm__ volatile (
+    "ulw              %[temp1],   0(%[in])                 \n\t"
+    "ulw              %[temp2],   16(%[in])                \n\t"
+    LOAD_IN_X2(temp5, temp6, 24, 26)
+    ADD_SUB_HALVES(temp3, temp4, temp1, temp2)
+    LOAD_IN_X2(temp1, temp2, 8, 10)
+    MUL_SHIFT_SUM(temp7, temp8, temp9, temp10, temp11, temp12, temp13, temp14,
+                  temp10, temp8, temp9, temp7, temp1, temp2, temp5, temp6,
+                  temp13, temp11, temp14, temp12)
+    INSERT_HALF_X2(temp8, temp7, temp10, temp9)
+    "ulw              %[temp17],  4(%[in])                 \n\t"
+    "ulw              %[temp18],  20(%[in])                \n\t"
+    ADD_SUB_HALVES(temp1, temp2, temp3, temp8)
+    ADD_SUB_HALVES(temp5, temp6, temp4, temp7)
+    ADD_SUB_HALVES(temp7, temp8, temp17, temp18)
+    LOAD_IN_X2(temp17, temp18, 12, 14)
+    LOAD_IN_X2(temp9, temp10, 28, 30)
+    MUL_SHIFT_SUM(temp11, temp12, temp13, temp14, temp15, temp16, temp4, temp17,
+                  temp12, temp14, temp11, temp13, temp17, temp18, temp9, temp10,
+                  temp15, temp4, temp16, temp17)
+    INSERT_HALF_X2(temp11, temp12, temp13, temp14)
+    ADD_SUB_HALVES(temp17, temp8, temp8, temp11)
+    ADD_SUB_HALVES(temp3, temp4, temp7, temp12)
+
+    // horizontal
+    SRA_16(temp9, temp10, temp11, temp12, temp1, temp2, temp5, temp6)
+    INSERT_HALF_X2(temp1, temp6, temp5, temp2)
+    SRA_16(temp13, temp14, temp15, temp16, temp3, temp4, temp17, temp8)
+    "repl.ph          %[temp2],   0x4                      \n\t"
+    INSERT_HALF_X2(temp3, temp8, temp17, temp4)
+    "addq.ph          %[temp1],   %[temp1],  %[temp2]      \n\t"
+    "addq.ph          %[temp6],   %[temp6],  %[temp2]      \n\t"
+    ADD_SUB_HALVES(temp2, temp4, temp1, temp3)
+    ADD_SUB_HALVES(temp5, temp7, temp6, temp8)
+    MUL_SHIFT_SUM(temp1, temp3, temp6, temp8, temp9, temp13, temp17, temp18,
+                  temp3, temp13, temp1, temp9, temp9, temp13, temp11, temp15,
+                  temp6, temp17, temp8, temp18)
+    MUL_SHIFT_SUM(temp6, temp8, temp18, temp17, temp11, temp15, temp12, temp16,
+                  temp8, temp15, temp6, temp11, temp12, temp16, temp10, temp14,
+                  temp18, temp12, temp17, temp16)
+    INSERT_HALF_X2(temp1, temp3, temp9, temp13)
+    INSERT_HALF_X2(temp6, temp8, temp11, temp15)
+    SHIFT_R_SUM_X2(temp9, temp10, temp11, temp12, temp13, temp14, temp15,
+                   temp16, temp2, temp4, temp5, temp7, temp3, temp1, temp8,
+                   temp6)
+    PACK_2_HALVES_TO_WORD(temp1, temp2, temp3, temp4, temp9, temp12, temp13,
+                          temp16, temp11, temp10, temp15, temp14)
+    LOAD_WITH_OFFSET_X4(temp10, temp11, temp14, temp15, ref,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp17, temp18, temp10,
+                            temp11, temp10, temp11, temp14, temp15)
+    STORE_SAT_SUM_X2(temp5, temp6, temp7, temp8, temp17, temp18, temp10, temp11,
+                     temp9, temp12, temp1, temp2, temp13, temp16, temp3, temp4,
+                     dst, 0, 1, 2, 3, BPS)
+
+    OUTPUT_EARLY_CLOBBER_REGS_18()
+    : [dst]"r"(dst), [in]"r"(in), [kC1]"r"(kC1), [kC2]"r"(kC2), [ref]"r"(ref)
+    : "memory", "hi", "lo"
+  );
+}
+
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                       int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  int temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8, temp9;
+  int temp10, temp11, temp12, temp13, temp14, temp15, temp16, temp17;
+
+  __asm__ volatile (
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, a,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp5, temp6, temp7, temp8, temp9,temp10, temp11,
+                            temp12, temp1, temp2, temp3, temp4)
+    ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+                      temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
+    PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
+                          temp7, temp2, temp4, temp6, temp8)
+    ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
+                      temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
+    ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
+                      temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
+    ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
+                      temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
+    ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
+    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
+                        0, 4, 8, 12,
+                        1, 1, 1, 1,
+                        16)
+    MUL_HALF(temp17, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+             temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
+    LOAD_WITH_OFFSET_X4(temp1, temp2, temp3, temp4, b,
+                        0, 0, 0, 0,
+                        0, 1, 2, 3,
+                        BPS)
+    CONVERT_2_BYTES_TO_HALF(temp5,temp6, temp7, temp8, temp9,temp10, temp11,
+                            temp12, temp1, temp2, temp3, temp4)
+    ADD_SUB_HALVES_X4(temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+                      temp5, temp6, temp7, temp8, temp9, temp10, temp11, temp12)
+    PACK_2_HALVES_TO_WORD(temp9, temp10, temp11, temp12, temp1, temp3, temp5,
+                          temp7, temp2, temp4, temp6, temp8)
+    ADD_SUB_HALVES_X4(temp2, temp4, temp6, temp8, temp9, temp1, temp3, temp10,
+                      temp1, temp9, temp3, temp10, temp5, temp11, temp7, temp12)
+    ADD_SUB_HALVES_X4(temp5, temp11, temp7, temp2, temp9, temp3, temp6, temp12,
+                      temp2, temp9, temp6, temp3, temp4, temp1, temp8, temp10)
+    ADD_SUB_HALVES_X4(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2,
+                      temp5, temp7, temp11, temp2, temp9, temp6, temp3, temp12)
+    ABS_X8(temp1, temp4, temp10, temp8, temp7, temp11, temp5, temp2)
+    LOAD_WITH_OFFSET_X4(temp3, temp6, temp9, temp12, w,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    LOAD_WITH_OFFSET_X4(temp13, temp14, temp15, temp16, w,
+                        0, 4, 8, 12,
+                        1, 1, 1, 1,
+                        16)
+    MUL_HALF(temp3, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8,
+             temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16)
+    OUTPUT_EARLY_CLOBBER_REGS_17()
+    : [a]"r"(a), [b]"r"(b), [w]"r"(w)
+    : "memory", "hi", "lo"
+  );
+  return abs(temp3 - temp17) >> 5;
+}
+
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+#define FILL_PART(J, SIZE)                                            \
+    "usw        %[value],  0+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+    "usw        %[value],  4+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+  ".if " #SIZE " == 16                                     \n\t"      \
+    "usw        %[value],  8+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+    "usw        %[value], 12+" #J "*" XSTR(BPS) "(%[dst])  \n\t"      \
+  ".endif                                                  \n\t"
+
+#define FILL_8_OR_16(DST, VALUE, SIZE) do {                         \
+  int value = (VALUE);                                              \
+  __asm__ volatile (                                                \
+    "replv.qb   %[value],  %[value]                      \n\t"      \
+    FILL_PART( 0, SIZE)                                             \
+    FILL_PART( 1, SIZE)                                             \
+    FILL_PART( 2, SIZE)                                             \
+    FILL_PART( 3, SIZE)                                             \
+    FILL_PART( 4, SIZE)                                             \
+    FILL_PART( 5, SIZE)                                             \
+    FILL_PART( 6, SIZE)                                             \
+    FILL_PART( 7, SIZE)                                             \
+  ".if " #SIZE " == 16                                   \n\t"      \
+    FILL_PART( 8, 16)                                               \
+    FILL_PART( 9, 16)                                               \
+    FILL_PART(10, 16)                                               \
+    FILL_PART(11, 16)                                               \
+    FILL_PART(12, 16)                                               \
+    FILL_PART(13, 16)                                               \
+    FILL_PART(14, 16)                                               \
+    FILL_PART(15, 16)                                               \
+  ".endif                                                \n\t"      \
+    : [value]"+&r"(value)                                           \
+    : [dst]"r"((DST))                                               \
+    : "memory"                                                      \
+  );                                                                \
+} while (0)
+
+#define VERTICAL_PRED(DST, TOP, SIZE)                                          \
+static WEBP_INLINE void VerticalPred##SIZE(uint8_t* (DST),                     \
+                                           const uint8_t* (TOP)) {             \
+  int j;                                                                       \
+  if ((TOP)) {                                                                 \
+    for (j = 0; j < (SIZE); ++j) memcpy((DST) + j * BPS, (TOP), (SIZE));       \
+  } else {                                                                     \
+    FILL_8_OR_16((DST), 127, (SIZE));                                          \
+  }                                                                            \
+}
+
+VERTICAL_PRED(dst, top, 8)
+VERTICAL_PRED(dst, top, 16)
+
+#undef VERTICAL_PRED
+
+#define HORIZONTAL_PRED(DST, LEFT, SIZE)                                       \
+static WEBP_INLINE void HorizontalPred##SIZE(uint8_t* (DST),                   \
+                                             const uint8_t* (LEFT)) {          \
+  if (LEFT) {                                                                  \
+    int j;                                                                     \
+    for (j = 0; j < (SIZE); ++j) {                                             \
+      memset((DST) + j * BPS, (LEFT)[j], (SIZE));                              \
+    }                                                                          \
+  } else {                                                                     \
+    FILL_8_OR_16((DST), 129, (SIZE));                                          \
+  }                                                                            \
+}
+
+HORIZONTAL_PRED(dst, left, 8)
+HORIZONTAL_PRED(dst, left, 16)
+
+#undef HORIZONTAL_PRED
+
+#define CLIPPING()                                                             \
+  "preceu.ph.qbl   %[temp2],   %[temp0]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp0],   %[temp0]                  \n\t"                 \
+  "preceu.ph.qbl   %[temp3],   %[temp1]                  \n\t"                 \
+  "preceu.ph.qbr   %[temp1],   %[temp1]                  \n\t"                 \
+  "addu.ph         %[temp2],   %[temp2],   %[leftY_1]    \n\t"                 \
+  "addu.ph         %[temp0],   %[temp0],   %[leftY_1]    \n\t"                 \
+  "addu.ph         %[temp3],   %[temp3],   %[leftY_1]    \n\t"                 \
+  "addu.ph         %[temp1],   %[temp1],   %[leftY_1]    \n\t"                 \
+  "shll_s.ph       %[temp2],   %[temp2],   7             \n\t"                 \
+  "shll_s.ph       %[temp0],   %[temp0],   7             \n\t"                 \
+  "shll_s.ph       %[temp3],   %[temp3],   7             \n\t"                 \
+  "shll_s.ph       %[temp1],   %[temp1],   7             \n\t"                 \
+  "precrqu_s.qb.ph %[temp0],   %[temp2],   %[temp0]      \n\t"                 \
+  "precrqu_s.qb.ph %[temp1],   %[temp3],   %[temp1]      \n\t"
+
+#define CLIP_8B_TO_DST(DST, LEFT, TOP, SIZE) do {                              \
+  int leftY_1 = ((int)(LEFT)[y] << 16) + (LEFT)[y];                            \
+  int temp0, temp1, temp2, temp3;                                              \
+  __asm__ volatile (                                                           \
+    "replv.ph        %[leftY_1], %[leftY_1]              \n\t"                 \
+    "ulw             %[temp0],   0(%[top])               \n\t"                 \
+    "ulw             %[temp1],   4(%[top])               \n\t"                 \
+    "subu.ph         %[leftY_1], %[leftY_1], %[left_1]   \n\t"                 \
+    CLIPPING()                                                                 \
+    "usw             %[temp0],   0(%[dst])               \n\t"                 \
+    "usw             %[temp1],   4(%[dst])               \n\t"                 \
+  ".if " #SIZE " == 16                                   \n\t"                 \
+    "ulw             %[temp0],   8(%[top])               \n\t"                 \
+    "ulw             %[temp1],   12(%[top])              \n\t"                 \
+    CLIPPING()                                                                 \
+    "usw             %[temp0],   8(%[dst])               \n\t"                 \
+    "usw             %[temp1],   12(%[dst])              \n\t"                 \
+  ".endif                                                \n\t"                 \
+    : [leftY_1]"+&r"(leftY_1), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),       \
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)                                 \
+    : [left_1]"r"(left_1), [top]"r"((TOP)), [dst]"r"((DST))                    \
+    : "memory"                                                                 \
+  );                                                                           \
+} while (0)
+
+#define CLIP_TO_DST(DST, LEFT, TOP, SIZE) do {                                 \
+  int y;                                                                       \
+  const int left_1 = ((int)(LEFT)[-1] << 16) + (LEFT)[-1];                     \
+  for (y = 0; y < (SIZE); ++y) {                                               \
+    CLIP_8B_TO_DST((DST), (LEFT), (TOP), (SIZE));                              \
+    (DST) += BPS;                                                              \
+  }                                                                            \
+} while (0)
+
+#define TRUE_MOTION(DST, LEFT, TOP, SIZE)                                      \
+static WEBP_INLINE void TrueMotion##SIZE(uint8_t* (DST), const uint8_t* (LEFT),\
+                                         const uint8_t* (TOP)) {               \
+  if ((LEFT) != NULL) {                                                        \
+    if ((TOP) != NULL) {                                                       \
+      CLIP_TO_DST((DST), (LEFT), (TOP), (SIZE));                               \
+    } else {                                                                   \
+      HorizontalPred##SIZE((DST), (LEFT));                                     \
+    }                                                                          \
+  } else {                                                                     \
+    /* true motion without left samples (hence: with default 129 value)    */  \
+    /* is equivalent to VE prediction where you just copy the top samples. */  \
+    /* Note that if top samples are not available, the default value is    */  \
+    /* then 129, and not 127 as in the VerticalPred case.                  */  \
+    if ((TOP) != NULL) {                                                       \
+      VerticalPred##SIZE((DST), (TOP));                                        \
+    } else {                                                                   \
+      FILL_8_OR_16((DST), 129, (SIZE));                                        \
+    }                                                                          \
+  }                                                                            \
+}
+
+TRUE_MOTION(dst, left, top, 8)
+TRUE_MOTION(dst, left, top, 16)
+
+#undef TRUE_MOTION
+#undef CLIP_TO_DST
+#undef CLIP_8B_TO_DST
+#undef CLIPPING
+
+static WEBP_INLINE void DCMode16(uint8_t* dst, const uint8_t* left,
+                                 const uint8_t* top) {
+  int DC, DC1;
+  int temp0, temp1, temp2, temp3;
+
+  __asm__ volatile(
+    "beqz        %[top],   2f                  \n\t"
+    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, top,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
+    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
+    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
+    "addu        %[DC],    %[temp0], %[temp2]  \n\t"
+    "move        %[DC1],   %[DC]               \n\t"
+    "beqz        %[left],  1f                  \n\t"
+    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
+    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
+    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
+    "addu        %[DC1],   %[temp0], %[temp2]  \n\t"
+  "1:                                          \n\t"
+    "addu        %[DC],   %[DC],     %[DC1]    \n\t"
+    "j           3f                            \n\t"
+  "2:                                          \n\t"
+    "beqz        %[left],  4f                  \n\t"
+    LOAD_WITH_OFFSET_X4(temp0, temp1, temp2, temp3, left,
+                        0, 4, 8, 12,
+                        0, 0, 0, 0,
+                        0)
+    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
+    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[temp0], %[temp0], %[temp1]  \n\t"
+    "addu        %[temp2], %[temp2], %[temp3]  \n\t"
+    "addu        %[DC],    %[temp0], %[temp2]  \n\t"
+    "addu        %[DC],    %[DC],    %[DC]     \n\t"
+  "3:                                          \n\t"
+    "shra_r.w    %[DC],    %[DC],    5         \n\t"
+    "j           5f                            \n\t"
+  "4:                                          \n\t"
+    "li          %[DC],    0x80                \n\t"
+  "5:                                          \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
+    : [left]"r"(left), [top]"r"(top)
+    : "memory"
+  );
+
+  FILL_8_OR_16(dst, DC, 16);
+}
+
+static WEBP_INLINE void DCMode8(uint8_t* dst, const uint8_t* left,
+                                const uint8_t* top) {
+  int DC, DC1;
+  int temp0, temp1, temp2, temp3;
+
+  __asm__ volatile(
+    "beqz        %[top],   2f                  \n\t"
+    "ulw         %[temp0], 0(%[top])           \n\t"
+    "ulw         %[temp1], 4(%[top])           \n\t"
+    "raddu.w.qb  %[temp0], %[temp0]            \n\t"
+    "raddu.w.qb  %[temp1], %[temp1]            \n\t"
+    "addu        %[DC],    %[temp0], %[temp1]  \n\t"
+    "move        %[DC1],   %[DC]               \n\t"
+    "beqz        %[left],  1f                  \n\t"
+    "ulw         %[temp2], 0(%[left])          \n\t"
+    "ulw         %[temp3], 4(%[left])          \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[DC1],   %[temp2], %[temp3]  \n\t"
+  "1:                                          \n\t"
+    "addu        %[DC],    %[DC],    %[DC1]    \n\t"
+    "j           3f                            \n\t"
+  "2:                                          \n\t"
+    "beqz        %[left],  4f                  \n\t"
+    "ulw         %[temp2], 0(%[left])          \n\t"
+    "ulw         %[temp3], 4(%[left])          \n\t"
+    "raddu.w.qb  %[temp2], %[temp2]            \n\t"
+    "raddu.w.qb  %[temp3], %[temp3]            \n\t"
+    "addu        %[DC],    %[temp2], %[temp3]  \n\t"
+    "addu        %[DC],    %[DC],    %[DC]     \n\t"
+  "3:                                          \n\t"
+    "shra_r.w    %[DC], %[DC], 4               \n\t"
+    "j           5f                            \n\t"
+  "4:                                          \n\t"
+    "li          %[DC], 0x80                   \n\t"
+  "5:                                          \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [DC]"=&r"(DC),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [DC1]"=&r"(DC1)
+    : [left]"r"(left), [top]"r"(top)
+    : "memory"
+  );
+
+  FILL_8_OR_16(dst, DC, 8);
+}
+
+static void DC4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1;
+  __asm__ volatile(
+    "ulw          %[temp0],   0(%[top])               \n\t"
+    "ulw          %[temp1],   -5(%[top])              \n\t"
+    "raddu.w.qb   %[temp0],   %[temp0]                \n\t"
+    "raddu.w.qb   %[temp1],   %[temp1]                \n\t"
+    "addu         %[temp0],   %[temp0],    %[temp1]   \n\t"
+    "addiu        %[temp0],   %[temp0],    4          \n\t"
+    "srl          %[temp0],   %[temp0],    3          \n\t"
+    "replv.qb     %[temp0],   %[temp0]                \n\t"
+    "usw          %[temp0],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw          %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw          %[temp0],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw          %[temp0],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void TM4(uint8_t* dst, const uint8_t* top) {
+  int a10, a32, temp0, temp1, temp2, temp3, temp4, temp5;
+  const int c35 = 0xff00ff;
+  __asm__ volatile (
+    "lbu              %[temp1],  0(%[top])                     \n\t"
+    "lbu              %[a10],    1(%[top])                     \n\t"
+    "lbu              %[temp2],  2(%[top])                     \n\t"
+    "lbu              %[a32],    3(%[top])                     \n\t"
+    "ulw              %[temp0],  -5(%[top])                    \n\t"
+    "lbu              %[temp4],  -1(%[top])                    \n\t"
+    "append           %[a10],    %[temp1],   16                \n\t"
+    "append           %[a32],    %[temp2],   16                \n\t"
+    "replv.ph         %[temp4],  %[temp4]                      \n\t"
+    "shrl.ph          %[temp1],  %[temp0],   8                 \n\t"
+    "and              %[temp0],  %[temp0],   %[c35]            \n\t"
+    "subu.ph          %[temp1],  %[temp1],   %[temp4]          \n\t"
+    "subu.ph          %[temp0],  %[temp0],   %[temp4]          \n\t"
+    "srl              %[temp2],  %[temp1],   16                \n\t"
+    "srl              %[temp3],  %[temp0],   16                \n\t"
+    "replv.ph         %[temp2],  %[temp2]                      \n\t"
+    "replv.ph         %[temp3],  %[temp3]                      \n\t"
+    "replv.ph         %[temp4],  %[temp1]                      \n\t"
+    "replv.ph         %[temp5],  %[temp0]                      \n\t"
+    "addu.ph          %[temp0],  %[temp3],   %[a10]            \n\t"
+    "addu.ph          %[temp1],  %[temp3],   %[a32]            \n\t"
+    "addu.ph          %[temp3],  %[temp2],   %[a10]            \n\t"
+    "addu.ph          %[temp2],  %[temp2],   %[a32]            \n\t"
+    "shll_s.ph        %[temp0],  %[temp0],   7                 \n\t"
+    "shll_s.ph        %[temp1],  %[temp1],   7                 \n\t"
+    "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
+    "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
+    "precrqu_s.qb.ph  %[temp0],  %[temp1],   %[temp0]          \n\t"
+    "precrqu_s.qb.ph  %[temp1],  %[temp2],   %[temp3]          \n\t"
+    "addu.ph          %[temp2],  %[temp5],   %[a10]            \n\t"
+    "addu.ph          %[temp3],  %[temp5],   %[a32]            \n\t"
+    "addu.ph          %[temp5],  %[temp4],   %[a10]            \n\t"
+    "addu.ph          %[temp4],  %[temp4],   %[a32]            \n\t"
+    "shll_s.ph        %[temp2],  %[temp2],   7                 \n\t"
+    "shll_s.ph        %[temp3],  %[temp3],   7                 \n\t"
+    "shll_s.ph        %[temp4],  %[temp4],   7                 \n\t"
+    "shll_s.ph        %[temp5],  %[temp5],   7                 \n\t"
+    "precrqu_s.qb.ph  %[temp2],  %[temp3],   %[temp2]          \n\t"
+    "precrqu_s.qb.ph  %[temp3],  %[temp4],   %[temp5]          \n\t"
+    "usw              %[temp1],  0*" XSTR(BPS) "(%[dst])       \n\t"
+    "usw              %[temp0],  1*" XSTR(BPS) "(%[dst])       \n\t"
+    "usw              %[temp3],  2*" XSTR(BPS) "(%[dst])       \n\t"
+    "usw              %[temp2],  3*" XSTR(BPS) "(%[dst])       \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [a10]"=&r"(a10), [a32]"=&r"(a32)
+    : [c35]"r"(c35), [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void VE4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  __asm__ volatile(
+    "ulw             %[temp0],   -1(%[top])              \n\t"
+    "ulh             %[temp1],   3(%[top])               \n\t"
+    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp4],   %[temp1]                \n\t"
+    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "packrl.ph       %[temp6],   %[temp4],    %[temp3]   \n\t"
+    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
+    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
+    "addq.ph         %[temp2],   %[temp5],    %[temp2]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp4]   \n\t"
+    "addq.ph         %[temp2],   %[temp2],    %[temp3]   \n\t"
+    "addq.ph         %[temp6],   %[temp6],    %[temp3]   \n\t"
+    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
+    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
+    "precr.qb.ph     %[temp4],   %[temp6],    %[temp2]   \n\t"
+    "usw             %[temp4],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp4],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp4],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void HE4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+  __asm__ volatile(
+    "ulw             %[temp0],   -4(%[top])              \n\t"
+    "lbu             %[temp1],   -5(%[top])              \n\t"
+    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbl   %[temp3],   %[temp0]                \n\t"
+    "replv.ph        %[temp4],   %[temp1]                \n\t"
+    "packrl.ph       %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "packrl.ph       %[temp6],   %[temp2],    %[temp4]   \n\t"
+    "shll.ph         %[temp5],   %[temp5],    1          \n\t"
+    "shll.ph         %[temp6],   %[temp6],    1          \n\t"
+    "addq.ph         %[temp3],   %[temp3],    %[temp5]   \n\t"
+    "addq.ph         %[temp3],   %[temp3],    %[temp2]   \n\t"
+    "addq.ph         %[temp2],   %[temp2],    %[temp6]   \n\t"
+    "addq.ph         %[temp2],   %[temp2],    %[temp4]   \n\t"
+    "shra_r.ph       %[temp3],   %[temp3],    2          \n\t"
+    "shra_r.ph       %[temp2],   %[temp2],    2          \n\t"
+    "replv.qb        %[temp0],   %[temp3]                \n\t"
+    "replv.qb        %[temp1],   %[temp2]                \n\t"
+    "srl             %[temp3],   %[temp3],    16         \n\t"
+    "srl             %[temp2],   %[temp2],    16         \n\t"
+    "replv.qb        %[temp3],   %[temp3]                \n\t"
+    "replv.qb        %[temp2],   %[temp2]                \n\t"
+    "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp0],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp2],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp1],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void RD4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  int temp6, temp7, temp8, temp9, temp10, temp11;
+  __asm__ volatile(
+    "ulw             %[temp0],    -5(%[top])               \n\t"
+    "ulw             %[temp1],    -1(%[top])               \n\t"
+    "preceu.ph.qbl   %[temp2],    %[temp0]                 \n\t"
+    "preceu.ph.qbr   %[temp3],    %[temp0]                 \n\t"
+    "preceu.ph.qbr   %[temp4],    %[temp1]                 \n\t"
+    "preceu.ph.qbl   %[temp5],    %[temp1]                 \n\t"
+    "packrl.ph       %[temp6],    %[temp2],    %[temp3]    \n\t"
+    "packrl.ph       %[temp7],    %[temp4],    %[temp2]    \n\t"
+    "packrl.ph       %[temp8],    %[temp5],    %[temp4]    \n\t"
+    "shll.ph         %[temp6],    %[temp6],    1           \n\t"
+    "addq.ph         %[temp9],    %[temp2],    %[temp6]    \n\t"
+    "shll.ph         %[temp7],    %[temp7],    1           \n\t"
+    "addq.ph         %[temp9],    %[temp9],    %[temp3]    \n\t"
+    "shll.ph         %[temp8],    %[temp8],    1           \n\t"
+    "shra_r.ph       %[temp9],    %[temp9],    2           \n\t"
+    "addq.ph         %[temp10],   %[temp4],    %[temp7]    \n\t"
+    "addq.ph         %[temp11],   %[temp5],    %[temp8]    \n\t"
+    "addq.ph         %[temp10],   %[temp10],   %[temp2]    \n\t"
+    "addq.ph         %[temp11],   %[temp11],   %[temp4]    \n\t"
+    "shra_r.ph       %[temp10],   %[temp10],   2           \n\t"
+    "shra_r.ph       %[temp11],   %[temp11],   2           \n\t"
+    "lbu             %[temp0],    3(%[top])                \n\t"
+    "lbu             %[temp1],    2(%[top])                \n\t"
+    "lbu             %[temp2],    1(%[top])                \n\t"
+    "sll             %[temp1],    %[temp1],    1           \n\t"
+    "addu            %[temp0],    %[temp0],    %[temp1]    \n\t"
+    "addu            %[temp0],    %[temp0],    %[temp2]    \n\t"
+    "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]    \n\t"
+    "shra_r.w        %[temp0],    %[temp0],    2           \n\t"
+    "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]   \n\t"
+    "usw             %[temp9],    3*" XSTR(BPS) "(%[dst])  \n\t"
+    "usw             %[temp10],   1*" XSTR(BPS) "(%[dst])  \n\t"
+    "prepend         %[temp9],    %[temp11],   8           \n\t"
+    "prepend         %[temp10],   %[temp0],    8           \n\t"
+    "usw             %[temp9],    2*" XSTR(BPS) "(%[dst])  \n\t"
+    "usw             %[temp10],   0*" XSTR(BPS) "(%[dst])  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void VR4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    "ulw              %[temp0],   -4(%[top])              \n\t"
+    "ulw              %[temp1],   0(%[top])               \n\t"
+    "preceu.ph.qbl    %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbr    %[temp0],   %[temp0]                \n\t"
+    "preceu.ph.qbla   %[temp3],   %[temp1]                \n\t"
+    "preceu.ph.qbra   %[temp1],   %[temp1]                \n\t"
+    "packrl.ph        %[temp7],   %[temp3],    %[temp2]   \n\t"
+    "addqh_r.ph       %[temp4],   %[temp1],    %[temp3]   \n\t"
+    "move             %[temp6],   %[temp1]                \n\t"
+    "append           %[temp1],   %[temp2],    16         \n\t"
+    "shll.ph          %[temp9],   %[temp6],    1          \n\t"
+    "addqh_r.ph       %[temp5],   %[temp7],    %[temp6]   \n\t"
+    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
+    "addu.ph          %[temp3],   %[temp7],    %[temp3]   \n\t"
+    "addu.ph          %[temp1],   %[temp1],    %[temp6]   \n\t"
+    "packrl.ph        %[temp7],   %[temp2],    %[temp0]   \n\t"
+    "addu.ph          %[temp6],   %[temp0],    %[temp2]   \n\t"
+    "addu.ph          %[temp3],   %[temp3],    %[temp9]   \n\t"
+    "addu.ph          %[temp1],   %[temp1],    %[temp8]   \n\t"
+    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
+    "shra_r.ph        %[temp3],   %[temp3],    2          \n\t"
+    "shra_r.ph        %[temp1],   %[temp1],    2          \n\t"
+    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
+    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
+    "precrq.ph.w      %[temp8],   %[temp4],    %[temp5]   \n\t"
+    "append           %[temp4],   %[temp5],    16         \n\t"
+    "precrq.ph.w      %[temp2],   %[temp3],    %[temp1]   \n\t"
+    "append           %[temp3],   %[temp1],    16         \n\t"
+    "precr.qb.ph      %[temp8],   %[temp8],    %[temp4]   \n\t"
+    "precr.qb.ph      %[temp3],   %[temp2],    %[temp3]   \n\t"
+    "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "append           %[temp3],   %[temp6],    8          \n\t"
+    "srl              %[temp6],   %[temp6],    16         \n\t"
+    "append           %[temp8],   %[temp6],    8          \n\t"
+    "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void LD4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  int temp6, temp7, temp8, temp9, temp10, temp11;
+  __asm__ volatile(
+    "ulw             %[temp0],    0(%[top])               \n\t"
+    "ulw             %[temp1],    4(%[top])               \n\t"
+    "preceu.ph.qbl   %[temp2],    %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp3],    %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp4],    %[temp1]                \n\t"
+    "preceu.ph.qbl   %[temp5],    %[temp1]                \n\t"
+    "packrl.ph       %[temp6],    %[temp2],    %[temp3]   \n\t"
+    "packrl.ph       %[temp7],    %[temp4],    %[temp2]   \n\t"
+    "packrl.ph       %[temp8],    %[temp5],    %[temp4]   \n\t"
+    "shll.ph         %[temp6],    %[temp6],    1          \n\t"
+    "addq.ph         %[temp9],    %[temp2],    %[temp6]   \n\t"
+    "shll.ph         %[temp7],    %[temp7],    1          \n\t"
+    "addq.ph         %[temp9],    %[temp9],    %[temp3]   \n\t"
+    "shll.ph         %[temp8],    %[temp8],    1          \n\t"
+    "shra_r.ph       %[temp9],    %[temp9],    2          \n\t"
+    "addq.ph         %[temp10],   %[temp4],    %[temp7]   \n\t"
+    "addq.ph         %[temp11],   %[temp5],    %[temp8]   \n\t"
+    "addq.ph         %[temp10],   %[temp10],   %[temp2]   \n\t"
+    "addq.ph         %[temp11],   %[temp11],   %[temp4]   \n\t"
+    "shra_r.ph       %[temp10],   %[temp10],   2          \n\t"
+    "shra_r.ph       %[temp11],   %[temp11],   2          \n\t"
+    "srl             %[temp1],    %[temp1],    24         \n\t"
+    "sll             %[temp1],    %[temp1],    1          \n\t"
+    "raddu.w.qb      %[temp5],    %[temp5]                \n\t"
+    "precr.qb.ph     %[temp9],    %[temp10],   %[temp9]   \n\t"
+    "precr.qb.ph     %[temp10],   %[temp11],   %[temp10]  \n\t"
+    "addu            %[temp1],    %[temp1],    %[temp5]   \n\t"
+    "shra_r.w        %[temp1],    %[temp1],    2          \n\t"
+    "usw             %[temp9],    0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp10],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "prepend         %[temp9],    %[temp11],   8          \n\t"
+    "prepend         %[temp10],   %[temp1],    8          \n\t"
+    "usw             %[temp9],    1*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp10],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9), [temp10]"=&r"(temp10), [temp11]"=&r"(temp11)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void VL4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    "ulw              %[temp0],   0(%[top])               \n\t"
+    "ulw              %[temp1],   4(%[top])               \n\t"
+    "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
+    "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
+    "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
+    "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
+    "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
+    "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
+    "shll.ph          %[temp9],   %[temp2],    1          \n\t"
+    "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
+    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
+    "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
+    "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
+    "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
+    "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
+    "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
+    "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
+    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
+    "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
+    "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
+    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
+    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
+    "precrq.ph.w      %[temp8],   %[temp5],    %[temp4]   \n\t"
+    "append           %[temp5],   %[temp4],    16         \n\t"
+    "precrq.ph.w      %[temp3],   %[temp2],    %[temp0]   \n\t"
+    "append           %[temp2],   %[temp0],    16         \n\t"
+    "precr.qb.ph      %[temp8],   %[temp8],    %[temp5]   \n\t"
+    "precr.qb.ph      %[temp3],   %[temp3],    %[temp2]   \n\t"
+    "usw              %[temp8],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "prepend          %[temp8],   %[temp6],    8          \n\t"
+    "usw              %[temp3],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "srl              %[temp6],   %[temp6],    16         \n\t"
+    "prepend          %[temp3],   %[temp6],    8          \n\t"
+    "usw              %[temp8],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp3],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void HD4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+  __asm__ volatile (
+    "ulw              %[temp0],   -5(%[top])              \n\t"
+    "ulw              %[temp1],   -1(%[top])              \n\t"
+    "preceu.ph.qbla   %[temp2],   %[temp0]                \n\t"
+    "preceu.ph.qbra   %[temp0],   %[temp0]                \n\t"
+    "preceu.ph.qbl    %[temp3],   %[temp1]                \n\t"
+    "preceu.ph.qbr    %[temp1],   %[temp1]                \n\t"
+    "addqh_r.ph       %[temp4],   %[temp0],    %[temp2]   \n\t"
+    "packrl.ph        %[temp7],   %[temp1],    %[temp0]   \n\t"
+    "precrq.ph.w      %[temp6],   %[temp1],    %[temp2]   \n\t"
+    "shll.ph          %[temp9],   %[temp2],    1          \n\t"
+    "addqh_r.ph       %[temp5],   %[temp7],    %[temp2]   \n\t"
+    "shll.ph          %[temp8],   %[temp7],    1          \n\t"
+    "addu.ph          %[temp2],   %[temp2],    %[temp6]   \n\t"
+    "addu.ph          %[temp0],   %[temp0],    %[temp7]   \n\t"
+    "packrl.ph        %[temp7],   %[temp3],    %[temp1]   \n\t"
+    "addu.ph          %[temp6],   %[temp1],    %[temp3]   \n\t"
+    "addu.ph          %[temp2],   %[temp2],    %[temp8]   \n\t"
+    "addu.ph          %[temp0],   %[temp0],    %[temp9]   \n\t"
+    "shll.ph          %[temp7],   %[temp7],    1          \n\t"
+    "shra_r.ph        %[temp2],   %[temp2],    2          \n\t"
+    "shra_r.ph        %[temp0],   %[temp0],    2          \n\t"
+    "addu.ph          %[temp6],   %[temp6],    %[temp7]   \n\t"
+    "shra_r.ph        %[temp6],   %[temp6],    2          \n\t"
+    "precrq.ph.w      %[temp1],   %[temp2],    %[temp5]   \n\t"
+    "precrq.ph.w      %[temp3],   %[temp0],    %[temp4]   \n\t"
+    "precr.qb.ph      %[temp7],   %[temp6],    %[temp1]   \n\t"
+    "precr.qb.ph      %[temp6],   %[temp1],    %[temp3]   \n\t"
+    "usw              %[temp7],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp6],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    "append           %[temp2],   %[temp5],    16         \n\t"
+    "append           %[temp0],   %[temp4],    16         \n\t"
+    "precr.qb.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "precr.qb.ph      %[temp4],   %[temp2],    %[temp0]   \n\t"
+    "usw              %[temp5],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw              %[temp4],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+static void HU4(uint8_t* dst, const uint8_t* top) {
+  int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  __asm__ volatile (
+    "ulw             %[temp0],   -5(%[top])              \n\t"
+    "preceu.ph.qbl   %[temp1],   %[temp0]                \n\t"
+    "preceu.ph.qbr   %[temp2],   %[temp0]                \n\t"
+    "packrl.ph       %[temp3],   %[temp1],    %[temp2]   \n\t"
+    "replv.qb        %[temp7],   %[temp2]                \n\t"
+    "addqh_r.ph      %[temp4],   %[temp1],    %[temp3]   \n\t"
+    "addqh_r.ph      %[temp5],   %[temp3],    %[temp2]   \n\t"
+    "shll.ph         %[temp6],   %[temp3],    1          \n\t"
+    "addu.ph         %[temp3],   %[temp2],    %[temp3]   \n\t"
+    "addu.ph         %[temp6],   %[temp1],    %[temp6]   \n\t"
+    "shll.ph         %[temp0],   %[temp2],    1          \n\t"
+    "addu.ph         %[temp6],   %[temp6],    %[temp2]   \n\t"
+    "addu.ph         %[temp0],   %[temp3],    %[temp0]   \n\t"
+    "shra_r.ph       %[temp6],   %[temp6],    2          \n\t"
+    "shra_r.ph       %[temp0],   %[temp0],    2          \n\t"
+    "packrl.ph       %[temp3],   %[temp6],    %[temp5]   \n\t"
+    "precrq.ph.w     %[temp2],   %[temp6],    %[temp4]   \n\t"
+    "append          %[temp0],   %[temp5],    16         \n\t"
+    "precr.qb.ph     %[temp3],   %[temp3],    %[temp2]   \n\t"
+    "usw             %[temp3],   0*" XSTR(BPS) "(%[dst]) \n\t"
+    "precr.qb.ph     %[temp1],   %[temp7],    %[temp0]   \n\t"
+    "usw             %[temp7],   3*" XSTR(BPS) "(%[dst]) \n\t"
+    "packrl.ph       %[temp2],   %[temp1],    %[temp3]   \n\t"
+    "usw             %[temp1],   2*" XSTR(BPS) "(%[dst]) \n\t"
+    "usw             %[temp2],   1*" XSTR(BPS) "(%[dst]) \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+    : [top]"r"(top), [dst]"r"(dst)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Chroma 8x8 prediction (paragraph 12.2)
+
+static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
+                             const uint8_t* top) {
+  // U block
+  DCMode8(C8DC8 + dst, left, top);
+  VerticalPred8(C8VE8 + dst, top);
+  HorizontalPred8(C8HE8 + dst, left);
+  TrueMotion8(C8TM8 + dst, left, top);
+  // V block
+  dst += 8;
+  if (top) top += 8;
+  if (left) left += 16;
+  DCMode8(C8DC8 + dst, left, top);
+  VerticalPred8(C8VE8 + dst, top);
+  HorizontalPred8(C8HE8 + dst, left);
+  TrueMotion8(C8TM8 + dst, left, top);
+}
+
+//------------------------------------------------------------------------------
+// luma 16x16 prediction (paragraph 12.3)
+
+static void Intra16Preds(uint8_t* dst,
+                         const uint8_t* left, const uint8_t* top) {
+  DCMode16(I16DC16 + dst, left, top);
+  VerticalPred16(I16VE16 + dst, top);
+  HorizontalPred16(I16HE16 + dst, left);
+  TrueMotion16(I16TM16 + dst, left, top);
+}
+
+// Left samples are top[-5 .. -2], top_left is top[-1], top are
+// located at top[0..3], and top right is top[4..7]
+static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+  DC4(I4DC4 + dst, top);
+  TM4(I4TM4 + dst, top);
+  VE4(I4VE4 + dst, top);
+  HE4(I4HE4 + dst, top);
+  RD4(I4RD4 + dst, top);
+  VR4(I4VR4 + dst, top);
+  LD4(I4LD4 + dst, top);
+  VL4(I4VL4 + dst, top);
+  HD4(I4HD4 + dst, top);
+  HU4(I4HU4 + dst, top);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+#if !defined(WORK_AROUND_GCC)
+
+#define GET_SSE_INNER(A)                                                  \
+  "lw               %[temp0],    " #A "(%[a])                  \n\t"      \
+  "lw               %[temp1],    " #A "(%[b])                  \n\t"      \
+  "preceu.ph.qbr    %[temp2],    %[temp0]                      \n\t"      \
+  "preceu.ph.qbl    %[temp0],    %[temp0]                      \n\t"      \
+  "preceu.ph.qbr    %[temp3],    %[temp1]                      \n\t"      \
+  "preceu.ph.qbl    %[temp1],    %[temp1]                      \n\t"      \
+  "subq.ph          %[temp2],    %[temp2],    %[temp3]         \n\t"      \
+  "subq.ph          %[temp0],    %[temp0],    %[temp1]         \n\t"      \
+  "dpa.w.ph         $ac0,        %[temp2],    %[temp2]         \n\t"      \
+  "dpa.w.ph         $ac0,        %[temp0],    %[temp0]         \n\t"
+
+#define GET_SSE(A, B, C, D)               \
+  GET_SSE_INNER(A)                        \
+  GET_SSE_INNER(B)                        \
+  GET_SSE_INNER(C)                        \
+  GET_SSE_INNER(D)
+
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3;
+  __asm__ volatile (
+    "mult   $zero,    $zero                            \n\t"
+    GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+    GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+    GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+    GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+    GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+    GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+    GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+    GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
+    GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS)
+    GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS)
+    GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS)
+    GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS)
+    GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS)
+    GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS)
+    GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS)
+    GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS)
+    "mflo   %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3;
+  __asm__ volatile (
+    "mult   $zero,    $zero                            \n\t"
+    GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS)
+    GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS)
+    GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS)
+    GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS)
+    GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS)
+    GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS)
+    GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS)
+    GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS)
+    "mflo   %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3;
+  __asm__ volatile (
+    "mult   $zero,    $zero                            \n\t"
+    GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS)
+    GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS)
+    GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS)
+    GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS)
+    "mflo   %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+  int count;
+  int temp0, temp1, temp2, temp3;
+  __asm__ volatile (
+    "mult   $zero,    $zero                            \n\t"
+    GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS)
+    "mflo   %[count]                                   \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [count]"=&r"(count)
+    : [a]"r"(a), [b]"r"(b)
+    : "memory", "hi", "lo"
+  );
+  return count;
+}
+
+#undef GET_SSE
+#undef GET_SSE_INNER
+
+#endif  // !WORK_AROUND_GCC
+
+#undef FILL_8_OR_16
+#undef FILL_PART
+#undef OUTPUT_EARLY_CLOBBER_REGS_17
+#undef MUL_HALF
+#undef ABS_X8
+#undef ADD_SUB_HALVES_X4
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+// macro for one pass through for loop in QuantizeBlock reading 2 values at time
+// QUANTDIV macro inlined
+// J - offset in bytes (kZigzag[n] * 2)
+// K - offset in bytes (kZigzag[n] * 4)
+// N - offset in bytes (n * 2)
+// N1 - offset in bytes ((n + 1) * 2)
+#define QUANTIZE_ONE(J, K, N, N1)                                         \
+  "ulw         %[temp1],     " #J "(%[ppin])                 \n\t"        \
+  "ulw         %[temp2],     " #J "(%[ppsharpen])            \n\t"        \
+  "lhu         %[temp3],     " #K "(%[ppzthresh])            \n\t"        \
+  "lhu         %[temp6],     " #K "+4(%[ppzthresh])          \n\t"        \
+  "absq_s.ph   %[temp4],     %[temp1]                        \n\t"        \
+  "ins         %[temp3],     %[temp6],         16,       16  \n\t"        \
+  "addu.ph     %[coeff],     %[temp4],         %[temp2]      \n\t"        \
+  "shra.ph     %[sign],      %[temp1],         15            \n\t"        \
+  "li          %[level],     0x10001                         \n\t"        \
+  "cmp.lt.ph   %[temp3],     %[coeff]                        \n\t"        \
+  "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
+  "pick.ph     %[temp5],     %[level],         $0            \n\t"        \
+  "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
+  "beqz        %[temp5],     0f                              \n\t"        \
+  "lhu         %[temp3],     " #J "(%[ppq])                  \n\t"        \
+  "beq         %[temp5],     %[level],         1f            \n\t"        \
+  "andi        %[temp5],     %[temp5],         0x1           \n\t"        \
+  "andi        %[temp4],     %[coeff],         0xffff        \n\t"        \
+  "beqz        %[temp5],     2f                              \n\t"        \
+  "mul         %[level],     %[temp4],         %[temp1]      \n\t"        \
+  "sh          $0,           " #J "+2(%[ppin])               \n\t"        \
+  "sh          $0,           " #N1 "(%[pout])                \n\t"        \
+  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
+  "sra         %[level],     %[level],         17            \n\t"        \
+  "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
+  "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
+  "andi        %[temp6],     %[sign],          0xffff        \n\t"        \
+  "xor         %[level],     %[level],         %[temp6]      \n\t"        \
+  "subu        %[level],     %[level],         %[temp6]      \n\t"        \
+  "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
+  "or          %[ret],       %[ret],           %[level]      \n\t"        \
+  "sh          %[level],     " #N "(%[pout])                 \n\t"        \
+  "sh          %[temp5],     " #J "(%[ppin])                 \n\t"        \
+  "j           3f                                            \n\t"        \
+"2:                                                          \n\t"        \
+  "lhu         %[temp1],     " #J "+2(%[ppiq])               \n\t"        \
+  "srl         %[temp5],     %[coeff],         16            \n\t"        \
+  "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
+  "lw          %[temp2],     " #K "+4(%[ppbias])             \n\t"        \
+  "lhu         %[temp3],     " #J "+2(%[ppq])                \n\t"        \
+  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
+  "sra         %[level],     %[level],         17            \n\t"        \
+  "srl         %[temp6],     %[sign],          16            \n\t"        \
+  "slt         %[temp4],     %[max_level],     %[level]      \n\t"        \
+  "movn        %[level],     %[max_level],     %[temp4]      \n\t"        \
+  "xor         %[level],     %[level],         %[temp6]      \n\t"        \
+  "subu        %[level],     %[level],         %[temp6]      \n\t"        \
+  "mul         %[temp5],     %[level],         %[temp3]      \n\t"        \
+  "sh          $0,           " #J "(%[ppin])                 \n\t"        \
+  "sh          $0,           " #N "(%[pout])                 \n\t"        \
+  "or          %[ret],       %[ret],           %[level]      \n\t"        \
+  "sh          %[temp5],     " #J "+2(%[ppin])               \n\t"        \
+  "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
+  "j           3f                                            \n\t"        \
+"1:                                                          \n\t"        \
+  "lhu         %[temp1],     " #J "(%[ppiq])                 \n\t"        \
+  "lw          %[temp2],     " #K "(%[ppbias])               \n\t"        \
+  "ulw         %[temp3],     " #J "(%[ppq])                  \n\t"        \
+  "andi        %[temp5],     %[coeff],         0xffff        \n\t"        \
+  "srl         %[temp0],     %[coeff],         16            \n\t"        \
+  "lhu         %[temp6],     " #J "+2(%[ppiq])               \n\t"        \
+  "lw          %[coeff],     " #K "+4(%[ppbias])             \n\t"        \
+  "mul         %[level],     %[temp5],         %[temp1]      \n\t"        \
+  "mul         %[temp4],     %[temp0],         %[temp6]      \n\t"        \
+  "addu        %[level],     %[level],         %[temp2]      \n\t"        \
+  "addu        %[temp4],     %[temp4],         %[coeff]      \n\t"        \
+  "precrq.ph.w %[level],     %[temp4],         %[level]      \n\t"        \
+  "shra.ph     %[level],     %[level],         1             \n\t"        \
+  "cmp.lt.ph   %[max_level1],%[level]                        \n\t"        \
+  "pick.ph     %[level],     %[max_level],     %[level]      \n\t"        \
+  "xor         %[level],     %[level],         %[sign]       \n\t"        \
+  "subu.ph     %[level],     %[level],         %[sign]       \n\t"        \
+  "mul.ph      %[temp3],     %[level],         %[temp3]      \n\t"        \
+  "or          %[ret],       %[ret],           %[level]      \n\t"        \
+  "sh          %[level],     " #N "(%[pout])                 \n\t"        \
+  "srl         %[level],     %[level],         16            \n\t"        \
+  "sh          %[level],     " #N1 "(%[pout])                \n\t"        \
+  "usw         %[temp3],     " #J "(%[ppin])                 \n\t"        \
+  "j           3f                                            \n\t"        \
+"0:                                                          \n\t"        \
+  "sh          $0,           " #N "(%[pout])                 \n\t"        \
+  "sh          $0,           " #N1 "(%[pout])                \n\t"        \
+  "usw         $0,           " #J "(%[ppin])                 \n\t"        \
+"3:                                                          \n\t"
+
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
+  int temp0, temp1, temp2, temp3, temp4, temp5,temp6;
+  int sign, coeff, level;
+  int max_level = MAX_LEVEL;
+  int max_level1 = max_level << 16 | max_level;
+  int ret = 0;
+
+  int16_t* ppin             = &in[0];
+  int16_t* pout             = &out[0];
+  const uint16_t* ppsharpen = &mtx->sharpen_[0];
+  const uint32_t* ppzthresh = &mtx->zthresh_[0];
+  const uint16_t* ppq       = &mtx->q_[0];
+  const uint16_t* ppiq      = &mtx->iq_[0];
+  const uint32_t* ppbias    = &mtx->bias_[0];
+
+  __asm__ volatile (
+    QUANTIZE_ONE( 0,  0,  0,  2)
+    QUANTIZE_ONE( 4,  8, 10, 12)
+    QUANTIZE_ONE( 8, 16,  4,  8)
+    QUANTIZE_ONE(12, 24, 14, 24)
+    QUANTIZE_ONE(16, 32,  6, 16)
+    QUANTIZE_ONE(20, 40, 22, 26)
+    QUANTIZE_ONE(24, 48, 18, 20)
+    QUANTIZE_ONE(28, 56, 28, 30)
+
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [sign]"=&r"(sign), [coeff]"=&r"(coeff),
+      [level]"=&r"(level), [temp6]"=&r"(temp6), [ret]"+&r"(ret)
+    : [ppin]"r"(ppin), [pout]"r"(pout), [max_level1]"r"(max_level1),
+      [ppiq]"r"(ppiq), [max_level]"r"(max_level),
+      [ppbias]"r"(ppbias), [ppzthresh]"r"(ppzthresh),
+      [ppsharpen]"r"(ppsharpen), [ppq]"r"(ppq)
+    : "memory", "hi", "lo"
+  );
+
+  return (ret != 0);
+}
+
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
+#undef QUANTIZE_ONE
+
+// macro for one horizontal pass in FTransformWHT
+// temp0..temp7 holds tmp[0]..tmp[15]
+// A, B, C, D - offset in bytes to load from in buffer
+// TEMP0, TEMP1 - registers for corresponding tmp elements
+#define HORIZONTAL_PASS_WHT(A, B, C, D, TEMP0, TEMP1)                          \
+  "lh              %[" #TEMP0 "],  " #A "(%[in])            \n\t"              \
+  "lh              %[" #TEMP1 "],  " #B "(%[in])            \n\t"              \
+  "lh              %[temp8],     " #C "(%[in])              \n\t"              \
+  "lh              %[temp9],     " #D "(%[in])              \n\t"              \
+  "ins             %[" #TEMP1 "],  %[" #TEMP0 "],  16,  16  \n\t"              \
+  "ins             %[temp9],     %[temp8],     16,  16      \n\t"              \
+  "subq.ph         %[temp8],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
+  "addq.ph         %[temp9],     %[" #TEMP1 "],  %[temp9]   \n\t"              \
+  "precrq.ph.w     %[" #TEMP0 "],  %[temp8],     %[temp9]   \n\t"              \
+  "append          %[temp8],     %[temp9],     16           \n\t"              \
+  "subq.ph         %[" #TEMP1 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
+  "addq.ph         %[" #TEMP0 "],  %[" #TEMP0 "],  %[temp8] \n\t"              \
+  "rotr            %[" #TEMP1 "],  %[" #TEMP1 "],  16       \n\t"
+
+// macro for one vertical pass in FTransformWHT
+// temp0..temp7 holds tmp[0]..tmp[15]
+// A, B, C, D - offsets in bytes to store to out buffer
+// TEMP0, TEMP2, TEMP4 and TEMP6 - registers for corresponding tmp elements
+#define VERTICAL_PASS_WHT(A, B, C, D, TEMP0, TEMP2, TEMP4, TEMP6)              \
+  "addq.ph         %[temp8],     %[" #TEMP0 "],  %[" #TEMP4 "]    \n\t"        \
+  "addq.ph         %[temp9],     %[" #TEMP2 "],  %[" #TEMP6 "]    \n\t"        \
+  "subq.ph         %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
+  "subq.ph         %[" #TEMP6 "],  %[" #TEMP0 "],  %[" #TEMP4 "]  \n\t"        \
+  "addqh.ph        %[" #TEMP0 "],  %[temp8],     %[temp9]         \n\t"        \
+  "subqh.ph        %[" #TEMP4 "],  %[" #TEMP6 "],  %[" #TEMP2 "]  \n\t"        \
+  "addqh.ph        %[" #TEMP2 "],  %[" #TEMP2 "],  %[" #TEMP6 "]  \n\t"        \
+  "subqh.ph        %[" #TEMP6 "],  %[temp8],     %[temp9]         \n\t"        \
+  "usw             %[" #TEMP0 "],  " #A "(%[out])                 \n\t"        \
+  "usw             %[" #TEMP2 "],  " #B "(%[out])                 \n\t"        \
+  "usw             %[" #TEMP4 "],  " #C "(%[out])                 \n\t"        \
+  "usw             %[" #TEMP6 "],  " #D "(%[out])                 \n\t"
+
+static void FTransformWHT(const int16_t* in, int16_t* out) {
+  int temp0, temp1, temp2, temp3, temp4;
+  int temp5, temp6, temp7, temp8, temp9;
+
+  __asm__ volatile (
+    HORIZONTAL_PASS_WHT(  0,  32,  64,  96, temp0, temp1)
+    HORIZONTAL_PASS_WHT(128, 160, 192, 224, temp2, temp3)
+    HORIZONTAL_PASS_WHT(256, 288, 320, 352, temp4, temp5)
+    HORIZONTAL_PASS_WHT(384, 416, 448, 480, temp6, temp7)
+    VERTICAL_PASS_WHT(0,  8, 16, 24, temp0, temp2, temp4, temp6)
+    VERTICAL_PASS_WHT(4, 12, 20, 28, temp1, temp3, temp5, temp7)
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8),
+      [temp9]"=&r"(temp9)
+    : [in]"r"(in), [out]"r"(out)
+    : "memory"
+  );
+}
+
+#undef VERTICAL_PASS_WHT
+#undef HORIZONTAL_PASS_WHT
+
+// macro for converting coefficients to bin
+// convert 8 coeffs at time
+// A, B, C, D - offsets in bytes to load from out buffer
+#define CONVERT_COEFFS_TO_BIN(A, B, C, D)                                      \
+  "ulw        %[temp0],  " #A "(%[out])                \n\t"                   \
+  "ulw        %[temp1],  " #B "(%[out])                \n\t"                   \
+  "ulw        %[temp2],  " #C "(%[out])                \n\t"                   \
+  "ulw        %[temp3],  " #D "(%[out])                \n\t"                   \
+  "absq_s.ph  %[temp0],  %[temp0]                      \n\t"                   \
+  "absq_s.ph  %[temp1],  %[temp1]                      \n\t"                   \
+  "absq_s.ph  %[temp2],  %[temp2]                      \n\t"                   \
+  "absq_s.ph  %[temp3],  %[temp3]                      \n\t"                   \
+  /* TODO(skal): add rounding ? shra_r.ph : shra.ph */                         \
+  /*             for following 4 instructions       */                         \
+  "shra.ph    %[temp0],  %[temp0],    3                \n\t"                   \
+  "shra.ph    %[temp1],  %[temp1],    3                \n\t"                   \
+  "shra.ph    %[temp2],  %[temp2],    3                \n\t"                   \
+  "shra.ph    %[temp3],  %[temp3],    3                \n\t"                   \
+  "shll_s.ph  %[temp0],  %[temp0],    10               \n\t"                   \
+  "shll_s.ph  %[temp1],  %[temp1],    10               \n\t"                   \
+  "shll_s.ph  %[temp2],  %[temp2],    10               \n\t"                   \
+  "shll_s.ph  %[temp3],  %[temp3],    10               \n\t"                   \
+  "shrl.ph    %[temp0],  %[temp0],    10               \n\t"                   \
+  "shrl.ph    %[temp1],  %[temp1],    10               \n\t"                   \
+  "shrl.ph    %[temp2],  %[temp2],    10               \n\t"                   \
+  "shrl.ph    %[temp3],  %[temp3],    10               \n\t"                   \
+  "shll.ph    %[temp0],  %[temp0],    2                \n\t"                   \
+  "shll.ph    %[temp1],  %[temp1],    2                \n\t"                   \
+  "shll.ph    %[temp2],  %[temp2],    2                \n\t"                   \
+  "shll.ph    %[temp3],  %[temp3],    2                \n\t"                   \
+  "ext        %[temp4],  %[temp0],    0,       16      \n\t"                   \
+  "ext        %[temp0],  %[temp0],    16,      16      \n\t"                   \
+  "addu       %[temp4],  %[temp4],    %[dist]          \n\t"                   \
+  "addu       %[temp0],  %[temp0],    %[dist]          \n\t"                   \
+  "ext        %[temp5],  %[temp1],    0,       16      \n\t"                   \
+  "lw         %[temp8],  0(%[temp4])                   \n\t"                   \
+  "ext        %[temp1],  %[temp1],    16,      16      \n\t"                   \
+  "addu       %[temp5],  %[temp5],    %[dist]          \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp4])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp0])                   \n\t"                   \
+  "addu       %[temp1],  %[temp1],    %[dist]          \n\t"                   \
+  "ext        %[temp6],  %[temp2],    0,       16      \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp0])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp5])                   \n\t"                   \
+  "ext        %[temp2],  %[temp2],    16,      16      \n\t"                   \
+  "addu       %[temp6],  %[temp6],    %[dist]          \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp5])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp1])                   \n\t"                   \
+  "addu       %[temp2],  %[temp2],    %[dist]          \n\t"                   \
+  "ext        %[temp7],  %[temp3],    0,       16      \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp1])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp6])                   \n\t"                   \
+  "ext        %[temp3],  %[temp3],    16,      16      \n\t"                   \
+  "addu       %[temp7],  %[temp7],    %[dist]          \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp6])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp2])                   \n\t"                   \
+  "addu       %[temp3],  %[temp3],    %[dist]          \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp2])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp7])                   \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp7])                   \n\t"                   \
+  "lw         %[temp8],  0(%[temp3])                   \n\t"                   \
+  "addiu      %[temp8],  %[temp8],    1                \n\t"                   \
+  "sw         %[temp8],  0(%[temp3])                   \n\t"
+
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  const int max_coeff = (MAX_COEFF_THRESH << 16) + MAX_COEFF_THRESH;
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8;
+
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin.
+    __asm__ volatile (
+      CONVERT_COEFFS_TO_BIN( 0,  4,  8, 12)
+      CONVERT_COEFFS_TO_BIN(16, 20, 24, 28)
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+        [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+        [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [temp8]"=&r"(temp8)
+      : [dist]"r"(distribution), [out]"r"(out), [max_coeff]"r"(max_coeff)
+      : "memory"
+    );
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+#undef CONVERT_COEFFS_TO_BIN
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPSdspR2(void) {
+  VP8FTransform = FTransform;
+  VP8ITransform = ITransform;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+  VP8EncPredLuma16 = Intra16Preds;
+  VP8EncPredChroma8 = IntraChromaPreds;
+  VP8EncPredLuma4 = Intra4Preds;
+#if !defined(WORK_AROUND_GCC)
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE4x4 = SSE4x4;
+#endif
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
+  VP8FTransformWHT = FTransformWHT;
+  VP8CollectHistogram = CollectHistogram;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/enc_neon.c b/src/3rdparty/libwebp/src/dsp/enc_neon.c
index 5814fac..c2aef58 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_neon.c
@@ -32,9 +32,9 @@ static const int16_t kC2 = 17734;  // half of kC2, actually. See comment above.
 
 // This code works but is *slower* than the inlined-asm version below
 // (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
-// USE_INTRINSICS define.
+// WEBP_USE_INTRINSICS define.
 // With gcc-4.8, it's a little faster speed than inlined-assembly.
-#if defined(USE_INTRINSICS)
+#if defined(WEBP_USE_INTRINSICS)
 
 // Treats 'v' as an uint8x8_t and zero extends to an int16x8_t.
 static WEBP_INLINE int16x8_t ConvertU8ToS16(uint32x2_t v) {
@@ -241,7 +241,7 @@ static void ITransformOne(const uint8_t* ref,
   );
 }
 
-#endif    // USE_INTRINSICS
+#endif    // WEBP_USE_INTRINSICS
 
 static void ITransform(const uint8_t* ref,
                        const int16_t* in, uint8_t* dst, int do_two) {
@@ -263,7 +263,7 @@ static uint8x16_t Load4x4(const uint8_t* src) {
 
 // Forward transform.
 
-#if defined(USE_INTRINSICS)
+#if defined(WEBP_USE_INTRINSICS)
 
 static WEBP_INLINE void Transpose4x4_S16(const int16x4_t A, const int16x4_t B,
                                          const int16x4_t C, const int16x4_t D,
@@ -548,323 +548,165 @@ static void FTransformWHT(const int16_t* src, int16_t* out) {
 // We try to match the spectral content (weighted) between source and
 // reconstructed samples.
 
-// This code works but is *slower* than the inlined-asm version below
-// (with gcc-4.6). So we disable it for now. Later, it'll be conditional to
-// USE_INTRINSICS define.
-// With gcc-4.8, it's only slightly slower than the inlined.
-#if defined(USE_INTRINSICS)
-
-// Zero extend an uint16x4_t 'v' to an int32x4_t.
-static WEBP_INLINE int32x4_t ConvertU16ToS32(uint16x4_t v) {
-  return vreinterpretq_s32_u32(vmovl_u16(v));
-}
-
-// Does a regular 4x4 transpose followed by an adjustment of the upper columns
-// in the inner rows to restore the source order of differences,
-// i.e., a0 - a1 | a3 - a2.
-static WEBP_INLINE int32x4x4_t DistoTranspose4x4(const int32x4x4_t rows) {
-  int32x4x4_t out = Transpose4x4(rows);
-  // restore source order in the columns containing differences.
-  const int32x2_t r1h = vget_high_s32(out.val[1]);
-  const int32x2_t r2h = vget_high_s32(out.val[2]);
-  out.val[1] = vcombine_s32(vget_low_s32(out.val[1]), r2h);
-  out.val[2] = vcombine_s32(vget_low_s32(out.val[2]), r1h);
-  return out;
-}
-
-static WEBP_INLINE int32x4x4_t DistoHorizontalPass(const uint8x8_t r0r1,
-                                                   const uint8x8_t r2r3) {
-  // a0 = in[0] + in[2] | a1 = in[1] + in[3]
-  const uint16x8_t a0a1 = vaddl_u8(r0r1, r2r3);
-  // a3 = in[0] - in[2] | a2 = in[1] - in[3]
-  const uint16x8_t a3a2 = vsubl_u8(r0r1, r2r3);
-  const int32x4_t tmp0 = vpaddlq_s16(vreinterpretq_s16_u16(a0a1));  // a0 + a1
-  const int32x4_t tmp1 = vpaddlq_s16(vreinterpretq_s16_u16(a3a2));  // a3 + a2
-  // no pairwise subtraction; reorder to perform tmp[2]/tmp[3] calculations.
-  // a0a0 a3a3 a0a0 a3a3 a0a0 a3a3 a0a0 a3a3
-  // a1a1 a2a2 a1a1 a2a2 a1a1 a2a2 a1a1 a2a2
-  const int16x8x2_t transpose =
-      vtrnq_s16(vreinterpretq_s16_u16(a0a1), vreinterpretq_s16_u16(a3a2));
-  // tmp[3] = a0 - a1 | tmp[2] = a3 - a2
-  const int32x4_t tmp32_1 = vsubl_s16(vget_low_s16(transpose.val[0]),
-                                      vget_low_s16(transpose.val[1]));
-  const int32x4_t tmp32_2 = vsubl_s16(vget_high_s16(transpose.val[0]),
-                                      vget_high_s16(transpose.val[1]));
-  // [0]: tmp[3] [1]: tmp[2]
-  const int32x4x2_t split = vtrnq_s32(tmp32_1, tmp32_2);
-  const int32x4x4_t res = { { tmp0, tmp1, split.val[1], split.val[0] } };
-  return res;
-}
-
-static WEBP_INLINE int32x4x4_t DistoVerticalPass(const int32x4x4_t rows) {
-  // a0 = tmp[0 + i] + tmp[8 + i];
-  const int32x4_t a0 = vaddq_s32(rows.val[0], rows.val[1]);
-  // a1 = tmp[4 + i] + tmp[12+ i];
-  const int32x4_t a1 = vaddq_s32(rows.val[2], rows.val[3]);
-  // a2 = tmp[4 + i] - tmp[12+ i];
-  const int32x4_t a2 = vsubq_s32(rows.val[2], rows.val[3]);
-  // a3 = tmp[0 + i] - tmp[8 + i];
-  const int32x4_t a3 = vsubq_s32(rows.val[0], rows.val[1]);
-  const int32x4_t b0 = vqabsq_s32(vaddq_s32(a0, a1));  // abs(a0 + a1)
-  const int32x4_t b1 = vqabsq_s32(vaddq_s32(a3, a2));  // abs(a3 + a2)
-  const int32x4_t b2 = vabdq_s32(a3, a2);              // abs(a3 - a2)
-  const int32x4_t b3 = vabdq_s32(a0, a1);              // abs(a0 - a1)
-  const int32x4x4_t res = { { b0, b1, b2, b3 } };
-  return res;
-}
-
-// Calculate the weighted sum of the rows in 'b'.
-static WEBP_INLINE int64x1_t DistoSum(const int32x4x4_t b,
-                                      const int32x4_t w0, const int32x4_t w1,
-                                      const int32x4_t w2, const int32x4_t w3) {
-  const int32x4_t s0 = vmulq_s32(w0, b.val[0]);
-  const int32x4_t s1 = vmlaq_s32(s0, w1, b.val[1]);
-  const int32x4_t s2 = vmlaq_s32(s1, w2, b.val[2]);
-  const int32x4_t s3 = vmlaq_s32(s2, w3, b.val[3]);
-  const int64x2_t sum1 = vpaddlq_s32(s3);
-  const int64x1_t sum2 = vadd_s64(vget_low_s64(sum1), vget_high_s64(sum1));
-  return sum2;
+// a 0123, b 0123
+// a 4567, b 4567
+// a 89ab, b 89ab
+// a cdef, b cdef
+//
+// transpose
+//
+// a 048c, b 048c
+// a 159d, b 159d
+// a 26ae, b 26ae
+// a 37bf, b 37bf
+//
+static WEBP_INLINE uint8x8x4_t DistoTranspose4x4U8(uint8x8x4_t d4_in) {
+  const uint8x8x2_t d2_tmp0 = vtrn_u8(d4_in.val[0], d4_in.val[1]);
+  const uint8x8x2_t d2_tmp1 = vtrn_u8(d4_in.val[2], d4_in.val[3]);
+  const uint16x4x2_t d2_tmp2 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[0]),
+                                        vreinterpret_u16_u8(d2_tmp1.val[0]));
+  const uint16x4x2_t d2_tmp3 = vtrn_u16(vreinterpret_u16_u8(d2_tmp0.val[1]),
+                                        vreinterpret_u16_u8(d2_tmp1.val[1]));
+
+  d4_in.val[0] = vreinterpret_u8_u16(d2_tmp2.val[0]);
+  d4_in.val[2] = vreinterpret_u8_u16(d2_tmp2.val[1]);
+  d4_in.val[1] = vreinterpret_u8_u16(d2_tmp3.val[0]);
+  d4_in.val[3] = vreinterpret_u8_u16(d2_tmp3.val[1]);
+  return d4_in;
+}
+
+static WEBP_INLINE int16x8x4_t DistoTranspose4x4S16(int16x8x4_t q4_in) {
+  const int16x8x2_t q2_tmp0 = vtrnq_s16(q4_in.val[0], q4_in.val[1]);
+  const int16x8x2_t q2_tmp1 = vtrnq_s16(q4_in.val[2], q4_in.val[3]);
+  const int32x4x2_t q2_tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[0]),
+                                        vreinterpretq_s32_s16(q2_tmp1.val[0]));
+  const int32x4x2_t q2_tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q2_tmp0.val[1]),
+                                        vreinterpretq_s32_s16(q2_tmp1.val[1]));
+  q4_in.val[0] = vreinterpretq_s16_s32(q2_tmp2.val[0]);
+  q4_in.val[2] = vreinterpretq_s16_s32(q2_tmp2.val[1]);
+  q4_in.val[1] = vreinterpretq_s16_s32(q2_tmp3.val[0]);
+  q4_in.val[3] = vreinterpretq_s16_s32(q2_tmp3.val[1]);
+  return q4_in;
+}
+
+static WEBP_INLINE int16x8x4_t DistoHorizontalPass(const uint8x8x4_t d4_in) {
+  // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
+  // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
+  const int16x8_t q_a0 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[0],
+                                                        d4_in.val[2]));
+  const int16x8_t q_a1 = vreinterpretq_s16_u16(vaddl_u8(d4_in.val[1],
+                                                        d4_in.val[3]));
+  const int16x8_t q_a3 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[0],
+                                                        d4_in.val[2]));
+  const int16x8_t q_a2 = vreinterpretq_s16_u16(vsubl_u8(d4_in.val[1],
+                                                        d4_in.val[3]));
+  int16x8x4_t q4_out;
+  // tmp[0] = a0 + a1
+  // tmp[1] = a3 + a2
+  // tmp[2] = a3 - a2
+  // tmp[3] = a0 - a1
+  INIT_VECTOR4(q4_out,
+               vaddq_s16(q_a0, q_a1), vaddq_s16(q_a3, q_a2),
+               vsubq_s16(q_a3, q_a2), vsubq_s16(q_a0, q_a1));
+  return q4_out;
+}
+
+static WEBP_INLINE int16x8x4_t DistoVerticalPass(int16x8x4_t q4_in) {
+  const int16x8_t q_a0 = vaddq_s16(q4_in.val[0], q4_in.val[2]);
+  const int16x8_t q_a1 = vaddq_s16(q4_in.val[1], q4_in.val[3]);
+  const int16x8_t q_a2 = vsubq_s16(q4_in.val[1], q4_in.val[3]);
+  const int16x8_t q_a3 = vsubq_s16(q4_in.val[0], q4_in.val[2]);
+
+  q4_in.val[0] = vaddq_s16(q_a0, q_a1);
+  q4_in.val[1] = vaddq_s16(q_a3, q_a2);
+  q4_in.val[2] = vabdq_s16(q_a3, q_a2);
+  q4_in.val[3] = vabdq_s16(q_a0, q_a1);
+  q4_in.val[0] = vabsq_s16(q4_in.val[0]);
+  q4_in.val[1] = vabsq_s16(q4_in.val[1]);
+  return q4_in;
+}
+
+static WEBP_INLINE int16x4x4_t DistoLoadW(const uint16_t* w) {
+  const uint16x8_t q_w07 = vld1q_u16(&w[0]);
+  const uint16x8_t q_w8f = vld1q_u16(&w[8]);
+  int16x4x4_t d4_w;
+  INIT_VECTOR4(d4_w,
+               vget_low_s16(vreinterpretq_s16_u16(q_w07)),
+               vget_high_s16(vreinterpretq_s16_u16(q_w07)),
+               vget_low_s16(vreinterpretq_s16_u16(q_w8f)),
+               vget_high_s16(vreinterpretq_s16_u16(q_w8f)));
+  return d4_w;
+}
+
+static WEBP_INLINE int32x2_t DistoSum(const int16x8x4_t q4_in,
+                                      const int16x4x4_t d4_w) {
+  int32x2_t d_sum;
+  // sum += w[ 0] * abs(b0);
+  // sum += w[ 4] * abs(b1);
+  // sum += w[ 8] * abs(b2);
+  // sum += w[12] * abs(b3);
+  int32x4_t q_sum0 = vmull_s16(d4_w.val[0], vget_low_s16(q4_in.val[0]));
+  int32x4_t q_sum1 = vmull_s16(d4_w.val[1], vget_low_s16(q4_in.val[1]));
+  int32x4_t q_sum2 = vmull_s16(d4_w.val[2], vget_low_s16(q4_in.val[2]));
+  int32x4_t q_sum3 = vmull_s16(d4_w.val[3], vget_low_s16(q4_in.val[3]));
+  q_sum0 = vmlsl_s16(q_sum0, d4_w.val[0], vget_high_s16(q4_in.val[0]));
+  q_sum1 = vmlsl_s16(q_sum1, d4_w.val[1], vget_high_s16(q4_in.val[1]));
+  q_sum2 = vmlsl_s16(q_sum2, d4_w.val[2], vget_high_s16(q4_in.val[2]));
+  q_sum3 = vmlsl_s16(q_sum3, d4_w.val[3], vget_high_s16(q4_in.val[3]));
+
+  q_sum0 = vaddq_s32(q_sum0, q_sum1);
+  q_sum2 = vaddq_s32(q_sum2, q_sum3);
+  q_sum2 = vaddq_s32(q_sum0, q_sum2);
+  d_sum = vpadd_s32(vget_low_s32(q_sum2), vget_high_s32(q_sum2));
+  d_sum = vpadd_s32(d_sum, d_sum);
+  return d_sum;
 }
 
 #define LOAD_LANE_32b(src, VALUE, LANE) \
-    (VALUE) = vld1q_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
+    (VALUE) = vld1_lane_u32((const uint32_t*)(src), (VALUE), (LANE))
 
 // Hadamard transform
 // Returns the weighted sum of the absolute value of transformed coefficients.
 static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
                     const uint16_t* const w) {
-  uint32x4_t d0d1 = { 0, 0, 0, 0 };
-  uint32x4_t d2d3 = { 0, 0, 0, 0 };
-  LOAD_LANE_32b(a + 0 * BPS, d0d1, 0);  // a00 a01 a02 a03
-  LOAD_LANE_32b(a + 1 * BPS, d0d1, 1);  // a10 a11 a12 a13
-  LOAD_LANE_32b(b + 0 * BPS, d0d1, 2);  // b00 b01 b02 b03
-  LOAD_LANE_32b(b + 1 * BPS, d0d1, 3);  // b10 b11 b12 b13
-  LOAD_LANE_32b(a + 2 * BPS, d2d3, 0);  // a20 a21 a22 a23
-  LOAD_LANE_32b(a + 3 * BPS, d2d3, 1);  // a30 a31 a32 a33
-  LOAD_LANE_32b(b + 2 * BPS, d2d3, 2);  // b20 b21 b22 b23
-  LOAD_LANE_32b(b + 3 * BPS, d2d3, 3);  // b30 b31 b32 b33
+  uint32x2_t d_in_ab_0123 = vdup_n_u32(0);
+  uint32x2_t d_in_ab_4567 = vdup_n_u32(0);
+  uint32x2_t d_in_ab_89ab = vdup_n_u32(0);
+  uint32x2_t d_in_ab_cdef = vdup_n_u32(0);
+  uint8x8x4_t d4_in;
+
+  // load data a, b
+  LOAD_LANE_32b(a + 0 * BPS, d_in_ab_0123, 0);
+  LOAD_LANE_32b(a + 1 * BPS, d_in_ab_4567, 0);
+  LOAD_LANE_32b(a + 2 * BPS, d_in_ab_89ab, 0);
+  LOAD_LANE_32b(a + 3 * BPS, d_in_ab_cdef, 0);
+  LOAD_LANE_32b(b + 0 * BPS, d_in_ab_0123, 1);
+  LOAD_LANE_32b(b + 1 * BPS, d_in_ab_4567, 1);
+  LOAD_LANE_32b(b + 2 * BPS, d_in_ab_89ab, 1);
+  LOAD_LANE_32b(b + 3 * BPS, d_in_ab_cdef, 1);
+  INIT_VECTOR4(d4_in,
+               vreinterpret_u8_u32(d_in_ab_0123),
+               vreinterpret_u8_u32(d_in_ab_4567),
+               vreinterpret_u8_u32(d_in_ab_89ab),
+               vreinterpret_u8_u32(d_in_ab_cdef));
 
   {
-    // a00 a01 a20 a21 a10 a11 a30 a31 b00 b01 b20 b21 b10 b11 b30 b31
-    // a02 a03 a22 a23 a12 a13 a32 a33 b02 b03 b22 b23 b12 b13 b32 b33
-    const uint16x8x2_t tmp =
-        vtrnq_u16(vreinterpretq_u16_u32(d0d1), vreinterpretq_u16_u32(d2d3));
-    const uint8x16_t d0d1u8 = vreinterpretq_u8_u16(tmp.val[0]);
-    const uint8x16_t d2d3u8 = vreinterpretq_u8_u16(tmp.val[1]);
-    const int32x4x4_t hpass_a = DistoHorizontalPass(vget_low_u8(d0d1u8),
-                                                    vget_low_u8(d2d3u8));
-    const int32x4x4_t hpass_b = DistoHorizontalPass(vget_high_u8(d0d1u8),
-                                                    vget_high_u8(d2d3u8));
-    const int32x4x4_t tmp_a = DistoTranspose4x4(hpass_a);
-    const int32x4x4_t tmp_b = DistoTranspose4x4(hpass_b);
-    const int32x4x4_t vpass_a = DistoVerticalPass(tmp_a);
-    const int32x4x4_t vpass_b = DistoVerticalPass(tmp_b);
-    const int32x4_t w0 = ConvertU16ToS32(vld1_u16(w + 0));
-    const int32x4_t w1 = ConvertU16ToS32(vld1_u16(w + 4));
-    const int32x4_t w2 = ConvertU16ToS32(vld1_u16(w + 8));
-    const int32x4_t w3 = ConvertU16ToS32(vld1_u16(w + 12));
-    const int64x1_t sum1 = DistoSum(vpass_a, w0, w1, w2, w3);
-    const int64x1_t sum2 = DistoSum(vpass_b, w0, w1, w2, w3);
-    const int32x2_t diff = vabd_s32(vreinterpret_s32_s64(sum1),
-                                    vreinterpret_s32_s64(sum2));
-    const int32x2_t res = vshr_n_s32(diff, 5);
-    return vget_lane_s32(res, 0);
-  }
-}
-
-#undef LOAD_LANE_32b
-
-#else
+    // horizontal pass
+    const uint8x8x4_t d4_t = DistoTranspose4x4U8(d4_in);
+    const int16x8x4_t q4_h = DistoHorizontalPass(d4_t);
+    const int16x4x4_t d4_w = DistoLoadW(w);
+    // vertical pass
+    const int16x8x4_t q4_t = DistoTranspose4x4S16(q4_h);
+    const int16x8x4_t q4_v = DistoVerticalPass(q4_t);
+    int32x2_t d_sum = DistoSum(q4_v, d4_w);
 
-// Hadamard transform
-// Returns the weighted sum of the absolute value of transformed coefficients.
-static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
-                    const uint16_t* const w) {
-  const int kBPS = BPS;
-  const uint8_t* A = a;
-  const uint8_t* B = b;
-  const uint16_t* W = w;
-  int sum;
-  __asm__ volatile (
-    "vld1.32         d0[0], [%[a]], %[kBPS]   \n"
-    "vld1.32         d0[1], [%[a]], %[kBPS]   \n"
-    "vld1.32         d2[0], [%[a]], %[kBPS]   \n"
-    "vld1.32         d2[1], [%[a]]            \n"
-
-    "vld1.32         d1[0], [%[b]], %[kBPS]   \n"
-    "vld1.32         d1[1], [%[b]], %[kBPS]   \n"
-    "vld1.32         d3[0], [%[b]], %[kBPS]   \n"
-    "vld1.32         d3[1], [%[b]]            \n"
-
-    // a d0/d2, b d1/d3
-    // d0/d1: 01 01 01 01
-    // d2/d3: 23 23 23 23
-    // But: it goes 01 45 23 67
-    // Notice the middle values are transposed
-    "vtrn.16         q0, q1                   \n"
-
-    // {a0, a1} = {in[0] + in[2], in[1] + in[3]}
-    "vaddl.u8        q2, d0, d2               \n"
-    "vaddl.u8        q10, d1, d3              \n"
-    // {a3, a2} = {in[0] - in[2], in[1] - in[3]}
-    "vsubl.u8        q3, d0, d2               \n"
-    "vsubl.u8        q11, d1, d3              \n"
-
-    // tmp[0] = a0 + a1
-    "vpaddl.s16      q0, q2                   \n"
-    "vpaddl.s16      q8, q10                  \n"
-
-    // tmp[1] = a3 + a2
-    "vpaddl.s16      q1, q3                   \n"
-    "vpaddl.s16      q9, q11                  \n"
-
-    // No pair subtract
-    // q2 = {a0, a3}
-    // q3 = {a1, a2}
-    "vtrn.16         q2, q3                   \n"
-    "vtrn.16         q10, q11                 \n"
-
-    // {tmp[3], tmp[2]} = {a0 - a1, a3 - a2}
-    "vsubl.s16       q12, d4, d6              \n"
-    "vsubl.s16       q13, d5, d7              \n"
-    "vsubl.s16       q14, d20, d22            \n"
-    "vsubl.s16       q15, d21, d23            \n"
-
-    // separate tmp[3] and tmp[2]
-    // q12 = tmp[3]
-    // q13 = tmp[2]
-    "vtrn.32         q12, q13                 \n"
-    "vtrn.32         q14, q15                 \n"
-
-    // Transpose tmp for a
-    "vswp            d1, d26                  \n" // vtrn.64
-    "vswp            d3, d24                  \n" // vtrn.64
-    "vtrn.32         q0, q1                   \n"
-    "vtrn.32         q13, q12                 \n"
-
-    // Transpose tmp for b
-    "vswp            d17, d30                 \n" // vtrn.64
-    "vswp            d19, d28                 \n" // vtrn.64
-    "vtrn.32         q8, q9                   \n"
-    "vtrn.32         q15, q14                 \n"
-
-    // The first Q register is a, the second b.
-    // q0/8 tmp[0-3]
-    // q13/15 tmp[4-7]
-    // q1/9 tmp[8-11]
-    // q12/14 tmp[12-15]
-
-    // These are still in 01 45 23 67 order. We fix it easily in the addition
-    // case but the subtraction propagates them.
-    "vswp            d3, d27                  \n"
-    "vswp            d19, d31                 \n"
-
-    // a0 = tmp[0] + tmp[8]
-    "vadd.s32        q2, q0, q1               \n"
-    "vadd.s32        q3, q8, q9               \n"
-
-    // a1 = tmp[4] + tmp[12]
-    "vadd.s32        q10, q13, q12            \n"
-    "vadd.s32        q11, q15, q14            \n"
-
-    // a2 = tmp[4] - tmp[12]
-    "vsub.s32        q13, q13, q12            \n"
-    "vsub.s32        q15, q15, q14            \n"
-
-    // a3 = tmp[0] - tmp[8]
-    "vsub.s32        q0, q0, q1               \n"
-    "vsub.s32        q8, q8, q9               \n"
-
-    // b0 = a0 + a1
-    "vadd.s32        q1, q2, q10              \n"
-    "vadd.s32        q9, q3, q11              \n"
-
-    // b1 = a3 + a2
-    "vadd.s32        q12, q0, q13             \n"
-    "vadd.s32        q14, q8, q15             \n"
-
-    // b2 = a3 - a2
-    "vsub.s32        q0, q0, q13              \n"
-    "vsub.s32        q8, q8, q15              \n"
-
-    // b3 = a0 - a1
-    "vsub.s32        q2, q2, q10              \n"
-    "vsub.s32        q3, q3, q11              \n"
-
-    "vld1.64         {q10, q11}, [%[w]]       \n"
-
-    // abs(b0)
-    "vabs.s32        q1, q1                   \n"
-    "vabs.s32        q9, q9                   \n"
-    // abs(b1)
-    "vabs.s32        q12, q12                 \n"
-    "vabs.s32        q14, q14                 \n"
-    // abs(b2)
-    "vabs.s32        q0, q0                   \n"
-    "vabs.s32        q8, q8                   \n"
-    // abs(b3)
-    "vabs.s32        q2, q2                   \n"
-    "vabs.s32        q3, q3                   \n"
-
-    // expand w before using.
-    "vmovl.u16       q13, d20                 \n"
-    "vmovl.u16       q15, d21                 \n"
-
-    // w[0] * abs(b0)
-    "vmul.u32        q1, q1, q13              \n"
-    "vmul.u32        q9, q9, q13              \n"
-
-    // w[4] * abs(b1)
-    "vmla.u32        q1, q12, q15             \n"
-    "vmla.u32        q9, q14, q15             \n"
-
-    // expand w before using.
-    "vmovl.u16       q13, d22                 \n"
-    "vmovl.u16       q15, d23                 \n"
-
-    // w[8] * abs(b1)
-    "vmla.u32        q1, q0, q13              \n"
-    "vmla.u32        q9, q8, q13              \n"
-
-    // w[12] * abs(b1)
-    "vmla.u32        q1, q2, q15              \n"
-    "vmla.u32        q9, q3, q15              \n"
-
-    // Sum the arrays
-    "vpaddl.u32      q1, q1                   \n"
-    "vpaddl.u32      q9, q9                   \n"
-    "vadd.u64        d2, d3                   \n"
-    "vadd.u64        d18, d19                 \n"
-
-    // Hadamard transform needs 4 bits of extra precision (2 bits in each
-    // direction) for dynamic raw. Weights w[] are 16bits at max, so the maximum
-    // precision for coeff is 8bit of input + 4bits of Hadamard transform +
-    // 16bits for w[] + 2 bits of abs() summation.
-    //
-    // This uses a maximum of 31 bits (signed). Discarding the top 32 bits is
-    // A-OK.
-
-    // sum2 - sum1
-    "vsub.u32        d0, d2, d18              \n"
-    // abs(sum2 - sum1)
-    "vabs.s32        d0, d0                   \n"
     // abs(sum2 - sum1) >> 5
-    "vshr.u32        d0, #5                   \n"
-
-    // It would be better to move the value straight into r0 but I'm not
-    // entirely sure how this works with inline assembly.
-    "vmov.32         %[sum], d0[0]            \n"
-
-    : [sum] "=r"(sum), [a] "+r"(A), [b] "+r"(B), [w] "+r"(W)
-    : [kBPS] "r"(kBPS)
-    : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
-      "q10", "q11", "q12", "q13", "q14", "q15"  // clobbered
-  ) ;
-
-  return sum;
+    d_sum = vabs_s32(d_sum);
+    d_sum  = vshr_n_s32(d_sum, 5);
+    return vget_lane_s32(d_sum, 0);
+  }
 }
-
-#endif  // USE_INTRINSICS
+#undef LOAD_LANE_32b
 
 static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
                       const uint16_t* const w) {
@@ -885,6 +727,7 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
                              VP8Histogram* const histo) {
   const uint16x8_t max_coeff_thresh = vdupq_n_u16(MAX_COEFF_THRESH);
   int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
   for (j = start_block; j < end_block; ++j) {
     int16_t out[16];
     FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
@@ -902,10 +745,11 @@ static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
       vst1q_s16(out + 8, vreinterpretq_s16_u16(b3));
       // Convert coefficients to bin.
       for (k = 0; k < 16; ++k) {
-        histo->distribution[out[k]]++;
+        ++distribution[out[k]];
       }
     }
   }
+  VP8SetHistogramData(distribution, histo);
 }
 
 //------------------------------------------------------------------------------
@@ -1049,17 +893,22 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return 0;
 }
 
-#endif   // !WORK_AROUND_GCC
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  nz  = QuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= QuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
 
-#endif   // WEBP_USE_NEON
+#endif   // !WORK_AROUND_GCC
 
 //------------------------------------------------------------------------------
 // Entry point
 
 extern void VP8EncDspInitNEON(void);
 
-void VP8EncDspInitNEON(void) {
-#if defined(WEBP_USE_NEON)
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
   VP8ITransform = ITransform;
   VP8FTransform = FTransform;
 
@@ -1074,6 +923,12 @@ void VP8EncDspInitNEON(void) {
   VP8SSE4x4 = SSE4x4;
 #if !defined(WORK_AROUND_GCC)
   VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
 #endif
-#endif   // WEBP_USE_NEON
 }
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse2.c b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
index 9958d9f..2333d2b 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
@@ -19,7 +19,6 @@
 
 #include "../enc/cost.h"
 #include "../enc/vp8enci.h"
-#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 // Quite useful macro for debugging. Left here for convenience.
@@ -36,67 +35,21 @@ static void PrintReg(const __m128i r, const char* const name, int size) {
     uint64_t i64[2];
   } tmp;
   tmp.r = r;
-  printf("%s\t: ", name);
+  fprintf(stderr, "%s\t: ", name);
   if (size == 8) {
-    for (n = 0; n < 16; ++n) printf("%.2x ", tmp.i8[n]);
+    for (n = 0; n < 16; ++n) fprintf(stderr, "%.2x ", tmp.i8[n]);
   } else if (size == 16) {
-    for (n = 0; n < 8; ++n) printf("%.4x ", tmp.i16[n]);
+    for (n = 0; n < 8; ++n) fprintf(stderr, "%.4x ", tmp.i16[n]);
   } else if (size == 32) {
-    for (n = 0; n < 4; ++n) printf("%.8x ", tmp.i32[n]);
+    for (n = 0; n < 4; ++n) fprintf(stderr, "%.8x ", tmp.i32[n]);
   } else {
-    for (n = 0; n < 2; ++n) printf("%.16lx ", tmp.i64[n]);
+    for (n = 0; n < 2; ++n) fprintf(stderr, "%.16lx ", tmp.i64[n]);
   }
-  printf("\n");
+  fprintf(stderr, "\n");
 }
 #endif
 
 //------------------------------------------------------------------------------
-// Compute susceptibility based on DCT-coeff histograms:
-// the higher, the "easier" the macroblock is to compress.
-
-static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
-                             int start_block, int end_block,
-                             VP8Histogram* const histo) {
-  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
-  int j;
-  for (j = start_block; j < end_block; ++j) {
-    int16_t out[16];
-    int k;
-
-    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
-
-    // Convert coefficients to bin (within out[]).
-    {
-      // Load.
-      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
-      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
-      // sign(out) = out >> 15  (0x0000 if positive, 0xffff if negative)
-      const __m128i sign0 = _mm_srai_epi16(out0, 15);
-      const __m128i sign1 = _mm_srai_epi16(out1, 15);
-      // abs(out) = (out ^ sign) - sign
-      const __m128i xor0 = _mm_xor_si128(out0, sign0);
-      const __m128i xor1 = _mm_xor_si128(out1, sign1);
-      const __m128i abs0 = _mm_sub_epi16(xor0, sign0);
-      const __m128i abs1 = _mm_sub_epi16(xor1, sign1);
-      // v = abs(out) >> 3
-      const __m128i v0 = _mm_srai_epi16(abs0, 3);
-      const __m128i v1 = _mm_srai_epi16(abs1, 3);
-      // bin = min(v, MAX_COEFF_THRESH)
-      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
-      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
-      // Store.
-      _mm_storeu_si128((__m128i*)&out[0], bin0);
-      _mm_storeu_si128((__m128i*)&out[8], bin1);
-    }
-
-    // Convert coefficients to bin.
-    for (k = 0; k < 16; ++k) {
-      histo->distribution[out[k]]++;
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
 
 // Does one or two inverse transforms.
@@ -128,19 +81,19 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
   // use nor store.
   __m128i in0, in1, in2, in3;
   {
-    in0 = _mm_loadl_epi64((__m128i*)&in[0]);
-    in1 = _mm_loadl_epi64((__m128i*)&in[4]);
-    in2 = _mm_loadl_epi64((__m128i*)&in[8]);
-    in3 = _mm_loadl_epi64((__m128i*)&in[12]);
+    in0 = _mm_loadl_epi64((const __m128i*)&in[0]);
+    in1 = _mm_loadl_epi64((const __m128i*)&in[4]);
+    in2 = _mm_loadl_epi64((const __m128i*)&in[8]);
+    in3 = _mm_loadl_epi64((const __m128i*)&in[12]);
     // a00 a10 a20 a30   x x x x
     // a01 a11 a21 a31   x x x x
     // a02 a12 a22 a32   x x x x
     // a03 a13 a23 a33   x x x x
     if (do_two) {
-      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
-      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
-      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
-      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
+      const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]);
+      const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]);
+      const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]);
+      const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]);
       in0 = _mm_unpacklo_epi64(in0, inB0);
       in1 = _mm_unpacklo_epi64(in1, inB1);
       in2 = _mm_unpacklo_epi64(in2, inB2);
@@ -277,16 +230,16 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
     __m128i ref0, ref1, ref2, ref3;
     if (do_two) {
       // Load eight bytes/pixels per line.
-      ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
-      ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
-      ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
-      ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
+      ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+      ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+      ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+      ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
     } else {
       // Load four bytes/pixels per line.
-      ref0 = _mm_cvtsi32_si128(*(int*)&ref[0 * BPS]);
-      ref1 = _mm_cvtsi32_si128(*(int*)&ref[1 * BPS]);
-      ref2 = _mm_cvtsi32_si128(*(int*)&ref[2 * BPS]);
-      ref3 = _mm_cvtsi32_si128(*(int*)&ref[3 * BPS]);
+      ref0 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[0 * BPS]));
+      ref1 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[1 * BPS]));
+      ref2 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[2 * BPS]));
+      ref3 = _mm_cvtsi32_si128(WebPMemToUint32(&ref[3 * BPS]));
     }
     // Convert to 16b.
     ref0 = _mm_unpacklo_epi8(ref0, zero);
@@ -312,168 +265,233 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
       _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
     } else {
       // Store four bytes/pixels per line.
-      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0);
-      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1);
-      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2);
-      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3);
+      WebPUint32ToMem(&dst[0 * BPS], _mm_cvtsi128_si32(ref0));
+      WebPUint32ToMem(&dst[1 * BPS], _mm_cvtsi128_si32(ref1));
+      WebPUint32ToMem(&dst[2 * BPS], _mm_cvtsi128_si32(ref2));
+      WebPUint32ToMem(&dst[3 * BPS], _mm_cvtsi128_si32(ref3));
     }
   }
 }
 
-static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i seven = _mm_set1_epi16(7);
+static void FTransformPass1(const __m128i* const in01,
+                            const __m128i* const in23,
+                            __m128i* const out01,
+                            __m128i* const out32) {
   const __m128i k937 = _mm_set1_epi32(937);
   const __m128i k1812 = _mm_set1_epi32(1812);
-  const __m128i k51000 = _mm_set1_epi32(51000);
-  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
-  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
-                                           5352,  2217, 5352,  2217);
-  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
-                                           2217, -5352, 2217, -5352);
+
   const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
   const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
   const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
                                             2217, 5352, 2217, 5352);
   const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
                                             -5352, 2217, -5352, 2217);
+
+  // *in01 = 00 01 10 11 02 03 12 13
+  // *in23 = 20 21 30 31 22 23 32 33
+  const __m128i shuf01_p = _mm_shufflehi_epi16(*in01, _MM_SHUFFLE(2, 3, 0, 1));
+  const __m128i shuf23_p = _mm_shufflehi_epi16(*in23, _MM_SHUFFLE(2, 3, 0, 1));
+  // 00 01 10 11 03 02 13 12
+  // 20 21 30 31 23 22 33 32
+  const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
+  const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
+  // 00 01 10 11 20 21 30 31
+  // 03 02 13 12 23 22 33 32
+  const __m128i a01 = _mm_add_epi16(s01, s32);
+  const __m128i a32 = _mm_sub_epi16(s01, s32);
+  // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
+  // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
+
+  const __m128i tmp0   = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
+  const __m128i tmp2   = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
+  const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
+  const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
+  const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
+  const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
+  const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
+  const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
+  const __m128i s03    = _mm_packs_epi32(tmp0, tmp2);
+  const __m128i s12    = _mm_packs_epi32(tmp1, tmp3);
+  const __m128i s_lo   = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
+  const __m128i s_hi   = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
+  const __m128i v23    = _mm_unpackhi_epi32(s_lo, s_hi);
+  *out01 = _mm_unpacklo_epi32(s_lo, s_hi);
+  *out32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
+}
+
+static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
+                            int16_t* out) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i seven = _mm_set1_epi16(7);
+  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
+                                           5352,  2217, 5352,  2217);
+  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
+                                           2217, -5352, 2217, -5352);
+  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
+  const __m128i k51000 = _mm_set1_epi32(51000);
+
+  // Same operations are done on the (0,3) and (1,2) pairs.
+  // a0 = v0 + v3
+  // a1 = v1 + v2
+  // a3 = v0 - v3
+  // a2 = v1 - v2
+  const __m128i a01 = _mm_add_epi16(*v01, *v32);
+  const __m128i a32 = _mm_sub_epi16(*v01, *v32);
+  const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
+  const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
+  const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
+
+  // d0 = (a0 + a1 + 7) >> 4;
+  // d2 = (a0 - a1 + 7) >> 4;
+  const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
+  const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
+  const __m128i d0 = _mm_srai_epi16(c0, 4);
+  const __m128i d2 = _mm_srai_epi16(c2, 4);
+
+  // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
+  // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
+  const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
+  const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
+  const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
+  const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
+  const __m128i d3 = _mm_add_epi32(c3, k51000);
+  const __m128i e1 = _mm_srai_epi32(d1, 16);
+  const __m128i e3 = _mm_srai_epi32(d3, 16);
+  const __m128i f1 = _mm_packs_epi32(e1, e1);
+  const __m128i f3 = _mm_packs_epi32(e3, e3);
+  // f1 = f1 + (a3 != 0);
+  // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
+  // desired (0, 1), we add one earlier through k12000_plus_one.
+  // -> f1 = f1 + 1 - (a3 == 0)
+  const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
+
+  const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
+  const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
+  _mm_storeu_si128((__m128i*)&out[0], d0_g1);
+  _mm_storeu_si128((__m128i*)&out[8], d2_f3);
+}
+
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  const __m128i zero = _mm_setzero_si128();
+
+  // Load src and convert to 16b.
+  const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
+  const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
+  const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
+  const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
+  const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
+  const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
+  const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
+  const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
+  // Load ref and convert to 16b.
+  const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+  const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+  const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+  const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
+  const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
+  const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
+  const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
+  const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
+  // Compute difference. -> 00 01 02 03 00 00 00 00
+  const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
+  const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
+  const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
+  const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
+
+  // Unpack and shuffle
+  // 00 01 02 03   0 0 0 0
+  // 10 11 12 13   0 0 0 0
+  // 20 21 22 23   0 0 0 0
+  // 30 31 32 33   0 0 0 0
+  const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
+  const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
   __m128i v01, v32;
 
+  // First pass
+  FTransformPass1(&shuf01, &shuf23, &v01, &v32);
 
-  // Difference between src and ref and initial transpose.
-  {
-    // Load src and convert to 16b.
-    const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]);
-    const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]);
-    const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]);
-    const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]);
-    const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
-    const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
-    const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
-    const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
-    // Load ref and convert to 16b.
-    const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
-    const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
-    const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
-    const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
-    const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
-    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
-    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
-    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
-    // Compute difference. -> 00 01 02 03 00 00 00 00
-    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
-    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
-    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
-    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
-
-
-    // Unpack and shuffle
-    // 00 01 02 03   0 0 0 0
-    // 10 11 12 13   0 0 0 0
-    // 20 21 22 23   0 0 0 0
-    // 30 31 32 33   0 0 0 0
-    const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
-    const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
-    // 00 01 10 11 02 03 12 13
-    // 20 21 30 31 22 23 32 33
-    const __m128i shuf01_p =
-        _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1));
-    const __m128i shuf23_p =
-        _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1));
-    // 00 01 10 11 03 02 13 12
-    // 20 21 30 31 23 22 33 32
-    const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
-    const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
-    // 00 01 10 11 20 21 30 31
-    // 03 02 13 12 23 22 33 32
-    const __m128i a01 = _mm_add_epi16(s01, s32);
-    const __m128i a32 = _mm_sub_epi16(s01, s32);
-    // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
-    // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]
-
-    const __m128i tmp0 = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
-    const __m128i tmp2 = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
-    const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
-    const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
-    const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
-    const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
-    const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
-    const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
-    const __m128i s03 = _mm_packs_epi32(tmp0, tmp2);
-    const __m128i s12 = _mm_packs_epi32(tmp1, tmp3);
-    const __m128i s_lo = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
-    const __m128i s_hi = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
-    const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi);
-    v01 = _mm_unpacklo_epi32(s_lo, s_hi);
-    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
-  }
+  // Second pass
+  FTransformPass2(&v01, &v32, out);
+}
+
+static void FTransform2(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  const __m128i zero = _mm_setzero_si128();
+
+  // Load src and convert to 16b.
+  const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
+  const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
+  const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
+  const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
+  const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
+  const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
+  const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
+  const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
+  // Load ref and convert to 16b.
+  const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
+  const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
+  const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
+  const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
+  const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
+  const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
+  const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
+  const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
+  // Compute difference. -> 00 01 02 03  00' 01' 02' 03'
+  const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
+  const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
+  const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
+  const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);
+
+  // Unpack and shuffle
+  // 00 01 02 03   0 0 0 0
+  // 10 11 12 13   0 0 0 0
+  // 20 21 22 23   0 0 0 0
+  // 30 31 32 33   0 0 0 0
+  const __m128i shuf01l = _mm_unpacklo_epi32(diff0, diff1);
+  const __m128i shuf23l = _mm_unpacklo_epi32(diff2, diff3);
+  const __m128i shuf01h = _mm_unpackhi_epi32(diff0, diff1);
+  const __m128i shuf23h = _mm_unpackhi_epi32(diff2, diff3);
+  __m128i v01l, v32l;
+  __m128i v01h, v32h;
+
+  // First pass
+  FTransformPass1(&shuf01l, &shuf23l, &v01l, &v32l);
+  FTransformPass1(&shuf01h, &shuf23h, &v01h, &v32h);
 
   // Second pass
-  {
-    // Same operations are done on the (0,3) and (1,2) pairs.
-    // a0 = v0 + v3
-    // a1 = v1 + v2
-    // a3 = v0 - v3
-    // a2 = v1 - v2
-    const __m128i a01 = _mm_add_epi16(v01, v32);
-    const __m128i a32 = _mm_sub_epi16(v01, v32);
-    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
-    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
-    const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
-
-    // d0 = (a0 + a1 + 7) >> 4;
-    // d2 = (a0 - a1 + 7) >> 4;
-    const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
-    const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
-    const __m128i d0 = _mm_srai_epi16(c0, 4);
-    const __m128i d2 = _mm_srai_epi16(c2, 4);
-
-    // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
-    // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
-    const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
-    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
-    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
-    const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
-    const __m128i d3 = _mm_add_epi32(c3, k51000);
-    const __m128i e1 = _mm_srai_epi32(d1, 16);
-    const __m128i e3 = _mm_srai_epi32(d3, 16);
-    const __m128i f1 = _mm_packs_epi32(e1, e1);
-    const __m128i f3 = _mm_packs_epi32(e3, e3);
-    // f1 = f1 + (a3 != 0);
-    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
-    // desired (0, 1), we add one earlier through k12000_plus_one.
-    // -> f1 = f1 + 1 - (a3 == 0)
-    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
-
-    const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
-    const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
-    _mm_storeu_si128((__m128i*)&out[0], d0_g1);
-    _mm_storeu_si128((__m128i*)&out[8], d2_f3);
-  }
+  FTransformPass2(&v01l, &v32l, out + 0);
+  FTransformPass2(&v01h, &v32h, out + 16);
+}
+
+static void FTransformWHTRow(const int16_t* const in, __m128i* const out) {
+  const __m128i kMult1 = _mm_set_epi16(0, 0, 0, 0, 1, 1, 1, 1);
+  const __m128i kMult2 = _mm_set_epi16(0, 0, 0, 0, -1, 1, -1, 1);
+  const __m128i src0 = _mm_loadl_epi64((__m128i*)&in[0 * 16]);
+  const __m128i src1 = _mm_loadl_epi64((__m128i*)&in[1 * 16]);
+  const __m128i src2 = _mm_loadl_epi64((__m128i*)&in[2 * 16]);
+  const __m128i src3 = _mm_loadl_epi64((__m128i*)&in[3 * 16]);
+  const __m128i A01 = _mm_unpacklo_epi16(src0, src1);  // A0 A1 | ...
+  const __m128i A23 = _mm_unpacklo_epi16(src2, src3);  // A2 A3 | ...
+  const __m128i B0 = _mm_adds_epi16(A01, A23);    // a0 | a1 | ...
+  const __m128i B1 = _mm_subs_epi16(A01, A23);    // a3 | a2 | ...
+  const __m128i C0 = _mm_unpacklo_epi32(B0, B1);  // a0 | a1 | a3 | a2
+  const __m128i C1 = _mm_unpacklo_epi32(B1, B0);  // a3 | a2 | a0 | a1
+  const __m128i D0 = _mm_madd_epi16(C0, kMult1);  // out0, out1
+  const __m128i D1 = _mm_madd_epi16(C1, kMult2);  // out2, out3
+  *out = _mm_unpacklo_epi64(D0, D1);
 }
 
 static void FTransformWHT(const int16_t* in, int16_t* out) {
-  int32_t tmp[16];
-  int i;
-  for (i = 0; i < 4; ++i, in += 64) {
-    const int a0 = (in[0 * 16] + in[2 * 16]);
-    const int a1 = (in[1 * 16] + in[3 * 16]);
-    const int a2 = (in[1 * 16] - in[3 * 16]);
-    const int a3 = (in[0 * 16] - in[2 * 16]);
-    tmp[0 + i * 4] = a0 + a1;
-    tmp[1 + i * 4] = a3 + a2;
-    tmp[2 + i * 4] = a3 - a2;
-    tmp[3 + i * 4] = a0 - a1;
-  }
+  __m128i row0, row1, row2, row3;
+  FTransformWHTRow(in + 0 * 64, &row0);
+  FTransformWHTRow(in + 1 * 64, &row1);
+  FTransformWHTRow(in + 2 * 64, &row2);
+  FTransformWHTRow(in + 3 * 64, &row3);
+
   {
-    const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]);
-    const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]);
-    const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]);
-    const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]);
-    const __m128i a0 = _mm_add_epi32(src0, src2);
-    const __m128i a1 = _mm_add_epi32(src1, src3);
-    const __m128i a2 = _mm_sub_epi32(src1, src3);
-    const __m128i a3 = _mm_sub_epi32(src0, src2);
+    const __m128i a0 = _mm_add_epi32(row0, row2);
+    const __m128i a1 = _mm_add_epi32(row1, row3);
+    const __m128i a2 = _mm_sub_epi32(row1, row3);
+    const __m128i a3 = _mm_sub_epi32(row0, row2);
     const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);
     const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);
     const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);
@@ -486,143 +504,634 @@ static void FTransformWHT(const int16_t* in, int16_t* out) {
 }
 
 //------------------------------------------------------------------------------
-// Metric
+// Compute susceptibility based on DCT-coeff histograms:
+// the higher, the "easier" the macroblock is to compress.
 
-static int SSE_Nx4(const uint8_t* a, const uint8_t* b,
-                   int num_quads, int do_16) {
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
   const __m128i zero = _mm_setzero_si128();
-  __m128i sum1 = zero;
-  __m128i sum2 = zero;
-
-  while (num_quads-- > 0) {
-    // Note: for the !do_16 case, we read 16 pixels instead of 8 but that's ok,
-    // thanks to buffer over-allocation to that effect.
-    const __m128i a0 = _mm_loadu_si128((__m128i*)&a[BPS * 0]);
-    const __m128i a1 = _mm_loadu_si128((__m128i*)&a[BPS * 1]);
-    const __m128i a2 = _mm_loadu_si128((__m128i*)&a[BPS * 2]);
-    const __m128i a3 = _mm_loadu_si128((__m128i*)&a[BPS * 3]);
-    const __m128i b0 = _mm_loadu_si128((__m128i*)&b[BPS * 0]);
-    const __m128i b1 = _mm_loadu_si128((__m128i*)&b[BPS * 1]);
-    const __m128i b2 = _mm_loadu_si128((__m128i*)&b[BPS * 2]);
-    const __m128i b3 = _mm_loadu_si128((__m128i*)&b[BPS * 3]);
-
-    // compute clip0(a-b) and clip0(b-a)
-    const __m128i a0p = _mm_subs_epu8(a0, b0);
-    const __m128i a0m = _mm_subs_epu8(b0, a0);
-    const __m128i a1p = _mm_subs_epu8(a1, b1);
-    const __m128i a1m = _mm_subs_epu8(b1, a1);
-    const __m128i a2p = _mm_subs_epu8(a2, b2);
-    const __m128i a2m = _mm_subs_epu8(b2, a2);
-    const __m128i a3p = _mm_subs_epu8(a3, b3);
-    const __m128i a3m = _mm_subs_epu8(b3, a3);
-
-    // compute |a-b| with 8b arithmetic as clip0(a-b) | clip0(b-a)
-    const __m128i diff0 = _mm_or_si128(a0p, a0m);
-    const __m128i diff1 = _mm_or_si128(a1p, a1m);
-    const __m128i diff2 = _mm_or_si128(a2p, a2m);
-    const __m128i diff3 = _mm_or_si128(a3p, a3m);
-
-    // unpack (only four operations, instead of eight)
-    const __m128i low0 = _mm_unpacklo_epi8(diff0, zero);
-    const __m128i low1 = _mm_unpacklo_epi8(diff1, zero);
-    const __m128i low2 = _mm_unpacklo_epi8(diff2, zero);
-    const __m128i low3 = _mm_unpacklo_epi8(diff3, zero);
-
-    // multiply with self
-    const __m128i low_madd0 = _mm_madd_epi16(low0, low0);
-    const __m128i low_madd1 = _mm_madd_epi16(low1, low1);
-    const __m128i low_madd2 = _mm_madd_epi16(low2, low2);
-    const __m128i low_madd3 = _mm_madd_epi16(low3, low3);
-
-    // collect in a cascading way
-    const __m128i low_sum0 = _mm_add_epi32(low_madd0, low_madd1);
-    const __m128i low_sum1 = _mm_add_epi32(low_madd2, low_madd3);
-    sum1 = _mm_add_epi32(sum1, low_sum0);
-    sum2 = _mm_add_epi32(sum2, low_sum1);
-
-    if (do_16) {  // if necessary, process the higher 8 bytes similarly
-      const __m128i hi0 = _mm_unpackhi_epi8(diff0, zero);
-      const __m128i hi1 = _mm_unpackhi_epi8(diff1, zero);
-      const __m128i hi2 = _mm_unpackhi_epi8(diff2, zero);
-      const __m128i hi3 = _mm_unpackhi_epi8(diff3, zero);
-
-      const __m128i hi_madd0 = _mm_madd_epi16(hi0, hi0);
-      const __m128i hi_madd1 = _mm_madd_epi16(hi1, hi1);
-      const __m128i hi_madd2 = _mm_madd_epi16(hi2, hi2);
-      const __m128i hi_madd3 = _mm_madd_epi16(hi3, hi3);
-      const __m128i hi_sum0 = _mm_add_epi32(hi_madd0, hi_madd1);
-      const __m128i hi_sum1 = _mm_add_epi32(hi_madd2, hi_madd3);
-      sum1 = _mm_add_epi32(sum1, hi_sum0);
-      sum2 = _mm_add_epi32(sum2, hi_sum1);
+  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    int k;
+
+    FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin (within out[]).
+    {
+      // Load.
+      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
+      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
+      const __m128i d0 = _mm_sub_epi16(zero, out0);
+      const __m128i d1 = _mm_sub_epi16(zero, out1);
+      const __m128i abs0 = _mm_max_epi16(out0, d0);   // abs(v), 16b
+      const __m128i abs1 = _mm_max_epi16(out1, d1);
+      // v = abs(out) >> 3
+      const __m128i v0 = _mm_srai_epi16(abs0, 3);
+      const __m128i v1 = _mm_srai_epi16(abs1, 3);
+      // bin = min(v, MAX_COEFF_THRESH)
+      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
+      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
+      // Store.
+      _mm_storeu_si128((__m128i*)&out[0], bin0);
+      _mm_storeu_si128((__m128i*)&out[8], bin1);
+    }
+
+    // Convert coefficients to bin.
+    for (k = 0; k < 16; ++k) {
+      ++distribution[out[k]];
     }
-    a += 4 * BPS;
-    b += 4 * BPS;
   }
-  {
-    int32_t tmp[4];
-    const __m128i sum = _mm_add_epi32(sum1, sum2);
-    _mm_storeu_si128((__m128i*)tmp, sum);
-    return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+  VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+// helper for chroma-DC predictions
+static WEBP_INLINE void Put8x8uv(uint8_t v, uint8_t* dst) {
+  int j;
+  const __m128i values = _mm_set1_epi8(v);
+  for (j = 0; j < 8; ++j) {
+    _mm_storel_epi64((__m128i*)(dst + j * BPS), values);
+  }
+}
+
+static WEBP_INLINE void Put16(uint8_t v, uint8_t* dst) {
+  int j;
+  const __m128i values = _mm_set1_epi8(v);
+  for (j = 0; j < 16; ++j) {
+    _mm_store_si128((__m128i*)(dst + j * BPS), values);
+  }
+}
+
+static WEBP_INLINE void Fill(uint8_t* dst, int value, int size) {
+  if (size == 4) {
+    int j;
+    for (j = 0; j < 4; ++j) {
+      memset(dst + j * BPS, value, 4);
+    }
+  } else if (size == 8) {
+    Put8x8uv(value, dst);
+  } else {
+    Put16(value, dst);
+  }
+}
+
+static WEBP_INLINE void VE8uv(uint8_t* dst, const uint8_t* top) {
+  int j;
+  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+  for (j = 0; j < 8; ++j) {
+    _mm_storel_epi64((__m128i*)(dst + j * BPS), top_values);
+  }
+}
+
+static WEBP_INLINE void VE16(uint8_t* dst, const uint8_t* top) {
+  const __m128i top_values = _mm_load_si128((const __m128i*)top);
+  int j;
+  for (j = 0; j < 16; ++j) {
+    _mm_store_si128((__m128i*)(dst + j * BPS), top_values);
+  }
+}
+
+static WEBP_INLINE void VerticalPred(uint8_t* dst,
+                                     const uint8_t* top, int size) {
+  if (top != NULL) {
+    if (size == 8) {
+      VE8uv(dst, top);
+    } else {
+      VE16(dst, top);
+    }
+  } else {
+    Fill(dst, 127, size);
+  }
+}
+
+static WEBP_INLINE void HE8uv(uint8_t* dst, const uint8_t* left) {
+  int j;
+  for (j = 0; j < 8; ++j) {
+    const __m128i values = _mm_set1_epi8(left[j]);
+    _mm_storel_epi64((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void HE16(uint8_t* dst, const uint8_t* left) {
+  int j;
+  for (j = 0; j < 16; ++j) {
+    const __m128i values = _mm_set1_epi8(left[j]);
+    _mm_store_si128((__m128i*)dst, values);
+    dst += BPS;
+  }
+}
+
+static WEBP_INLINE void HorizontalPred(uint8_t* dst,
+                                       const uint8_t* left, int size) {
+  if (left != NULL) {
+    if (size == 8) {
+      HE8uv(dst, left);
+    } else {
+      HE16(dst, left);
+    }
+  } else {
+    Fill(dst, 129, size);
+  }
+}
+
+static WEBP_INLINE void TM(uint8_t* dst, const uint8_t* left,
+                           const uint8_t* top, int size) {
+  const __m128i zero = _mm_setzero_si128();
+  int y;
+  if (size == 8) {
+    const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+    const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+    for (y = 0; y < 8; ++y, dst += BPS) {
+      const int val = left[y] - left[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+      _mm_storel_epi64((__m128i*)dst, out);
+    }
+  } else {
+    const __m128i top_values = _mm_load_si128((const __m128i*)top);
+    const __m128i top_base_0 = _mm_unpacklo_epi8(top_values, zero);
+    const __m128i top_base_1 = _mm_unpackhi_epi8(top_values, zero);
+    for (y = 0; y < 16; ++y, dst += BPS) {
+      const int val = left[y] - left[-1];
+      const __m128i base = _mm_set1_epi16(val);
+      const __m128i out_0 = _mm_add_epi16(base, top_base_0);
+      const __m128i out_1 = _mm_add_epi16(base, top_base_1);
+      const __m128i out = _mm_packus_epi16(out_0, out_1);
+      _mm_store_si128((__m128i*)dst, out);
+    }
+  }
+}
+
+static WEBP_INLINE void TrueMotion(uint8_t* dst, const uint8_t* left,
+                                   const uint8_t* top, int size) {
+  if (left != NULL) {
+    if (top != NULL) {
+      TM(dst, left, top, size);
+    } else {
+      HorizontalPred(dst, left, size);
+    }
+  } else {
+    // true motion without left samples (hence: with default 129 value)
+    // is equivalent to VE prediction where you just copy the top samples.
+    // Note that if top samples are not available, the default value is
+    // then 129, and not 127 as in the VerticalPred case.
+    if (top != NULL) {
+      VerticalPred(dst, top, size);
+    } else {
+      Fill(dst, 129, size);
+    }
+  }
+}
+
+static WEBP_INLINE void DC8uv(uint8_t* dst, const uint8_t* left,
+                              const uint8_t* top) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+  const __m128i left_values = _mm_loadl_epi64((const __m128i*)left);
+  const __m128i sum_top = _mm_sad_epu8(top_values, zero);
+  const __m128i sum_left = _mm_sad_epu8(left_values, zero);
+  const int DC = _mm_cvtsi128_si32(sum_top) + _mm_cvtsi128_si32(sum_left) + 8;
+  Put8x8uv(DC >> 4, dst);
+}
+
+static WEBP_INLINE void DC8uvNoLeft(uint8_t* dst, const uint8_t* top) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top_values = _mm_loadl_epi64((const __m128i*)top);
+  const __m128i sum = _mm_sad_epu8(top_values, zero);
+  const int DC = _mm_cvtsi128_si32(sum) + 4;
+  Put8x8uv(DC >> 3, dst);
+}
+
+static WEBP_INLINE void DC8uvNoTop(uint8_t* dst, const uint8_t* left) {
+  // 'left' is contiguous so we can reuse the top summation.
+  DC8uvNoLeft(dst, left);
+}
+
+static WEBP_INLINE void DC8uvNoTopLeft(uint8_t* dst) {
+  Put8x8uv(0x80, dst);
+}
+
+static WEBP_INLINE void DC8uvMode(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
+  if (top != NULL) {
+    if (left != NULL) {  // top and left present
+      DC8uv(dst, left, top);
+    } else {  // top, but no left
+      DC8uvNoLeft(dst, top);
+    }
+  } else if (left != NULL) {  // left but no top
+    DC8uvNoTop(dst, left);
+  } else {  // no top, no left, nothing.
+    DC8uvNoTopLeft(dst);
+  }
+}
+
+static WEBP_INLINE void DC16(uint8_t* dst, const uint8_t* left,
+                             const uint8_t* top) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top_row = _mm_load_si128((const __m128i*)top);
+  const __m128i left_row = _mm_load_si128((const __m128i*)left);
+  const __m128i sad8x2 = _mm_sad_epu8(top_row, zero);
+  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+  const __m128i sum_top = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
+  const __m128i sad8x2_left = _mm_sad_epu8(left_row, zero);
+  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+  const __m128i sum_left =
+      _mm_add_epi16(sad8x2_left, _mm_shuffle_epi32(sad8x2_left, 2));
+  const int DC = _mm_cvtsi128_si32(sum_top) + _mm_cvtsi128_si32(sum_left) + 16;
+  Put16(DC >> 5, dst);
+}
+
+static WEBP_INLINE void DC16NoLeft(uint8_t* dst, const uint8_t* top) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top_row = _mm_load_si128((const __m128i*)top);
+  const __m128i sad8x2 = _mm_sad_epu8(top_row, zero);
+  // sum the two sads: sad8x2[0:1] + sad8x2[8:9]
+  const __m128i sum = _mm_add_epi16(sad8x2, _mm_shuffle_epi32(sad8x2, 2));
+  const int DC = _mm_cvtsi128_si32(sum) + 8;
+  Put16(DC >> 4, dst);
+}
+
+static WEBP_INLINE void DC16NoTop(uint8_t* dst, const uint8_t* left) {
+  // 'left' is contiguous so we can reuse the top summation.
+  DC16NoLeft(dst, left);
+}
+
+static WEBP_INLINE void DC16NoTopLeft(uint8_t* dst) {
+  Put16(0x80, dst);
+}
+
+static WEBP_INLINE void DC16Mode(uint8_t* dst, const uint8_t* left,
+                                 const uint8_t* top) {
+  if (top != NULL) {
+    if (left != NULL) {  // top and left present
+      DC16(dst, left, top);
+    } else {  // top, but no left
+      DC16NoLeft(dst, top);
+    }
+  } else if (left != NULL) {  // left but no top
+    DC16NoTop(dst, left);
+  } else {  // no top, no left, nothing.
+    DC16NoTopLeft(dst);
+  }
+}
+
+//------------------------------------------------------------------------------
+// 4x4 predictions
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+// We use the following 8b-arithmetic tricks:
+//     (a + 2 * b + c + 2) >> 2 = (AC + b + 1) >> 1
+//   where: AC = (a + c) >> 1 = [(a + c + 1) >> 1] - [(a^c) & 1]
+// and:
+//     (a + 2 * b + c + 2) >> 2 = (AB + BC + 1) >> 1 - (ab|bc)&lsb
+//   where: AC = (a + b + 1) >> 1,   BC = (b + c + 1) >> 1
+//   and ab = a ^ b, bc = b ^ c, lsb = (AC^BC)&1
+
+static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {  // vertical
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((__m128i*)(top - 1));
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i a = _mm_avg_epu8(ABCDEFGH, CDEFGH00);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGH00), one);
+  const __m128i b = _mm_subs_epu8(a, lsb);
+  const __m128i avg = _mm_avg_epu8(b, BCDEFGH0);
+  const uint32_t vals = _mm_cvtsi128_si32(avg);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    WebPUint32ToMem(dst + i * BPS, vals);
+  }
+}
+
+static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {  // horizontal
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
+}
+
+static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+  Fill(dst, dc >> 3, 4);
+}
+
+static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {  // Down-Left
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
+  const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i CDEFGHH0 = _mm_insert_epi16(CDEFGH00, top[7], 3);
+  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, CDEFGHH0);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(ABCDEFGH, CDEFGHH0), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i abcdefg = _mm_avg_epu8(avg2, BCDEFGH0);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+}
+
+static WEBP_INLINE void VR4(uint8_t* dst,
+                            const uint8_t* top) {  // Vertical-Right
+  const __m128i one = _mm_set1_epi8(1);
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int X = top[-1];
+  const __m128i XABCD = _mm_loadl_epi64((const __m128i*)(top - 1));
+  const __m128i ABCD0 = _mm_srli_si128(XABCD, 1);
+  const __m128i abcd = _mm_avg_epu8(XABCD, ABCD0);
+  const __m128i _XABCD = _mm_slli_si128(XABCD, 1);
+  const __m128i IXABCD = _mm_insert_epi16(_XABCD, I | (X << 8), 0);
+  const __m128i avg1 = _mm_avg_epu8(IXABCD, ABCD0);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(IXABCD, ABCD0), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i efgh = _mm_avg_epu8(avg2, XABCD);
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               abcd    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               efgh    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(abcd, 1)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_slli_si128(efgh, 1)));
+
+  // these two are hard to implement in SSE2, so we keep the C-version:
+  DST(0, 2) = AVG3(J, I, X);
+  DST(0, 3) = AVG3(K, J, I);
+}
+
+static WEBP_INLINE void VL4(uint8_t* dst,
+                            const uint8_t* top) {  // Vertical-Left
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i ABCDEFGH = _mm_loadl_epi64((const __m128i*)top);
+  const __m128i BCDEFGH_ = _mm_srli_si128(ABCDEFGH, 1);
+  const __m128i CDEFGH__ = _mm_srli_si128(ABCDEFGH, 2);
+  const __m128i avg1 = _mm_avg_epu8(ABCDEFGH, BCDEFGH_);
+  const __m128i avg2 = _mm_avg_epu8(CDEFGH__, BCDEFGH_);
+  const __m128i avg3 = _mm_avg_epu8(avg1, avg2);
+  const __m128i lsb1 = _mm_and_si128(_mm_xor_si128(avg1, avg2), one);
+  const __m128i ab = _mm_xor_si128(ABCDEFGH, BCDEFGH_);
+  const __m128i bc = _mm_xor_si128(CDEFGH__, BCDEFGH_);
+  const __m128i abbc = _mm_or_si128(ab, bc);
+  const __m128i lsb2 = _mm_and_si128(abbc, lsb1);
+  const __m128i avg4 = _mm_subs_epu8(avg3, lsb2);
+  const uint32_t extra_out = _mm_cvtsi128_si32(_mm_srli_si128(avg4, 4));
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(               avg1    ));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(               avg4    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg1, 1)));
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(avg4, 1)));
+
+  // these two are hard to get and irregular
+  DST(3, 2) = (extra_out >> 0) & 0xff;
+  DST(3, 3) = (extra_out >> 8) & 0xff;
+}
+
+static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {  // Down-right
+  const __m128i one = _mm_set1_epi8(1);
+  const __m128i LKJIXABC = _mm_loadl_epi64((const __m128i*)(top - 5));
+  const __m128i LKJIXABCD = _mm_insert_epi16(LKJIXABC, top[3], 4);
+  const __m128i KJIXABCD_ = _mm_srli_si128(LKJIXABCD, 1);
+  const __m128i JIXABCD__ = _mm_srli_si128(LKJIXABCD, 2);
+  const __m128i avg1 = _mm_avg_epu8(JIXABCD__, LKJIXABCD);
+  const __m128i lsb = _mm_and_si128(_mm_xor_si128(JIXABCD__, LKJIXABCD), one);
+  const __m128i avg2 = _mm_subs_epu8(avg1, lsb);
+  const __m128i abcdefg = _mm_avg_epu8(avg2, KJIXABCD_);
+  WebPUint32ToMem(dst + 3 * BPS, _mm_cvtsi128_si32(               abcdefg    ));
+  WebPUint32ToMem(dst + 2 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 1)));
+  WebPUint32ToMem(dst + 1 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 2)));
+  WebPUint32ToMem(dst + 0 * BPS, _mm_cvtsi128_si32(_mm_srli_si128(abcdefg, 3)));
+}
+
+static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top_values = _mm_cvtsi32_si128(WebPMemToUint32(top));
+  const __m128i top_base = _mm_unpacklo_epi8(top_values, zero);
+  int y;
+  for (y = 0; y < 4; ++y, dst += BPS) {
+    const int val = top[-2 - y] - top[-1];
+    const __m128i base = _mm_set1_epi16(val);
+    const __m128i out = _mm_packus_epi16(_mm_add_epi16(base, top_base), zero);
+    WebPUint32ToMem(dst, _mm_cvtsi128_si32(out));
+  }
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+//------------------------------------------------------------------------------
+// luma 4x4 prediction
+
+// Left samples are top[-5 .. -2], top_left is top[-1], top are
+// located at top[0..3], and top right is top[4..7]
+static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+  DC4(I4DC4 + dst, top);
+  TM4(I4TM4 + dst, top);
+  VE4(I4VE4 + dst, top);
+  HE4(I4HE4 + dst, top);
+  RD4(I4RD4 + dst, top);
+  VR4(I4VR4 + dst, top);
+  LD4(I4LD4 + dst, top);
+  VL4(I4VL4 + dst, top);
+  HD4(I4HD4 + dst, top);
+  HU4(I4HU4 + dst, top);
+}
+
+//------------------------------------------------------------------------------
+// Chroma 8x8 prediction (paragraph 12.2)
+
+static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
+                             const uint8_t* top) {
+  // U block
+  DC8uvMode(C8DC8 + dst, left, top);
+  VerticalPred(C8VE8 + dst, top, 8);
+  HorizontalPred(C8HE8 + dst, left, 8);
+  TrueMotion(C8TM8 + dst, left, top, 8);
+  // V block
+  dst += 8;
+  if (top != NULL) top += 8;
+  if (left != NULL) left += 16;
+  DC8uvMode(C8DC8 + dst, left, top);
+  VerticalPred(C8VE8 + dst, top, 8);
+  HorizontalPred(C8HE8 + dst, left, 8);
+  TrueMotion(C8TM8 + dst, left, top, 8);
+}
+
+//------------------------------------------------------------------------------
+// luma 16x16 prediction (paragraph 12.3)
+
+static void Intra16Preds(uint8_t* dst,
+                         const uint8_t* left, const uint8_t* top) {
+  DC16Mode(I16DC16 + dst, left, top);
+  VerticalPred(I16VE16 + dst, top, 16);
+  HorizontalPred(I16HE16 + dst, left, 16);
+  TrueMotion(I16TM16 + dst, left, top, 16);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+static WEBP_INLINE void SubtractAndAccumulate(const __m128i a, const __m128i b,
+                                              __m128i* const sum) {
+  // take abs(a-b) in 8b
+  const __m128i a_b = _mm_subs_epu8(a, b);
+  const __m128i b_a = _mm_subs_epu8(b, a);
+  const __m128i abs_a_b = _mm_or_si128(a_b, b_a);
+  // zero-extend to 16b
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i C0 = _mm_unpacklo_epi8(abs_a_b, zero);
+  const __m128i C1 = _mm_unpackhi_epi8(abs_a_b, zero);
+  // multiply with self
+  const __m128i sum1 = _mm_madd_epi16(C0, C0);
+  const __m128i sum2 = _mm_madd_epi16(C1, C1);
+  *sum = _mm_add_epi32(sum1, sum2);
+}
+
+static WEBP_INLINE int SSE_16xN(const uint8_t* a, const uint8_t* b,
+                                int num_pairs) {
+  __m128i sum = _mm_setzero_si128();
+  int32_t tmp[4];
+  int i;
+
+  for (i = 0; i < num_pairs; ++i) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[BPS * 0]);
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[BPS * 0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[BPS * 1]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[BPS * 1]);
+    __m128i sum1, sum2;
+    SubtractAndAccumulate(a0, b0, &sum1);
+    SubtractAndAccumulate(a1, b1, &sum2);
+    sum = _mm_add_epi32(sum, _mm_add_epi32(sum1, sum2));
+    a += 2 * BPS;
+    b += 2 * BPS;
   }
+  _mm_storeu_si128((__m128i*)tmp, sum);
+  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
 }
 
 static int SSE16x16(const uint8_t* a, const uint8_t* b) {
-  return SSE_Nx4(a, b, 4, 1);
+  return SSE_16xN(a, b, 8);
 }
 
 static int SSE16x8(const uint8_t* a, const uint8_t* b) {
-  return SSE_Nx4(a, b, 2, 1);
+  return SSE_16xN(a, b, 4);
 }
 
+#define LOAD_8x16b(ptr) \
+  _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr)), zero)
+
 static int SSE8x8(const uint8_t* a, const uint8_t* b) {
-  return SSE_Nx4(a, b, 2, 0);
+  const __m128i zero = _mm_setzero_si128();
+  int num_pairs = 4;
+  __m128i sum = zero;
+  int32_t tmp[4];
+  while (num_pairs-- > 0) {
+    const __m128i a0 = LOAD_8x16b(&a[BPS * 0]);
+    const __m128i a1 = LOAD_8x16b(&a[BPS * 1]);
+    const __m128i b0 = LOAD_8x16b(&b[BPS * 0]);
+    const __m128i b1 = LOAD_8x16b(&b[BPS * 1]);
+    // subtract
+    const __m128i c0 = _mm_subs_epi16(a0, b0);
+    const __m128i c1 = _mm_subs_epi16(a1, b1);
+    // multiply/accumulate with self
+    const __m128i d0 = _mm_madd_epi16(c0, c0);
+    const __m128i d1 = _mm_madd_epi16(c1, c1);
+    // collect
+    const __m128i sum01 = _mm_add_epi32(d0, d1);
+    sum = _mm_add_epi32(sum, sum01);
+    a += 2 * BPS;
+    b += 2 * BPS;
+  }
+  _mm_storeu_si128((__m128i*)tmp, sum);
+  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
 }
+#undef LOAD_8x16b
 
 static int SSE4x4(const uint8_t* a, const uint8_t* b) {
   const __m128i zero = _mm_setzero_si128();
 
   // Load values. Note that we read 8 pixels instead of 4,
   // but the a/b buffers are over-allocated to that effect.
-  const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]);
-  const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]);
-  const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]);
-  const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]);
-  const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]);
-  const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]);
-  const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]);
-  const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]);
-
-  // Combine pair of lines and convert to 16b.
+  const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]);
+  const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]);
+  const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]);
+  const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]);
+  const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]);
+  const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]);
+  const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]);
+  const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]);
+  // Combine pair of lines.
   const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
   const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
   const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
   const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
+  // Convert to 16b.
   const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
   const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
   const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
   const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
-
-  // Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2
-  // TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't
-  //                  need absolute values, there is no need to do calculation
-  //                  in 8bit as we are already in 16bit, ... Yet this is what
-  //                  benchmarks the fastest!
-  const __m128i d0 = _mm_subs_epu8(a01s, b01s);
-  const __m128i d1 = _mm_subs_epu8(b01s, a01s);
-  const __m128i d2 = _mm_subs_epu8(a23s, b23s);
-  const __m128i d3 = _mm_subs_epu8(b23s, a23s);
-
-  // Square and add them all together.
-  const __m128i madd0 = _mm_madd_epi16(d0, d0);
-  const __m128i madd1 = _mm_madd_epi16(d1, d1);
-  const __m128i madd2 = _mm_madd_epi16(d2, d2);
-  const __m128i madd3 = _mm_madd_epi16(d3, d3);
-  const __m128i sum0 = _mm_add_epi32(madd0, madd1);
-  const __m128i sum1 = _mm_add_epi32(madd2, madd3);
-  const __m128i sum2 = _mm_add_epi32(sum0, sum1);
+  // subtract, square and accumulate
+  const __m128i d0 = _mm_subs_epi16(a01s, b01s);
+  const __m128i d1 = _mm_subs_epi16(a23s, b23s);
+  const __m128i e0 = _mm_madd_epi16(d0, d0);
+  const __m128i e1 = _mm_madd_epi16(d1, d1);
+  const __m128i sum = _mm_add_epi32(e0, e1);
 
   int32_t tmp[4];
-  _mm_storeu_si128((__m128i*)tmp, sum2);
+  _mm_storeu_si128((__m128i*)tmp, sum);
   return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
 }
 
@@ -643,14 +1152,14 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
 
   // Load, combine and transpose inputs.
   {
-    const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
-    const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
-    const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]);
-    const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]);
-    const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]);
-    const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]);
-    const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]);
-    const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]);
+    const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
+    const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
+    const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
+    const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
+    const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
+    const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
+    const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
+    const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
 
     // Combine inA and inB (we'll do two transforms in parallel).
     const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
@@ -729,10 +1238,8 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
   // Vertical pass and difference of weighted sums.
   {
     // Load all inputs.
-    // TODO(cduvivier): Make variable declarations and allocations aligned so
-    //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
-    const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]);
-    const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]);
+    const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
+    const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
 
     // Calculate a and b (two 4x4 at once).
     const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
@@ -751,21 +1258,14 @@ static int TTransform(const uint8_t* inA, const uint8_t* inB,
     __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
 
     {
-      // sign(b) = b >> 15  (0x0000 if positive, 0xffff if negative)
-      const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15);
-      const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15);
-      const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15);
-      const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15);
-
-      // b = abs(b) = (b ^ sign) - sign
-      A_b0 = _mm_xor_si128(A_b0, sign_A_b0);
-      A_b2 = _mm_xor_si128(A_b2, sign_A_b2);
-      B_b0 = _mm_xor_si128(B_b0, sign_B_b0);
-      B_b2 = _mm_xor_si128(B_b2, sign_B_b2);
-      A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);
-      A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);
-      B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);
-      B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
+      const __m128i d0 = _mm_sub_epi16(zero, A_b0);
+      const __m128i d1 = _mm_sub_epi16(zero, A_b2);
+      const __m128i d2 = _mm_sub_epi16(zero, B_b0);
+      const __m128i d3 = _mm_sub_epi16(zero, B_b2);
+      A_b0 = _mm_max_epi16(A_b0, d0);   // abs(v), 16b
+      A_b2 = _mm_max_epi16(A_b2, d1);
+      B_b0 = _mm_max_epi16(B_b0, d2);
+      B_b2 = _mm_max_epi16(B_b2, d3);
     }
 
     // weighted sums
@@ -815,14 +1315,12 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
   __m128i packed_out;
 
   // Load all inputs.
-  // TODO(cduvivier): Make variable declarations and allocations aligned so that
-  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
   __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
   __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
-  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
-  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
-  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
-  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
+  const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
+  const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
+  const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
+  const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
 
   // extract sign(in)  (0x0000 if positive, 0xffff if negative)
   const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
@@ -836,8 +1334,8 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
 
   // coeff = abs(in) + sharpen
   if (sharpen != NULL) {
-    const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&sharpen[0]);
-    const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&sharpen[8]);
+    const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
+    const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
     coeff0 = _mm_add_epi16(coeff0, sharpen0);
     coeff8 = _mm_add_epi16(coeff8, sharpen8);
   }
@@ -855,10 +1353,10 @@ static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
     __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
     __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
     // out = (coeff * iQ + B)
-    const __m128i bias_00 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
-    const __m128i bias_04 = _mm_loadu_si128((__m128i*)&mtx->bias_[4]);
-    const __m128i bias_08 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
-    const __m128i bias_12 = _mm_loadu_si128((__m128i*)&mtx->bias_[12]);
+    const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
+    const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
+    const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
+    const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
     out_00 = _mm_add_epi32(out_00, bias_00);
     out_04 = _mm_add_epi32(out_04, bias_04);
     out_08 = _mm_add_epi32(out_08, bias_08);
@@ -929,47 +1427,31 @@ static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
   return DoQuantizeBlock(in, out, NULL, mtx);
 }
 
-// Forward declaration.
-void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
-                              VP8Residual* const res);
-
-void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
-                              VP8Residual* const res) {
-  const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs);
-  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
-  // Use SSE to compare 8 values with a single instruction.
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i m0 = _mm_cmpeq_epi16(c0, zero);
-  const __m128i m1 = _mm_cmpeq_epi16(c1, zero);
-  // Get the comparison results as a bitmask, consisting of two times 16 bits:
-  // two identical bits for each result. Concatenate both bitmasks to get a
-  // single 32 bit value. Negate the mask to get the position of entries that
-  // are not equal to zero. We don't need to mask out least significant bits
-  // according to res->first, since coeffs[0] is 0 if res->first > 0
-  const uint32_t mask =
-      ~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0));
-  // The position of the most significant non-zero bit indicates the position of
-  // the last non-zero value. Divide the result by two because __movemask_epi8
-  // operates on 8 bit values instead of 16 bit values.
-  assert(res->first == 0 || coeffs[0] == 0);
-  res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1;
-  res->coeffs = coeffs;
-}
-
-#endif   // WEBP_USE_SSE2
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  const uint16_t* const sharpen = &mtx->sharpen_[0];
+  nz  = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  return nz;
+}
 
 //------------------------------------------------------------------------------
 // Entry point
 
 extern void VP8EncDspInitSSE2(void);
 
-void VP8EncDspInitSSE2(void) {
-#if defined(WEBP_USE_SSE2)
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
   VP8CollectHistogram = CollectHistogram;
+  VP8EncPredLuma16 = Intra16Preds;
+  VP8EncPredChroma8 = IntraChromaPreds;
+  VP8EncPredLuma4 = Intra4Preds;
   VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
   VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
   VP8ITransform = ITransform;
   VP8FTransform = FTransform;
+  VP8FTransform2 = FTransform2;
   VP8FTransformWHT = FTransformWHT;
   VP8SSE16x16 = SSE16x16;
   VP8SSE16x8 = SSE16x8;
@@ -977,6 +1459,10 @@ void VP8EncDspInitSSE2(void) {
   VP8SSE4x4 = SSE4x4;
   VP8TDisto4x4 = Disto4x4;
   VP8TDisto16x16 = Disto16x16;
-#endif   // WEBP_USE_SSE2
 }
 
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse41.c b/src/3rdparty/libwebp/src/dsp/enc_sse41.c
new file mode 100644
index 0000000..65c01ae
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/enc_sse41.c
@@ -0,0 +1,373 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4 version of some encoding functions.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+#include <smmintrin.h>
+#include <stdlib.h>  // for abs()
+
+#include "../enc/vp8enci.h"
+
+//------------------------------------------------------------------------------
+// Compute susceptibility based on DCT-coeff histograms.
+
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
+  const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH);
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    int k;
+
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+
+    // Convert coefficients to bin (within out[]).
+    {
+      // Load.
+      const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]);
+      const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]);
+      // v = abs(out) >> 3
+      const __m128i abs0 = _mm_abs_epi16(out0);
+      const __m128i abs1 = _mm_abs_epi16(out1);
+      const __m128i v0 = _mm_srai_epi16(abs0, 3);
+      const __m128i v1 = _mm_srai_epi16(abs1, 3);
+      // bin = min(v, MAX_COEFF_THRESH)
+      const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh);
+      const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh);
+      // Store.
+      _mm_storeu_si128((__m128i*)&out[0], bin0);
+      _mm_storeu_si128((__m128i*)&out[8], bin1);
+    }
+
+    // Convert coefficients to bin.
+    for (k = 0; k < 16; ++k) {
+      ++distribution[out[k]];
+    }
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Texture distortion
+//
+// We try to match the spectral content (weighted) between source and
+// reconstructed samples.
+
+// Hadamard transform
+// Returns the difference between the weighted sum of the absolute value of
+// transformed coefficients.
+static int TTransform(const uint8_t* inA, const uint8_t* inB,
+                      const uint16_t* const w) {
+  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
+
+  // Load, combine and transpose inputs.
+  {
+    const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]);
+    const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]);
+    const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]);
+    const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
+    const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]);
+    const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]);
+    const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]);
+    const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);
+
+    // Combine inA and inB (we'll do two transforms in parallel).
+    const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
+    const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
+    const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
+    const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
+    // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
+    // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
+    // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
+    // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0
+
+    // Transpose the two 4x4, discarding the filling zeroes.
+    const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
+    const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
+    // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
+    // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
+    // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
+    // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33
+
+    // Convert to 16b.
+    tmp_0 = _mm_cvtepu8_epi16(transpose1_0);
+    tmp_1 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_0, 8));
+    tmp_2 = _mm_cvtepu8_epi16(transpose1_1);
+    tmp_3 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_1, 8));
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Horizontal pass and subsequent transpose.
+  {
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+    // a00 a01 a02 a03   b00 b01 b02 b03
+    // a10 a11 a12 a13   b10 b11 b12 b13
+    // a20 a21 a22 a23   b20 b21 b22 b23
+    // a30 a31 a32 a33   b30 b31 b32 b33
+
+    // Transpose the two 4x4.
+    const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
+    const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
+    const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
+    const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
+    // a00 a10 a01 a11   a02 a12 a03 a13
+    // a20 a30 a21 a31   a22 a32 a23 a33
+    // b00 b10 b01 b11   b02 b12 b03 b13
+    // b20 b30 b21 b31   b22 b32 b23 b33
+    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
+    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
+    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
+    // a00 a10 a20 a30 a01 a11 a21 a31
+    // b00 b10 b20 b30 b01 b11 b21 b31
+    // a02 a12 a22 a32 a03 a13 a23 a33
+    // b02 b12 a22 b32 b03 b13 b23 b33
+    tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
+    tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
+    tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
+    tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
+    // a00 a10 a20 a30   b00 b10 b20 b30
+    // a01 a11 a21 a31   b01 b11 b21 b31
+    // a02 a12 a22 a32   b02 b12 b22 b32
+    // a03 a13 a23 a33   b03 b13 b23 b33
+  }
+
+  // Vertical pass and difference of weighted sums.
+  {
+    // Load all inputs.
+    const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
+    const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);
+
+    // Calculate a and b (two 4x4 at once).
+    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
+    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
+    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
+    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
+    const __m128i b0 = _mm_add_epi16(a0, a1);
+    const __m128i b1 = _mm_add_epi16(a3, a2);
+    const __m128i b2 = _mm_sub_epi16(a3, a2);
+    const __m128i b3 = _mm_sub_epi16(a0, a1);
+
+    // Separate the transforms of inA and inB.
+    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
+    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
+    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
+    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);
+
+    A_b0 = _mm_abs_epi16(A_b0);
+    A_b2 = _mm_abs_epi16(A_b2);
+    B_b0 = _mm_abs_epi16(B_b0);
+    B_b2 = _mm_abs_epi16(B_b2);
+
+    // weighted sums
+    A_b0 = _mm_madd_epi16(A_b0, w_0);
+    A_b2 = _mm_madd_epi16(A_b2, w_8);
+    B_b0 = _mm_madd_epi16(B_b0, w_0);
+    B_b2 = _mm_madd_epi16(B_b2, w_8);
+    A_b0 = _mm_add_epi32(A_b0, A_b2);
+    B_b0 = _mm_add_epi32(B_b0, B_b2);
+
+    // difference of weighted sums
+    A_b2 = _mm_sub_epi32(A_b0, B_b0);
+    // cascading summation of the differences
+    B_b0 = _mm_hadd_epi32(A_b2, A_b2);
+    B_b2 = _mm_hadd_epi32(B_b0, B_b0);
+    return _mm_cvtsi128_si32(B_b2);
+  }
+}
+
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  const int diff_sum = TTransform(a, b, w);
+  return abs(diff_sum) >> 5;
+}
+
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+//
+
+// Generates a pshufb constant for shuffling 16b words.
+#define PSHUFB_CST(A,B,C,D,E,F,G,H) \
+  _mm_set_epi8(2 * (H) + 1, 2 * (H) + 0, 2 * (G) + 1, 2 * (G) + 0, \
+               2 * (F) + 1, 2 * (F) + 0, 2 * (E) + 1, 2 * (E) + 0, \
+               2 * (D) + 1, 2 * (D) + 0, 2 * (C) + 1, 2 * (C) + 0, \
+               2 * (B) + 1, 2 * (B) + 0, 2 * (A) + 1, 2 * (A) + 0)
+
+static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
+                                       const uint16_t* const sharpen,
+                                       const VP8Matrix* const mtx) {
+  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i out0, out8;
+  __m128i packed_out;
+
+  // Load all inputs.
+  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
+  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
+  const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
+  const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
+  const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
+  const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);
+
+  // coeff = abs(in)
+  __m128i coeff0 = _mm_abs_epi16(in0);
+  __m128i coeff8 = _mm_abs_epi16(in8);
+
+  // coeff = abs(in) + sharpen
+  if (sharpen != NULL) {
+    const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
+    const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
+    coeff0 = _mm_add_epi16(coeff0, sharpen0);
+    coeff8 = _mm_add_epi16(coeff8, sharpen8);
+  }
+
+  // out = (coeff * iQ + B) >> QFIX
+  {
+    // doing calculations with 32b precision (QFIX=17)
+    // out = (coeff * iQ)
+    const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
+    const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
+    const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
+    const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
+    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
+    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
+    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
+    // out = (coeff * iQ + B)
+    const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
+    const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
+    const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
+    const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
+    out_00 = _mm_add_epi32(out_00, bias_00);
+    out_04 = _mm_add_epi32(out_04, bias_04);
+    out_08 = _mm_add_epi32(out_08, bias_08);
+    out_12 = _mm_add_epi32(out_12, bias_12);
+    // out = QUANTDIV(coeff, iQ, B, QFIX)
+    out_00 = _mm_srai_epi32(out_00, QFIX);
+    out_04 = _mm_srai_epi32(out_04, QFIX);
+    out_08 = _mm_srai_epi32(out_08, QFIX);
+    out_12 = _mm_srai_epi32(out_12, QFIX);
+
+    // pack result as 16b
+    out0 = _mm_packs_epi32(out_00, out_04);
+    out8 = _mm_packs_epi32(out_08, out_12);
+
+    // if (coeff > 2047) coeff = 2047
+    out0 = _mm_min_epi16(out0, max_coeff_2047);
+    out8 = _mm_min_epi16(out8, max_coeff_2047);
+  }
+
+  // put sign back
+  out0 = _mm_sign_epi16(out0, in0);
+  out8 = _mm_sign_epi16(out8, in8);
+
+  // in = out * Q
+  in0 = _mm_mullo_epi16(out0, q0);
+  in8 = _mm_mullo_epi16(out8, q8);
+
+  _mm_storeu_si128((__m128i*)&in[0], in0);
+  _mm_storeu_si128((__m128i*)&in[8], in8);
+
+  // zigzag the output before storing it. The re-ordering is:
+  //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
+  // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
+  // There's only two misplaced entries ([8] and [7]) that are crossing the
+  // reg's boundaries.
+  // We use pshufb instead of pshuflo/pshufhi.
+  {
+    const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);
+    const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);
+    const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);
+    const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7);  // extract #7
+    const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);
+    const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);
+    const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);
+    const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8);  // extract #8
+    const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);
+    const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);
+    _mm_storeu_si128((__m128i*)&out[0], out_z0);
+    _mm_storeu_si128((__m128i*)&out[8], out_z8);
+    packed_out = _mm_packs_epi16(out_z0, out_z8);
+  }
+
+  // detect if all 'out' values are zeroes or not
+  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
+}
+
+#undef PSHUFB_CST
+
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
+  return DoQuantizeBlock(in, out, &mtx->sharpen_[0], mtx);
+}
+
+static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
+                            const VP8Matrix* const mtx) {
+  return DoQuantizeBlock(in, out, NULL, mtx);
+}
+
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  const uint16_t* const sharpen = &mtx->sharpen_[0];
+  nz  = DoQuantizeBlock(in + 0 * 16, out + 0 * 16, sharpen, mtx) << 0;
+  nz |= DoQuantizeBlock(in + 1 * 16, out + 1 * 16, sharpen, mtx) << 1;
+  return nz;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitSSE41(void);
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE41(void) {
+  VP8CollectHistogram = CollectHistogram;
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
diff --git a/src/3rdparty/libwebp/src/dsp/filters.c b/src/3rdparty/libwebp/src/dsp/filters.c
new file mode 100644
index 0000000..5c30f2e
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/filters.c
@@ -0,0 +1,240 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Spatial prediction using various filters
+//
+// Author: Urvang (urvang@google.com)
+
+#include "./dsp.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+# define SANITY_CHECK(in, out)                                                 \
+  assert(in != NULL);                                                          \
+  assert(out != NULL);                                                         \
+  assert(width > 0);                                                           \
+  assert(height > 0);                                                          \
+  assert(stride >= width);                                                     \
+  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
+  (void)height;  // Silence unused warning.
+
+static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
+                                    uint8_t* dst, int length, int inverse) {
+  int i;
+  if (inverse) {
+    for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
+  } else {
+    for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Horizontal filter.
+
+static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Leftmost pixel is the same as input for topmost scanline.
+    out[0] = in[0];
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    // Leftmost pixel is predicted from above.
+    PredictLine(in, preds - stride, out, 1, inverse);
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter.
+
+static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Very first top-left pixel is copied.
+    out[0] = in[0];
+    // Rest of top scan-line is left-predicted.
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    row = 1;
+    in += stride;
+    out += stride;
+  } else {
+    // We are starting from in-between. Make sure 'preds' points to prev row.
+    preds -= stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    PredictLine(in, preds, out, width, inverse);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter.
+
+static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
+  const int g = a + b - c;
+  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
+}
+
+static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  // left prediction for top scan-line
+  if (row == 0) {
+    out[0] = in[0];
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    int w;
+    // leftmost pixel: predict from above.
+    PredictLine(in, preds - stride, out, 1, inverse);
+    for (w = 1; w < width; ++w) {
+      const int pred = GradientPredictor(preds[w - 1],
+                                         preds[w - stride],
+                                         preds[w - stride - 1]);
+      out[w] = in[w] + (inverse ? pred : -pred);
+    }
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+
+static void HorizontalFilter(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+static void VerticalFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+
+static void GradientFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+
+//------------------------------------------------------------------------------
+
+static void VerticalUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+static void HorizontalUnfilter(int width, int height, int stride, int row,
+                               int num_rows, uint8_t* data) {
+  DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+static void GradientUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+//------------------------------------------------------------------------------
+// Init function
+
+WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
+WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
+
+extern void VP8FiltersInitMIPSdspR2(void);
+extern void VP8FiltersInitSSE2(void);
+
+static volatile VP8CPUInfo filters_last_cpuinfo_used =
+    (VP8CPUInfo)&filters_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
+  if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  WebPUnfilters[WEBP_FILTER_NONE] = NULL;
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+
+  WebPFilters[WEBP_FILTER_NONE] = NULL;
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8FiltersInitSSE2();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8FiltersInitMIPSdspR2();
+    }
+#endif
+  }
+  filters_last_cpuinfo_used = VP8GetCPUInfo;
+}
diff --git a/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
new file mode 100644
index 0000000..8134af5
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/filters_mips_dsp_r2.c
@@ -0,0 +1,405 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Spatial prediction using various filters
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+//            Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "../dsp/dsp.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <string.h>
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+# define SANITY_CHECK(in, out)                                                 \
+  assert(in != NULL);                                                          \
+  assert(out != NULL);                                                         \
+  assert(width > 0);                                                           \
+  assert(height > 0);                                                          \
+  assert(stride >= width);                                                     \
+  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
+  (void)height;  // Silence unused warning.
+
+// if INVERSE
+//   preds == &dst[-1] == &src[-1]
+// else
+//   preds == &src[-1] != &dst[-1]
+#define DO_PREDICT_LINE(SRC, DST, LENGTH, INVERSE) do {                        \
+    const uint8_t* psrc = (uint8_t*)(SRC);                                     \
+    uint8_t* pdst = (uint8_t*)(DST);                                           \
+    const int ilength = (int)(LENGTH);                                         \
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6;                       \
+    __asm__ volatile (                                                         \
+      ".set      push                                   \n\t"                  \
+      ".set      noreorder                              \n\t"                  \
+      "srl       %[temp0],    %[length],    0x2         \n\t"                  \
+      "beqz      %[temp0],    4f                        \n\t"                  \
+      " andi     %[temp6],    %[length],    0x3         \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+      "lbu       %[temp1],    -1(%[src])                \n\t"                  \
+    "1:                                                 \n\t"                  \
+      "lbu       %[temp2],    0(%[src])                 \n\t"                  \
+      "lbu       %[temp3],    1(%[src])                 \n\t"                  \
+      "lbu       %[temp4],    2(%[src])                 \n\t"                  \
+      "lbu       %[temp5],    3(%[src])                 \n\t"                  \
+      "addiu     %[src],      %[src],       4           \n\t"                  \
+      "addiu     %[temp0],    %[temp0],     -1          \n\t"                  \
+      "addu      %[temp2],    %[temp2],     %[temp1]    \n\t"                  \
+      "addu      %[temp3],    %[temp3],     %[temp2]    \n\t"                  \
+      "addu      %[temp4],    %[temp4],     %[temp3]    \n\t"                  \
+      "addu      %[temp1],    %[temp5],     %[temp4]    \n\t"                  \
+      "sb        %[temp2],    -4(%[src])                \n\t"                  \
+      "sb        %[temp3],    -3(%[src])                \n\t"                  \
+      "sb        %[temp4],    -2(%[src])                \n\t"                  \
+      "bnez      %[temp0],    1b                        \n\t"                  \
+      " sb       %[temp1],    -1(%[src])                \n\t"                  \
+    ".else                                              \n\t"                  \
+    "1:                                                 \n\t"                  \
+      "ulw       %[temp1],    -1(%[src])                \n\t"                  \
+      "ulw       %[temp2],    0(%[src])                 \n\t"                  \
+      "addiu     %[src],      %[src],       4           \n\t"                  \
+      "addiu     %[temp0],    %[temp0],     -1          \n\t"                  \
+      "subu.qb   %[temp3],    %[temp2],     %[temp1]    \n\t"                  \
+      "usw       %[temp3],    0(%[dst])                 \n\t"                  \
+      "bnez      %[temp0],    1b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       4           \n\t"                  \
+    ".endif                                             \n\t"                  \
+    "4:                                                 \n\t"                  \
+      "beqz      %[temp6],    3f                        \n\t"                  \
+      " nop                                             \n\t"                  \
+    "2:                                                 \n\t"                  \
+      "lbu       %[temp1],    -1(%[src])                \n\t"                  \
+      "lbu       %[temp2],    0(%[src])                 \n\t"                  \
+      "addiu     %[src],      %[src],       1           \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+      "addu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+      "sb        %[temp3],    -1(%[src])                \n\t"                  \
+    ".else                                              \n\t"                  \
+      "subu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+      "sb        %[temp3],    0(%[dst])                 \n\t"                  \
+    ".endif                                             \n\t"                  \
+      "addiu     %[temp6],    %[temp6],     -1          \n\t"                  \
+      "bnez      %[temp6],    2b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       1           \n\t"                  \
+    "3:                                                 \n\t"                  \
+      ".set      pop                                    \n\t"                  \
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),         \
+        [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),         \
+        [temp6]"=&r"(temp6), [dst]"+&r"(pdst), [src]"+&r"(psrc)                \
+      : [length]"r"(ilength)                                                   \
+      : "memory"                                                               \
+    );                                                                         \
+  } while (0)
+
+static WEBP_INLINE void PredictLine(const uint8_t* src, uint8_t* dst,
+                                    int length, int inverse) {
+  if (inverse) {
+    DO_PREDICT_LINE(src, dst, length, 1);
+  } else {
+    DO_PREDICT_LINE(src, dst, length, 0);
+  }
+}
+
+#define DO_PREDICT_LINE_VERTICAL(SRC, PRED, DST, LENGTH, INVERSE) do {         \
+    const uint8_t* psrc = (uint8_t*)(SRC);                                     \
+    const uint8_t* ppred = (uint8_t*)(PRED);                                   \
+    uint8_t* pdst = (uint8_t*)(DST);                                           \
+    const int ilength = (int)(LENGTH);                                         \
+    int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;                \
+    __asm__ volatile (                                                         \
+      ".set      push                                   \n\t"                  \
+      ".set      noreorder                              \n\t"                  \
+      "srl       %[temp0],    %[length],    0x3         \n\t"                  \
+      "beqz      %[temp0],    4f                        \n\t"                  \
+      " andi     %[temp7],    %[length],    0x7         \n\t"                  \
+    "1:                                                 \n\t"                  \
+      "ulw       %[temp1],    0(%[src])                 \n\t"                  \
+      "ulw       %[temp2],    0(%[pred])                \n\t"                  \
+      "ulw       %[temp3],    4(%[src])                 \n\t"                  \
+      "ulw       %[temp4],    4(%[pred])                \n\t"                  \
+      "addiu     %[src],      %[src],       8           \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+      "addu.qb   %[temp5],    %[temp1],     %[temp2]    \n\t"                  \
+      "addu.qb   %[temp6],    %[temp3],     %[temp4]    \n\t"                  \
+    ".else                                              \n\t"                  \
+      "subu.qb   %[temp5],    %[temp1],     %[temp2]    \n\t"                  \
+      "subu.qb   %[temp6],    %[temp3],     %[temp4]    \n\t"                  \
+    ".endif                                             \n\t"                  \
+      "addiu     %[pred],     %[pred],      8           \n\t"                  \
+      "usw       %[temp5],    0(%[dst])                 \n\t"                  \
+      "usw       %[temp6],    4(%[dst])                 \n\t"                  \
+      "addiu     %[temp0],    %[temp0],     -1          \n\t"                  \
+      "bnez      %[temp0],    1b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       8           \n\t"                  \
+    "4:                                                 \n\t"                  \
+      "beqz      %[temp7],    3f                        \n\t"                  \
+      " nop                                             \n\t"                  \
+    "2:                                                 \n\t"                  \
+      "lbu       %[temp1],    0(%[src])                 \n\t"                  \
+      "lbu       %[temp2],    0(%[pred])                \n\t"                  \
+      "addiu     %[src],      %[src],       1           \n\t"                  \
+      "addiu     %[pred],     %[pred],      1           \n\t"                  \
+    ".if " #INVERSE "                                   \n\t"                  \
+      "addu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+    ".else                                              \n\t"                  \
+      "subu      %[temp3],    %[temp1],     %[temp2]    \n\t"                  \
+    ".endif                                             \n\t"                  \
+      "sb        %[temp3],    0(%[dst])                 \n\t"                  \
+      "addiu     %[temp7],    %[temp7],     -1          \n\t"                  \
+      "bnez      %[temp7],    2b                        \n\t"                  \
+      " addiu    %[dst],      %[dst],       1           \n\t"                  \
+    "3:                                                 \n\t"                  \
+      ".set      pop                                    \n\t"                  \
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),         \
+        [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),         \
+        [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), [pred]"+&r"(ppred),          \
+        [dst]"+&r"(pdst), [src]"+&r"(psrc)                                     \
+      : [length]"r"(ilength)                                                   \
+      : "memory"                                                               \
+    );                                                                         \
+  } while (0)
+
+#define PREDICT_LINE_ONE_PASS(SRC, PRED, DST, INVERSE) do {                    \
+    int temp1, temp2, temp3;                                                   \
+    __asm__ volatile (                                                         \
+      "lbu       %[temp1],   0(%[src])               \n\t"                     \
+      "lbu       %[temp2],   0(%[pred])              \n\t"                     \
+    ".if " #INVERSE "                                \n\t"                     \
+      "addu      %[temp3],   %[temp1],   %[temp2]    \n\t"                     \
+    ".else                                           \n\t"                     \
+      "subu      %[temp3],   %[temp1],   %[temp2]    \n\t"                     \
+    ".endif                                          \n\t"                     \
+      "sb        %[temp3],   0(%[dst])               \n\t"                     \
+      : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3)          \
+      : [pred]"r"((PRED)), [dst]"r"((DST)), [src]"r"((SRC))                    \
+      : "memory"                                                               \
+    );                                                                         \
+  } while (0)
+
+//------------------------------------------------------------------------------
+// Horizontal filter.
+
+#define FILTER_LINE_BY_LINE(INVERSE) do {                                      \
+    while (row < last_row) {                                                   \
+      PREDICT_LINE_ONE_PASS(in, preds - stride, out, INVERSE);                 \
+      DO_PREDICT_LINE(in + 1, out + 1, width - 1, INVERSE);                    \
+      ++row;                                                                   \
+      preds += stride;                                                         \
+      in += stride;                                                            \
+      out += stride;                                                           \
+    }                                                                          \
+  } while (0)
+
+static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Leftmost pixel is the same as input for topmost scanline.
+    out[0] = in[0];
+    PredictLine(in + 1, out + 1, width - 1, inverse);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  if (inverse) {
+    FILTER_LINE_BY_LINE(1);
+  } else {
+    FILTER_LINE_BY_LINE(0);
+  }
+}
+
+#undef FILTER_LINE_BY_LINE
+
+static void HorizontalFilter(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+static void HorizontalUnfilter(int width, int height, int stride, int row,
+                               int num_rows, uint8_t* data) {
+  DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter.
+
+#define FILTER_LINE_BY_LINE(INVERSE) do {                                      \
+    while (row < last_row) {                                                   \
+      DO_PREDICT_LINE_VERTICAL(in, preds, out, width, INVERSE);                \
+      ++row;                                                                   \
+      preds += stride;                                                         \
+      in += stride;                                                            \
+      out += stride;                                                           \
+    }                                                                          \
+  } while (0)
+
+static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Very first top-left pixel is copied.
+    out[0] = in[0];
+    // Rest of top scan-line is left-predicted.
+    PredictLine(in + 1, out + 1, width - 1, inverse);
+    row = 1;
+    in += stride;
+    out += stride;
+  } else {
+    // We are starting from in-between. Make sure 'preds' points to prev row.
+    preds -= stride;
+  }
+
+  // Filter line-by-line.
+  if (inverse) {
+    FILTER_LINE_BY_LINE(1);
+  } else {
+    FILTER_LINE_BY_LINE(0);
+  }
+}
+
+#undef FILTER_LINE_BY_LINE
+#undef DO_PREDICT_LINE_VERTICAL
+
+static void VerticalFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+static void VerticalUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter.
+
+static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
+  int temp0;
+  __asm__ volatile (
+    "addu             %[temp0],   %[a],       %[b]        \n\t"
+    "subu             %[temp0],   %[temp0],   %[c]        \n\t"
+    "shll_s.w         %[temp0],   %[temp0],   23          \n\t"
+    "precrqu_s.qb.ph  %[temp0],   %[temp0],   $zero       \n\t"
+    "srl              %[temp0],   %[temp0],   24          \n\t"
+    : [temp0]"=&r"(temp0)
+    : [a]"r"(a),[b]"r"(b),[c]"r"(c)
+  );
+  return temp0;
+}
+
+#define FILTER_LINE_BY_LINE(INVERSE, PREDS, OPERATION) do {                    \
+    while (row < last_row) {                                                   \
+      int w;                                                                   \
+      PREDICT_LINE_ONE_PASS(in, PREDS - stride, out, INVERSE);                 \
+      for (w = 1; w < width; ++w) {                                            \
+        const int pred = GradientPredictor(PREDS[w - 1],                       \
+                                           PREDS[w - stride],                  \
+                                           PREDS[w - stride - 1]);             \
+        out[w] = in[w] OPERATION pred;                                         \
+      }                                                                        \
+      ++row;                                                                   \
+      in += stride;                                                            \
+      out += stride;                                                           \
+    }                                                                          \
+  } while (0)
+
+static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  // left prediction for top scan-line
+  if (row == 0) {
+    out[0] = in[0];
+    PredictLine(in + 1, out + 1, width - 1, inverse);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  if (inverse) {
+    FILTER_LINE_BY_LINE(1, out, +);
+  } else {
+    FILTER_LINE_BY_LINE(0, in, -);
+  }
+}
+
+#undef FILTER_LINE_BY_LINE
+
+static void GradientFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+static void GradientUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+#undef PREDICT_LINE_ONE_PASS
+#undef DO_PREDICT_LINE
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8FiltersInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMIPSdspR2(void) {
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8FiltersInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/filters_sse2.c b/src/3rdparty/libwebp/src/dsp/filters_sse2.c
new file mode 100644
index 0000000..bf93342
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/filters_sse2.c
@@ -0,0 +1,352 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 variant of alpha filters
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stdlib.h>
+#include <string.h>
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+# define SANITY_CHECK(in, out)                                                 \
+  assert(in != NULL);                                                          \
+  assert(out != NULL);                                                         \
+  assert(width > 0);                                                           \
+  assert(height > 0);                                                          \
+  assert(stride >= width);                                                     \
+  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
+  (void)height;  // Silence unused warning.
+
+static void PredictLineTop(const uint8_t* src, const uint8_t* pred,
+                           uint8_t* dst, int length, int inverse) {
+  int i;
+  const int max_pos = length & ~31;
+  assert(length >= 0);
+  if (inverse) {
+    for (i = 0; i < max_pos; i += 32) {
+      const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i +  0]);
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]);
+      const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i +  0]);
+      const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]);
+      const __m128i C0 = _mm_add_epi8(A0, B0);
+      const __m128i C1 = _mm_add_epi8(A1, B1);
+      _mm_storeu_si128((__m128i*)&dst[i +  0], C0);
+      _mm_storeu_si128((__m128i*)&dst[i + 16], C1);
+    }
+    for (; i < length; ++i) dst[i] = src[i] + pred[i];
+  } else {
+    for (i = 0; i < max_pos; i += 32) {
+      const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i +  0]);
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]);
+      const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i +  0]);
+      const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]);
+      const __m128i C0 = _mm_sub_epi8(A0, B0);
+      const __m128i C1 = _mm_sub_epi8(A1, B1);
+      _mm_storeu_si128((__m128i*)&dst[i +  0], C0);
+      _mm_storeu_si128((__m128i*)&dst[i + 16], C1);
+    }
+    for (; i < length; ++i) dst[i] = src[i] - pred[i];
+  }
+}
+
+// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
+static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length,
+                            int inverse) {
+  int i;
+  if (length <= 0) return;
+  if (inverse) {
+    const int max_pos = length & ~7;
+    __m128i last = _mm_set_epi32(0, 0, 0, dst[-1]);
+    for (i = 0; i < max_pos; i += 8) {
+      const __m128i A0 = _mm_loadl_epi64((const __m128i*)(src + i));
+      const __m128i A1 = _mm_add_epi8(A0, last);
+      const __m128i A2 = _mm_slli_si128(A1, 1);
+      const __m128i A3 = _mm_add_epi8(A1, A2);
+      const __m128i A4 = _mm_slli_si128(A3, 2);
+      const __m128i A5 = _mm_add_epi8(A3, A4);
+      const __m128i A6 = _mm_slli_si128(A5, 4);
+      const __m128i A7 = _mm_add_epi8(A5, A6);
+      _mm_storel_epi64((__m128i*)(dst + i), A7);
+      last = _mm_srli_epi64(A7, 56);
+    }
+    for (; i < length; ++i) dst[i] = src[i] + dst[i - 1];
+  } else {
+    const int max_pos = length & ~31;
+    for (i = 0; i < max_pos; i += 32) {
+      const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i +  0    ));
+      const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i +  0 - 1));
+      const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16    ));
+      const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1));
+      const __m128i C0 = _mm_sub_epi8(A0, B0);
+      const __m128i C1 = _mm_sub_epi8(A1, B1);
+      _mm_storeu_si128((__m128i*)(dst + i +  0), C0);
+      _mm_storeu_si128((__m128i*)(dst + i + 16), C1);
+    }
+    for (; i < length; ++i) dst[i] = src[i] - src[i - 1];
+  }
+}
+
+static void PredictLineC(const uint8_t* src, const uint8_t* pred,
+                         uint8_t* dst, int length, int inverse) {
+  int i;
+  if (inverse) {
+    for (i = 0; i < length; ++i) dst[i] = src[i] + pred[i];
+  } else {
+    for (i = 0; i < length; ++i) dst[i] = src[i] - pred[i];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Horizontal filter.
+
+static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
+                                           int width, int height, int stride,
+                                           int row, int num_rows,
+                                           int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Leftmost pixel is the same as input for topmost scanline.
+    out[0] = in[0];
+    PredictLineLeft(in + 1, out + 1, width - 1, inverse);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    // Leftmost pixel is predicted from above.
+    PredictLineC(in, preds - stride, out, 1, inverse);
+    PredictLineLeft(in + 1, out + 1, width - 1, inverse);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter.
+
+static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Very first top-left pixel is copied.
+    out[0] = in[0];
+    // Rest of top scan-line is left-predicted.
+    PredictLineLeft(in + 1, out + 1, width - 1, inverse);
+    row = 1;
+    in += stride;
+    out += stride;
+  } else {
+    // We are starting from in-between. Make sure 'preds' points to prev row.
+    preds -= stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    PredictLineTop(in, preds, out, width, inverse);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter.
+
+static WEBP_INLINE int GradientPredictorC(uint8_t a, uint8_t b, uint8_t c) {
+  const int g = a + b - c;
+  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
+}
+
+static void GradientPredictDirect(const uint8_t* const row,
+                                  const uint8_t* const top,
+                                  uint8_t* const out, int length) {
+  const int max_pos = length & ~7;
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i < max_pos; i += 8) {
+    const __m128i A0 = _mm_loadl_epi64((const __m128i*)&row[i - 1]);
+    const __m128i B0 = _mm_loadl_epi64((const __m128i*)&top[i]);
+    const __m128i C0 = _mm_loadl_epi64((const __m128i*)&top[i - 1]);
+    const __m128i D = _mm_loadl_epi64((const __m128i*)&row[i]);
+    const __m128i A1 = _mm_unpacklo_epi8(A0, zero);
+    const __m128i B1 = _mm_unpacklo_epi8(B0, zero);
+    const __m128i C1 = _mm_unpacklo_epi8(C0, zero);
+    const __m128i E = _mm_add_epi16(A1, B1);
+    const __m128i F = _mm_sub_epi16(E, C1);
+    const __m128i G = _mm_packus_epi16(F, zero);
+    const __m128i H = _mm_sub_epi8(D, G);
+    _mm_storel_epi64((__m128i*)(out + i), H);
+  }
+  for (; i < length; ++i) {
+    out[i] = row[i] - GradientPredictorC(row[i - 1], top[i], top[i - 1]);
+  }
+}
+
+static void GradientPredictInverse(const uint8_t* const in,
+                                   const uint8_t* const top,
+                                   uint8_t* const row, int length) {
+  if (length > 0) {
+    int i;
+    const int max_pos = length & ~7;
+    const __m128i zero = _mm_setzero_si128();
+    __m128i A = _mm_set_epi32(0, 0, 0, row[-1]);   // left sample
+    for (i = 0; i < max_pos; i += 8) {
+      const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]);
+      const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]);
+      const __m128i B = _mm_unpacklo_epi8(tmp0, zero);
+      const __m128i C = _mm_unpacklo_epi8(tmp1, zero);
+      const __m128i tmp2 = _mm_loadl_epi64((const __m128i*)&in[i]);
+      const __m128i D = _mm_unpacklo_epi8(tmp2, zero);   // base input
+      const __m128i E = _mm_sub_epi16(B, C);  // unclipped gradient basis B - C
+      __m128i out = zero;                     // accumulator for output
+      __m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff);
+      int k = 8;
+      while (1) {
+        const __m128i tmp3 = _mm_add_epi16(A, E);        // delta = A + B - C
+        const __m128i tmp4 = _mm_min_epi16(tmp3, mask_hi);
+        const __m128i tmp5 = _mm_max_epi16(tmp4, zero);  // clipped delta
+        const __m128i tmp6 = _mm_add_epi16(tmp5, D);     // add to in[] values
+        A = _mm_and_si128(tmp6, mask_hi);                // 1-complement clip
+        out = _mm_or_si128(out, A);                      // accumulate output
+        if (--k == 0) break;
+        A = _mm_slli_si128(A, 2);                        // rotate left sample
+        mask_hi = _mm_slli_si128(mask_hi, 2);            // rotate mask
+      }
+      A = _mm_srli_si128(A, 14);       // prepare left sample for next iteration
+      _mm_storel_epi64((__m128i*)&row[i], _mm_packus_epi16(out, zero));
+    }
+    for (; i < length; ++i) {
+      row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]);
+    }
+  }
+}
+
+static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+
+  // left prediction for top scan-line
+  if (row == 0) {
+    out[0] = in[0];
+    PredictLineLeft(in + 1, out + 1, width - 1, inverse);
+    row = 1;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    if (inverse) {
+      PredictLineC(in, out - stride, out, 1, inverse);  // predict from above
+      GradientPredictInverse(in + 1, out + 1 - stride, out + 1, width - 1);
+    } else {
+      PredictLineC(in, in - stride, out, 1, inverse);
+      GradientPredictDirect(in + 1, in + 1 - stride, out + 1, width - 1);
+    }
+    ++row;
+    in += stride;
+    out += stride;
+  }
+}
+
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+
+static void HorizontalFilter(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+static void VerticalFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+
+static void GradientFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
+}
+
+
+//------------------------------------------------------------------------------
+
+static void VerticalUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+static void HorizontalUnfilter(int width, int height, int stride, int row,
+                               int num_rows, uint8_t* data) {
+  DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+static void GradientUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8FiltersInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitSSE2(void) {
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter;
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter;
+
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8FiltersInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.c b/src/3rdparty/libwebp/src/dsp/lossless.c
index ee334bc..71ae9d4 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless.c
@@ -20,376 +20,12 @@
 #include "../dec/vp8li.h"
 #include "../utils/endian_inl.h"
 #include "./lossless.h"
-#include "./yuv.h"
 
 #define MAX_DIFF_COST (1e30f)
 
-// lookup table for small values of log2(int)
-const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
-  0.0000000000000000f, 0.0000000000000000f,
-  1.0000000000000000f, 1.5849625007211560f,
-  2.0000000000000000f, 2.3219280948873621f,
-  2.5849625007211560f, 2.8073549220576041f,
-  3.0000000000000000f, 3.1699250014423121f,
-  3.3219280948873621f, 3.4594316186372973f,
-  3.5849625007211560f, 3.7004397181410921f,
-  3.8073549220576041f, 3.9068905956085187f,
-  4.0000000000000000f, 4.0874628412503390f,
-  4.1699250014423121f, 4.2479275134435852f,
-  4.3219280948873626f, 4.3923174227787606f,
-  4.4594316186372973f, 4.5235619560570130f,
-  4.5849625007211560f, 4.6438561897747243f,
-  4.7004397181410917f, 4.7548875021634682f,
-  4.8073549220576037f, 4.8579809951275718f,
-  4.9068905956085187f, 4.9541963103868749f,
-  5.0000000000000000f, 5.0443941193584533f,
-  5.0874628412503390f, 5.1292830169449663f,
-  5.1699250014423121f, 5.2094533656289501f,
-  5.2479275134435852f, 5.2854022188622487f,
-  5.3219280948873626f, 5.3575520046180837f,
-  5.3923174227787606f, 5.4262647547020979f,
-  5.4594316186372973f, 5.4918530963296747f,
-  5.5235619560570130f, 5.5545888516776376f,
-  5.5849625007211560f, 5.6147098441152083f,
-  5.6438561897747243f, 5.6724253419714951f,
-  5.7004397181410917f, 5.7279204545631987f,
-  5.7548875021634682f, 5.7813597135246599f,
-  5.8073549220576037f, 5.8328900141647412f,
-  5.8579809951275718f, 5.8826430493618415f,
-  5.9068905956085187f, 5.9307373375628866f,
-  5.9541963103868749f, 5.9772799234999167f,
-  6.0000000000000000f, 6.0223678130284543f,
-  6.0443941193584533f, 6.0660891904577720f,
-  6.0874628412503390f, 6.1085244567781691f,
-  6.1292830169449663f, 6.1497471195046822f,
-  6.1699250014423121f, 6.1898245588800175f,
-  6.2094533656289501f, 6.2288186904958804f,
-  6.2479275134435852f, 6.2667865406949010f,
-  6.2854022188622487f, 6.3037807481771030f,
-  6.3219280948873626f, 6.3398500028846243f,
-  6.3575520046180837f, 6.3750394313469245f,
-  6.3923174227787606f, 6.4093909361377017f,
-  6.4262647547020979f, 6.4429434958487279f,
-  6.4594316186372973f, 6.4757334309663976f,
-  6.4918530963296747f, 6.5077946401986963f,
-  6.5235619560570130f, 6.5391588111080309f,
-  6.5545888516776376f, 6.5698556083309478f,
-  6.5849625007211560f, 6.5999128421871278f,
-  6.6147098441152083f, 6.6293566200796094f,
-  6.6438561897747243f, 6.6582114827517946f,
-  6.6724253419714951f, 6.6865005271832185f,
-  6.7004397181410917f, 6.7142455176661224f,
-  6.7279204545631987f, 6.7414669864011464f,
-  6.7548875021634682f, 6.7681843247769259f,
-  6.7813597135246599f, 6.7944158663501061f,
-  6.8073549220576037f, 6.8201789624151878f,
-  6.8328900141647412f, 6.8454900509443747f,
-  6.8579809951275718f, 6.8703647195834047f,
-  6.8826430493618415f, 6.8948177633079437f,
-  6.9068905956085187f, 6.9188632372745946f,
-  6.9307373375628866f, 6.9425145053392398f,
-  6.9541963103868749f, 6.9657842846620869f,
-  6.9772799234999167f, 6.9886846867721654f,
-  7.0000000000000000f, 7.0112272554232539f,
-  7.0223678130284543f, 7.0334230015374501f,
-  7.0443941193584533f, 7.0552824355011898f,
-  7.0660891904577720f, 7.0768155970508308f,
-  7.0874628412503390f, 7.0980320829605263f,
-  7.1085244567781691f, 7.1189410727235076f,
-  7.1292830169449663f, 7.1395513523987936f,
-  7.1497471195046822f, 7.1598713367783890f,
-  7.1699250014423121f, 7.1799090900149344f,
-  7.1898245588800175f, 7.1996723448363644f,
-  7.2094533656289501f, 7.2191685204621611f,
-  7.2288186904958804f, 7.2384047393250785f,
-  7.2479275134435852f, 7.2573878426926521f,
-  7.2667865406949010f, 7.2761244052742375f,
-  7.2854022188622487f, 7.2946207488916270f,
-  7.3037807481771030f, 7.3128829552843557f,
-  7.3219280948873626f, 7.3309168781146167f,
-  7.3398500028846243f, 7.3487281542310771f,
-  7.3575520046180837f, 7.3663222142458160f,
-  7.3750394313469245f, 7.3837042924740519f,
-  7.3923174227787606f, 7.4008794362821843f,
-  7.4093909361377017f, 7.4178525148858982f,
-  7.4262647547020979f, 7.4346282276367245f,
-  7.4429434958487279f, 7.4512111118323289f,
-  7.4594316186372973f, 7.4676055500829976f,
-  7.4757334309663976f, 7.4838157772642563f,
-  7.4918530963296747f, 7.4998458870832056f,
-  7.5077946401986963f, 7.5156998382840427f,
-  7.5235619560570130f, 7.5313814605163118f,
-  7.5391588111080309f, 7.5468944598876364f,
-  7.5545888516776376f, 7.5622424242210728f,
-  7.5698556083309478f, 7.5774288280357486f,
-  7.5849625007211560f, 7.5924570372680806f,
-  7.5999128421871278f, 7.6073303137496104f,
-  7.6147098441152083f, 7.6220518194563764f,
-  7.6293566200796094f, 7.6366246205436487f,
-  7.6438561897747243f, 7.6510516911789281f,
-  7.6582114827517946f, 7.6653359171851764f,
-  7.6724253419714951f, 7.6794800995054464f,
-  7.6865005271832185f, 7.6934869574993252f,
-  7.7004397181410917f, 7.7073591320808825f,
-  7.7142455176661224f, 7.7210991887071855f,
-  7.7279204545631987f, 7.7347096202258383f,
-  7.7414669864011464f, 7.7481928495894605f,
-  7.7548875021634682f, 7.7615512324444795f,
-  7.7681843247769259f, 7.7747870596011736f,
-  7.7813597135246599f, 7.7879025593914317f,
-  7.7944158663501061f, 7.8008998999203047f,
-  7.8073549220576037f, 7.8137811912170374f,
-  7.8201789624151878f, 7.8265484872909150f,
-  7.8328900141647412f, 7.8392037880969436f,
-  7.8454900509443747f, 7.8517490414160571f,
-  7.8579809951275718f, 7.8641861446542797f,
-  7.8703647195834047f, 7.8765169465649993f,
-  7.8826430493618415f, 7.8887432488982591f,
-  7.8948177633079437f, 7.9008668079807486f,
-  7.9068905956085187f, 7.9128893362299619f,
-  7.9188632372745946f, 7.9248125036057812f,
-  7.9307373375628866f, 7.9366379390025709f,
-  7.9425145053392398f, 7.9483672315846778f,
-  7.9541963103868749f, 7.9600019320680805f,
-  7.9657842846620869f, 7.9715435539507719f,
-  7.9772799234999167f, 7.9829935746943103f,
-  7.9886846867721654f, 7.9943534368588577f
-};
-
-const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
-  0.00000000f,    0.00000000f,  2.00000000f,   4.75488750f,
-  8.00000000f,   11.60964047f,  15.50977500f,  19.65148445f,
-  24.00000000f,  28.52932501f,  33.21928095f,  38.05374781f,
-  43.01955001f,  48.10571634f,  53.30296891f,  58.60335893f,
-  64.00000000f,  69.48686830f,  75.05865003f,  80.71062276f,
-  86.43856190f,  92.23866588f,  98.10749561f,  104.04192499f,
-  110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
-  134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
-  160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
-  186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
-  212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
-  240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
-  268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
-  296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
-  325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
-  354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
-  384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
-  413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
-  444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
-  474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
-  505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
-  536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
-  568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
-  600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
-  632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
-  664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
-  696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
-  729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
-  762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
-  795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
-  828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
-  862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
-  896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
-  929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
-  963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
-  998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
-  1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
-  1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
-  1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
-  1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
-  1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
-  1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
-  1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
-  1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
-  1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
-  1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
-  1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
-  1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
-  1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
-  1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
-  1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
-  1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
-  1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
-  1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
-  1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
-  1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
-  1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
-  1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
-  1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
-  1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
-  1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
-  1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
-  1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
-  2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
-};
-
-const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
-  { 0, 0}, { 0, 0}, { 1, 0}, { 2, 0}, { 3, 0}, { 4, 1}, { 4, 1}, { 5, 1},
-  { 5, 1}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 7, 2}, { 7, 2}, { 7, 2},
-  { 7, 2}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3},
-  { 8, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3},
-  { 9, 3}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
-  {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
-  {10, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
-  {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
-  {11, 4}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
-  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
-  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
-  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
-  {12, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
-  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
-  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
-  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
-  {13, 5}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
-  {14, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
-  {15, 6}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
-  {16, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
-};
-
-const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
-   0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  1,  2,  3,  0,  1,  2,  3,
-   0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
-  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
-  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
-  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
-  127,
-   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
-  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
-  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
-  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
-  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
-};
-
-// The threshold till approximate version of log_2 can be used.
-// Practically, we can get rid of the call to log() as the two values match to
-// very high degree (the ratio of these two is 0.99999x).
-// Keeping a high threshold for now.
-#define APPROX_LOG_WITH_CORRECTION_MAX  65536
-#define APPROX_LOG_MAX                   4096
-#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
-static float FastSLog2Slow(uint32_t v) {
-  assert(v >= LOG_LOOKUP_IDX_MAX);
-  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
-    int log_cnt = 0;
-    uint32_t y = 1;
-    int correction = 0;
-    const float v_f = (float)v;
-    const uint32_t orig_v = v;
-    do {
-      ++log_cnt;
-      v = v >> 1;
-      y = y << 1;
-    } while (v >= LOG_LOOKUP_IDX_MAX);
-    // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
-    // Xf = floor(Xf) * (1 + (v % y) / v)
-    // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
-    // The correction factor: log(1 + d) ~ d; for very small d values, so
-    // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
-    // LOG_2_RECIPROCAL ~ 23/16
-    correction = (23 * (orig_v & (y - 1))) >> 4;
-    return v_f * (kLog2Table[v] + log_cnt) + correction;
-  } else {
-    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
-  }
-}
-
-static float FastLog2Slow(uint32_t v) {
-  assert(v >= LOG_LOOKUP_IDX_MAX);
-  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
-    int log_cnt = 0;
-    uint32_t y = 1;
-    const uint32_t orig_v = v;
-    double log_2;
-    do {
-      ++log_cnt;
-      v = v >> 1;
-      y = y << 1;
-    } while (v >= LOG_LOOKUP_IDX_MAX);
-    log_2 = kLog2Table[v] + log_cnt;
-    if (orig_v >= APPROX_LOG_MAX) {
-      // Since the division is still expensive, add this correction factor only
-      // for large values of 'v'.
-      const int correction = (23 * (orig_v & (y - 1))) >> 4;
-      log_2 += (double)correction / orig_v;
-    }
-    return (float)log_2;
-  } else {
-    return (float)(LOG_2_RECIPROCAL * log((double)v));
-  }
-}
-
 //------------------------------------------------------------------------------
 // Image transforms.
 
-// Mostly used to reduce code size + readability
-static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
-
 // In-place sum of each component with mod 256.
 static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
   const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
@@ -398,7 +34,7 @@ static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
 }
 
 static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
-  return (((a0 ^ a1) & 0xfefefefeL) >> 1) + (a0 & a1);
+  return (((a0 ^ a1) & 0xfefefefeu) >> 1) + (a0 & a1);
 }
 
 static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
@@ -537,202 +173,7 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
   return pred;
 }
 
-static const VP8LPredictorFunc kPredictorsC[16] = {
-  Predictor0, Predictor1, Predictor2, Predictor3,
-  Predictor4, Predictor5, Predictor6, Predictor7,
-  Predictor8, Predictor9, Predictor10, Predictor11,
-  Predictor12, Predictor13,
-  Predictor0, Predictor0    // <- padding security sentinels
-};
-
-static float PredictionCostSpatial(const int counts[256], int weight_0,
-                                   double exp_val) {
-  const int significant_symbols = 256 >> 4;
-  const double exp_decay_factor = 0.6;
-  double bits = weight_0 * counts[0];
-  int i;
-  for (i = 1; i < significant_symbols; ++i) {
-    bits += exp_val * (counts[i] + counts[256 - i]);
-    exp_val *= exp_decay_factor;
-  }
-  return (float)(-0.1 * bits);
-}
-
-// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
-static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
-  int i;
-  double retval = 0.;
-  int sumX = 0, sumXY = 0;
-  for (i = 0; i < 256; ++i) {
-    const int x = X[i];
-    const int xy = x + Y[i];
-    if (x != 0) {
-      sumX += x;
-      retval -= VP8LFastSLog2(x);
-      sumXY += xy;
-      retval -= VP8LFastSLog2(xy);
-    } else if (xy != 0) {
-      sumXY += xy;
-      retval -= VP8LFastSLog2(xy);
-    }
-  }
-  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
-  return (float)retval;
-}
-
-static float PredictionCostSpatialHistogram(const int accumulated[4][256],
-                                            const int tile[4][256]) {
-  int i;
-  double retval = 0;
-  for (i = 0; i < 4; ++i) {
-    const double kExpValue = 0.94;
-    retval += PredictionCostSpatial(tile[i], 1, kExpValue);
-    retval += CombinedShannonEntropy(tile[i], accumulated[i]);
-  }
-  return (float)retval;
-}
-
-static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
-  ++histo_argb[0][argb >> 24];
-  ++histo_argb[1][(argb >> 16) & 0xff];
-  ++histo_argb[2][(argb >> 8) & 0xff];
-  ++histo_argb[3][argb & 0xff];
-}
-
-static int GetBestPredictorForTile(int width, int height,
-                                   int tile_x, int tile_y, int bits,
-                                   const int accumulated[4][256],
-                                   const uint32_t* const argb_scratch) {
-  const int kNumPredModes = 14;
-  const int col_start = tile_x << bits;
-  const int row_start = tile_y << bits;
-  const int tile_size = 1 << bits;
-  const int max_y = GetMin(tile_size, height - row_start);
-  const int max_x = GetMin(tile_size, width - col_start);
-  float best_diff = MAX_DIFF_COST;
-  int best_mode = 0;
-  int mode;
-  for (mode = 0; mode < kNumPredModes; ++mode) {
-    const uint32_t* current_row = argb_scratch;
-    const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
-    float cur_diff;
-    int y;
-    int histo_argb[4][256];
-    memset(histo_argb, 0, sizeof(histo_argb));
-    for (y = 0; y < max_y; ++y) {
-      int x;
-      const int row = row_start + y;
-      const uint32_t* const upper_row = current_row;
-      current_row = upper_row + width;
-      for (x = 0; x < max_x; ++x) {
-        const int col = col_start + x;
-        uint32_t predict;
-        if (row == 0) {
-          predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
-        } else if (col == 0) {
-          predict = upper_row[col];  // Top.
-        } else {
-          predict = pred_func(current_row[col - 1], upper_row + col);
-        }
-        UpdateHisto(histo_argb, VP8LSubPixels(current_row[col], predict));
-      }
-    }
-    cur_diff = PredictionCostSpatialHistogram(
-        accumulated, (const int (*)[256])histo_argb);
-    if (cur_diff < best_diff) {
-      best_diff = cur_diff;
-      best_mode = mode;
-    }
-  }
-
-  return best_mode;
-}
-
-static void CopyTileWithPrediction(int width, int height,
-                                   int tile_x, int tile_y, int bits, int mode,
-                                   const uint32_t* const argb_scratch,
-                                   uint32_t* const argb) {
-  const int col_start = tile_x << bits;
-  const int row_start = tile_y << bits;
-  const int tile_size = 1 << bits;
-  const int max_y = GetMin(tile_size, height - row_start);
-  const int max_x = GetMin(tile_size, width - col_start);
-  const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
-  const uint32_t* current_row = argb_scratch;
-
-  int y;
-  for (y = 0; y < max_y; ++y) {
-    int x;
-    const int row = row_start + y;
-    const uint32_t* const upper_row = current_row;
-    current_row = upper_row + width;
-    for (x = 0; x < max_x; ++x) {
-      const int col = col_start + x;
-      const int pix = row * width + col;
-      uint32_t predict;
-      if (row == 0) {
-        predict = (col == 0) ? ARGB_BLACK : current_row[col - 1];  // Left.
-      } else if (col == 0) {
-        predict = upper_row[col];  // Top.
-      } else {
-        predict = pred_func(current_row[col - 1], upper_row + col);
-      }
-      argb[pix] = VP8LSubPixels(current_row[col], predict);
-    }
-  }
-}
-
-void VP8LResidualImage(int width, int height, int bits,
-                       uint32_t* const argb, uint32_t* const argb_scratch,
-                       uint32_t* const image) {
-  const int max_tile_size = 1 << bits;
-  const int tiles_per_row = VP8LSubSampleSize(width, bits);
-  const int tiles_per_col = VP8LSubSampleSize(height, bits);
-  uint32_t* const upper_row = argb_scratch;
-  uint32_t* const current_tile_rows = argb_scratch + width;
-  int tile_y;
-  int histo[4][256];
-  memset(histo, 0, sizeof(histo));
-  for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
-    const int tile_y_offset = tile_y * max_tile_size;
-    const int this_tile_height =
-        (tile_y < tiles_per_col - 1) ? max_tile_size : height - tile_y_offset;
-    int tile_x;
-    if (tile_y > 0) {
-      memcpy(upper_row, current_tile_rows + (max_tile_size - 1) * width,
-             width * sizeof(*upper_row));
-    }
-    memcpy(current_tile_rows, &argb[tile_y_offset * width],
-           this_tile_height * width * sizeof(*current_tile_rows));
-    for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
-      int pred;
-      int y;
-      const int tile_x_offset = tile_x * max_tile_size;
-      int all_x_max = tile_x_offset + max_tile_size;
-      if (all_x_max > width) {
-        all_x_max = width;
-      }
-      pred = GetBestPredictorForTile(width, height, tile_x, tile_y, bits,
-                                     (const int (*)[256])histo,
-                                     argb_scratch);
-      image[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
-      CopyTileWithPrediction(width, height, tile_x, tile_y, bits, pred,
-                             argb_scratch, argb);
-      for (y = 0; y < max_tile_size; ++y) {
-        int ix;
-        int all_x;
-        int all_y = tile_y_offset + y;
-        if (all_y >= height) {
-          break;
-        }
-        ix = all_y * width + tile_x_offset;
-        for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
-          UpdateHisto(histo, argb[ix]);
-        }
-      }
-    }
-  }
-}
+//------------------------------------------------------------------------------
 
 // Inverse prediction.
 static void PredictorInverseTransform(const VP8LTransform* const transform,
@@ -792,17 +233,6 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
   }
 }
 
-void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
-  int i;
-  for (i = 0; i < num_pixels; ++i) {
-    const uint32_t argb = argb_data[i];
-    const uint32_t green = (argb >> 8) & 0xff;
-    const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
-    const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
-    argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
-  }
-}
-
 // Add green to blue and red channels (i.e. perform the inverse transform of
 // 'subtract green').
 void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels) {
@@ -817,12 +247,6 @@ void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels) {
   }
 }
 
-static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
-  m->green_to_red_ = 0;
-  m->green_to_blue_ = 0;
-  m->red_to_blue_ = 0;
-}
-
 static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
                                                 int8_t color) {
   return (uint32_t)((int)(color_pred) * color) >> 5;
@@ -835,32 +259,6 @@ static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
   m->red_to_blue_   = (color_code >> 16) & 0xff;
 }
 
-static WEBP_INLINE uint32_t MultipliersToColorCode(
-    const VP8LMultipliers* const m) {
-  return 0xff000000u |
-         ((uint32_t)(m->red_to_blue_) << 16) |
-         ((uint32_t)(m->green_to_blue_) << 8) |
-         m->green_to_red_;
-}
-
-void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
-                          int num_pixels) {
-  int i;
-  for (i = 0; i < num_pixels; ++i) {
-    const uint32_t argb = data[i];
-    const uint32_t green = argb >> 8;
-    const uint32_t red = argb >> 16;
-    uint32_t new_red = red;
-    uint32_t new_blue = argb;
-    new_red -= ColorTransformDelta(m->green_to_red_, green);
-    new_red &= 0xff;
-    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
-    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
-    new_blue &= 0xff;
-    data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
-  }
-}
-
 void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data,
                                  int num_pixels) {
   int i;
@@ -879,276 +277,6 @@ void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data,
   }
 }
 
-static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
-                                             uint32_t argb) {
-  const uint32_t green = argb >> 8;
-  uint32_t new_red = argb >> 16;
-  new_red -= ColorTransformDelta(green_to_red, green);
-  return (new_red & 0xff);
-}
-
-static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
-                                              uint8_t red_to_blue,
-                                              uint32_t argb) {
-  const uint32_t green = argb >> 8;
-  const uint32_t red = argb >> 16;
-  uint8_t new_blue = argb;
-  new_blue -= ColorTransformDelta(green_to_blue, green);
-  new_blue -= ColorTransformDelta(red_to_blue, red);
-  return (new_blue & 0xff);
-}
-
-static float PredictionCostCrossColor(const int accumulated[256],
-                                      const int counts[256]) {
-  // Favor low entropy, locally and globally.
-  // Favor small absolute values for PredictionCostSpatial
-  static const double kExpValue = 2.4;
-  return CombinedShannonEntropy(counts, accumulated) +
-         PredictionCostSpatial(counts, 3, kExpValue);
-}
-
-static float GetPredictionCostCrossColorRed(
-    int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
-    int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
-    const int accumulated_red_histo[256], const uint32_t* const argb) {
-  int all_y;
-  int histo[256] = { 0 };
-  float cur_diff;
-  for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
-    int ix = all_y * xsize + tile_x_offset;
-    int all_x;
-    for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
-      ++histo[TransformColorRed(green_to_red, argb[ix])];  // red.
-    }
-  }
-  cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
-  if ((uint8_t)green_to_red == prev_x.green_to_red_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if ((uint8_t)green_to_red == prev_y.green_to_red_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if (green_to_red == 0) {
-    cur_diff -= 3;
-  }
-  return cur_diff;
-}
-
-static void GetBestGreenToRed(
-    int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
-    int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y,
-    const int accumulated_red_histo[256], const uint32_t* const argb,
-    VP8LMultipliers* const best_tx) {
-  int min_green_to_red = -64;
-  int max_green_to_red = 64;
-  int green_to_red = 0;
-  int eval_min = 1;
-  int eval_max = 1;
-  float cur_diff_min = MAX_DIFF_COST;
-  float cur_diff_max = MAX_DIFF_COST;
-  // Do a binary search to find the optimal green_to_red color transform.
-  while (max_green_to_red - min_green_to_red > 2) {
-    if (eval_min) {
-      cur_diff_min = GetPredictionCostCrossColorRed(
-          tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize,
-          prev_x, prev_y, min_green_to_red, accumulated_red_histo, argb);
-      eval_min = 0;
-    }
-    if (eval_max) {
-      cur_diff_max = GetPredictionCostCrossColorRed(
-          tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize,
-          prev_x, prev_y, max_green_to_red, accumulated_red_histo, argb);
-      eval_max = 0;
-    }
-    if (cur_diff_min < cur_diff_max) {
-      green_to_red = min_green_to_red;
-      max_green_to_red = (max_green_to_red + min_green_to_red) / 2;
-      eval_max = 1;
-    } else {
-      green_to_red = max_green_to_red;
-      min_green_to_red = (max_green_to_red + min_green_to_red) / 2;
-      eval_min = 1;
-    }
-  }
-  best_tx->green_to_red_ = green_to_red;
-}
-
-static float GetPredictionCostCrossColorBlue(
-    int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
-    int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y,
-    int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256],
-    const uint32_t* const argb) {
-  int all_y;
-  int histo[256] = { 0 };
-  float cur_diff;
-  for (all_y = tile_y_offset; all_y < all_y_max; ++all_y) {
-    int all_x;
-    int ix = all_y * xsize + tile_x_offset;
-    for (all_x = tile_x_offset; all_x < all_x_max; ++all_x, ++ix) {
-      ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[ix])];
-    }
-  }
-  cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
-  if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if (green_to_blue == 0) {
-    cur_diff -= 3;
-  }
-  if (red_to_blue == 0) {
-    cur_diff -= 3;
-  }
-  return cur_diff;
-}
-
-static void GetBestGreenRedToBlue(
-    int tile_x_offset, int tile_y_offset, int all_x_max, int all_y_max,
-    int xsize, VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
-    const int accumulated_blue_histo[256], const uint32_t* const argb,
-    VP8LMultipliers* const best_tx) {
-  float best_diff = MAX_DIFF_COST;
-  float cur_diff;
-  const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16;
-  const int min_green_to_blue = -32;
-  const int max_green_to_blue = 32;
-  const int min_red_to_blue = -32;
-  const int max_red_to_blue = 32;
-  const int num_iters =
-      (1 + (max_green_to_blue - min_green_to_blue) / step) *
-      (1 + (max_red_to_blue - min_red_to_blue) / step);
-  // Number of tries to get optimal green_to_blue & red_to_blue color transforms
-  // after finding a local minima.
-  const int max_tries_after_min = 4 + (num_iters >> 2);
-  int num_tries_after_min = 0;
-  int green_to_blue;
-  for (green_to_blue = min_green_to_blue;
-       green_to_blue <= max_green_to_blue &&
-       num_tries_after_min < max_tries_after_min;
-       green_to_blue += step) {
-    int red_to_blue;
-    for (red_to_blue = min_red_to_blue;
-         red_to_blue <= max_red_to_blue &&
-         num_tries_after_min < max_tries_after_min;
-         red_to_blue += step) {
-      cur_diff = GetPredictionCostCrossColorBlue(
-          tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize, prev_x,
-          prev_y, green_to_blue, red_to_blue, accumulated_blue_histo, argb);
-      if (cur_diff < best_diff) {
-        best_diff = cur_diff;
-        best_tx->green_to_blue_ = green_to_blue;
-        best_tx->red_to_blue_ = red_to_blue;
-        num_tries_after_min = 0;
-      } else {
-        ++num_tries_after_min;
-      }
-    }
-  }
-}
-
-static VP8LMultipliers GetBestColorTransformForTile(
-    int tile_x, int tile_y, int bits,
-    VP8LMultipliers prev_x,
-    VP8LMultipliers prev_y,
-    int quality, int xsize, int ysize,
-    const int accumulated_red_histo[256],
-    const int accumulated_blue_histo[256],
-    const uint32_t* const argb) {
-  const int max_tile_size = 1 << bits;
-  const int tile_y_offset = tile_y * max_tile_size;
-  const int tile_x_offset = tile_x * max_tile_size;
-  const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
-  const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
-  VP8LMultipliers best_tx;
-  MultipliersClear(&best_tx);
-
-  GetBestGreenToRed(tile_x_offset, tile_y_offset, all_x_max, all_y_max, xsize,
-                    prev_x, prev_y, accumulated_red_histo, argb, &best_tx);
-  GetBestGreenRedToBlue(tile_x_offset, tile_y_offset, all_x_max, all_y_max,
-                        xsize, prev_x, prev_y, quality, accumulated_blue_histo,
-                        argb, &best_tx);
-  return best_tx;
-}
-
-static void CopyTileWithColorTransform(int xsize, int ysize,
-                                       int tile_x, int tile_y,
-                                       int max_tile_size,
-                                       VP8LMultipliers color_transform,
-                                       uint32_t* argb) {
-  const int xscan = GetMin(max_tile_size, xsize - tile_x);
-  int yscan = GetMin(max_tile_size, ysize - tile_y);
-  argb += tile_y * xsize + tile_x;
-  while (yscan-- > 0) {
-    VP8LTransformColor(&color_transform, argb, xscan);
-    argb += xsize;
-  }
-}
-
-void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
-                             uint32_t* const argb, uint32_t* image) {
-  const int max_tile_size = 1 << bits;
-  const int tile_xsize = VP8LSubSampleSize(width, bits);
-  const int tile_ysize = VP8LSubSampleSize(height, bits);
-  int accumulated_red_histo[256] = { 0 };
-  int accumulated_blue_histo[256] = { 0 };
-  int tile_x, tile_y;
-  VP8LMultipliers prev_x, prev_y;
-  MultipliersClear(&prev_y);
-  MultipliersClear(&prev_x);
-  for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
-    for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
-      int y;
-      const int tile_x_offset = tile_x * max_tile_size;
-      const int tile_y_offset = tile_y * max_tile_size;
-      const int all_x_max = GetMin(tile_x_offset + max_tile_size, width);
-      const int all_y_max = GetMin(tile_y_offset + max_tile_size, height);
-      const int offset = tile_y * tile_xsize + tile_x;
-      if (tile_y != 0) {
-        ColorCodeToMultipliers(image[offset - tile_xsize], &prev_y);
-      }
-      prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits,
-                                            prev_x, prev_y,
-                                            quality, width, height,
-                                            accumulated_red_histo,
-                                            accumulated_blue_histo,
-                                            argb);
-      image[offset] = MultipliersToColorCode(&prev_x);
-      CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset,
-                                 max_tile_size, prev_x, argb);
-
-      // Gather accumulated histogram data.
-      for (y = tile_y_offset; y < all_y_max; ++y) {
-        int ix = y * width + tile_x_offset;
-        const int ix_end = ix + all_x_max - tile_x_offset;
-        for (; ix < ix_end; ++ix) {
-          const uint32_t pix = argb[ix];
-          if (ix >= 2 &&
-              pix == argb[ix - 2] &&
-              pix == argb[ix - 1]) {
-            continue;  // repeated pixels are handled by backward references
-          }
-          if (ix >= width + 2 &&
-              argb[ix - 2] == argb[ix - width - 2] &&
-              argb[ix - 1] == argb[ix - width - 1] &&
-              pix == argb[ix - width]) {
-            continue;  // repeated pixels are handled by backward references
-          }
-          ++accumulated_red_histo[(pix >> 16) & 0xff];
-          ++accumulated_blue_histo[(pix >> 0) & 0xff];
-        }
-      }
-    }
-  }
-}
-
 // Color space inverse transform.
 static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
                                        int y_start, int y_end, uint32_t* data) {
@@ -1184,9 +312,21 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
 
 // Separate out pixels packed together using pixel-bundling.
 // We define two methods for ARGB data (uint32_t) and alpha-only data (uint8_t).
-#define COLOR_INDEX_INVERSE(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)             \
-void FUNC_NAME(const VP8LTransform* const transform,                           \
-               int y_start, int y_end, const TYPE* src, TYPE* dst) {           \
+#define COLOR_INDEX_INVERSE(FUNC_NAME, F_NAME, STATIC_DECL, TYPE, BIT_SUFFIX,  \
+                            GET_INDEX, GET_VALUE)                              \
+static void F_NAME(const TYPE* src, const uint32_t* const color_map,           \
+                   TYPE* dst, int y_start, int y_end, int width) {             \
+  int y;                                                                       \
+  for (y = y_start; y < y_end; ++y) {                                          \
+    int x;                                                                     \
+    for (x = 0; x < width; ++x) {                                              \
+      *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]);                        \
+    }                                                                          \
+  }                                                                            \
+}                                                                              \
+STATIC_DECL void FUNC_NAME(const VP8LTransform* const transform,               \
+                           int y_start, int y_end, const TYPE* src,            \
+                           TYPE* dst) {                                        \
   int y;                                                                       \
   const int bits_per_pixel = 8 >> transform->bits_;                            \
   const int width = transform->xsize_;                                         \
@@ -1209,35 +349,14 @@ void FUNC_NAME(const VP8LTransform* const transform,                           \
       }                                                                        \
     }                                                                          \
   } else {                                                                     \
-    for (y = y_start; y < y_end; ++y) {                                        \
-      int x;                                                                   \
-      for (x = 0; x < width; ++x) {                                            \
-        *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]);                      \
-      }                                                                        \
-    }                                                                          \
+    VP8LMapColor##BIT_SUFFIX(src, color_map, dst, y_start, y_end, width);      \
   }                                                                            \
 }
 
-static WEBP_INLINE uint32_t GetARGBIndex(uint32_t idx) {
-  return (idx >> 8) & 0xff;
-}
-
-static WEBP_INLINE uint8_t GetAlphaIndex(uint8_t idx) {
-  return idx;
-}
-
-static WEBP_INLINE uint32_t GetARGBValue(uint32_t val) {
-  return val;
-}
-
-static WEBP_INLINE uint8_t GetAlphaValue(uint32_t val) {
-  return (val >> 8) & 0xff;
-}
-
-static COLOR_INDEX_INVERSE(ColorIndexInverseTransform, uint32_t, GetARGBIndex,
-                           GetARGBValue)
-COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, uint8_t, GetAlphaIndex,
-                    GetAlphaValue)
+COLOR_INDEX_INVERSE(ColorIndexInverseTransform, MapARGB, static, uint32_t, 32b,
+                    VP8GetARGBIndex, VP8GetARGBValue)
+COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, MapAlpha, , uint8_t,
+                    8b, VP8GetAlphaIndex, VP8GetAlphaValue)
 
 #undef COLOR_INDEX_INVERSE
 
@@ -1371,7 +490,7 @@ static void CopyOrSwap(const uint32_t* src, int num_pixels, uint8_t* dst,
 
 #if !defined(WORDS_BIGENDIAN)
 #if !defined(WEBP_REFERENCE_IMPLEMENTATION)
-      *(uint32_t*)dst = BSwap32(argb);
+      WebPUint32ToMem(dst, BSwap32(argb));
 #else  // WEBP_REFERENCE_IMPLEMENTATION
       dst[0] = (argb >> 24) & 0xff;
       dst[1] = (argb >> 16) & 0xff;
@@ -1437,136 +556,10 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
 }
 
 //------------------------------------------------------------------------------
-// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
-void VP8LBundleColorMap(const uint8_t* const row, int width,
-                        int xbits, uint32_t* const dst) {
-  int x;
-  if (xbits > 0) {
-    const int bit_depth = 1 << (3 - xbits);
-    const int mask = (1 << xbits) - 1;
-    uint32_t code = 0xff000000;
-    for (x = 0; x < width; ++x) {
-      const int xsub = x & mask;
-      if (xsub == 0) {
-        code = 0xff000000;
-      }
-      code |= row[x] << (8 + bit_depth * xsub);
-      dst[x >> xbits] = code;
-    }
-  } else {
-    for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
-  }
-}
-
-//------------------------------------------------------------------------------
-
-static double ExtraCost(const uint32_t* population, int length) {
-  int i;
-  double cost = 0.;
-  for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
-  return cost;
-}
-
-static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
-                                int length) {
-  int i;
-  double cost = 0.;
-  for (i = 2; i < length - 2; ++i) {
-    const int xy = X[i + 2] + Y[i + 2];
-    cost += (i >> 1) * xy;
-  }
-  return cost;
-}
-
-// Returns the various RLE counts
-static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) {
-  int i;
-  int streak = 0;
-  VP8LStreaks stats;
-  memset(&stats, 0, sizeof(stats));
-  for (i = 0; i < length - 1; ++i) {
-    ++streak;
-    if (population[i] == population[i + 1]) {
-      continue;
-    }
-    stats.counts[population[i] != 0] += (streak > 3);
-    stats.streaks[population[i] != 0][(streak > 3)] += streak;
-    streak = 0;
-  }
-  ++streak;
-  stats.counts[population[i] != 0] += (streak > 3);
-  stats.streaks[population[i] != 0][(streak > 3)] += streak;
-  return stats;
-}
 
-static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
-                                            const uint32_t* Y, int length) {
-  int i;
-  int streak = 0;
-  VP8LStreaks stats;
-  memset(&stats, 0, sizeof(stats));
-  for (i = 0; i < length - 1; ++i) {
-    const int xy = X[i] + Y[i];
-    const int xy_next = X[i + 1] + Y[i + 1];
-    ++streak;
-    if (xy == xy_next) {
-      continue;
-    }
-    stats.counts[xy != 0] += (streak > 3);
-    stats.streaks[xy != 0][(streak > 3)] += streak;
-    streak = 0;
-  }
-  {
-    const int xy = X[i] + Y[i];
-    ++streak;
-    stats.counts[xy != 0] += (streak > 3);
-    stats.streaks[xy != 0][(streak > 3)] += streak;
-  }
-  return stats;
-}
-
-//------------------------------------------------------------------------------
-
-static void HistogramAdd(const VP8LHistogram* const a,
-                         const VP8LHistogram* const b,
-                         VP8LHistogram* const out) {
-  int i;
-  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
-  assert(a->palette_code_bits_ == b->palette_code_bits_);
-  if (b != out) {
-    for (i = 0; i < literal_size; ++i) {
-      out->literal_[i] = a->literal_[i] + b->literal_[i];
-    }
-    for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-      out->distance_[i] = a->distance_[i] + b->distance_[i];
-    }
-    for (i = 0; i < NUM_LITERAL_CODES; ++i) {
-      out->red_[i] = a->red_[i] + b->red_[i];
-      out->blue_[i] = a->blue_[i] + b->blue_[i];
-      out->alpha_[i] = a->alpha_[i] + b->alpha_[i];
-    }
-  } else {
-    for (i = 0; i < literal_size; ++i) {
-      out->literal_[i] += a->literal_[i];
-    }
-    for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-      out->distance_[i] += a->distance_[i];
-    }
-    for (i = 0; i < NUM_LITERAL_CODES; ++i) {
-      out->red_[i] += a->red_[i];
-      out->blue_[i] += a->blue_[i];
-      out->alpha_[i] += a->alpha_[i];
-    }
-  }
-}
-
-//------------------------------------------------------------------------------
-
-VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
 VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
 VP8LPredictorFunc VP8LPredictors[16];
 
-VP8LTransformColorFunc VP8LTransformColor;
 VP8LTransformColorFunc VP8LTransformColorInverse;
 
 VP8LConvertFunc VP8LConvertBGRAToRGB;
@@ -1575,33 +568,38 @@ VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
 VP8LConvertFunc VP8LConvertBGRAToRGB565;
 VP8LConvertFunc VP8LConvertBGRAToBGR;
 
-VP8LFastLog2SlowFunc VP8LFastLog2Slow;
-VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
-
-VP8LCostFunc VP8LExtraCost;
-VP8LCostCombinedFunc VP8LExtraCostCombined;
-
-VP8LCostCountFunc VP8LHuffmanCostCount;
-VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount;
-
-VP8LHistogramAddFunc VP8LHistogramAdd;
+VP8LMapARGBFunc VP8LMapColor32b;
+VP8LMapAlphaFunc VP8LMapColor8b;
 
 extern void VP8LDspInitSSE2(void);
 extern void VP8LDspInitNEON(void);
-extern void VP8LDspInitMIPS32(void);
+extern void VP8LDspInitMIPSdspR2(void);
 
 static volatile VP8CPUInfo lossless_last_cpuinfo_used =
     (VP8CPUInfo)&lossless_last_cpuinfo_used;
 
-void VP8LDspInit(void) {
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
   if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return;
 
-  memcpy(VP8LPredictors, kPredictorsC, sizeof(VP8LPredictors));
+  VP8LPredictors[0] = Predictor0;
+  VP8LPredictors[1] = Predictor1;
+  VP8LPredictors[2] = Predictor2;
+  VP8LPredictors[3] = Predictor3;
+  VP8LPredictors[4] = Predictor4;
+  VP8LPredictors[5] = Predictor5;
+  VP8LPredictors[6] = Predictor6;
+  VP8LPredictors[7] = Predictor7;
+  VP8LPredictors[8] = Predictor8;
+  VP8LPredictors[9] = Predictor9;
+  VP8LPredictors[10] = Predictor10;
+  VP8LPredictors[11] = Predictor11;
+  VP8LPredictors[12] = Predictor12;
+  VP8LPredictors[13] = Predictor13;
+  VP8LPredictors[14] = Predictor0;     // <- padding security sentinels
+  VP8LPredictors[15] = Predictor0;
 
-  VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
   VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
 
-  VP8LTransformColor = VP8LTransformColor_C;
   VP8LTransformColorInverse = VP8LTransformColorInverse_C;
 
   VP8LConvertBGRAToRGB = VP8LConvertBGRAToRGB_C;
@@ -1610,16 +608,8 @@ void VP8LDspInit(void) {
   VP8LConvertBGRAToRGB565 = VP8LConvertBGRAToRGB565_C;
   VP8LConvertBGRAToBGR = VP8LConvertBGRAToBGR_C;
 
-  VP8LFastLog2Slow = FastLog2Slow;
-  VP8LFastSLog2Slow = FastSLog2Slow;
-
-  VP8LExtraCost = ExtraCost;
-  VP8LExtraCostCombined = ExtraCostCombined;
-
-  VP8LHuffmanCostCount = HuffmanCostCount;
-  VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount;
-
-  VP8LHistogramAdd = HistogramAdd;
+  VP8LMapColor32b = MapARGB;
+  VP8LMapColor8b = MapAlpha;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -1633,9 +623,9 @@ void VP8LDspInit(void) {
       VP8LDspInitNEON();
     }
 #endif
-#if defined(WEBP_USE_MIPS32)
-    if (VP8GetCPUInfo(kMIPS32)) {
-      VP8LDspInitMIPS32();
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8LDspInitMIPSdspR2();
     }
 #endif
   }
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.h b/src/3rdparty/libwebp/src/dsp/lossless.h
index 8c7551c..e063bdd 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.h
+++ b/src/3rdparty/libwebp/src/dsp/lossless.h
@@ -25,14 +25,17 @@
 extern "C" {
 #endif
 
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+#include "../enc/delta_palettization.h"
+#endif  // WEBP_EXPERIMENTAL_FEATURES
+
 //------------------------------------------------------------------------------
-// Signatures and generic function-pointers
+// Decoding
 
 typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];
 
 typedef void (*VP8LProcessBlueAndRedFunc)(uint32_t* argb_data, int num_pixels);
-extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
 extern VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
 
 typedef struct {
@@ -44,9 +47,19 @@ typedef struct {
 } VP8LMultipliers;
 typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
                                        uint32_t* argb_data, int num_pixels);
-extern VP8LTransformColorFunc VP8LTransformColor;
 extern VP8LTransformColorFunc VP8LTransformColorInverse;
 
+struct VP8LTransform;  // Defined in dec/vp8li.h.
+
+// Performs inverse transform of data given transform information, start and end
+// rows. Transform will be applied to rows [row_start, row_end[.
+// The *in and *out pointers refer to source and destination data respectively
+// corresponding to the intermediate row (row_start).
+void VP8LInverseTransform(const struct VP8LTransform* const transform,
+                          int row_start, int row_end,
+                          const uint32_t* const in, uint32_t* const out);
+
+// Color space conversion.
 typedef void (*VP8LConvertFunc)(const uint32_t* src, int num_pixels,
                                 uint8_t* dst);
 extern VP8LConvertFunc VP8LConvertBGRAToRGB;
@@ -55,9 +68,47 @@ extern VP8LConvertFunc VP8LConvertBGRAToRGBA4444;
 extern VP8LConvertFunc VP8LConvertBGRAToRGB565;
 extern VP8LConvertFunc VP8LConvertBGRAToBGR;
 
+// Converts from BGRA to other color spaces.
+void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
+                         WEBP_CSP_MODE out_colorspace, uint8_t* const rgba);
+
+// color mapping related functions.
+static WEBP_INLINE uint32_t VP8GetARGBIndex(uint32_t idx) {
+  return (idx >> 8) & 0xff;
+}
+
+static WEBP_INLINE uint8_t VP8GetAlphaIndex(uint8_t idx) {
+  return idx;
+}
+
+static WEBP_INLINE uint32_t VP8GetARGBValue(uint32_t val) {
+  return val;
+}
+
+static WEBP_INLINE uint8_t VP8GetAlphaValue(uint32_t val) {
+  return (val >> 8) & 0xff;
+}
+
+typedef void (*VP8LMapARGBFunc)(const uint32_t* src,
+                                const uint32_t* const color_map,
+                                uint32_t* dst, int y_start,
+                                int y_end, int width);
+typedef void (*VP8LMapAlphaFunc)(const uint8_t* src,
+                                 const uint32_t* const color_map,
+                                 uint8_t* dst, int y_start,
+                                 int y_end, int width);
+
+extern VP8LMapARGBFunc VP8LMapColor32b;
+extern VP8LMapAlphaFunc VP8LMapColor8b;
+
+// Similar to the static method ColorIndexInverseTransform() that is part of
+// lossless.c, but used only for alpha decoding. It takes uint8_t (rather than
+// uint32_t) arguments for 'src' and 'dst'.
+void VP8LColorIndexInverseTransformAlpha(
+    const struct VP8LTransform* const transform, int y_start, int y_end,
+    const uint8_t* src, uint8_t* dst);
+
 // Expose some C-only fallback functions
-void VP8LTransformColor_C(const VP8LMultipliers* const m,
-                          uint32_t* data, int num_pixels);
 void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
                                  uint32_t* data, int num_pixels);
 
@@ -68,47 +119,51 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
 void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
                                int num_pixels, uint8_t* dst);
 void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
-void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
 void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
 
 // Must be called before calling any of the above methods.
 void VP8LDspInit(void);
 
 //------------------------------------------------------------------------------
-// Image transforms.
+// Encoding
 
-struct VP8LTransform;  // Defined in dec/vp8li.h.
+extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+extern VP8LTransformColorFunc VP8LTransformColor;
+typedef void (*VP8LCollectColorBlueTransformsFunc)(
+    const uint32_t* argb, int stride,
+    int tile_width, int tile_height,
+    int green_to_blue, int red_to_blue, int histo[]);
+extern VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
+
+typedef void (*VP8LCollectColorRedTransformsFunc)(
+    const uint32_t* argb, int stride,
+    int tile_width, int tile_height,
+    int green_to_red, int histo[]);
+extern VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
 
-// Performs inverse transform of data given transform information, start and end
-// rows. Transform will be applied to rows [row_start, row_end[.
-// The *in and *out pointers refer to source and destination data respectively
-// corresponding to the intermediate row (row_start).
-void VP8LInverseTransform(const struct VP8LTransform* const transform,
-                          int row_start, int row_end,
-                          const uint32_t* const in, uint32_t* const out);
+// Expose some C-only fallback functions
+void VP8LTransformColor_C(const VP8LMultipliers* const m,
+                          uint32_t* data, int num_pixels);
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels);
+void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
+                                     int tile_width, int tile_height,
+                                     int green_to_red, int histo[]);
+void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
+                                      int tile_width, int tile_height,
+                                      int green_to_blue, int red_to_blue,
+                                      int histo[]);
 
-// Similar to the static method ColorIndexInverseTransform() that is part of
-// lossless.c, but used only for alpha decoding. It takes uint8_t (rather than
-// uint32_t) arguments for 'src' and 'dst'.
-void VP8LColorIndexInverseTransformAlpha(
-    const struct VP8LTransform* const transform, int y_start, int y_end,
-    const uint8_t* src, uint8_t* dst);
+//------------------------------------------------------------------------------
+// Image transforms.
 
-void VP8LResidualImage(int width, int height, int bits,
+void VP8LResidualImage(int width, int height, int bits, int low_effort,
                        uint32_t* const argb, uint32_t* const argb_scratch,
-                       uint32_t* const image);
+                       uint32_t* const image, int exact);
 
 void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
                              uint32_t* const argb, uint32_t* image);
 
 //------------------------------------------------------------------------------
-// Color space conversion.
-
-// Converts from BGRA to other color spaces.
-void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
-                         WEBP_CSP_MODE out_colorspace, uint8_t* const rgba);
-
-//------------------------------------------------------------------------------
 // Misc methods.
 
 // Computes sampled size of 'size' when sampling using 'sampling bits'.
@@ -119,6 +174,14 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
 
 // -----------------------------------------------------------------------------
 // Faster logarithm for integers. Small values use a look-up table.
+
+// The threshold till approximate version of log_2 can be used.
+// Practically, we can get rid of the call to log() as the two values match to
+// very high degree (the ratio of these two is 0.99999x).
+// Keeping a high threshold for now.
+#define APPROX_LOG_WITH_CORRECTION_MAX  65536
+#define APPROX_LOG_MAX                   4096
+#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
 #define LOG_LOOKUP_IDX_MAX 256
 extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
 extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
@@ -141,23 +204,56 @@ static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
 typedef double (*VP8LCostFunc)(const uint32_t* population, int length);
 typedef double (*VP8LCostCombinedFunc)(const uint32_t* X, const uint32_t* Y,
                                        int length);
+typedef float (*VP8LCombinedShannonEntropyFunc)(const int X[256],
+                                                const int Y[256]);
 
 extern VP8LCostFunc VP8LExtraCost;
 extern VP8LCostCombinedFunc VP8LExtraCostCombined;
+extern VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
 
 typedef struct {        // small struct to hold counters
   int counts[2];        // index: 0=zero steak, 1=non-zero streak
   int streaks[2][2];    // [zero/non-zero][streak<3 / streak>=3]
 } VP8LStreaks;
 
-typedef VP8LStreaks (*VP8LCostCountFunc)(const uint32_t* population,
-                                         int length);
 typedef VP8LStreaks (*VP8LCostCombinedCountFunc)(const uint32_t* X,
                                                  const uint32_t* Y, int length);
 
-extern VP8LCostCountFunc VP8LHuffmanCostCount;
 extern VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount;
 
+typedef struct {            // small struct to hold bit entropy results
+  double entropy;           // entropy
+  uint32_t sum;             // sum of the population
+  int nonzeros;             // number of non-zero elements in the population
+  uint32_t max_val;         // maximum value in the population
+  uint32_t nonzero_code;    // index of the last non-zero in the population
+} VP8LBitEntropy;
+
+void VP8LBitEntropyInit(VP8LBitEntropy* const entropy);
+
+// Get the combined symbol bit entropy and Huffman cost stats for the
+// distributions 'X' and 'Y'. Those results can then be refined according to
+// codec specific heuristics.
+void VP8LGetCombinedEntropyUnrefined(const uint32_t* const X,
+                                     const uint32_t* const Y, int length,
+                                     VP8LBitEntropy* const bit_entropy,
+                                     VP8LStreaks* const stats);
+// Get the entropy for the distribution 'X'.
+void VP8LGetEntropyUnrefined(const uint32_t* const X, int length,
+                             VP8LBitEntropy* const bit_entropy,
+                             VP8LStreaks* const stats);
+
+void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
+                              VP8LBitEntropy* const entropy);
+
+typedef void (*GetEntropyUnrefinedHelperFunc)(uint32_t val, int i,
+                                              uint32_t* const val_prev,
+                                              int* const i_prev,
+                                              VP8LBitEntropy* const bit_entropy,
+                                              VP8LStreaks* const stats);
+// Internal function used by VP8LGet*EntropyUnrefined.
+extern GetEntropyUnrefinedHelperFunc VP8LGetEntropyUnrefinedHelper;
+
 typedef void (*VP8LHistogramAddFunc)(const VP8LHistogram* const a,
                                      const VP8LHistogram* const b,
                                      VP8LHistogram* const out);
@@ -240,6 +336,9 @@ static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
 void VP8LBundleColorMap(const uint8_t* const row, int width,
                         int xbits, uint32_t* const dst);
 
+// Must be called before calling any of the above methods.
+void VP8LEncDspInit(void);
+
 //------------------------------------------------------------------------------
 
 #ifdef __cplusplus
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc.c b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
new file mode 100644
index 0000000..2eafa3d
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
@@ -0,0 +1,1215 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+//          Urvang Joshi (urvang@google.com)
+
+#include "./dsp.h"
+
+#include <math.h>
+#include <stdlib.h>
+#include "../dec/vp8li.h"
+#include "../utils/endian_inl.h"
+#include "./lossless.h"
+#include "./yuv.h"
+
+#define MAX_DIFF_COST (1e30f)
+
+static const int kPredLowEffort = 11;
+static const uint32_t kMaskAlpha = 0xff000000;
+
+// lookup table for small values of log2(int)
+const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
+  0.0000000000000000f, 0.0000000000000000f,
+  1.0000000000000000f, 1.5849625007211560f,
+  2.0000000000000000f, 2.3219280948873621f,
+  2.5849625007211560f, 2.8073549220576041f,
+  3.0000000000000000f, 3.1699250014423121f,
+  3.3219280948873621f, 3.4594316186372973f,
+  3.5849625007211560f, 3.7004397181410921f,
+  3.8073549220576041f, 3.9068905956085187f,
+  4.0000000000000000f, 4.0874628412503390f,
+  4.1699250014423121f, 4.2479275134435852f,
+  4.3219280948873626f, 4.3923174227787606f,
+  4.4594316186372973f, 4.5235619560570130f,
+  4.5849625007211560f, 4.6438561897747243f,
+  4.7004397181410917f, 4.7548875021634682f,
+  4.8073549220576037f, 4.8579809951275718f,
+  4.9068905956085187f, 4.9541963103868749f,
+  5.0000000000000000f, 5.0443941193584533f,
+  5.0874628412503390f, 5.1292830169449663f,
+  5.1699250014423121f, 5.2094533656289501f,
+  5.2479275134435852f, 5.2854022188622487f,
+  5.3219280948873626f, 5.3575520046180837f,
+  5.3923174227787606f, 5.4262647547020979f,
+  5.4594316186372973f, 5.4918530963296747f,
+  5.5235619560570130f, 5.5545888516776376f,
+  5.5849625007211560f, 5.6147098441152083f,
+  5.6438561897747243f, 5.6724253419714951f,
+  5.7004397181410917f, 5.7279204545631987f,
+  5.7548875021634682f, 5.7813597135246599f,
+  5.8073549220576037f, 5.8328900141647412f,
+  5.8579809951275718f, 5.8826430493618415f,
+  5.9068905956085187f, 5.9307373375628866f,
+  5.9541963103868749f, 5.9772799234999167f,
+  6.0000000000000000f, 6.0223678130284543f,
+  6.0443941193584533f, 6.0660891904577720f,
+  6.0874628412503390f, 6.1085244567781691f,
+  6.1292830169449663f, 6.1497471195046822f,
+  6.1699250014423121f, 6.1898245588800175f,
+  6.2094533656289501f, 6.2288186904958804f,
+  6.2479275134435852f, 6.2667865406949010f,
+  6.2854022188622487f, 6.3037807481771030f,
+  6.3219280948873626f, 6.3398500028846243f,
+  6.3575520046180837f, 6.3750394313469245f,
+  6.3923174227787606f, 6.4093909361377017f,
+  6.4262647547020979f, 6.4429434958487279f,
+  6.4594316186372973f, 6.4757334309663976f,
+  6.4918530963296747f, 6.5077946401986963f,
+  6.5235619560570130f, 6.5391588111080309f,
+  6.5545888516776376f, 6.5698556083309478f,
+  6.5849625007211560f, 6.5999128421871278f,
+  6.6147098441152083f, 6.6293566200796094f,
+  6.6438561897747243f, 6.6582114827517946f,
+  6.6724253419714951f, 6.6865005271832185f,
+  6.7004397181410917f, 6.7142455176661224f,
+  6.7279204545631987f, 6.7414669864011464f,
+  6.7548875021634682f, 6.7681843247769259f,
+  6.7813597135246599f, 6.7944158663501061f,
+  6.8073549220576037f, 6.8201789624151878f,
+  6.8328900141647412f, 6.8454900509443747f,
+  6.8579809951275718f, 6.8703647195834047f,
+  6.8826430493618415f, 6.8948177633079437f,
+  6.9068905956085187f, 6.9188632372745946f,
+  6.9307373375628866f, 6.9425145053392398f,
+  6.9541963103868749f, 6.9657842846620869f,
+  6.9772799234999167f, 6.9886846867721654f,
+  7.0000000000000000f, 7.0112272554232539f,
+  7.0223678130284543f, 7.0334230015374501f,
+  7.0443941193584533f, 7.0552824355011898f,
+  7.0660891904577720f, 7.0768155970508308f,
+  7.0874628412503390f, 7.0980320829605263f,
+  7.1085244567781691f, 7.1189410727235076f,
+  7.1292830169449663f, 7.1395513523987936f,
+  7.1497471195046822f, 7.1598713367783890f,
+  7.1699250014423121f, 7.1799090900149344f,
+  7.1898245588800175f, 7.1996723448363644f,
+  7.2094533656289501f, 7.2191685204621611f,
+  7.2288186904958804f, 7.2384047393250785f,
+  7.2479275134435852f, 7.2573878426926521f,
+  7.2667865406949010f, 7.2761244052742375f,
+  7.2854022188622487f, 7.2946207488916270f,
+  7.3037807481771030f, 7.3128829552843557f,
+  7.3219280948873626f, 7.3309168781146167f,
+  7.3398500028846243f, 7.3487281542310771f,
+  7.3575520046180837f, 7.3663222142458160f,
+  7.3750394313469245f, 7.3837042924740519f,
+  7.3923174227787606f, 7.4008794362821843f,
+  7.4093909361377017f, 7.4178525148858982f,
+  7.4262647547020979f, 7.4346282276367245f,
+  7.4429434958487279f, 7.4512111118323289f,
+  7.4594316186372973f, 7.4676055500829976f,
+  7.4757334309663976f, 7.4838157772642563f,
+  7.4918530963296747f, 7.4998458870832056f,
+  7.5077946401986963f, 7.5156998382840427f,
+  7.5235619560570130f, 7.5313814605163118f,
+  7.5391588111080309f, 7.5468944598876364f,
+  7.5545888516776376f, 7.5622424242210728f,
+  7.5698556083309478f, 7.5774288280357486f,
+  7.5849625007211560f, 7.5924570372680806f,
+  7.5999128421871278f, 7.6073303137496104f,
+  7.6147098441152083f, 7.6220518194563764f,
+  7.6293566200796094f, 7.6366246205436487f,
+  7.6438561897747243f, 7.6510516911789281f,
+  7.6582114827517946f, 7.6653359171851764f,
+  7.6724253419714951f, 7.6794800995054464f,
+  7.6865005271832185f, 7.6934869574993252f,
+  7.7004397181410917f, 7.7073591320808825f,
+  7.7142455176661224f, 7.7210991887071855f,
+  7.7279204545631987f, 7.7347096202258383f,
+  7.7414669864011464f, 7.7481928495894605f,
+  7.7548875021634682f, 7.7615512324444795f,
+  7.7681843247769259f, 7.7747870596011736f,
+  7.7813597135246599f, 7.7879025593914317f,
+  7.7944158663501061f, 7.8008998999203047f,
+  7.8073549220576037f, 7.8137811912170374f,
+  7.8201789624151878f, 7.8265484872909150f,
+  7.8328900141647412f, 7.8392037880969436f,
+  7.8454900509443747f, 7.8517490414160571f,
+  7.8579809951275718f, 7.8641861446542797f,
+  7.8703647195834047f, 7.8765169465649993f,
+  7.8826430493618415f, 7.8887432488982591f,
+  7.8948177633079437f, 7.9008668079807486f,
+  7.9068905956085187f, 7.9128893362299619f,
+  7.9188632372745946f, 7.9248125036057812f,
+  7.9307373375628866f, 7.9366379390025709f,
+  7.9425145053392398f, 7.9483672315846778f,
+  7.9541963103868749f, 7.9600019320680805f,
+  7.9657842846620869f, 7.9715435539507719f,
+  7.9772799234999167f, 7.9829935746943103f,
+  7.9886846867721654f, 7.9943534368588577f
+};
+
+const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
+  0.00000000f,    0.00000000f,  2.00000000f,   4.75488750f,
+  8.00000000f,   11.60964047f,  15.50977500f,  19.65148445f,
+  24.00000000f,  28.52932501f,  33.21928095f,  38.05374781f,
+  43.01955001f,  48.10571634f,  53.30296891f,  58.60335893f,
+  64.00000000f,  69.48686830f,  75.05865003f,  80.71062276f,
+  86.43856190f,  92.23866588f,  98.10749561f,  104.04192499f,
+  110.03910002f, 116.09640474f, 122.21143267f, 128.38196256f,
+  134.60593782f, 140.88144886f, 147.20671787f, 153.58008562f,
+  160.00000000f, 166.46500594f, 172.97373660f, 179.52490559f,
+  186.11730005f, 192.74977453f, 199.42124551f, 206.13068654f,
+  212.87712380f, 219.65963219f, 226.47733176f, 233.32938445f,
+  240.21499122f, 247.13338933f, 254.08384998f, 261.06567603f,
+  268.07820003f, 275.12078236f, 282.19280949f, 289.29369244f,
+  296.42286534f, 303.57978409f, 310.76392512f, 317.97478424f,
+  325.21187564f, 332.47473081f, 339.76289772f, 347.07593991f,
+  354.41343574f, 361.77497759f, 369.16017124f, 376.56863518f,
+  384.00000000f, 391.45390785f, 398.93001188f, 406.42797576f,
+  413.94747321f, 421.48818752f, 429.04981119f, 436.63204548f,
+  444.23460010f, 451.85719280f, 459.49954906f, 467.16140179f,
+  474.84249102f, 482.54256363f, 490.26137307f, 497.99867911f,
+  505.75424759f, 513.52785023f, 521.31926438f, 529.12827280f,
+  536.95466351f, 544.79822957f, 552.65876890f, 560.53608414f,
+  568.42998244f, 576.34027536f, 584.26677867f, 592.20931226f,
+  600.16769996f, 608.14176943f, 616.13135206f, 624.13628279f,
+  632.15640007f, 640.19154569f, 648.24156472f, 656.30630539f,
+  664.38561898f, 672.47935976f, 680.58738488f, 688.70955430f,
+  696.84573069f, 704.99577935f, 713.15956818f, 721.33696754f,
+  729.52785023f, 737.73209140f, 745.94956849f, 754.18016116f,
+  762.42375127f, 770.68022275f, 778.94946161f, 787.23135586f,
+  795.52579543f, 803.83267219f, 812.15187982f, 820.48331383f,
+  828.82687147f, 837.18245171f, 845.54995518f, 853.92928416f,
+  862.32034249f, 870.72303558f, 879.13727036f, 887.56295522f,
+  896.00000000f, 904.44831595f, 912.90781569f, 921.37841320f,
+  929.86002376f, 938.35256392f, 946.85595152f, 955.37010560f,
+  963.89494641f, 972.43039537f, 980.97637504f, 989.53280911f,
+  998.09962237f, 1006.67674069f, 1015.26409097f, 1023.86160116f,
+  1032.46920021f, 1041.08681805f, 1049.71438560f, 1058.35183469f,
+  1066.99909811f, 1075.65610955f, 1084.32280357f, 1092.99911564f,
+  1101.68498204f, 1110.38033993f, 1119.08512727f, 1127.79928282f,
+  1136.52274614f, 1145.25545758f, 1153.99735821f, 1162.74838989f,
+  1171.50849518f, 1180.27761738f, 1189.05570047f, 1197.84268914f,
+  1206.63852876f, 1215.44316535f, 1224.25654560f, 1233.07861684f,
+  1241.90932703f, 1250.74862473f, 1259.59645914f, 1268.45278005f,
+  1277.31753781f, 1286.19068338f, 1295.07216828f, 1303.96194457f,
+  1312.85996488f, 1321.76618236f, 1330.68055071f, 1339.60302413f,
+  1348.53355734f, 1357.47210556f, 1366.41862452f, 1375.37307041f,
+  1384.33539991f, 1393.30557020f, 1402.28353887f, 1411.26926400f,
+  1420.26270412f, 1429.26381818f, 1438.27256558f, 1447.28890615f,
+  1456.31280014f, 1465.34420819f, 1474.38309138f, 1483.42941118f,
+  1492.48312945f, 1501.54420843f, 1510.61261078f, 1519.68829949f,
+  1528.77123795f, 1537.86138993f, 1546.95871952f, 1556.06319119f,
+  1565.17476976f, 1574.29342040f, 1583.41910860f, 1592.55180020f,
+  1601.69146137f, 1610.83805860f, 1619.99155871f, 1629.15192882f,
+  1638.31913637f, 1647.49314911f, 1656.67393509f, 1665.86146266f,
+  1675.05570047f, 1684.25661744f, 1693.46418280f, 1702.67836605f,
+  1711.89913698f, 1721.12646563f, 1730.36032233f, 1739.60067768f,
+  1748.84750254f, 1758.10076802f, 1767.36044551f, 1776.62650662f,
+  1785.89892323f, 1795.17766747f, 1804.46271172f, 1813.75402857f,
+  1823.05159087f, 1832.35537170f, 1841.66534438f, 1850.98148244f,
+  1860.30375965f, 1869.63214999f, 1878.96662767f, 1888.30716711f,
+  1897.65374295f, 1907.00633003f, 1916.36490342f, 1925.72943838f,
+  1935.09991037f, 1944.47629506f, 1953.85856831f, 1963.24670620f,
+  1972.64068498f, 1982.04048108f, 1991.44607117f, 2000.85743204f,
+  2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
+};
+
+const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
+  { 0, 0}, { 0, 0}, { 1, 0}, { 2, 0}, { 3, 0}, { 4, 1}, { 4, 1}, { 5, 1},
+  { 5, 1}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 7, 2}, { 7, 2}, { 7, 2},
+  { 7, 2}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3},
+  { 8, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3},
+  { 9, 3}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
+  {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
+  {10, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
+  {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
+  {11, 4}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+};
+
+const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
+   0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  1,  2,  3,  0,  1,  2,  3,
+   0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+  127,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
+};
+
+static float FastSLog2Slow(uint32_t v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+    int log_cnt = 0;
+    uint32_t y = 1;
+    int correction = 0;
+    const float v_f = (float)v;
+    const uint32_t orig_v = v;
+    do {
+      ++log_cnt;
+      v = v >> 1;
+      y = y << 1;
+    } while (v >= LOG_LOOKUP_IDX_MAX);
+    // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256
+    // Xf = floor(Xf) * (1 + (v % y) / v)
+    // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v)
+    // The correction factor: log(1 + d) ~ d; for very small d values, so
+    // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v
+    // LOG_2_RECIPROCAL ~ 23/16
+    correction = (23 * (orig_v & (y - 1))) >> 4;
+    return v_f * (kLog2Table[v] + log_cnt) + correction;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * v * log((double)v));
+  }
+}
+
+static float FastLog2Slow(uint32_t v) {
+  assert(v >= LOG_LOOKUP_IDX_MAX);
+  if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
+    int log_cnt = 0;
+    uint32_t y = 1;
+    const uint32_t orig_v = v;
+    double log_2;
+    do {
+      ++log_cnt;
+      v = v >> 1;
+      y = y << 1;
+    } while (v >= LOG_LOOKUP_IDX_MAX);
+    log_2 = kLog2Table[v] + log_cnt;
+    if (orig_v >= APPROX_LOG_MAX) {
+      // Since the division is still expensive, add this correction factor only
+      // for large values of 'v'.
+      const int correction = (23 * (orig_v & (y - 1))) >> 4;
+      log_2 += (double)correction / orig_v;
+    }
+    return (float)log_2;
+  } else {
+    return (float)(LOG_2_RECIPROCAL * log((double)v));
+  }
+}
+
+// Mostly used to reduce code size + readability
+static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
+
+//------------------------------------------------------------------------------
+// Methods to calculate Entropy (Shannon).
+
+static float PredictionCostSpatial(const int counts[256], int weight_0,
+                                   double exp_val) {
+  const int significant_symbols = 256 >> 4;
+  const double exp_decay_factor = 0.6;
+  double bits = weight_0 * counts[0];
+  int i;
+  for (i = 1; i < significant_symbols; ++i) {
+    bits += exp_val * (counts[i] + counts[256 - i]);
+    exp_val *= exp_decay_factor;
+  }
+  return (float)(-0.1 * bits);
+}
+
+// Compute the combined Shanon's entropy for distribution {X} and {X+Y}
+static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
+  int i;
+  double retval = 0.;
+  int sumX = 0, sumXY = 0;
+  for (i = 0; i < 256; ++i) {
+    const int x = X[i];
+    if (x != 0) {
+      const int xy = x + Y[i];
+      sumX += x;
+      retval -= VP8LFastSLog2(x);
+      sumXY += xy;
+      retval -= VP8LFastSLog2(xy);
+    } else if (Y[i] != 0) {
+      sumXY += Y[i];
+      retval -= VP8LFastSLog2(Y[i]);
+    }
+  }
+  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
+  return (float)retval;
+}
+
+static float PredictionCostSpatialHistogram(const int accumulated[4][256],
+                                            const int tile[4][256]) {
+  int i;
+  double retval = 0;
+  for (i = 0; i < 4; ++i) {
+    const double kExpValue = 0.94;
+    retval += PredictionCostSpatial(tile[i], 1, kExpValue);
+    retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
+  }
+  return (float)retval;
+}
+
+void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
+  entropy->entropy = 0.;
+  entropy->sum = 0;
+  entropy->nonzeros = 0;
+  entropy->max_val = 0;
+  entropy->nonzero_code = VP8L_NON_TRIVIAL_SYM;
+}
+
+void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
+                              VP8LBitEntropy* const entropy) {
+  int i;
+
+  VP8LBitEntropyInit(entropy);
+
+  for (i = 0; i < n; ++i) {
+    if (array[i] != 0) {
+      entropy->sum += array[i];
+      entropy->nonzero_code = i;
+      ++entropy->nonzeros;
+      entropy->entropy -= VP8LFastSLog2(array[i]);
+      if (entropy->max_val < array[i]) {
+        entropy->max_val = array[i];
+      }
+    }
+  }
+  entropy->entropy += VP8LFastSLog2(entropy->sum);
+}
+
+static WEBP_INLINE void GetEntropyUnrefinedHelper(
+    uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
+    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
+  const int streak = i - *i_prev;
+
+  // Gather info for the bit entropy.
+  if (*val_prev != 0) {
+    bit_entropy->sum += (*val_prev) * streak;
+    bit_entropy->nonzeros += streak;
+    bit_entropy->nonzero_code = *i_prev;
+    bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
+    if (bit_entropy->max_val < *val_prev) {
+      bit_entropy->max_val = *val_prev;
+    }
+  }
+
+  // Gather info for the Huffman cost.
+  stats->counts[*val_prev != 0] += (streak > 3);
+  stats->streaks[*val_prev != 0][(streak > 3)] += streak;
+
+  *val_prev = val;
+  *i_prev = i;
+}
+
+void VP8LGetEntropyUnrefined(const uint32_t* const X, int length,
+                             VP8LBitEntropy* const bit_entropy,
+                             VP8LStreaks* const stats) {
+  int i;
+  int i_prev = 0;
+  uint32_t x_prev = X[0];
+
+  memset(stats, 0, sizeof(*stats));
+  VP8LBitEntropyInit(bit_entropy);
+
+  for (i = 1; i < length; ++i) {
+    const uint32_t x = X[i];
+    if (x != x_prev) {
+      VP8LGetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
+    }
+  }
+  VP8LGetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
+
+  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
+void VP8LGetCombinedEntropyUnrefined(const uint32_t* const X,
+                                     const uint32_t* const Y, int length,
+                                     VP8LBitEntropy* const bit_entropy,
+                                     VP8LStreaks* const stats) {
+  int i = 1;
+  int i_prev = 0;
+  uint32_t xy_prev = X[0] + Y[0];
+
+  memset(stats, 0, sizeof(*stats));
+  VP8LBitEntropyInit(bit_entropy);
+
+  for (i = 1; i < length; ++i) {
+    const uint32_t xy = X[i] + Y[i];
+    if (xy != xy_prev) {
+      VP8LGetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy,
+                                    stats);
+    }
+  }
+  VP8LGetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
+
+  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
+static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
+  ++histo_argb[0][argb >> 24];
+  ++histo_argb[1][(argb >> 16) & 0xff];
+  ++histo_argb[2][(argb >> 8) & 0xff];
+  ++histo_argb[3][argb & 0xff];
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE uint32_t Predict(VP8LPredictorFunc pred_func,
+                                    int x, int y,
+                                    const uint32_t* current_row,
+                                    const uint32_t* upper_row) {
+  if (y == 0) {
+    return (x == 0) ? ARGB_BLACK : current_row[x - 1];  // Left.
+  } else if (x == 0) {
+    return upper_row[x];  // Top.
+  } else {
+    return pred_func(current_row[x - 1], upper_row + x);
+  }
+}
+
+// Returns best predictor and updates the accumulated histogram.
+static int GetBestPredictorForTile(int width, int height,
+                                   int tile_x, int tile_y, int bits,
+                                   int accumulated[4][256],
+                                   const uint32_t* const argb_scratch,
+                                   int exact) {
+  const int kNumPredModes = 14;
+  const int col_start = tile_x << bits;
+  const int row_start = tile_y << bits;
+  const int tile_size = 1 << bits;
+  const int max_y = GetMin(tile_size, height - row_start);
+  const int max_x = GetMin(tile_size, width - col_start);
+  float best_diff = MAX_DIFF_COST;
+  int best_mode = 0;
+  int mode;
+  int histo_stack_1[4][256];
+  int histo_stack_2[4][256];
+  // Need pointers to be able to swap arrays.
+  int (*histo_argb)[256] = histo_stack_1;
+  int (*best_histo)[256] = histo_stack_2;
+
+  int i, j;
+  for (mode = 0; mode < kNumPredModes; ++mode) {
+    const uint32_t* current_row = argb_scratch;
+    const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
+    float cur_diff;
+    int y;
+    memset(histo_argb, 0, sizeof(histo_stack_1));
+    for (y = 0; y < max_y; ++y) {
+      int x;
+      const int row = row_start + y;
+      const uint32_t* const upper_row = current_row;
+      current_row = upper_row + width;
+      for (x = 0; x < max_x; ++x) {
+        const int col = col_start + x;
+        const uint32_t predict =
+            Predict(pred_func, col, row, current_row, upper_row);
+        uint32_t residual = VP8LSubPixels(current_row[col], predict);
+        if (!exact && (current_row[col] & kMaskAlpha) == 0) {
+          residual &= kMaskAlpha;  // See CopyTileWithPrediction.
+        }
+        UpdateHisto(histo_argb, residual);
+      }
+    }
+    cur_diff = PredictionCostSpatialHistogram(
+        (const int (*)[256])accumulated, (const int (*)[256])histo_argb);
+    if (cur_diff < best_diff) {
+      int (*tmp)[256] = histo_argb;
+      histo_argb = best_histo;
+      best_histo = tmp;
+      best_diff = cur_diff;
+      best_mode = mode;
+    }
+  }
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 256; j++) {
+      accumulated[i][j] += best_histo[i][j];
+    }
+  }
+
+  return best_mode;
+}
+
+static void CopyImageWithPrediction(int width, int height,
+                                    int bits, uint32_t* const modes,
+                                    uint32_t* const argb_scratch,
+                                    uint32_t* const argb,
+                                    int low_effort, int exact) {
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  const int mask = (1 << bits) - 1;
+  // The row size is one pixel longer to allow the top right pixel to point to
+  // the leftmost pixel of the next row when at the right edge.
+  uint32_t* current_row = argb_scratch;
+  uint32_t* upper_row = argb_scratch + width + 1;
+  int y;
+  VP8LPredictorFunc pred_func =
+      low_effort ? VP8LPredictors[kPredLowEffort] : NULL;
+
+  for (y = 0; y < height; ++y) {
+    int x;
+    uint32_t* tmp = upper_row;
+    upper_row = current_row;
+    current_row = tmp;
+    memcpy(current_row, argb + y * width, sizeof(*current_row) * width);
+    current_row[width] = (y + 1 < height) ? argb[(y + 1) * width] : ARGB_BLACK;
+
+    if (low_effort) {
+      for (x = 0; x < width; ++x) {
+        const uint32_t predict =
+            Predict(pred_func, x, y, current_row, upper_row);
+        argb[y * width + x] = VP8LSubPixels(current_row[x], predict);
+      }
+    } else {
+      for (x = 0; x < width; ++x) {
+        uint32_t predict, residual;
+        if ((x & mask) == 0) {
+          const int mode =
+              (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff;
+          pred_func = VP8LPredictors[mode];
+        }
+        predict = Predict(pred_func, x, y, current_row, upper_row);
+        residual = VP8LSubPixels(current_row[x], predict);
+        if (!exact && (current_row[x] & kMaskAlpha) == 0) {
+          // If alpha is 0, cleanup RGB. We can choose the RGB values of the
+          // residual for best compression. The prediction of alpha itself can
+          // be non-zero and must be kept though. We choose RGB of the residual
+          // to be 0.
+          residual &= kMaskAlpha;
+          // Update input image so that next predictions use correct RGB value.
+          current_row[x] = predict & ~kMaskAlpha;
+          if (x == 0 && y != 0) upper_row[width] = current_row[x];
+        }
+        argb[y * width + x] = residual;
+      }
+    }
+  }
+}
+
+void VP8LResidualImage(int width, int height, int bits, int low_effort,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image, int exact) {
+  const int max_tile_size = 1 << bits;
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  const int tiles_per_col = VP8LSubSampleSize(height, bits);
+  uint32_t* const upper_row = argb_scratch;
+  uint32_t* const current_tile_rows = argb_scratch + width;
+  int tile_y;
+  int histo[4][256];
+  if (low_effort) {
+    int i;
+    for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
+      image[i] = ARGB_BLACK | (kPredLowEffort << 8);
+    }
+  } else {
+    memset(histo, 0, sizeof(histo));
+    for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
+      const int tile_y_offset = tile_y * max_tile_size;
+      const int this_tile_height =
+          (tile_y < tiles_per_col - 1) ? max_tile_size : height - tile_y_offset;
+      int tile_x;
+      if (tile_y > 0) {
+        memcpy(upper_row, current_tile_rows + (max_tile_size - 1) * width,
+               width * sizeof(*upper_row));
+      }
+      memcpy(current_tile_rows, &argb[tile_y_offset * width],
+             this_tile_height * width * sizeof(*current_tile_rows));
+      for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
+        const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y,
+            bits, (int (*)[256])histo, argb_scratch, exact);
+        image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
+      }
+    }
+  }
+
+  CopyImageWithPrediction(width, height, bits,
+                          image, argb_scratch, argb, low_effort, exact);
+}
+
+void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint32_t argb = argb_data[i];
+    const uint32_t green = (argb >> 8) & 0xff;
+    const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
+    const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
+    argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
+  }
+}
+
+static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
+  m->green_to_red_ = 0;
+  m->green_to_blue_ = 0;
+  m->red_to_blue_ = 0;
+}
+
+static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
+                                                int8_t color) {
+  return (uint32_t)((int)(color_pred) * color) >> 5;
+}
+
+static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
+                                               VP8LMultipliers* const m) {
+  m->green_to_red_  = (color_code >>  0) & 0xff;
+  m->green_to_blue_ = (color_code >>  8) & 0xff;
+  m->red_to_blue_   = (color_code >> 16) & 0xff;
+}
+
+static WEBP_INLINE uint32_t MultipliersToColorCode(
+    const VP8LMultipliers* const m) {
+  return 0xff000000u |
+         ((uint32_t)(m->red_to_blue_) << 16) |
+         ((uint32_t)(m->green_to_blue_) << 8) |
+         m->green_to_red_;
+}
+
+void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
+                          int num_pixels) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) {
+    const uint32_t argb = data[i];
+    const uint32_t green = argb >> 8;
+    const uint32_t red = argb >> 16;
+    uint32_t new_red = red;
+    uint32_t new_blue = argb;
+    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue &= 0xff;
+    data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
+  }
+}
+
+static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
+                                             uint32_t argb) {
+  const uint32_t green = argb >> 8;
+  uint32_t new_red = argb >> 16;
+  new_red -= ColorTransformDelta(green_to_red, green);
+  return (new_red & 0xff);
+}
+
+static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
+                                              uint8_t red_to_blue,
+                                              uint32_t argb) {
+  const uint32_t green = argb >> 8;
+  const uint32_t red = argb >> 16;
+  uint8_t new_blue = argb;
+  new_blue -= ColorTransformDelta(green_to_blue, green);
+  new_blue -= ColorTransformDelta(red_to_blue, red);
+  return (new_blue & 0xff);
+}
+
+static float PredictionCostCrossColor(const int accumulated[256],
+                                      const int counts[256]) {
+  // Favor low entropy, locally and globally.
+  // Favor small absolute values for PredictionCostSpatial
+  static const double kExpValue = 2.4;
+  return VP8LCombinedShannonEntropy(counts, accumulated) +
+         PredictionCostSpatial(counts, 3, kExpValue);
+}
+
+void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
+                                     int tile_width, int tile_height,
+                                     int green_to_red, int histo[]) {
+  while (tile_height-- > 0) {
+    int x;
+    for (x = 0; x < tile_width; ++x) {
+      ++histo[TransformColorRed(green_to_red, argb[x])];
+    }
+    argb += stride;
+  }
+}
+
+static float GetPredictionCostCrossColorRed(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
+    const int accumulated_red_histo[256]) {
+  int histo[256] = { 0 };
+  float cur_diff;
+
+  VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height,
+                                green_to_red, histo);
+
+  cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
+  if ((uint8_t)green_to_red == prev_x.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)green_to_red == prev_y.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_red == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+static void GetBestGreenToRed(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+    const int accumulated_red_histo[256], VP8LMultipliers* const best_tx) {
+  const int kMaxIters = 4 + ((7 * quality) >> 8);  // in range [4..6]
+  int green_to_red_best = 0;
+  int iter, offset;
+  float best_diff = GetPredictionCostCrossColorRed(
+      argb, stride, tile_width, tile_height, prev_x, prev_y,
+      green_to_red_best, accumulated_red_histo);
+  for (iter = 0; iter < kMaxIters; ++iter) {
+    // ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
+    // one in color computation. Having initial delta here as 1 is sufficient
+    // to explore the range of (-2, 2).
+    const int delta = 32 >> iter;
+    // Try a negative and a positive delta from the best known value.
+    for (offset = -delta; offset <= delta; offset += 2 * delta) {
+      const int green_to_red_cur = offset + green_to_red_best;
+      const float cur_diff = GetPredictionCostCrossColorRed(
+          argb, stride, tile_width, tile_height, prev_x, prev_y,
+          green_to_red_cur, accumulated_red_histo);
+      if (cur_diff < best_diff) {
+        best_diff = cur_diff;
+        green_to_red_best = green_to_red_cur;
+      }
+    }
+  }
+  best_tx->green_to_red_ = green_to_red_best;
+}
+
+void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
+                                      int tile_width, int tile_height,
+                                      int green_to_blue, int red_to_blue,
+                                      int histo[]) {
+  while (tile_height-- > 0) {
+    int x;
+    for (x = 0; x < tile_width; ++x) {
+      ++histo[TransformColorBlue(green_to_blue, red_to_blue, argb[x])];
+    }
+    argb += stride;
+  }
+}
+
+static float GetPredictionCostCrossColorBlue(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y,
+    int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256]) {
+  int histo[256] = { 0 };
+  float cur_diff;
+
+  VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height,
+                                 green_to_blue, red_to_blue, histo);
+
+  cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
+  if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  if (red_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+#define kGreenRedToBlueNumAxis 8
+#define kGreenRedToBlueMaxIters 7
+static void GetBestGreenRedToBlue(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+    const int accumulated_blue_histo[256],
+    VP8LMultipliers* const best_tx) {
+  const int8_t offset[kGreenRedToBlueNumAxis][2] =
+      {{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}};
+  const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 };
+  const int iters =
+      (quality < 25) ? 1 : (quality > 50) ? kGreenRedToBlueMaxIters : 4;
+  int green_to_blue_best = 0;
+  int red_to_blue_best = 0;
+  int iter;
+  // Initial value at origin:
+  float best_diff = GetPredictionCostCrossColorBlue(
+      argb, stride, tile_width, tile_height, prev_x, prev_y,
+      green_to_blue_best, red_to_blue_best, accumulated_blue_histo);
+  for (iter = 0; iter < iters; ++iter) {
+    const int delta = delta_lut[iter];
+    int axis;
+    for (axis = 0; axis < kGreenRedToBlueNumAxis; ++axis) {
+      const int green_to_blue_cur =
+          offset[axis][0] * delta + green_to_blue_best;
+      const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best;
+      const float cur_diff = GetPredictionCostCrossColorBlue(
+          argb, stride, tile_width, tile_height, prev_x, prev_y,
+          green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo);
+      if (cur_diff < best_diff) {
+        best_diff = cur_diff;
+        green_to_blue_best = green_to_blue_cur;
+        red_to_blue_best = red_to_blue_cur;
+      }
+      if (quality < 25 && iter == 4) {
+        // Only axis aligned diffs for lower quality.
+        break;  // next iter.
+      }
+    }
+    if (delta == 2 && green_to_blue_best == 0 && red_to_blue_best == 0) {
+      // Further iterations would not help.
+      break;  // out of iter-loop.
+    }
+  }
+  best_tx->green_to_blue_ = green_to_blue_best;
+  best_tx->red_to_blue_ = red_to_blue_best;
+}
+#undef kGreenRedToBlueMaxIters
+#undef kGreenRedToBlueNumAxis
+
+static VP8LMultipliers GetBestColorTransformForTile(
+    int tile_x, int tile_y, int bits,
+    VP8LMultipliers prev_x,
+    VP8LMultipliers prev_y,
+    int quality, int xsize, int ysize,
+    const int accumulated_red_histo[256],
+    const int accumulated_blue_histo[256],
+    const uint32_t* const argb) {
+  const int max_tile_size = 1 << bits;
+  const int tile_y_offset = tile_y * max_tile_size;
+  const int tile_x_offset = tile_x * max_tile_size;
+  const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
+  const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
+  const int tile_width = all_x_max - tile_x_offset;
+  const int tile_height = all_y_max - tile_y_offset;
+  const uint32_t* const tile_argb = argb + tile_y_offset * xsize
+                                  + tile_x_offset;
+  VP8LMultipliers best_tx;
+  MultipliersClear(&best_tx);
+
+  GetBestGreenToRed(tile_argb, xsize, tile_width, tile_height,
+                    prev_x, prev_y, quality, accumulated_red_histo, &best_tx);
+  GetBestGreenRedToBlue(tile_argb, xsize, tile_width, tile_height,
+                        prev_x, prev_y, quality, accumulated_blue_histo,
+                        &best_tx);
+  return best_tx;
+}
+
+static void CopyTileWithColorTransform(int xsize, int ysize,
+                                       int tile_x, int tile_y,
+                                       int max_tile_size,
+                                       VP8LMultipliers color_transform,
+                                       uint32_t* argb) {
+  const int xscan = GetMin(max_tile_size, xsize - tile_x);
+  int yscan = GetMin(max_tile_size, ysize - tile_y);
+  argb += tile_y * xsize + tile_x;
+  while (yscan-- > 0) {
+    VP8LTransformColor(&color_transform, argb, xscan);
+    argb += xsize;
+  }
+}
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                             uint32_t* const argb, uint32_t* image) {
+  const int max_tile_size = 1 << bits;
+  const int tile_xsize = VP8LSubSampleSize(width, bits);
+  const int tile_ysize = VP8LSubSampleSize(height, bits);
+  int accumulated_red_histo[256] = { 0 };
+  int accumulated_blue_histo[256] = { 0 };
+  int tile_x, tile_y;
+  VP8LMultipliers prev_x, prev_y;
+  MultipliersClear(&prev_y);
+  MultipliersClear(&prev_x);
+  for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
+    for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
+      int y;
+      const int tile_x_offset = tile_x * max_tile_size;
+      const int tile_y_offset = tile_y * max_tile_size;
+      const int all_x_max = GetMin(tile_x_offset + max_tile_size, width);
+      const int all_y_max = GetMin(tile_y_offset + max_tile_size, height);
+      const int offset = tile_y * tile_xsize + tile_x;
+      if (tile_y != 0) {
+        ColorCodeToMultipliers(image[offset - tile_xsize], &prev_y);
+      }
+      prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits,
+                                            prev_x, prev_y,
+                                            quality, width, height,
+                                            accumulated_red_histo,
+                                            accumulated_blue_histo,
+                                            argb);
+      image[offset] = MultipliersToColorCode(&prev_x);
+      CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset,
+                                 max_tile_size, prev_x, argb);
+
+      // Gather accumulated histogram data.
+      for (y = tile_y_offset; y < all_y_max; ++y) {
+        int ix = y * width + tile_x_offset;
+        const int ix_end = ix + all_x_max - tile_x_offset;
+        for (; ix < ix_end; ++ix) {
+          const uint32_t pix = argb[ix];
+          if (ix >= 2 &&
+              pix == argb[ix - 2] &&
+              pix == argb[ix - 1]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          if (ix >= width + 2 &&
+              argb[ix - 2] == argb[ix - width - 2] &&
+              argb[ix - 1] == argb[ix - width - 1] &&
+              pix == argb[ix - width]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          ++accumulated_red_histo[(pix >> 16) & 0xff];
+          ++accumulated_blue_histo[(pix >> 0) & 0xff];
+        }
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
+void VP8LBundleColorMap(const uint8_t* const row, int width,
+                        int xbits, uint32_t* const dst) {
+  int x;
+  if (xbits > 0) {
+    const int bit_depth = 1 << (3 - xbits);
+    const int mask = (1 << xbits) - 1;
+    uint32_t code = 0xff000000;
+    for (x = 0; x < width; ++x) {
+      const int xsub = x & mask;
+      if (xsub == 0) {
+        code = 0xff000000;
+      }
+      code |= row[x] << (8 + bit_depth * xsub);
+      dst[x >> xbits] = code;
+    }
+  } else {
+    for (x = 0; x < width; ++x) dst[x] = 0xff000000 | (row[x] << 8);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static double ExtraCost(const uint32_t* population, int length) {
+  int i;
+  double cost = 0.;
+  for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
+  return cost;
+}
+
+static double ExtraCostCombined(const uint32_t* X, const uint32_t* Y,
+                                int length) {
+  int i;
+  double cost = 0.;
+  for (i = 2; i < length - 2; ++i) {
+    const int xy = X[i + 2] + Y[i + 2];
+    cost += (i >> 1) * xy;
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+
+static void HistogramAdd(const VP8LHistogram* const a,
+                         const VP8LHistogram* const b,
+                         VP8LHistogram* const out) {
+  int i;
+  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+  if (b != out) {
+    for (i = 0; i < literal_size; ++i) {
+      out->literal_[i] = a->literal_[i] + b->literal_[i];
+    }
+    for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+      out->distance_[i] = a->distance_[i] + b->distance_[i];
+    }
+    for (i = 0; i < NUM_LITERAL_CODES; ++i) {
+      out->red_[i] = a->red_[i] + b->red_[i];
+      out->blue_[i] = a->blue_[i] + b->blue_[i];
+      out->alpha_[i] = a->alpha_[i] + b->alpha_[i];
+    }
+  } else {
+    for (i = 0; i < literal_size; ++i) {
+      out->literal_[i] += a->literal_[i];
+    }
+    for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+      out->distance_[i] += a->distance_[i];
+    }
+    for (i = 0; i < NUM_LITERAL_CODES; ++i) {
+      out->red_[i] += a->red_[i];
+      out->blue_[i] += a->blue_[i];
+      out->alpha_[i] += a->alpha_[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+
+VP8LTransformColorFunc VP8LTransformColor;
+
+VP8LCollectColorBlueTransformsFunc VP8LCollectColorBlueTransforms;
+VP8LCollectColorRedTransformsFunc VP8LCollectColorRedTransforms;
+
+VP8LFastLog2SlowFunc VP8LFastLog2Slow;
+VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+
+VP8LCostFunc VP8LExtraCost;
+VP8LCostCombinedFunc VP8LExtraCostCombined;
+VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
+
+GetEntropyUnrefinedHelperFunc VP8LGetEntropyUnrefinedHelper;
+
+VP8LHistogramAddFunc VP8LHistogramAdd;
+
+extern void VP8LEncDspInitSSE2(void);
+extern void VP8LEncDspInitSSE41(void);
+extern void VP8LEncDspInitNEON(void);
+extern void VP8LEncDspInitMIPS32(void);
+extern void VP8LEncDspInitMIPSdspR2(void);
+
+static volatile VP8CPUInfo lossless_enc_last_cpuinfo_used =
+    (VP8CPUInfo)&lossless_enc_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
+  if (lossless_enc_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  VP8LDspInit();
+
+  VP8LSubtractGreenFromBlueAndRed = VP8LSubtractGreenFromBlueAndRed_C;
+
+  VP8LTransformColor = VP8LTransformColor_C;
+
+  VP8LCollectColorBlueTransforms = VP8LCollectColorBlueTransforms_C;
+  VP8LCollectColorRedTransforms = VP8LCollectColorRedTransforms_C;
+
+  VP8LFastLog2Slow = FastLog2Slow;
+  VP8LFastSLog2Slow = FastSLog2Slow;
+
+  VP8LExtraCost = ExtraCost;
+  VP8LExtraCostCombined = ExtraCostCombined;
+  VP8LCombinedShannonEntropy = CombinedShannonEntropy;
+
+  VP8LGetEntropyUnrefinedHelper = GetEntropyUnrefinedHelper;
+
+  VP8LHistogramAdd = HistogramAdd;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8LEncDspInitSSE2();
+#if defined(WEBP_USE_SSE41)
+      if (VP8GetCPUInfo(kSSE4_1)) {
+        VP8LEncDspInitSSE41();
+      }
+#endif
+    }
+#endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8LEncDspInitNEON();
+    }
+#endif
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8LEncDspInitMIPS32();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      VP8LEncDspInitMIPSdspR2();
+    }
+#endif
+  }
+  lossless_enc_last_cpuinfo_used = VP8GetCPUInfo;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_mips32.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
index 5562c41..49c666d 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
@@ -1,4 +1,4 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
+// Copyright 2015 Google Inc. All Rights Reserved.
 //
 // Use of this source code is governed by a BSD-style license
 // that can be found in the COPYING file in the root of the source
@@ -22,10 +22,6 @@
 #include <stdlib.h>
 #include <string.h>
 
-#define APPROX_LOG_WITH_CORRECTION_MAX  65536
-#define APPROX_LOG_MAX                   4096
-#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
-
 static float FastSLog2Slow(uint32_t v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_WITH_CORRECTION_MAX) {
@@ -217,58 +213,31 @@ static double ExtraCostCombined(const uint32_t* const X,
   );
 
 // Returns the various RLE counts
-static VP8LStreaks HuffmanCostCount(const uint32_t* population, int length) {
-  int i;
-  int streak = 0;
-  VP8LStreaks stats;
-  int* const pstreaks = &stats.streaks[0][0];
-  int* const pcnts = &stats.counts[0];
+static WEBP_INLINE void GetEntropyUnrefinedHelper(
+    uint32_t val, int i, uint32_t* const val_prev, int* const i_prev,
+    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) {
+  int* const pstreaks = &stats->streaks[0][0];
+  int* const pcnts = &stats->counts[0];
   int temp0, temp1, temp2, temp3;
-  memset(&stats, 0, sizeof(stats));
-  for (i = 0; i < length - 1; ++i) {
-    ++streak;
-    if (population[i] == population[i + 1]) {
-      continue;
+  const int streak = i - *i_prev;
+
+  // Gather info for the bit entropy.
+  if (*val_prev != 0) {
+    bit_entropy->sum += (*val_prev) * streak;
+    bit_entropy->nonzeros += streak;
+    bit_entropy->nonzero_code = *i_prev;
+    bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak;
+    if (bit_entropy->max_val < *val_prev) {
+      bit_entropy->max_val = *val_prev;
     }
-    temp0 = (population[i] != 0);
-    HUFFMAN_COST_PASS
-    streak = 0;
   }
-  ++streak;
-  temp0 = (population[i] != 0);
-  HUFFMAN_COST_PASS
 
-  return stats;
-}
-
-static VP8LStreaks HuffmanCostCombinedCount(const uint32_t* X,
-                                            const uint32_t* Y, int length) {
-  int i;
-  int streak = 0;
-  VP8LStreaks stats;
-  int* const pstreaks = &stats.streaks[0][0];
-  int* const pcnts = &stats.counts[0];
-  int temp0, temp1, temp2, temp3;
-  memset(&stats, 0, sizeof(stats));
-  for (i = 0; i < length - 1; ++i) {
-    const uint32_t xy = X[i] + Y[i];
-    const uint32_t xy_next = X[i + 1] + Y[i + 1];
-    ++streak;
-    if (xy == xy_next) {
-      continue;
-    }
-    temp0 = (xy != 0);
-    HUFFMAN_COST_PASS
-    streak = 0;
-  }
-  {
-    const uint32_t xy = X[i] + Y[i];
-    ++streak;
-    temp0 = (xy != 0);
-    HUFFMAN_COST_PASS
-  }
+  // Gather info for the Huffman cost.
+  temp0 = (*val_prev != 0);
+  HUFFMAN_COST_PASS
 
-  return stats;
+  *val_prev = val;
+  *i_prev = i;
 }
 
 #define ASM_START                                       \
@@ -396,21 +365,22 @@ static void HistogramAdd(const VP8LHistogram* const a,
 #undef ADD_TO_OUT
 #undef ASM_START
 
-#endif  // WEBP_USE_MIPS32
-
 //------------------------------------------------------------------------------
 // Entry point
 
-extern void VP8LDspInitMIPS32(void);
+extern void VP8LEncDspInitMIPS32(void);
 
-void VP8LDspInitMIPS32(void) {
-#if defined(WEBP_USE_MIPS32)
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
   VP8LFastSLog2Slow = FastSLog2Slow;
   VP8LFastLog2Slow = FastLog2Slow;
   VP8LExtraCost = ExtraCost;
   VP8LExtraCostCombined = ExtraCostCombined;
-  VP8LHuffmanCostCount = HuffmanCostCount;
-  VP8LHuffmanCostCombinedCount = HuffmanCostCombinedCount;
+  VP8LGetEntropyUnrefinedHelper = GetEntropyUnrefinedHelper;
   VP8LHistogramAdd = HistogramAdd;
-#endif  // WEBP_USE_MIPS32
 }
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c
new file mode 100644
index 0000000..0abf3c4
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips_dsp_r2.c
@@ -0,0 +1,275 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "./lossless.h"
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data,
+                                        int num_pixels) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  uint32_t* const p_loop1_end = argb_data + (num_pixels & ~3);
+  uint32_t* const p_loop2_end = p_loop1_end + (num_pixels & 3);
+  __asm__ volatile (
+    ".set       push                                          \n\t"
+    ".set       noreorder                                     \n\t"
+    "beq        %[argb_data],    %[p_loop1_end],     3f       \n\t"
+    " nop                                                     \n\t"
+  "0:                                                         \n\t"
+    "lw         %[temp0],        0(%[argb_data])              \n\t"
+    "lw         %[temp1],        4(%[argb_data])              \n\t"
+    "lw         %[temp2],        8(%[argb_data])              \n\t"
+    "lw         %[temp3],        12(%[argb_data])             \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "ext        %[temp5],        %[temp1],           8,    8  \n\t"
+    "ext        %[temp6],        %[temp2],           8,    8  \n\t"
+    "ext        %[temp7],        %[temp3],           8,    8  \n\t"
+    "addiu      %[argb_data],    %[argb_data],       16       \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "replv.ph   %[temp5],        %[temp5]                     \n\t"
+    "replv.ph   %[temp6],        %[temp6]                     \n\t"
+    "replv.ph   %[temp7],        %[temp7]                     \n\t"
+    "subu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "subu.qb    %[temp1],        %[temp1],           %[temp5] \n\t"
+    "subu.qb    %[temp2],        %[temp2],           %[temp6] \n\t"
+    "subu.qb    %[temp3],        %[temp3],           %[temp7] \n\t"
+    "sw         %[temp0],        -16(%[argb_data])            \n\t"
+    "sw         %[temp1],        -12(%[argb_data])            \n\t"
+    "sw         %[temp2],        -8(%[argb_data])             \n\t"
+    "bne        %[argb_data],    %[p_loop1_end],     0b       \n\t"
+    " sw        %[temp3],        -4(%[argb_data])             \n\t"
+  "3:                                                         \n\t"
+    "beq        %[argb_data],    %[p_loop2_end],     2f       \n\t"
+    " nop                                                     \n\t"
+  "1:                                                         \n\t"
+    "lw         %[temp0],        0(%[argb_data])              \n\t"
+    "addiu      %[argb_data],    %[argb_data],       4        \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "subu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "bne        %[argb_data],    %[p_loop2_end],     1b       \n\t"
+    " sw        %[temp0],        -4(%[argb_data])             \n\t"
+  "2:                                                         \n\t"
+    ".set       pop                                           \n\t"
+    : [argb_data]"+&r"(argb_data), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+      [temp7]"=&r"(temp7)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
+                                                int8_t color) {
+  return (uint32_t)((int)(color_pred) * color) >> 5;
+}
+
+static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
+                           int num_pixels) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  uint32_t argb, argb1, new_red, new_red1;
+  const uint32_t G_to_R = m->green_to_red_;
+  const uint32_t G_to_B = m->green_to_blue_;
+  const uint32_t R_to_B = m->red_to_blue_;
+  uint32_t* const p_loop_end = data + (num_pixels & ~1);
+  __asm__ volatile (
+    ".set            push                                    \n\t"
+    ".set            noreorder                               \n\t"
+    "beq             %[data],      %[p_loop_end],  1f        \n\t"
+    " nop                                                    \n\t"
+    "replv.ph        %[temp0],     %[G_to_R]                 \n\t"
+    "replv.ph        %[temp1],     %[G_to_B]                 \n\t"
+    "replv.ph        %[temp2],     %[R_to_B]                 \n\t"
+    "shll.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shll.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shll.ph         %[temp2],     %[temp2],       8         \n\t"
+    "shra.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shra.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shra.ph         %[temp2],     %[temp2],       8         \n\t"
+  "0:                                                        \n\t"
+    "lw              %[argb],      0(%[data])                \n\t"
+    "lw              %[argb1],     4(%[data])                \n\t"
+    "lhu             %[new_red],   2(%[data])                \n\t"
+    "lhu             %[new_red1],  6(%[data])                \n\t"
+    "precrq.qb.ph    %[temp3],     %[argb],        %[argb1]  \n\t"
+    "precr.qb.ph     %[temp4],     %[argb],        %[argb1]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[temp3]                  \n\t"
+    "preceu.ph.qbla  %[temp4],     %[temp4]                  \n\t"
+    "shll.ph         %[temp3],     %[temp3],       8         \n\t"
+    "shll.ph         %[temp4],     %[temp4],       8         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       8         \n\t"
+    "shra.ph         %[temp4],     %[temp4],       8         \n\t"
+    "mul.ph          %[temp5],     %[temp3],       %[temp0]  \n\t"
+    "mul.ph          %[temp3],     %[temp3],       %[temp1]  \n\t"
+    "mul.ph          %[temp4],     %[temp4],       %[temp2]  \n\t"
+    "addiu           %[data],      %[data],        8         \n\t"
+    "ins             %[new_red1],  %[new_red],     16,   16  \n\t"
+    "ins             %[argb1],     %[argb],        16,   16  \n\t"
+    "shra.ph         %[temp5],     %[temp5],       5         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       5         \n\t"
+    "shra.ph         %[temp4],     %[temp4],       5         \n\t"
+    "subu.ph         %[new_red1],  %[new_red1],    %[temp5]  \n\t"
+    "subu.ph         %[argb1],     %[argb1],       %[temp3]  \n\t"
+    "preceu.ph.qbra  %[temp5],     %[new_red1]               \n\t"
+    "subu.ph         %[argb1],     %[argb1],       %[temp4]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[argb1]                  \n\t"
+    "sb              %[temp5],     -2(%[data])               \n\t"
+    "sb              %[temp3],     -4(%[data])               \n\t"
+    "sra             %[temp5],     %[temp5],       16        \n\t"
+    "sra             %[temp3],     %[temp3],       16        \n\t"
+    "sb              %[temp5],     -6(%[data])               \n\t"
+    "bne             %[data],      %[p_loop_end],  0b        \n\t"
+    " sb             %[temp3],     -8(%[data])               \n\t"
+  "1:                                                        \n\t"
+    ".set            pop                                     \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [new_red1]"=&r"(new_red1), [new_red]"=&r"(new_red),
+      [argb]"=&r"(argb), [argb1]"=&r"(argb1), [data]"+&r"(data)
+    : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
+      [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
+    : "memory", "hi", "lo"
+  );
+
+  if (num_pixels & 1) {
+    const uint32_t argb_ = data[0];
+    const uint32_t green = argb_ >> 8;
+    const uint32_t red = argb_ >> 16;
+    uint32_t new_blue = argb_;
+    new_red = red;
+    new_red -= ColorTransformDelta(m->green_to_red_, green);
+    new_red &= 0xff;
+    new_blue -= ColorTransformDelta(m->green_to_blue_, green);
+    new_blue -= ColorTransformDelta(m->red_to_blue_, red);
+    new_blue &= 0xff;
+    data[0] = (argb_ & 0xff00ff00u) | (new_red << 16) | (new_blue);
+  }
+}
+
+static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
+                                              uint8_t red_to_blue,
+                                              uint32_t argb) {
+  const uint32_t green = argb >> 8;
+  const uint32_t red = argb >> 16;
+  uint8_t new_blue = argb;
+  new_blue -= ColorTransformDelta(green_to_blue, green);
+  new_blue -= ColorTransformDelta(red_to_blue, red);
+  return (new_blue & 0xff);
+}
+
+static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
+                                       int tile_width, int tile_height,
+                                       int green_to_blue, int red_to_blue,
+                                       int histo[]) {
+  const int rtb = (red_to_blue << 16) | (red_to_blue & 0xffff);
+  const int gtb = (green_to_blue << 16) | (green_to_blue & 0xffff);
+  const uint32_t mask = 0xff00ffu;
+  while (tile_height-- > 0) {
+    int x;
+    const uint32_t* p_argb = argb;
+    argb += stride;
+    for (x = 0; x < (tile_width >> 1); ++x) {
+      int temp0, temp1, temp2, temp3, temp4, temp5, temp6;
+      __asm__ volatile (
+        "lw           %[temp0],  0(%[p_argb])             \n\t"
+        "lw           %[temp1],  4(%[p_argb])             \n\t"
+        "precr.qb.ph  %[temp2],  %[temp0],  %[temp1]      \n\t"
+        "ins          %[temp1],  %[temp0],  16,    16     \n\t"
+        "shra.ph      %[temp2],  %[temp2],  8             \n\t"
+        "shra.ph      %[temp3],  %[temp1],  8             \n\t"
+        "mul.ph       %[temp5],  %[temp2],  %[rtb]        \n\t"
+        "mul.ph       %[temp6],  %[temp3],  %[gtb]        \n\t"
+        "and          %[temp4],  %[temp1],  %[mask]       \n\t"
+        "addiu        %[p_argb], %[p_argb], 8             \n\t"
+        "shra.ph      %[temp5],  %[temp5],  5             \n\t"
+        "shra.ph      %[temp6],  %[temp6],  5             \n\t"
+        "subu.qb      %[temp2],  %[temp4],  %[temp5]      \n\t"
+        "subu.qb      %[temp2],  %[temp2],  %[temp6]      \n\t"
+        : [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+          [temp5]"=&r"(temp5), [temp6]"=&r"(temp6)
+        : [rtb]"r"(rtb), [gtb]"r"(gtb), [mask]"r"(mask)
+        : "memory", "hi", "lo"
+      );
+      ++histo[(uint8_t)(temp2 >> 16)];
+      ++histo[(uint8_t)temp2];
+    }
+    if (tile_width & 1) {
+      ++histo[TransformColorBlue(green_to_blue, red_to_blue, *p_argb)];
+    }
+  }
+}
+
+static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
+                                             uint32_t argb) {
+  const uint32_t green = argb >> 8;
+  uint32_t new_red = argb >> 16;
+  new_red -= ColorTransformDelta(green_to_red, green);
+  return (new_red & 0xff);
+}
+
+static void CollectColorRedTransforms(const uint32_t* argb, int stride,
+                                      int tile_width, int tile_height,
+                                      int green_to_red, int histo[]) {
+  const int gtr = (green_to_red << 16) | (green_to_red & 0xffff);
+  while (tile_height-- > 0) {
+    int x;
+    const uint32_t* p_argb = argb;
+    argb += stride;
+    for (x = 0; x < (tile_width >> 1); ++x) {
+      int temp0, temp1, temp2, temp3, temp4;
+      __asm__ volatile (
+        "lw           %[temp0],  0(%[p_argb])             \n\t"
+        "lw           %[temp1],  4(%[p_argb])             \n\t"
+        "precrq.ph.w  %[temp4],  %[temp0],  %[temp1]      \n\t"
+        "ins          %[temp1],  %[temp0],  16,    16     \n\t"
+        "shra.ph      %[temp3],  %[temp1],  8             \n\t"
+        "mul.ph       %[temp2],  %[temp3],  %[gtr]        \n\t"
+        "addiu        %[p_argb], %[p_argb], 8             \n\t"
+        "shra.ph      %[temp2],  %[temp2],  5             \n\t"
+        "subu.qb      %[temp2],  %[temp4],  %[temp2]      \n\t"
+        : [p_argb]"+&r"(p_argb), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+          [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4)
+        : [gtr]"r"(gtr)
+        : "memory", "hi", "lo"
+      );
+      ++histo[(uint8_t)(temp2 >> 16)];
+      ++histo[(uint8_t)temp2];
+    }
+    if (tile_width & 1) {
+      ++histo[TransformColorRed(green_to_red, *p_argb)];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPSdspR2(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LTransformColor = TransformColor;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c
new file mode 100644
index 0000000..4c56f25
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_neon.c
@@ -0,0 +1,143 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+
+#include "./lossless.h"
+#include "./neon.h"
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+// vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
+// non-standard versions there.
+#if defined(__APPLE__) && defined(__aarch64__) && \
+    defined(__apple_build_version__) && (__apple_build_version__< 6020037)
+#define USE_VTBLQ
+#endif
+
+#ifdef USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[16] = {
+  1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13, 255
+};
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
+                                             const uint8x16_t shuffle) {
+  return vcombine_u8(vtbl1q_u8(argb, vget_low_u8(shuffle)),
+                     vtbl1q_u8(argb, vget_high_u8(shuffle)));
+}
+#else  // !USE_VTBLQ
+// 255 = byte will be zeroed
+static const uint8_t kGreenShuffle[8] = { 1, 255, 1, 255, 5, 255, 5, 255  };
+
+static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
+                                             const uint8x8_t shuffle) {
+  return vcombine_u8(vtbl1_u8(vget_low_u8(argb), shuffle),
+                     vtbl1_u8(vget_high_u8(argb), shuffle));
+}
+#endif  // USE_VTBLQ
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  const uint32_t* const end = argb_data + (num_pixels & ~3);
+#ifdef USE_VTBLQ
+  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+#else
+  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+#endif
+  for (; argb_data < end; argb_data += 4) {
+    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
+    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
+    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+static void TransformColor(const VP8LMultipliers* const m,
+                           uint32_t* argb_data, int num_pixels) {
+  // sign-extended multiplying constants, pre-shifted by 6.
+#define CST(X)  (((int16_t)(m->X << 8)) >> 6)
+  const int16_t rb[8] = {
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_)
+  };
+  const int16x8_t mults_rb = vld1q_s16(rb);
+  const int16_t b2[8] = {
+    0, CST(red_to_blue_), 0, CST(red_to_blue_),
+    0, CST(red_to_blue_), 0, CST(red_to_blue_),
+  };
+  const int16x8_t mults_b2 = vld1q_s16(b2);
+#undef CST
+#ifdef USE_VTBLQ
+  static const uint8_t kg0g0[16] = {
+    255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13
+  };
+  const uint8x16_t shuffle = vld1q_u8(kg0g0);
+#else
+  static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };
+  const uint8x8_t shuffle = vld1_u8(k0g0g);
+#endif
+  const uint32x4_t mask_rb = vdupq_n_u32(0x00ff00ffu);  // red-blue masks
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
+    // 0 g 0 g
+    const uint8x16_t greens = DoGreenShuffle(in, shuffle);
+    // x dr  x db1
+    const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
+    // r 0   b   0
+    const int16x8_t B = vshlq_n_s16(vreinterpretq_s16_u8(in), 8);
+    // x db2 0   0
+    const int16x8_t C = vqdmulhq_s16(B, mults_b2);
+    // 0 0   x db2
+    const uint32x4_t D = vshrq_n_u32(vreinterpretq_u32_s16(C), 16);
+    // x dr  x  db
+    const int8x16_t E = vaddq_s8(vreinterpretq_s8_u32(D),
+                                 vreinterpretq_s8_s16(A));
+    // 0 dr  0  db
+    const uint32x4_t F = vandq_u32(vreinterpretq_u32_s8(E), mask_rb);
+    const int8x16_t out = vsubq_s8(vreinterpretq_s8_u8(in),
+                                   vreinterpretq_s8_u32(F));
+    vst1q_s8((int8_t*)(argb_data + i), out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+}
+
+#undef USE_VTBLQ
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitNEON(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LTransformColor = TransformColor;
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
new file mode 100644
index 0000000..e8c9834
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -0,0 +1,345 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <assert.h>
+#include <emmintrin.h>
+#include "./lossless.h"
+
+// For sign-extended multiplying constants, pre-shifted by 5:
+#define CST_5b(X)  (((int16_t)((uint16_t)X << 8)) >> 5)
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+    const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
+    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
+    const __m128i out = _mm_sub_epi8(in, C);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+}
+
+//------------------------------------------------------------------------------
+// Color Transform
+
+static void TransformColor(const VP8LMultipliers* const m,
+                           uint32_t* argb_data, int num_pixels) {
+  const __m128i mults_rb = _mm_set_epi16(
+      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
+      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
+      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
+      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_));
+  const __m128i mults_b2 = _mm_set_epi16(
+      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0,
+      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0);
+  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
+  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);  // red-blue masks
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+    const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
+    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
+    const __m128i D = _mm_mulhi_epi16(C, mults_rb);    // x dr  x db1
+    const __m128i E = _mm_slli_epi16(in, 8);           // r 0   b   0
+    const __m128i F = _mm_mulhi_epi16(E, mults_b2);    // x db2 0   0
+    const __m128i G = _mm_srli_epi32(F, 16);           // 0 0   x db2
+    const __m128i H = _mm_add_epi8(G, D);              // x dr  x  db
+    const __m128i I = _mm_and_si128(H, mask_rb);       // 0 dr  0  db
+    const __m128i out = _mm_sub_epi8(in, I);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+}
+
+//------------------------------------------------------------------------------
+#define SPAN 8
+static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
+                                       int tile_width, int tile_height,
+                                       int green_to_blue, int red_to_blue,
+                                       int histo[]) {
+  const __m128i mults_r = _mm_set_epi16(
+      CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
+      CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
+  const __m128i mults_g = _mm_set_epi16(
+      0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue),
+      0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue));
+  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
+  const __m128i mask_b = _mm_set1_epi32(0x0000ff);  // blue mask
+  int y;
+  for (y = 0; y < tile_height; ++y) {
+    const uint32_t* const src = argb + y * stride;
+    int i, x;
+    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+      uint16_t values[SPAN];
+      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x +        0]);
+      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+      const __m128i A0 = _mm_slli_epi16(in0, 8);        // r 0  | b 0
+      const __m128i A1 = _mm_slli_epi16(in1, 8);
+      const __m128i B0 = _mm_and_si128(in0, mask_g);    // 0 0  | g 0
+      const __m128i B1 = _mm_and_si128(in1, mask_g);
+      const __m128i C0 = _mm_mulhi_epi16(A0, mults_r);  // x db | 0 0
+      const __m128i C1 = _mm_mulhi_epi16(A1, mults_r);
+      const __m128i D0 = _mm_mulhi_epi16(B0, mults_g);  // 0 0  | x db
+      const __m128i D1 = _mm_mulhi_epi16(B1, mults_g);
+      const __m128i E0 = _mm_sub_epi8(in0, D0);         // x x  | x b'
+      const __m128i E1 = _mm_sub_epi8(in1, D1);
+      const __m128i F0 = _mm_srli_epi32(C0, 16);        // 0 0  | x db
+      const __m128i F1 = _mm_srli_epi32(C1, 16);
+      const __m128i G0 = _mm_sub_epi8(E0, F0);          // 0 0  | x b'
+      const __m128i G1 = _mm_sub_epi8(E1, F1);
+      const __m128i H0 = _mm_and_si128(G0, mask_b);     // 0 0  | 0 b
+      const __m128i H1 = _mm_and_si128(G1, mask_b);
+      const __m128i I = _mm_packs_epi32(H0, H1);        // 0 b' | 0 b'
+      _mm_storeu_si128((__m128i*)values, I);
+      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & (SPAN - 1);
+    if (left_over > 0) {
+      VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
+                                       left_over, tile_height,
+                                       green_to_blue, red_to_blue, histo);
+    }
+  }
+}
+
+static void CollectColorRedTransforms(const uint32_t* argb, int stride,
+                                      int tile_width, int tile_height,
+                                      int green_to_red, int histo[]) {
+  const __m128i mults_g = _mm_set_epi16(
+      0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
+      0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
+  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
+  const __m128i mask = _mm_set1_epi32(0xff);
+
+  int y;
+  for (y = 0; y < tile_height; ++y) {
+    const uint32_t* const src = argb + y * stride;
+    int i, x;
+    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+      uint16_t values[SPAN];
+      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x +        0]);
+      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+      const __m128i A0 = _mm_and_si128(in0, mask_g);    // 0 0  | g 0
+      const __m128i A1 = _mm_and_si128(in1, mask_g);
+      const __m128i B0 = _mm_srli_epi32(in0, 16);       // 0 0  | x r
+      const __m128i B1 = _mm_srli_epi32(in1, 16);
+      const __m128i C0 = _mm_mulhi_epi16(A0, mults_g);  // 0 0  | x dr
+      const __m128i C1 = _mm_mulhi_epi16(A1, mults_g);
+      const __m128i E0 = _mm_sub_epi8(B0, C0);          // x x  | x r'
+      const __m128i E1 = _mm_sub_epi8(B1, C1);
+      const __m128i F0 = _mm_and_si128(E0, mask);       // 0 0  | 0 r'
+      const __m128i F1 = _mm_and_si128(E1, mask);
+      const __m128i I = _mm_packs_epi32(F0, F1);
+      _mm_storeu_si128((__m128i*)values, I);
+      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & (SPAN - 1);
+    if (left_over > 0) {
+      VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
+                                      left_over, tile_height,
+                                      green_to_red, histo);
+    }
+  }
+}
+#undef SPAN
+
+//------------------------------------------------------------------------------
+
+#define LINE_SIZE 16    // 8 or 16
+static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
+                      int size) {
+  int i;
+  assert(size % LINE_SIZE == 0);
+  for (i = 0; i < size; i += LINE_SIZE) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
+    const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
+#endif
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&b[i +  0]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&b[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i b2 = _mm_loadu_si128((const __m128i*)&b[i +  8]);
+    const __m128i b3 = _mm_loadu_si128((const __m128i*)&b[i + 12]);
+#endif
+    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
+    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+  }
+}
+
+static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
+  int i;
+  assert(size % LINE_SIZE == 0);
+  for (i = 0; i < size; i += LINE_SIZE) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)&a[i +  0]);
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)&a[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i a2 = _mm_loadu_si128((const __m128i*)&a[i +  8]);
+    const __m128i a3 = _mm_loadu_si128((const __m128i*)&a[i + 12]);
+#endif
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)&out[i +  0]);
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)&out[i +  4]);
+#if (LINE_SIZE == 16)
+    const __m128i b2 = _mm_loadu_si128((const __m128i*)&out[i +  8]);
+    const __m128i b3 = _mm_loadu_si128((const __m128i*)&out[i + 12]);
+#endif
+    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
+    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
+#if (LINE_SIZE == 16)
+    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
+    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
+#endif
+  }
+}
+#undef LINE_SIZE
+
+// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
+// that's ok since the histogram values are less than 1<<28 (max picture size).
+static void HistogramAdd(const VP8LHistogram* const a,
+                         const VP8LHistogram* const b,
+                         VP8LHistogram* const out) {
+  int i;
+  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+  if (b != out) {
+    AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
+    AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
+    AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
+    AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
+  } else {
+    AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
+    AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
+    AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
+    AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
+  }
+  for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
+    out->literal_[i] = a->literal_[i] + b->literal_[i];
+  }
+  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
+    out->distance_[i] = a->distance_[i] + b->distance_[i];
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entropy
+
+// Checks whether the X or Y contribution is worth computing and adding.
+// Used in loop unrolling.
+#define ANALYZE_X_OR_Y(x_or_y, j)                                   \
+  do {                                                              \
+    if (x_or_y[i + j] != 0) retval -= VP8LFastSLog2(x_or_y[i + j]); \
+  } while (0)
+
+// Checks whether the X + Y contribution is worth computing and adding.
+// Used in loop unrolling.
+#define ANALYZE_XY(j)                  \
+  do {                                 \
+    if (tmp[j] != 0) {                 \
+      retval -= VP8LFastSLog2(tmp[j]); \
+      ANALYZE_X_OR_Y(X, j);            \
+    }                                  \
+  } while (0)
+
+static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
+  int i;
+  double retval = 0.;
+  int sumX, sumXY;
+  int32_t tmp[4];
+  __m128i zero = _mm_setzero_si128();
+  // Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY).
+  __m128i sumXY_128 = zero;
+  __m128i sumX_128 = zero;
+
+  for (i = 0; i < 256; i += 4) {
+    const __m128i x = _mm_loadu_si128((const __m128i*)(X + i));
+    const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i));
+
+    // Check if any X is non-zero: this actually provides a speedup as X is
+    // usually sparse.
+    if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) {
+      const __m128i xy_128 = _mm_add_epi32(x, y);
+      sumXY_128 = _mm_add_epi32(sumXY_128, xy_128);
+
+      sumX_128 = _mm_add_epi32(sumX_128, x);
+
+      // Analyze the different X + Y.
+      _mm_storeu_si128((__m128i*)tmp, xy_128);
+
+      ANALYZE_XY(0);
+      ANALYZE_XY(1);
+      ANALYZE_XY(2);
+      ANALYZE_XY(3);
+    } else {
+      // X is fully 0, so only deal with Y.
+      sumXY_128 = _mm_add_epi32(sumXY_128, y);
+
+      ANALYZE_X_OR_Y(Y, 0);
+      ANALYZE_X_OR_Y(Y, 1);
+      ANALYZE_X_OR_Y(Y, 2);
+      ANALYZE_X_OR_Y(Y, 3);
+    }
+  }
+
+  // Sum up sumX_128 to get sumX.
+  _mm_storeu_si128((__m128i*)tmp, sumX_128);
+  sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0];
+
+  // Sum up sumXY_128 to get sumXY.
+  _mm_storeu_si128((__m128i*)tmp, sumXY_128);
+  sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0];
+
+  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
+  return (float)retval;
+}
+#undef ANALYZE_X_OR_Y
+#undef ANALYZE_XY
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LTransformColor = TransformColor;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms;
+  VP8LHistogramAdd = HistogramAdd;
+  VP8LCombinedShannonEntropy = CombinedShannonEntropy;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
new file mode 100644
index 0000000..3e49319
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
@@ -0,0 +1,51 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4.1 variant of methods for lossless encoder
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+#include <assert.h>
+#include <smmintrin.h>
+#include "./lossless.h"
+
+//------------------------------------------------------------------------------
+// Subtract-Green Transform
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  int i;
+  const __m128i kCstShuffle = _mm_set_epi8(-1, 13, -1, 13, -1, 9, -1, 9,
+                                           -1,  5, -1,  5, -1, 1, -1, 1);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i in_0g0g = _mm_shuffle_epi8(in, kCstShuffle);
+    const __m128i out = _mm_sub_epi8(in, in_0g0g);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+}
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitSSE41)
+
+#endif  // WEBP_USE_SSE41
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
new file mode 100644
index 0000000..90aed7f
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
@@ -0,0 +1,680 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transforms and color space conversion methods for lossless decoder.
+//
+// Author(s):  Djordje Pesut    (djordje.pesut@imgtec.com)
+//             Jovan Zelincevic (jovan.zelincevic@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "./lossless.h"
+
+#define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)                 \
+static void FUNC_NAME(const TYPE* src,                                         \
+                      const uint32_t* const color_map,                         \
+                      TYPE* dst, int y_start, int y_end,                       \
+                      int width) {                                             \
+  int y;                                                                       \
+  for (y = y_start; y < y_end; ++y) {                                          \
+    int x;                                                                     \
+    for (x = 0; x < (width >> 2); ++x) {                                       \
+      int tmp1, tmp2, tmp3, tmp4;                                              \
+      __asm__ volatile (                                                       \
+      ".ifc        " #TYPE ",  uint8_t                  \n\t"                  \
+        "lbu       %[tmp1],  0(%[src])                  \n\t"                  \
+        "lbu       %[tmp2],  1(%[src])                  \n\t"                  \
+        "lbu       %[tmp3],  2(%[src])                  \n\t"                  \
+        "lbu       %[tmp4],  3(%[src])                  \n\t"                  \
+        "addiu     %[src],   %[src],      4             \n\t"                  \
+      ".endif                                           \n\t"                  \
+      ".ifc        " #TYPE ",  uint32_t                 \n\t"                  \
+        "lw        %[tmp1],  0(%[src])                  \n\t"                  \
+        "lw        %[tmp2],  4(%[src])                  \n\t"                  \
+        "lw        %[tmp3],  8(%[src])                  \n\t"                  \
+        "lw        %[tmp4],  12(%[src])                 \n\t"                  \
+        "ext       %[tmp1],  %[tmp1],     8,        8   \n\t"                  \
+        "ext       %[tmp2],  %[tmp2],     8,        8   \n\t"                  \
+        "ext       %[tmp3],  %[tmp3],     8,        8   \n\t"                  \
+        "ext       %[tmp4],  %[tmp4],     8,        8   \n\t"                  \
+        "addiu     %[src],   %[src],      16            \n\t"                  \
+      ".endif                                           \n\t"                  \
+        "sll       %[tmp1],  %[tmp1],     2             \n\t"                  \
+        "sll       %[tmp2],  %[tmp2],     2             \n\t"                  \
+        "sll       %[tmp3],  %[tmp3],     2             \n\t"                  \
+        "sll       %[tmp4],  %[tmp4],     2             \n\t"                  \
+        "lwx       %[tmp1],  %[tmp1](%[color_map])      \n\t"                  \
+        "lwx       %[tmp2],  %[tmp2](%[color_map])      \n\t"                  \
+        "lwx       %[tmp3],  %[tmp3](%[color_map])      \n\t"                  \
+        "lwx       %[tmp4],  %[tmp4](%[color_map])      \n\t"                  \
+      ".ifc        " #TYPE ",  uint8_t                  \n\t"                  \
+        "ext       %[tmp1],  %[tmp1],     8,        8   \n\t"                  \
+        "ext       %[tmp2],  %[tmp2],     8,        8   \n\t"                  \
+        "ext       %[tmp3],  %[tmp3],     8,        8   \n\t"                  \
+        "ext       %[tmp4],  %[tmp4],     8,        8   \n\t"                  \
+        "sb        %[tmp1],  0(%[dst])                  \n\t"                  \
+        "sb        %[tmp2],  1(%[dst])                  \n\t"                  \
+        "sb        %[tmp3],  2(%[dst])                  \n\t"                  \
+        "sb        %[tmp4],  3(%[dst])                  \n\t"                  \
+        "addiu     %[dst],   %[dst],      4             \n\t"                  \
+      ".endif                                           \n\t"                  \
+      ".ifc        " #TYPE ",  uint32_t                 \n\t"                  \
+        "sw        %[tmp1],  0(%[dst])                  \n\t"                  \
+        "sw        %[tmp2],  4(%[dst])                  \n\t"                  \
+        "sw        %[tmp3],  8(%[dst])                  \n\t"                  \
+        "sw        %[tmp4],  12(%[dst])                 \n\t"                  \
+        "addiu     %[dst],   %[dst],      16            \n\t"                  \
+      ".endif                                           \n\t"                  \
+        : [tmp1]"=&r"(tmp1), [tmp2]"=&r"(tmp2), [tmp3]"=&r"(tmp3),             \
+          [tmp4]"=&r"(tmp4), [src]"+&r"(src), [dst]"+r"(dst)                   \
+        : [color_map]"r"(color_map)                                            \
+        : "memory"                                                             \
+      );                                                                       \
+    }                                                                          \
+    for (x = 0; x < (width & 3); ++x) {                                        \
+      *dst++ = GET_VALUE(color_map[GET_INDEX(*src++)]);                        \
+    }                                                                          \
+  }                                                                            \
+}
+
+MAP_COLOR_FUNCS(MapARGB, uint32_t, VP8GetARGBIndex, VP8GetARGBValue)
+MAP_COLOR_FUNCS(MapAlpha, uint8_t, VP8GetAlphaIndex, VP8GetAlphaValue)
+
+#undef MAP_COLOR_FUNCS
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  __asm__ volatile (
+    "preceu.ph.qbr   %[temp1],   %[c0]                 \n\t"
+    "preceu.ph.qbl   %[temp2],   %[c0]                 \n\t"
+    "preceu.ph.qbr   %[temp3],   %[c1]                 \n\t"
+    "preceu.ph.qbl   %[temp4],   %[c1]                 \n\t"
+    "preceu.ph.qbr   %[temp5],   %[c2]                 \n\t"
+    "preceu.ph.qbl   %[temp0],   %[c2]                 \n\t"
+    "subq.ph         %[temp3],   %[temp3],   %[temp5]  \n\t"
+    "subq.ph         %[temp4],   %[temp4],   %[temp0]  \n\t"
+    "addq.ph         %[temp1],   %[temp1],   %[temp3]  \n\t"
+    "addq.ph         %[temp2],   %[temp2],   %[temp4]  \n\t"
+    "shll_s.ph       %[temp1],   %[temp1],   7         \n\t"
+    "shll_s.ph       %[temp2],   %[temp2],   7         \n\t"
+    "precrqu_s.qb.ph %[temp2],   %[temp2],   %[temp1]  \n\t"
+    : [temp0]"=r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5)
+    : [c0]"r"(c0), [c1]"r"(c1), [c2]"r"(c2)
+    : "memory"
+  );
+  return temp2;
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  __asm__ volatile (
+    "adduh.qb         %[temp5],   %[c0],      %[c1]       \n\t"
+    "preceu.ph.qbr    %[temp3],   %[c2]                   \n\t"
+    "preceu.ph.qbr    %[temp1],   %[temp5]                \n\t"
+    "preceu.ph.qbl    %[temp2],   %[temp5]                \n\t"
+    "preceu.ph.qbl    %[temp4],   %[c2]                   \n\t"
+    "subq.ph          %[temp3],   %[temp1],   %[temp3]    \n\t"
+    "subq.ph          %[temp4],   %[temp2],   %[temp4]    \n\t"
+    "shrl.ph          %[temp5],   %[temp3],   15          \n\t"
+    "shrl.ph          %[temp0],   %[temp4],   15          \n\t"
+    "addq.ph          %[temp3],   %[temp3],   %[temp5]    \n\t"
+    "addq.ph          %[temp4],   %[temp0],   %[temp4]    \n\t"
+    "shra.ph          %[temp3],   %[temp3],   1           \n\t"
+    "shra.ph          %[temp4],   %[temp4],   1           \n\t"
+    "addq.ph          %[temp1],   %[temp1],   %[temp3]    \n\t"
+    "addq.ph          %[temp2],   %[temp2],   %[temp4]    \n\t"
+    "shll_s.ph        %[temp1],   %[temp1],   7           \n\t"
+    "shll_s.ph        %[temp2],   %[temp2],   7           \n\t"
+    "precrqu_s.qb.ph  %[temp1],   %[temp2],   %[temp1]    \n\t"
+    : [temp0]"=r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=r"(temp4), [temp5]"=&r"(temp5)
+    : [c0]"r"(c0), [c1]"r"(c1), [c2]"r"(c2)
+    : "memory"
+  );
+  return temp1;
+}
+
+static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  __asm__ volatile (
+    "cmpgdu.lt.qb %[temp1], %[c],     %[b]             \n\t"
+    "pick.qb      %[temp1], %[b],     %[c]             \n\t"
+    "pick.qb      %[temp2], %[c],     %[b]             \n\t"
+    "cmpgdu.lt.qb %[temp4], %[c],     %[a]             \n\t"
+    "pick.qb      %[temp4], %[a],     %[c]             \n\t"
+    "pick.qb      %[temp5], %[c],     %[a]             \n\t"
+    "subu.qb      %[temp3], %[temp1], %[temp2]         \n\t"
+    "subu.qb      %[temp0], %[temp4], %[temp5]         \n\t"
+    "raddu.w.qb   %[temp3], %[temp3]                   \n\t"
+    "raddu.w.qb   %[temp0], %[temp0]                   \n\t"
+    "subu         %[temp3], %[temp3], %[temp0]         \n\t"
+    "slti         %[temp0], %[temp3], 0x1              \n\t"
+    "movz         %[a],     %[b],     %[temp0]         \n\t"
+    : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp0]"=&r"(temp0),
+      [a]"+&r"(a)
+    : [b]"r"(b), [c]"r"(c)
+  );
+  return a;
+}
+
+static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
+  __asm__ volatile (
+    "adduh.qb    %[a0], %[a0], %[a1]       \n\t"
+    : [a0]"+r"(a0)
+    : [a1]"r"(a1)
+  );
+  return a0;
+}
+
+static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
+  return Average2(Average2(a0, a2), a1);
+}
+
+static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
+                                     uint32_t a2, uint32_t a3) {
+  return Average2(Average2(a0, a1), Average2(a2, a3));
+}
+
+static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+  return Average3(left, top[0], top[1]);
+}
+
+static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+  return Average2(left, top[-1]);
+}
+
+static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+  return Average2(left, top[0]);
+}
+
+static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return Average2(top[-1], top[0]);
+}
+
+static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return Average2(top[0], top[1]);
+}
+
+static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+  return Average4(left, top[-1], top[0], top[1]);
+}
+
+static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+  return Select(top[0], left, top[-1]);
+}
+
+static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+  return ClampedAddSubtractFull(left, top[0], top[-1]);
+}
+
+static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+  return ClampedAddSubtractHalf(left, top[0], top[-1]);
+}
+
+// Add green to blue and red channels (i.e. perform the inverse transform of
+// 'subtract green').
+static void AddGreenToBlueAndRed(uint32_t* data, int num_pixels) {
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+  uint32_t* const p_loop1_end = data + (num_pixels & ~3);
+  uint32_t* const p_loop2_end = data + num_pixels;
+  __asm__ volatile (
+    ".set       push                                          \n\t"
+    ".set       noreorder                                     \n\t"
+    "beq        %[data],         %[p_loop1_end],     3f       \n\t"
+    " nop                                                     \n\t"
+  "0:                                                         \n\t"
+    "lw         %[temp0],        0(%[data])                   \n\t"
+    "lw         %[temp1],        4(%[data])                   \n\t"
+    "lw         %[temp2],        8(%[data])                   \n\t"
+    "lw         %[temp3],        12(%[data])                  \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "ext        %[temp5],        %[temp1],           8,    8  \n\t"
+    "ext        %[temp6],        %[temp2],           8,    8  \n\t"
+    "ext        %[temp7],        %[temp3],           8,    8  \n\t"
+    "addiu      %[data],         %[data],            16       \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "replv.ph   %[temp5],        %[temp5]                     \n\t"
+    "replv.ph   %[temp6],        %[temp6]                     \n\t"
+    "replv.ph   %[temp7],        %[temp7]                     \n\t"
+    "addu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "addu.qb    %[temp1],        %[temp1],           %[temp5] \n\t"
+    "addu.qb    %[temp2],        %[temp2],           %[temp6] \n\t"
+    "addu.qb    %[temp3],        %[temp3],           %[temp7] \n\t"
+    "sw         %[temp0],        -16(%[data])                 \n\t"
+    "sw         %[temp1],        -12(%[data])                 \n\t"
+    "sw         %[temp2],        -8(%[data])                  \n\t"
+    "bne        %[data],         %[p_loop1_end],     0b       \n\t"
+    " sw        %[temp3],        -4(%[data])                  \n\t"
+  "3:                                                         \n\t"
+    "beq        %[data],         %[p_loop2_end],     2f       \n\t"
+    " nop                                                     \n\t"
+  "1:                                                         \n\t"
+    "lw         %[temp0],        0(%[data])                   \n\t"
+    "addiu      %[data],         %[data],            4        \n\t"
+    "ext        %[temp4],        %[temp0],           8,    8  \n\t"
+    "replv.ph   %[temp4],        %[temp4]                     \n\t"
+    "addu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
+    "bne        %[data],         %[p_loop2_end],     1b       \n\t"
+    " sw        %[temp0],        -4(%[data])                  \n\t"
+  "2:                                                         \n\t"
+    ".set       pop                                           \n\t"
+    : [data]"+&r"(data), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
+      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+      [temp5]"=&r"(temp5), [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void TransformColorInverse(const VP8LMultipliers* const m,
+                                  uint32_t* data, int num_pixels) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  uint32_t argb, argb1, new_red;
+  const uint32_t G_to_R = m->green_to_red_;
+  const uint32_t G_to_B = m->green_to_blue_;
+  const uint32_t R_to_B = m->red_to_blue_;
+  uint32_t* const p_loop_end = data + (num_pixels & ~1);
+  __asm__ volatile (
+    ".set            push                                    \n\t"
+    ".set            noreorder                               \n\t"
+    "beq             %[data],      %[p_loop_end],  1f        \n\t"
+    " nop                                                    \n\t"
+    "replv.ph        %[temp0],     %[G_to_R]                 \n\t"
+    "replv.ph        %[temp1],     %[G_to_B]                 \n\t"
+    "replv.ph        %[temp2],     %[R_to_B]                 \n\t"
+    "shll.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shll.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shll.ph         %[temp2],     %[temp2],       8         \n\t"
+    "shra.ph         %[temp0],     %[temp0],       8         \n\t"
+    "shra.ph         %[temp1],     %[temp1],       8         \n\t"
+    "shra.ph         %[temp2],     %[temp2],       8         \n\t"
+  "0:                                                        \n\t"
+    "lw              %[argb],      0(%[data])                \n\t"
+    "lw              %[argb1],     4(%[data])                \n\t"
+    "addiu           %[data],      %[data],        8         \n\t"
+    "precrq.qb.ph    %[temp3],     %[argb],        %[argb1]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[temp3]                  \n\t"
+    "shll.ph         %[temp3],     %[temp3],       8         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       8         \n\t"
+    "mul.ph          %[temp5],     %[temp3],       %[temp0]  \n\t"
+    "mul.ph          %[temp3],     %[temp3],       %[temp1]  \n\t"
+    "precrq.ph.w     %[new_red],   %[argb],        %[argb1]  \n\t"
+    "ins             %[argb1],     %[argb],        16,   16  \n\t"
+    "shra.ph         %[temp5],     %[temp5],       5         \n\t"
+    "shra.ph         %[temp3],     %[temp3],       5         \n\t"
+    "addu.ph         %[new_red],   %[new_red],     %[temp5]  \n\t"
+    "addu.ph         %[argb1],     %[argb1],       %[temp3]  \n\t"
+    "preceu.ph.qbra  %[temp5],     %[new_red]                \n\t"
+    "shll.ph         %[temp4],     %[temp5],       8         \n\t"
+    "shra.ph         %[temp4],     %[temp4],       8         \n\t"
+    "mul.ph          %[temp4],     %[temp4],       %[temp2]  \n\t"
+    "sb              %[temp5],     -2(%[data])               \n\t"
+    "sra             %[temp5],     %[temp5],       16        \n\t"
+    "shra.ph         %[temp4],     %[temp4],       5         \n\t"
+    "addu.ph         %[argb1],     %[argb1],       %[temp4]  \n\t"
+    "preceu.ph.qbra  %[temp3],     %[argb1]                  \n\t"
+    "sb              %[temp5],     -6(%[data])               \n\t"
+    "sb              %[temp3],     -4(%[data])               \n\t"
+    "sra             %[temp3],     %[temp3],       16        \n\t"
+    "bne             %[data],      %[p_loop_end],  0b        \n\t"
+    " sb             %[temp3],     -8(%[data])               \n\t"
+  "1:                                                        \n\t"
+    ".set            pop                                     \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [new_red]"=&r"(new_red), [argb]"=&r"(argb),
+      [argb1]"=&r"(argb1), [data]"+&r"(data)
+    : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
+      [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
+    : "memory", "hi", "lo"
+  );
+
+  // Fall-back to C-version for left-overs.
+  if (num_pixels & 1) VP8LTransformColorInverse_C(m, data, 1);
+}
+
+static void ConvertBGRAToRGB(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set       push                                       \n\t"
+    ".set       noreorder                                  \n\t"
+    "beq        %[src],      %[p_loop1_end],    3f         \n\t"
+    " nop                                                  \n\t"
+  "0:                                                      \n\t"
+    "lw         %[temp3],    12(%[src])                    \n\t"
+    "lw         %[temp2],    8(%[src])                     \n\t"
+    "lw         %[temp1],    4(%[src])                     \n\t"
+    "lw         %[temp0],    0(%[src])                     \n\t"
+    "ins        %[temp3],    %[temp2],          24,   8    \n\t"
+    "sll        %[temp2],    %[temp2],          8          \n\t"
+    "rotr       %[temp3],    %[temp3],          16         \n\t"
+    "ins        %[temp2],    %[temp1],          0,    16   \n\t"
+    "sll        %[temp1],    %[temp1],          8          \n\t"
+    "wsbh       %[temp3],    %[temp3]                      \n\t"
+    "balign     %[temp0],    %[temp1],          1          \n\t"
+    "wsbh       %[temp2],    %[temp2]                      \n\t"
+    "wsbh       %[temp0],    %[temp0]                      \n\t"
+    "usw        %[temp3],    8(%[dst])                     \n\t"
+    "rotr       %[temp0],    %[temp0],          16         \n\t"
+    "usw        %[temp2],    4(%[dst])                     \n\t"
+    "addiu      %[src],      %[src],            16         \n\t"
+    "usw        %[temp0],    0(%[dst])                     \n\t"
+    "bne        %[src],      %[p_loop1_end],    0b         \n\t"
+    " addiu     %[dst],      %[dst],            12         \n\t"
+  "3:                                                      \n\t"
+    "beq        %[src],      %[p_loop2_end],    2f         \n\t"
+    " nop                                                  \n\t"
+  "1:                                                      \n\t"
+    "lw         %[temp0],    0(%[src])                     \n\t"
+    "addiu      %[src],      %[src],            4          \n\t"
+    "wsbh       %[temp1],    %[temp0]                      \n\t"
+    "addiu      %[dst],      %[dst],            3          \n\t"
+    "ush        %[temp1],    -2(%[dst])                    \n\t"
+    "sra        %[temp0],    %[temp0],          16         \n\t"
+    "bne        %[src],      %[p_loop2_end],    1b         \n\t"
+    " sb        %[temp0],    -3(%[dst])                    \n\t"
+  "2:                                                      \n\t"
+    ".set       pop                                        \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void ConvertBGRAToRGBA(const uint32_t* src,
+                              int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set       push                                       \n\t"
+    ".set       noreorder                                  \n\t"
+    "beq        %[src],      %[p_loop1_end],    3f         \n\t"
+    " nop                                                  \n\t"
+  "0:                                                      \n\t"
+    "lw         %[temp0],    0(%[src])                     \n\t"
+    "lw         %[temp1],    4(%[src])                     \n\t"
+    "lw         %[temp2],    8(%[src])                     \n\t"
+    "lw         %[temp3],    12(%[src])                    \n\t"
+    "wsbh       %[temp0],    %[temp0]                      \n\t"
+    "wsbh       %[temp1],    %[temp1]                      \n\t"
+    "wsbh       %[temp2],    %[temp2]                      \n\t"
+    "wsbh       %[temp3],    %[temp3]                      \n\t"
+    "addiu      %[src],      %[src],            16         \n\t"
+    "balign     %[temp0],    %[temp0],          1          \n\t"
+    "balign     %[temp1],    %[temp1],          1          \n\t"
+    "balign     %[temp2],    %[temp2],          1          \n\t"
+    "balign     %[temp3],    %[temp3],          1          \n\t"
+    "usw        %[temp0],    0(%[dst])                     \n\t"
+    "usw        %[temp1],    4(%[dst])                     \n\t"
+    "usw        %[temp2],    8(%[dst])                     \n\t"
+    "usw        %[temp3],    12(%[dst])                    \n\t"
+    "bne        %[src],      %[p_loop1_end],    0b         \n\t"
+    " addiu     %[dst],      %[dst],            16         \n\t"
+  "3:                                                      \n\t"
+    "beq        %[src],      %[p_loop2_end],    2f         \n\t"
+    " nop                                                  \n\t"
+  "1:                                                      \n\t"
+    "lw         %[temp0],    0(%[src])                     \n\t"
+    "wsbh       %[temp0],    %[temp0]                      \n\t"
+    "addiu      %[src],      %[src],            4          \n\t"
+    "balign     %[temp0],    %[temp0],          1          \n\t"
+    "usw        %[temp0],    0(%[dst])                     \n\t"
+    "bne        %[src],      %[p_loop2_end],    1b         \n\t"
+    " addiu     %[dst],      %[dst],            4          \n\t"
+  "2:                                                      \n\t"
+    ".set       pop                                        \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void ConvertBGRAToRGBA4444(const uint32_t* src,
+                                  int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set           push                                       \n\t"
+    ".set           noreorder                                  \n\t"
+    "beq            %[src],      %[p_loop1_end],    3f         \n\t"
+    " nop                                                      \n\t"
+  "0:                                                          \n\t"
+    "lw             %[temp0],    0(%[src])                     \n\t"
+    "lw             %[temp1],    4(%[src])                     \n\t"
+    "lw             %[temp2],    8(%[src])                     \n\t"
+    "lw             %[temp3],    12(%[src])                    \n\t"
+    "ext            %[temp4],    %[temp0],          28,   4    \n\t"
+    "ext            %[temp5],    %[temp0],          12,   4    \n\t"
+    "ins            %[temp0],    %[temp4],          0,    4    \n\t"
+    "ext            %[temp4],    %[temp1],          28,   4    \n\t"
+    "ins            %[temp0],    %[temp5],          16,   4    \n\t"
+    "ext            %[temp5],    %[temp1],          12,   4    \n\t"
+    "ins            %[temp1],    %[temp4],          0,    4    \n\t"
+    "ext            %[temp4],    %[temp2],          28,   4    \n\t"
+    "ins            %[temp1],    %[temp5],          16,   4    \n\t"
+    "ext            %[temp5],    %[temp2],          12,   4    \n\t"
+    "ins            %[temp2],    %[temp4],          0,    4    \n\t"
+    "ext            %[temp4],    %[temp3],          28,   4    \n\t"
+    "ins            %[temp2],    %[temp5],          16,   4    \n\t"
+    "ext            %[temp5],    %[temp3],          12,   4    \n\t"
+    "ins            %[temp3],    %[temp4],          0,    4    \n\t"
+    "precr.qb.ph    %[temp1],    %[temp1],          %[temp0]   \n\t"
+    "ins            %[temp3],    %[temp5],          16,   4    \n\t"
+    "addiu          %[src],      %[src],            16         \n\t"
+    "precr.qb.ph    %[temp3],    %[temp3],          %[temp2]   \n\t"
+#ifdef WEBP_SWAP_16BIT_CSP
+    "usw            %[temp1],    0(%[dst])                     \n\t"
+    "usw            %[temp3],    4(%[dst])                     \n\t"
+#else
+    "wsbh           %[temp1],    %[temp1]                      \n\t"
+    "wsbh           %[temp3],    %[temp3]                      \n\t"
+    "usw            %[temp1],    0(%[dst])                     \n\t"
+    "usw            %[temp3],    4(%[dst])                     \n\t"
+#endif
+    "bne            %[src],      %[p_loop1_end],    0b         \n\t"
+    " addiu         %[dst],      %[dst],            8          \n\t"
+  "3:                                                          \n\t"
+    "beq            %[src],      %[p_loop2_end],    2f         \n\t"
+    " nop                                                      \n\t"
+  "1:                                                          \n\t"
+    "lw             %[temp0],    0(%[src])                     \n\t"
+    "ext            %[temp4],    %[temp0],          28,   4    \n\t"
+    "ext            %[temp5],    %[temp0],          12,   4    \n\t"
+    "ins            %[temp0],    %[temp4],          0,    4    \n\t"
+    "ins            %[temp0],    %[temp5],          16,   4    \n\t"
+    "addiu          %[src],      %[src],            4          \n\t"
+    "precr.qb.ph    %[temp0],    %[temp0],          %[temp0]   \n\t"
+#ifdef WEBP_SWAP_16BIT_CSP
+    "ush            %[temp0],    0(%[dst])                     \n\t"
+#else
+    "wsbh           %[temp0],    %[temp0]                      \n\t"
+    "ush            %[temp0],    0(%[dst])                     \n\t"
+#endif
+    "bne            %[src],      %[p_loop2_end],    1b         \n\t"
+    " addiu         %[dst],      %[dst],            2          \n\t"
+  "2:                                                          \n\t"
+    ".set           pop                                        \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void ConvertBGRAToRGB565(const uint32_t* src,
+                                int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3, temp4, temp5;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set           push                                       \n\t"
+    ".set           noreorder                                  \n\t"
+    "beq            %[src],      %[p_loop1_end],    3f         \n\t"
+    " nop                                                      \n\t"
+  "0:                                                          \n\t"
+    "lw             %[temp0],    0(%[src])                     \n\t"
+    "lw             %[temp1],    4(%[src])                     \n\t"
+    "lw             %[temp2],    8(%[src])                     \n\t"
+    "lw             %[temp3],    12(%[src])                    \n\t"
+    "ext            %[temp4],    %[temp0],          8,    16   \n\t"
+    "ext            %[temp5],    %[temp0],          5,    11   \n\t"
+    "ext            %[temp0],    %[temp0],          3,    5    \n\t"
+    "ins            %[temp4],    %[temp5],          0,    11   \n\t"
+    "ext            %[temp5],    %[temp1],          5,    11   \n\t"
+    "ins            %[temp4],    %[temp0],          0,    5    \n\t"
+    "ext            %[temp0],    %[temp1],          8,    16   \n\t"
+    "ext            %[temp1],    %[temp1],          3,    5    \n\t"
+    "ins            %[temp0],    %[temp5],          0,    11   \n\t"
+    "ext            %[temp5],    %[temp2],          5,    11   \n\t"
+    "ins            %[temp0],    %[temp1],          0,    5    \n\t"
+    "ext            %[temp1],    %[temp2],          8,    16   \n\t"
+    "ext            %[temp2],    %[temp2],          3,    5    \n\t"
+    "ins            %[temp1],    %[temp5],          0,    11   \n\t"
+    "ext            %[temp5],    %[temp3],          5,    11   \n\t"
+    "ins            %[temp1],    %[temp2],          0,    5    \n\t"
+    "ext            %[temp2],    %[temp3],          8,    16   \n\t"
+    "ext            %[temp3],    %[temp3],          3,    5    \n\t"
+    "ins            %[temp2],    %[temp5],          0,    11   \n\t"
+    "append         %[temp0],    %[temp4],          16         \n\t"
+    "ins            %[temp2],    %[temp3],          0,    5    \n\t"
+    "addiu          %[src],      %[src],            16         \n\t"
+    "append         %[temp2],    %[temp1],          16         \n\t"
+#ifdef WEBP_SWAP_16BIT_CSP
+    "usw            %[temp0],    0(%[dst])                     \n\t"
+    "usw            %[temp2],    4(%[dst])                     \n\t"
+#else
+    "wsbh           %[temp0],    %[temp0]                      \n\t"
+    "wsbh           %[temp2],    %[temp2]                      \n\t"
+    "usw            %[temp0],    0(%[dst])                     \n\t"
+    "usw            %[temp2],    4(%[dst])                     \n\t"
+#endif
+    "bne            %[src],      %[p_loop1_end],    0b         \n\t"
+    " addiu         %[dst],      %[dst],            8          \n\t"
+  "3:                                                          \n\t"
+    "beq            %[src],      %[p_loop2_end],    2f         \n\t"
+    " nop                                                      \n\t"
+  "1:                                                          \n\t"
+    "lw             %[temp0],    0(%[src])                     \n\t"
+    "ext            %[temp4],    %[temp0],          8,    16   \n\t"
+    "ext            %[temp5],    %[temp0],          5,    11   \n\t"
+    "ext            %[temp0],    %[temp0],          3,    5    \n\t"
+    "ins            %[temp4],    %[temp5],          0,    11   \n\t"
+    "addiu          %[src],      %[src],            4          \n\t"
+    "ins            %[temp4],    %[temp0],          0,    5    \n\t"
+#ifdef WEBP_SWAP_16BIT_CSP
+    "ush            %[temp4],    0(%[dst])                     \n\t"
+#else
+    "wsbh           %[temp4],    %[temp4]                      \n\t"
+    "ush            %[temp4],    0(%[dst])                     \n\t"
+#endif
+    "bne            %[src],      %[p_loop2_end],    1b         \n\t"
+    " addiu         %[dst],      %[dst],            2          \n\t"
+  "2:                                                          \n\t"
+    ".set           pop                                        \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
+      [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+static void ConvertBGRAToBGR(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  int temp0, temp1, temp2, temp3;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
+  __asm__ volatile (
+    ".set       push                                         \n\t"
+    ".set       noreorder                                    \n\t"
+    "beq        %[src],      %[p_loop1_end],    3f           \n\t"
+    " nop                                                    \n\t"
+  "0:                                                        \n\t"
+    "lw         %[temp0],    0(%[src])                       \n\t"
+    "lw         %[temp1],    4(%[src])                       \n\t"
+    "lw         %[temp2],    8(%[src])                       \n\t"
+    "lw         %[temp3],    12(%[src])                      \n\t"
+    "ins        %[temp0],    %[temp1],          24,    8     \n\t"
+    "sra        %[temp1],    %[temp1],          8            \n\t"
+    "ins        %[temp1],    %[temp2],          16,    16    \n\t"
+    "sll        %[temp2],    %[temp2],          8            \n\t"
+    "balign     %[temp3],    %[temp2],          1            \n\t"
+    "addiu      %[src],      %[src],            16           \n\t"
+    "usw        %[temp0],    0(%[dst])                       \n\t"
+    "usw        %[temp1],    4(%[dst])                       \n\t"
+    "usw        %[temp3],    8(%[dst])                       \n\t"
+    "bne        %[src],      %[p_loop1_end],    0b           \n\t"
+    " addiu     %[dst],      %[dst],            12           \n\t"
+  "3:                                                        \n\t"
+    "beq        %[src],      %[p_loop2_end],    2f           \n\t"
+    " nop                                                    \n\t"
+  "1:                                                        \n\t"
+    "lw         %[temp0],    0(%[src])                       \n\t"
+    "addiu      %[src],      %[src],            4            \n\t"
+    "addiu      %[dst],      %[dst],            3            \n\t"
+    "ush        %[temp0],    -3(%[dst])                      \n\t"
+    "sra        %[temp0],    %[temp0],          16           \n\t"
+    "bne        %[src],      %[p_loop2_end],    1b           \n\t"
+    " sb        %[temp0],    -1(%[dst])                      \n\t"
+  "2:                                                        \n\t"
+    ".set       pop                                          \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [dst]"+&r"(dst), [src]"+&r"(src)
+    : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
+    : "memory"
+  );
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMIPSdspR2(void) {
+  VP8LMapColor32b = MapARGB;
+  VP8LMapColor8b = MapAlpha;
+  VP8LPredictors[5] = Predictor5;
+  VP8LPredictors[6] = Predictor6;
+  VP8LPredictors[7] = Predictor7;
+  VP8LPredictors[8] = Predictor8;
+  VP8LPredictors[9] = Predictor9;
+  VP8LPredictors[10] = Predictor10;
+  VP8LPredictors[11] = Predictor11;
+  VP8LPredictors[12] = Predictor12;
+  VP8LPredictors[13] = Predictor13;
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
+  VP8LTransformColorInverse = TransformColorInverse;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
+  VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
+  VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(VP8LDspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_neon.c b/src/3rdparty/libwebp/src/dsp/lossless_neon.c
index 8c82b19..6faccb8 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_neon.c
@@ -140,123 +140,6 @@ static void ConvertBGRAToRGB(const uint32_t* src,
 #endif   // !WORK_AROUND_GCC
 
 //------------------------------------------------------------------------------
-
-#ifdef USE_INTRINSICS
-
-static WEBP_INLINE uint32_t Average2(const uint32_t* const a,
-                                     const uint32_t* const b) {
-  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
-  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
-  const uint8x8_t avg = vhadd_u8(a0, b0);
-  return vget_lane_u32(vreinterpret_u32_u8(avg), 0);
-}
-
-static WEBP_INLINE uint32_t Average3(const uint32_t* const a,
-                                     const uint32_t* const b,
-                                     const uint32_t* const c) {
-  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
-  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
-  const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c));
-  const uint8x8_t avg1 = vhadd_u8(a0, c0);
-  const uint8x8_t avg2 = vhadd_u8(avg1, b0);
-  return vget_lane_u32(vreinterpret_u32_u8(avg2), 0);
-}
-
-static WEBP_INLINE uint32_t Average4(const uint32_t* const a,
-                                     const uint32_t* const b,
-                                     const uint32_t* const c,
-                                     const uint32_t* const d) {
-  const uint8x8_t a0 = vreinterpret_u8_u64(vcreate_u64(*a));
-  const uint8x8_t b0 = vreinterpret_u8_u64(vcreate_u64(*b));
-  const uint8x8_t c0 = vreinterpret_u8_u64(vcreate_u64(*c));
-  const uint8x8_t d0 = vreinterpret_u8_u64(vcreate_u64(*d));
-  const uint8x8_t avg1 = vhadd_u8(a0, b0);
-  const uint8x8_t avg2 = vhadd_u8(c0, d0);
-  const uint8x8_t avg3 = vhadd_u8(avg1, avg2);
-  return vget_lane_u32(vreinterpret_u32_u8(avg3), 0);
-}
-
-static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
-  return Average3(&left, top + 0, top + 1);
-}
-
-static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
-  return Average2(&left, top - 1);
-}
-
-static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
-  return Average2(&left, top + 0);
-}
-
-static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
-  (void)left;
-  return Average2(top - 1, top + 0);
-}
-
-static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
-  (void)left;
-  return Average2(top + 0, top + 1);
-}
-
-static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
-  return Average4(&left, top - 1, top + 0, top + 1);
-}
-
-//------------------------------------------------------------------------------
-
-static WEBP_INLINE uint32_t Select(const uint32_t* const c0,
-                                   const uint32_t* const c1,
-                                   const uint32_t* const c2) {
-  const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
-  const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
-  const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
-  const uint8x8_t bc = vabd_u8(p1, p2);   // |b-c|
-  const uint8x8_t ac = vabd_u8(p0, p2);   // |a-c|
-  const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc));
-  const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac));
-  const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac));
-  const int32_t pa_minus_pb = vget_lane_s32(diff, 0);
-  return (pa_minus_pb <= 0) ? *c0 : *c1;
-}
-
-static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
-  return Select(top + 0, &left, top - 1);
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractFull(const uint32_t* const c0,
-                                                   const uint32_t* const c1,
-                                                   const uint32_t* const c2) {
-  const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
-  const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
-  const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
-  const uint16x8_t sum0 = vaddl_u8(p0, p1);                // add and widen
-  const uint16x8_t sum1 = vqsubq_u16(sum0, vmovl_u8(p2));  // widen and subtract
-  const uint8x8_t out = vqmovn_u16(sum1);                  // narrow and clamp
-  return vget_lane_u32(vreinterpret_u32_u8(out), 0);
-}
-
-static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
-  return ClampedAddSubtractFull(&left, top + 0, top - 1);
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractHalf(const uint32_t* const c0,
-                                                   const uint32_t* const c1,
-                                                   const uint32_t* const c2) {
-  const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
-  const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
-  const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
-  const uint8x8_t avg = vhadd_u8(p0, p1);                  // Average(c0,c1)
-  const uint8x8_t ab = vshr_n_u8(vqsub_u8(avg, p2), 1);    // (a-b)>>1 saturated
-  const uint8x8_t ba = vshr_n_u8(vqsub_u8(p2, avg), 1);    // (b-a)>>1 saturated
-  const uint8x8_t out = vqsub_u8(vqadd_u8(avg, ab), ba);
-  return vget_lane_u32(vreinterpret_u32_u8(out), 0);
-}
-
-static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
-  return ClampedAddSubtractHalf(&left, top + 0, top - 1);
-}
-
-//------------------------------------------------------------------------------
 // Subtract-Green Transform
 
 // vtbl?_u8 are marked unavailable for iOS arm64 with Xcode < 6.3, use
@@ -288,7 +171,7 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
 }
 #endif  // USE_VTBLQ
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
   const uint32_t* const end = argb_data + (num_pixels & ~3);
 #ifdef USE_VTBLQ
   const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
@@ -298,60 +181,89 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
   for (; argb_data < end; argb_data += 4) {
     const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
     const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
-    vst1q_u8((uint8_t*)argb_data, vsubq_u8(argb, greens));
+    vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
   }
   // fallthrough and finish off with plain-C
-  VP8LSubtractGreenFromBlueAndRed_C(argb_data, num_pixels & 3);
+  VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
 }
 
-static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
-  const uint32_t* const end = argb_data + (num_pixels & ~3);
+//------------------------------------------------------------------------------
+// Color Transform
+
+static void TransformColorInverse(const VP8LMultipliers* const m,
+                                  uint32_t* argb_data, int num_pixels) {
+  // sign-extended multiplying constants, pre-shifted by 6.
+#define CST(X)  (((int16_t)(m->X << 8)) >> 6)
+  const int16_t rb[8] = {
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_),
+    CST(green_to_blue_), CST(green_to_red_)
+  };
+  const int16x8_t mults_rb = vld1q_s16(rb);
+  const int16_t b2[8] = {
+    0, CST(red_to_blue_), 0, CST(red_to_blue_),
+    0, CST(red_to_blue_), 0, CST(red_to_blue_),
+  };
+  const int16x8_t mults_b2 = vld1q_s16(b2);
+#undef CST
 #ifdef USE_VTBLQ
-  const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
+  static const uint8_t kg0g0[16] = {
+    255, 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255, 13, 255, 13
+  };
+  const uint8x16_t shuffle = vld1q_u8(kg0g0);
 #else
-  const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
+  static const uint8_t k0g0g[8] = { 255, 1, 255, 1, 255, 5, 255, 5 };
+  const uint8x8_t shuffle = vld1_u8(k0g0g);
 #endif
-  for (; argb_data < end; argb_data += 4) {
-    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
-    const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
-    vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
+  const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u);
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
+    const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
+    // 0 g 0 g
+    const uint8x16_t greens = DoGreenShuffle(in, shuffle);
+    // x dr  x db1
+    const int16x8_t A = vqdmulhq_s16(vreinterpretq_s16_u8(greens), mults_rb);
+    // x r'  x   b'
+    const int8x16_t B = vaddq_s8(vreinterpretq_s8_u8(in),
+                                 vreinterpretq_s8_s16(A));
+    // r' 0   b' 0
+    const int16x8_t C = vshlq_n_s16(vreinterpretq_s16_s8(B), 8);
+    // x db2  0  0
+    const int16x8_t D = vqdmulhq_s16(C, mults_b2);
+    // 0  x db2  0
+    const uint32x4_t E = vshrq_n_u32(vreinterpretq_u32_s16(D), 8);
+    // r' x  b'' 0
+    const int8x16_t F = vaddq_s8(vreinterpretq_s8_u32(E),
+                                 vreinterpretq_s8_s16(C));
+    // 0  r'  0  b''
+    const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8);
+    const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0);
+    vst1q_u32(argb_data + i, out);
   }
-  // fallthrough and finish off with plain-C
-  VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
+  // Fall-back to C-version for left-overs.
+  VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
 }
 
 #undef USE_VTBLQ
 
-#endif   // USE_INTRINSICS
-
-#endif   // WEBP_USE_NEON
-
 //------------------------------------------------------------------------------
+// Entry point
 
 extern void VP8LDspInitNEON(void);
 
-void VP8LDspInitNEON(void) {
-#if defined(WEBP_USE_NEON)
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
   VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
   VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
   VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
 
-#ifdef USE_INTRINSICS
-  VP8LPredictors[5] = Predictor5;
-  VP8LPredictors[6] = Predictor6;
-  VP8LPredictors[7] = Predictor7;
-  VP8LPredictors[8] = Predictor8;
-  VP8LPredictors[9] = Predictor9;
-  VP8LPredictors[10] = Predictor10;
-  VP8LPredictors[11] = Predictor11;
-  VP8LPredictors[12] = Predictor12;
-  VP8LPredictors[13] = Predictor13;
-
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
   VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
-#endif
-
-#endif   // WEBP_USE_NEON
+  VP8LTransformColorInverse = TransformColorInverse;
 }
 
-//------------------------------------------------------------------------------
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8LDspInitNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
index 7130909..2d016c2 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
@@ -13,9 +13,8 @@
 
 #include "./dsp.h"
 
-#include <assert.h>
-
 #if defined(WEBP_USE_SSE2)
+#include <assert.h>
 #include <emmintrin.h>
 #include "./lossless.h"
 
@@ -156,32 +155,14 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
 
-static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
-  const __m128i mask = _mm_set1_epi32(0x0000ff00);
-  int i;
-  for (i = 0; i + 4 <= num_pixels; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
-    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
-    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
-    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
-    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
-    const __m128i out = _mm_sub_epi8(in, in_0g0g);
-    _mm_storeu_si128((__m128i*)&argb_data[i], out);
-  }
-  // fallthrough and finish off with plain-C
-  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
-}
-
 static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
-  const __m128i mask = _mm_set1_epi32(0x0000ff00);
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
-    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
-    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
-    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
-    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
-    const __m128i out = _mm_add_epi8(in, in_0g0g);
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+    const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
+    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
+    const __m128i out = _mm_add_epi8(in, C);
     _mm_storeu_si128((__m128i*)&argb_data[i], out);
   }
   // fallthrough and finish off with plain-C
@@ -191,94 +172,36 @@ static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
 //------------------------------------------------------------------------------
 // Color Transform
 
-static WEBP_INLINE __m128i ColorTransformDelta(__m128i color_pred,
-                                               __m128i color) {
-  // We simulate signed 8-bit multiplication as:
-  // * Left shift the two (8-bit) numbers by 8 bits,
-  // * Perform a 16-bit signed multiplication and retain the higher 16-bits.
-  const __m128i color_pred_shifted = _mm_slli_epi32(color_pred, 8);
-  const __m128i color_shifted = _mm_slli_epi32(color, 8);
-  // Note: This performs multiplication on 8 packed 16-bit numbers, 4 of which
-  // happen to be zeroes.
-  const __m128i signed_mult =
-      _mm_mulhi_epi16(color_pred_shifted, color_shifted);
-  return _mm_srli_epi32(signed_mult, 5);
-}
-
-static WEBP_INLINE void TransformColor(const VP8LMultipliers* const m,
-                                       uint32_t* argb_data,
-                                       int num_pixels) {
-  const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_);       // multipliers
-  const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
-  const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);
-
-  int i;
-
-  for (i = 0; i + 4 <= num_pixels; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
-    const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00);  // masks
-    const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
-    const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
-    const __m128i lower_8bit_mask  = _mm_set1_epi32(0x000000ff);
-    const __m128i ag = _mm_and_si128(in, alpha_green_mask);      // alpha, green
-    const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
-    const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
-    const __m128i b = in;
-
-    const __m128i r_delta = ColorTransformDelta(g_to_r, g);      // red
-    const __m128i r_new =
-        _mm_and_si128(_mm_sub_epi32(r, r_delta), lower_8bit_mask);
-    const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);
-
-    const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g);    // blue
-    const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r);
-    const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
-    const __m128i b_new =
-        _mm_and_si128(_mm_sub_epi32(b, b_delta), lower_8bit_mask);
-
-    const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
-    _mm_storeu_si128((__m128i*)&argb_data[i], out);
-  }
-
-  // Fall-back to C-version for left-overs.
-  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
-}
-
-static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m,
-                                              uint32_t* argb_data,
-                                              int num_pixels) {
-  const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_);       // multipliers
-  const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
-  const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);
-
+static void TransformColorInverse(const VP8LMultipliers* const m,
+                                  uint32_t* argb_data, int num_pixels) {
+  // sign-extended multiplying constants, pre-shifted by 5.
+#define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
+  const __m128i mults_rb = _mm_set_epi16(
+      CST(green_to_red_), CST(green_to_blue_),
+      CST(green_to_red_), CST(green_to_blue_),
+      CST(green_to_red_), CST(green_to_blue_),
+      CST(green_to_red_), CST(green_to_blue_));
+  const __m128i mults_b2 = _mm_set_epi16(
+      CST(red_to_blue_), 0, CST(red_to_blue_), 0,
+      CST(red_to_blue_), 0, CST(red_to_blue_), 0);
+#undef CST
+  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
   int i;
-
   for (i = 0; i + 4 <= num_pixels; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
-    const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00);  // masks
-    const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
-    const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
-    const __m128i lower_8bit_mask  = _mm_set1_epi32(0x000000ff);
-    const __m128i ag = _mm_and_si128(in, alpha_green_mask);      // alpha, green
-    const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
-    const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
-    const __m128i b = in;
-
-    const __m128i r_delta = ColorTransformDelta(g_to_r, g);      // red
-    const __m128i r_new =
-        _mm_and_si128(_mm_add_epi32(r, r_delta), lower_8bit_mask);
-    const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);
-
-    const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g);    // blue
-    const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r_new);
-    const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
-    const __m128i b_new =
-        _mm_and_si128(_mm_add_epi32(b, b_delta), lower_8bit_mask);
-
-    const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+    const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
+    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
+    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
+    const __m128i D = _mm_mulhi_epi16(C, mults_rb);    // x dr  x db1
+    const __m128i E = _mm_add_epi8(in, D);             // x r'  x   b'
+    const __m128i F = _mm_slli_epi16(E, 8);            // r' 0   b' 0
+    const __m128i G = _mm_mulhi_epi16(F, mults_b2);    // x db2  0  0
+    const __m128i H = _mm_srli_epi32(G, 8);            // 0  x db2  0
+    const __m128i I = _mm_add_epi8(H, F);              // r' x  b'' 0
+    const __m128i J = _mm_srli_epi16(I, 8);            // 0  r'  0  b''
+    const __m128i out = _mm_or_si128(J, A);
     _mm_storeu_si128((__m128i*)&argb_data[i], out);
   }
-
   // Fall-back to C-version for left-overs.
   VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
 }
@@ -418,95 +341,11 @@ static void ConvertBGRAToBGR(const uint32_t* src,
 }
 
 //------------------------------------------------------------------------------
-
-#define LINE_SIZE 16    // 8 or 16
-static void AddVector(const uint32_t* a, const uint32_t* b, uint32_t* out,
-                      int size) {
-  int i;
-  assert(size % LINE_SIZE == 0);
-  for (i = 0; i < size; i += LINE_SIZE) {
-    const __m128i a0 = _mm_loadu_si128((__m128i*)&a[i +  0]);
-    const __m128i a1 = _mm_loadu_si128((__m128i*)&a[i +  4]);
-#if (LINE_SIZE == 16)
-    const __m128i a2 = _mm_loadu_si128((__m128i*)&a[i +  8]);
-    const __m128i a3 = _mm_loadu_si128((__m128i*)&a[i + 12]);
-#endif
-    const __m128i b0 = _mm_loadu_si128((__m128i*)&b[i +  0]);
-    const __m128i b1 = _mm_loadu_si128((__m128i*)&b[i +  4]);
-#if (LINE_SIZE == 16)
-    const __m128i b2 = _mm_loadu_si128((__m128i*)&b[i +  8]);
-    const __m128i b3 = _mm_loadu_si128((__m128i*)&b[i + 12]);
-#endif
-    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
-    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
-#if (LINE_SIZE == 16)
-    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
-    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
-#endif
-  }
-}
-
-static void AddVectorEq(const uint32_t* a, uint32_t* out, int size) {
-  int i;
-  assert(size % LINE_SIZE == 0);
-  for (i = 0; i < size; i += LINE_SIZE) {
-    const __m128i a0 = _mm_loadu_si128((__m128i*)&a[i +  0]);
-    const __m128i a1 = _mm_loadu_si128((__m128i*)&a[i +  4]);
-#if (LINE_SIZE == 16)
-    const __m128i a2 = _mm_loadu_si128((__m128i*)&a[i +  8]);
-    const __m128i a3 = _mm_loadu_si128((__m128i*)&a[i + 12]);
-#endif
-    const __m128i b0 = _mm_loadu_si128((__m128i*)&out[i +  0]);
-    const __m128i b1 = _mm_loadu_si128((__m128i*)&out[i +  4]);
-#if (LINE_SIZE == 16)
-    const __m128i b2 = _mm_loadu_si128((__m128i*)&out[i +  8]);
-    const __m128i b3 = _mm_loadu_si128((__m128i*)&out[i + 12]);
-#endif
-    _mm_storeu_si128((__m128i*)&out[i +  0], _mm_add_epi32(a0, b0));
-    _mm_storeu_si128((__m128i*)&out[i +  4], _mm_add_epi32(a1, b1));
-#if (LINE_SIZE == 16)
-    _mm_storeu_si128((__m128i*)&out[i +  8], _mm_add_epi32(a2, b2));
-    _mm_storeu_si128((__m128i*)&out[i + 12], _mm_add_epi32(a3, b3));
-#endif
-  }
-}
-#undef LINE_SIZE
-
-// Note we are adding uint32_t's as *signed* int32's (using _mm_add_epi32). But
-// that's ok since the histogram values are less than 1<<28 (max picture size).
-static void HistogramAdd(const VP8LHistogram* const a,
-                         const VP8LHistogram* const b,
-                         VP8LHistogram* const out) {
-  int i;
-  const int literal_size = VP8LHistogramNumCodes(a->palette_code_bits_);
-  assert(a->palette_code_bits_ == b->palette_code_bits_);
-  if (b != out) {
-    AddVector(a->literal_, b->literal_, out->literal_, NUM_LITERAL_CODES);
-    AddVector(a->red_, b->red_, out->red_, NUM_LITERAL_CODES);
-    AddVector(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES);
-    AddVector(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES);
-  } else {
-    AddVectorEq(a->literal_, out->literal_, NUM_LITERAL_CODES);
-    AddVectorEq(a->red_, out->red_, NUM_LITERAL_CODES);
-    AddVectorEq(a->blue_, out->blue_, NUM_LITERAL_CODES);
-    AddVectorEq(a->alpha_, out->alpha_, NUM_LITERAL_CODES);
-  }
-  for (i = NUM_LITERAL_CODES; i < literal_size; ++i) {
-    out->literal_[i] = a->literal_[i] + b->literal_[i];
-  }
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    out->distance_[i] = a->distance_[i] + b->distance_[i];
-  }
-}
-
-#endif   // WEBP_USE_SSE2
-
-//------------------------------------------------------------------------------
+// Entry point
 
 extern void VP8LDspInitSSE2(void);
 
-void VP8LDspInitSSE2(void) {
-#if defined(WEBP_USE_SSE2)
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
   VP8LPredictors[5] = Predictor5;
   VP8LPredictors[6] = Predictor6;
   VP8LPredictors[7] = Predictor7;
@@ -517,19 +356,17 @@ void VP8LDspInitSSE2(void) {
   VP8LPredictors[12] = Predictor12;
   VP8LPredictors[13] = Predictor13;
 
-  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
   VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
-
-  VP8LTransformColor = TransformColor;
   VP8LTransformColorInverse = TransformColorInverse;
 
   VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
   VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
   VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
   VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
-
-  VP8LHistogramAdd = HistogramAdd;
-#endif   // WEBP_USE_SSE2
 }
 
-//------------------------------------------------------------------------------
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(VP8LDspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/mips_macro.h b/src/3rdparty/libwebp/src/dsp/mips_macro.h
new file mode 100644
index 0000000..44aba9b
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/mips_macro.h
@@ -0,0 +1,200 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS common macros
+
+#ifndef WEBP_DSP_MIPS_MACRO_H_
+#define WEBP_DSP_MIPS_MACRO_H_
+
+#if defined(__GNUC__) && defined(__ANDROID__) && LOCAL_GCC_VERSION == 0x409
+#define WORK_AROUND_GCC
+#endif
+
+#define STR(s) #s
+#define XSTR(s) STR(s)
+
+// O0[31..16 | 15..0] = I0[31..16 | 15..0] + I1[31..16 | 15..0]
+// O1[31..16 | 15..0] = I0[31..16 | 15..0] - I1[31..16 | 15..0]
+// O - output
+// I - input (macro doesn't change it)
+#define ADD_SUB_HALVES(O0, O1,                                                 \
+                       I0, I1)                                                 \
+  "addq.ph          %[" #O0 "],   %[" #I0 "],  %[" #I1 "]           \n\t"      \
+  "subq.ph          %[" #O1 "],   %[" #I0 "],  %[" #I1 "]           \n\t"
+
+// O - output
+// I - input (macro doesn't change it)
+// I[0/1] - offset in bytes
+#define LOAD_IN_X2(O0, O1,                                                     \
+                   I0, I1)                                                     \
+  "lh               %[" #O0 "],   " #I0 "(%[in])                  \n\t"        \
+  "lh               %[" #O1 "],   " #I1 "(%[in])                  \n\t"
+
+// I0 - location
+// I1..I9 - offsets in bytes
+#define LOAD_WITH_OFFSET_X4(O0, O1, O2, O3,                                    \
+                            I0, I1, I2, I3, I4, I5, I6, I7, I8, I9)            \
+  "ulw    %[" #O0 "],    " #I1 "+" XSTR(I9) "*" #I5 "(%[" #I0 "])       \n\t"  \
+  "ulw    %[" #O1 "],    " #I2 "+" XSTR(I9) "*" #I6 "(%[" #I0 "])       \n\t"  \
+  "ulw    %[" #O2 "],    " #I3 "+" XSTR(I9) "*" #I7 "(%[" #I0 "])       \n\t"  \
+  "ulw    %[" #O3 "],    " #I4 "+" XSTR(I9) "*" #I8 "(%[" #I0 "])       \n\t"
+
+// O - output
+// IO - input/output
+// I - input (macro doesn't change it)
+#define MUL_SHIFT_SUM(O0, O1, O2, O3, O4, O5, O6, O7,                          \
+                      IO0, IO1, IO2, IO3,                                      \
+                      I0, I1, I2, I3, I4, I5, I6, I7)                          \
+  "mul              %[" #O0 "],   %[" #I0 "],   %[kC2]        \n\t"            \
+  "mul              %[" #O1 "],   %[" #I0 "],   %[kC1]        \n\t"            \
+  "mul              %[" #O2 "],   %[" #I1 "],   %[kC2]        \n\t"            \
+  "mul              %[" #O3 "],   %[" #I1 "],   %[kC1]        \n\t"            \
+  "mul              %[" #O4 "],   %[" #I2 "],   %[kC2]        \n\t"            \
+  "mul              %[" #O5 "],   %[" #I2 "],   %[kC1]        \n\t"            \
+  "mul              %[" #O6 "],   %[" #I3 "],   %[kC2]        \n\t"            \
+  "mul              %[" #O7 "],   %[" #I3 "],   %[kC1]        \n\t"            \
+  "sra              %[" #O0 "],   %[" #O0 "],   16            \n\t"            \
+  "sra              %[" #O1 "],   %[" #O1 "],   16            \n\t"            \
+  "sra              %[" #O2 "],   %[" #O2 "],   16            \n\t"            \
+  "sra              %[" #O3 "],   %[" #O3 "],   16            \n\t"            \
+  "sra              %[" #O4 "],   %[" #O4 "],   16            \n\t"            \
+  "sra              %[" #O5 "],   %[" #O5 "],   16            \n\t"            \
+  "sra              %[" #O6 "],   %[" #O6 "],   16            \n\t"            \
+  "sra              %[" #O7 "],   %[" #O7 "],   16            \n\t"            \
+  "addu             %[" #IO0 "],  %[" #IO0 "],  %[" #I4 "]    \n\t"            \
+  "addu             %[" #IO1 "],  %[" #IO1 "],  %[" #I5 "]    \n\t"            \
+  "subu             %[" #IO2 "],  %[" #IO2 "],  %[" #I6 "]    \n\t"            \
+  "subu             %[" #IO3 "],  %[" #IO3 "],  %[" #I7 "]    \n\t"
+
+// O - output
+// I - input (macro doesn't change it)
+#define INSERT_HALF_X2(O0, O1,                                                 \
+                       I0, I1)                                                 \
+  "ins              %[" #O0 "],   %[" #I0 "], 16,    16           \n\t"        \
+  "ins              %[" #O1 "],   %[" #I1 "], 16,    16           \n\t"
+
+// O - output
+// I - input (macro doesn't change it)
+#define SRA_16(O0, O1, O2, O3,                                                 \
+               I0, I1, I2, I3)                                                 \
+  "sra              %[" #O0 "],  %[" #I0 "],  16                  \n\t"        \
+  "sra              %[" #O1 "],  %[" #I1 "],  16                  \n\t"        \
+  "sra              %[" #O2 "],  %[" #I2 "],  16                  \n\t"        \
+  "sra              %[" #O3 "],  %[" #I3 "],  16                  \n\t"
+
+// temp0[31..16 | 15..0] = temp8[31..16 | 15..0] + temp12[31..16 | 15..0]
+// temp1[31..16 | 15..0] = temp8[31..16 | 15..0] - temp12[31..16 | 15..0]
+// temp0[31..16 | 15..0] = temp0[31..16 >> 3 | 15..0 >> 3]
+// temp1[31..16 | 15..0] = temp1[31..16 >> 3 | 15..0 >> 3]
+// O - output
+// I - input (macro doesn't change it)
+#define SHIFT_R_SUM_X2(O0, O1, O2, O3, O4, O5, O6, O7,                         \
+                       I0, I1, I2, I3, I4, I5, I6, I7)                         \
+  "addq.ph          %[" #O0 "],   %[" #I0 "],   %[" #I4 "]    \n\t"            \
+  "subq.ph          %[" #O1 "],   %[" #I0 "],   %[" #I4 "]    \n\t"            \
+  "addq.ph          %[" #O2 "],   %[" #I1 "],   %[" #I5 "]    \n\t"            \
+  "subq.ph          %[" #O3 "],   %[" #I1 "],   %[" #I5 "]    \n\t"            \
+  "addq.ph          %[" #O4 "],   %[" #I2 "],   %[" #I6 "]    \n\t"            \
+  "subq.ph          %[" #O5 "],   %[" #I2 "],   %[" #I6 "]    \n\t"            \
+  "addq.ph          %[" #O6 "],   %[" #I3 "],   %[" #I7 "]    \n\t"            \
+  "subq.ph          %[" #O7 "],   %[" #I3 "],   %[" #I7 "]    \n\t"            \
+  "shra.ph          %[" #O0 "],   %[" #O0 "],   3             \n\t"            \
+  "shra.ph          %[" #O1 "],   %[" #O1 "],   3             \n\t"            \
+  "shra.ph          %[" #O2 "],   %[" #O2 "],   3             \n\t"            \
+  "shra.ph          %[" #O3 "],   %[" #O3 "],   3             \n\t"            \
+  "shra.ph          %[" #O4 "],   %[" #O4 "],   3             \n\t"            \
+  "shra.ph          %[" #O5 "],   %[" #O5 "],   3             \n\t"            \
+  "shra.ph          %[" #O6 "],   %[" #O6 "],   3             \n\t"            \
+  "shra.ph          %[" #O7 "],   %[" #O7 "],   3             \n\t"
+
+// precrq.ph.w temp0, temp8, temp2
+//   temp0 = temp8[31..16] | temp2[31..16]
+// ins temp2, temp8, 16, 16
+//   temp2 = temp8[31..16] | temp2[15..0]
+// O - output
+// IO - input/output
+// I - input (macro doesn't change it)
+#define PACK_2_HALVES_TO_WORD(O0, O1, O2, O3,                                  \
+                              IO0, IO1, IO2, IO3,                              \
+                              I0, I1, I2, I3)                                  \
+  "precrq.ph.w      %[" #O0 "],    %[" #I0 "],  %[" #IO0 "]       \n\t"        \
+  "precrq.ph.w      %[" #O1 "],    %[" #I1 "],  %[" #IO1 "]       \n\t"        \
+  "ins              %[" #IO0 "],   %[" #I0 "],  16,    16         \n\t"        \
+  "ins              %[" #IO1 "],   %[" #I1 "],  16,    16         \n\t"        \
+  "precrq.ph.w      %[" #O2 "],    %[" #I2 "],  %[" #IO2 "]       \n\t"        \
+  "precrq.ph.w      %[" #O3 "],    %[" #I3 "],  %[" #IO3 "]       \n\t"        \
+  "ins              %[" #IO2 "],   %[" #I2 "],  16,    16         \n\t"        \
+  "ins              %[" #IO3 "],   %[" #I3 "],  16,    16         \n\t"
+
+// preceu.ph.qbr temp0, temp8
+//   temp0 = 0 | 0 | temp8[23..16] | temp8[7..0]
+// preceu.ph.qbl temp1, temp8
+//   temp1 = temp8[23..16] | temp8[7..0] | 0 | 0
+// O - output
+// I - input (macro doesn't change it)
+#define CONVERT_2_BYTES_TO_HALF(O0, O1, O2, O3, O4, O5, O6, O7,                \
+                                I0, I1, I2, I3)                                \
+  "preceu.ph.qbr    %[" #O0 "],   %[" #I0 "]                      \n\t"        \
+  "preceu.ph.qbl    %[" #O1 "],   %[" #I0 "]                      \n\t"        \
+  "preceu.ph.qbr    %[" #O2 "],   %[" #I1 "]                      \n\t"        \
+  "preceu.ph.qbl    %[" #O3 "],   %[" #I1 "]                      \n\t"        \
+  "preceu.ph.qbr    %[" #O4 "],   %[" #I2 "]                      \n\t"        \
+  "preceu.ph.qbl    %[" #O5 "],   %[" #I2 "]                      \n\t"        \
+  "preceu.ph.qbr    %[" #O6 "],   %[" #I3 "]                      \n\t"        \
+  "preceu.ph.qbl    %[" #O7 "],   %[" #I3 "]                      \n\t"
+
+// temp0[31..16 | 15..0] = temp0[31..16 | 15..0] + temp8[31..16 | 15..0]
+// temp0[31..16 | 15..0] = temp0[31..16 <<(s) 7 | 15..0 <<(s) 7]
+// temp1..temp7 same as temp0
+// precrqu_s.qb.ph temp0, temp1, temp0:
+//   temp0 = temp1[31..24] | temp1[15..8] | temp0[31..24] | temp0[15..8]
+// store temp0 to dst
+// IO - input/output
+// I - input (macro doesn't change it)
+#define STORE_SAT_SUM_X2(IO0, IO1, IO2, IO3, IO4, IO5, IO6, IO7,               \
+                         I0, I1, I2, I3, I4, I5, I6, I7,                       \
+                         I8, I9, I10, I11, I12, I13)                           \
+  "addq.ph          %[" #IO0 "],  %[" #IO0 "],  %[" #I0 "]          \n\t"      \
+  "addq.ph          %[" #IO1 "],  %[" #IO1 "],  %[" #I1 "]          \n\t"      \
+  "addq.ph          %[" #IO2 "],  %[" #IO2 "],  %[" #I2 "]          \n\t"      \
+  "addq.ph          %[" #IO3 "],  %[" #IO3 "],  %[" #I3 "]          \n\t"      \
+  "addq.ph          %[" #IO4 "],  %[" #IO4 "],  %[" #I4 "]          \n\t"      \
+  "addq.ph          %[" #IO5 "],  %[" #IO5 "],  %[" #I5 "]          \n\t"      \
+  "addq.ph          %[" #IO6 "],  %[" #IO6 "],  %[" #I6 "]          \n\t"      \
+  "addq.ph          %[" #IO7 "],  %[" #IO7 "],  %[" #I7 "]          \n\t"      \
+  "shll_s.ph        %[" #IO0 "],  %[" #IO0 "],  7                   \n\t"      \
+  "shll_s.ph        %[" #IO1 "],  %[" #IO1 "],  7                   \n\t"      \
+  "shll_s.ph        %[" #IO2 "],  %[" #IO2 "],  7                   \n\t"      \
+  "shll_s.ph        %[" #IO3 "],  %[" #IO3 "],  7                   \n\t"      \
+  "shll_s.ph        %[" #IO4 "],  %[" #IO4 "],  7                   \n\t"      \
+  "shll_s.ph        %[" #IO5 "],  %[" #IO5 "],  7                   \n\t"      \
+  "shll_s.ph        %[" #IO6 "],  %[" #IO6 "],  7                   \n\t"      \
+  "shll_s.ph        %[" #IO7 "],  %[" #IO7 "],  7                   \n\t"      \
+  "precrqu_s.qb.ph  %[" #IO0 "],  %[" #IO1 "],  %[" #IO0 "]         \n\t"      \
+  "precrqu_s.qb.ph  %[" #IO2 "],  %[" #IO3 "],  %[" #IO2 "]         \n\t"      \
+  "precrqu_s.qb.ph  %[" #IO4 "],  %[" #IO5 "],  %[" #IO4 "]         \n\t"      \
+  "precrqu_s.qb.ph  %[" #IO6 "],  %[" #IO7 "],  %[" #IO6 "]         \n\t"      \
+  "usw              %[" #IO0 "],  " XSTR(I13) "*" #I9 "(%[" #I8 "])   \n\t"    \
+  "usw              %[" #IO2 "],  " XSTR(I13) "*" #I10 "(%[" #I8 "])  \n\t"    \
+  "usw              %[" #IO4 "],  " XSTR(I13) "*" #I11 "(%[" #I8 "])  \n\t"    \
+  "usw              %[" #IO6 "],  " XSTR(I13) "*" #I12 "(%[" #I8 "])  \n\t"
+
+#define OUTPUT_EARLY_CLOBBER_REGS_10()                                         \
+  : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),             \
+    [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),             \
+    [temp7]"=&r"(temp7), [temp8]"=&r"(temp8), [temp9]"=&r"(temp9),             \
+    [temp10]"=&r"(temp10)
+
+#define OUTPUT_EARLY_CLOBBER_REGS_18()                                         \
+  OUTPUT_EARLY_CLOBBER_REGS_10(),                                              \
+  [temp11]"=&r"(temp11), [temp12]"=&r"(temp12), [temp13]"=&r"(temp13),         \
+  [temp14]"=&r"(temp14), [temp15]"=&r"(temp15), [temp16]"=&r"(temp16),         \
+  [temp17]"=&r"(temp17), [temp18]"=&r"(temp18)
+
+#endif  // WEBP_DSP_MIPS_MACRO_H_
diff --git a/src/3rdparty/libwebp/src/dsp/neon.h b/src/3rdparty/libwebp/src/dsp/neon.h
index 7e06eae..0a06266 100644
--- a/src/3rdparty/libwebp/src/dsp/neon.h
+++ b/src/3rdparty/libwebp/src/dsp/neon.h
@@ -19,7 +19,7 @@
 // Right now, some intrinsics functions seem slower, so we disable them
 // everywhere except aarch64 where the inline assembly is incompatible.
 #if defined(__aarch64__)
-#define USE_INTRINSICS   // use intrinsics when possible
+#define WEBP_USE_INTRINSICS   // use intrinsics when possible
 #endif
 
 #define INIT_VECTOR2(v, a, b) do {  \
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler.c b/src/3rdparty/libwebp/src/dsp/rescaler.c
new file mode 100644
index 0000000..bc743d5
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/rescaler.c
@@ -0,0 +1,238 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Rescaling functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+
+#include "./dsp.h"
+#include "../utils/rescaler.h"
+
+//------------------------------------------------------------------------------
+// Implementations of critical functions ImportRow / ExportRow
+
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+
+//------------------------------------------------------------------------------
+// Row import
+
+void WebPRescalerImportRowExpandC(WebPRescaler* const wrk, const uint8_t* src) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  int channel;
+  assert(!WebPRescalerInputDone(wrk));
+  assert(wrk->x_expand);
+  for (channel = 0; channel < x_stride; ++channel) {
+    int x_in = channel;
+    int x_out = channel;
+    // simple bilinear interpolation
+    int accum = wrk->x_add;
+    int left = src[x_in];
+    int right = (wrk->src_width > 1) ? src[x_in + x_stride] : left;
+    x_in += x_stride;
+    while (1) {
+      wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum;
+      x_out += x_stride;
+      if (x_out >= x_out_max) break;
+      accum -= wrk->x_sub;
+      if (accum < 0) {
+        left = right;
+        x_in += x_stride;
+        assert(x_in < wrk->src_width * x_stride);
+        right = src[x_in];
+        accum += wrk->x_add;
+      }
+    }
+    assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
+  }
+}
+
+void WebPRescalerImportRowShrinkC(WebPRescaler* const wrk, const uint8_t* src) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  int channel;
+  assert(!WebPRescalerInputDone(wrk));
+  assert(!wrk->x_expand);
+  for (channel = 0; channel < x_stride; ++channel) {
+    int x_in = channel;
+    int x_out = channel;
+    uint32_t sum = 0;
+    int accum = 0;
+    while (x_out < x_out_max) {
+      uint32_t base = 0;
+      accum += wrk->x_add;
+      while (accum > 0) {
+        accum -= wrk->x_sub;
+        assert(x_in < wrk->src_width * x_stride);
+        base = src[x_in];
+        sum += base;
+        x_in += x_stride;
+      }
+      {        // Emit next horizontal pixel.
+        const rescaler_t frac = base * (-accum);
+        wrk->frow[x_out] = sum * wrk->x_sub - frac;
+        // fresh fractional start for next pixel
+        sum = (int)MULT_FIX(frac, wrk->fx_scale);
+      }
+      x_out += x_stride;
+    }
+    assert(accum == 0);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Row export
+
+void WebPRescalerExportRowExpandC(WebPRescaler* const wrk) {
+  int x_out;
+  uint8_t* const dst = wrk->dst;
+  rescaler_t* const irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* const frow = wrk->frow;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const uint32_t J = frow[x_out];
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const uint64_t I = (uint64_t)A * frow[x_out]
+                       + (uint64_t)B * irow[x_out];
+      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  }
+}
+
+void WebPRescalerExportRowShrinkC(WebPRescaler* const wrk) {
+  int x_out;
+  uint8_t* const dst = wrk->dst;
+  rescaler_t* const irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* const frow = wrk->frow;
+  const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  if (yscale) {
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
+      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+      irow[x_out] = frac;   // new fractional start
+    }
+  } else {
+    for (x_out = 0; x_out < x_out_max; ++x_out) {
+      const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+      irow[x_out] = 0;
+    }
+  }
+}
+
+#undef MULT_FIX
+#undef ROUNDER
+
+//------------------------------------------------------------------------------
+// Main entry calls
+
+void WebPRescalerImportRow(WebPRescaler* const wrk, const uint8_t* src) {
+  assert(!WebPRescalerInputDone(wrk));
+  if (!wrk->x_expand) {
+    WebPRescalerImportRowShrink(wrk, src);
+  } else {
+    WebPRescalerImportRowExpand(wrk, src);
+  }
+}
+
+void WebPRescalerExportRow(WebPRescaler* const wrk) {
+  if (wrk->y_accum <= 0) {
+    assert(!WebPRescalerOutputDone(wrk));
+    if (wrk->y_expand) {
+      WebPRescalerExportRowExpand(wrk);
+    } else if (wrk->fxy_scale) {
+      WebPRescalerExportRowShrink(wrk);
+    } else {  // very special case for src = dst = 1x1
+      int i;
+      assert(wrk->src_width == 1 && wrk->dst_width <= 2);
+      assert(wrk->src_height == 1 && wrk->dst_height == 1);
+      for (i = 0; i < wrk->num_channels * wrk->dst_width; ++i) {
+        wrk->dst[i] = wrk->irow[i];
+        wrk->irow[i] = 0;
+      }
+    }
+    wrk->y_accum += wrk->y_add;
+    wrk->dst += wrk->dst_stride;
+    ++wrk->dst_y;
+  }
+}
+
+//------------------------------------------------------------------------------
+
+WebPRescalerImportRowFunc WebPRescalerImportRowExpand;
+WebPRescalerImportRowFunc WebPRescalerImportRowShrink;
+
+WebPRescalerExportRowFunc WebPRescalerExportRowExpand;
+WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
+
+extern void WebPRescalerDspInitSSE2(void);
+extern void WebPRescalerDspInitMIPS32(void);
+extern void WebPRescalerDspInitMIPSdspR2(void);
+extern void WebPRescalerDspInitNEON(void);
+
+static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
+    (VP8CPUInfo)&rescaler_last_cpuinfo_used;
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
+  if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  WebPRescalerImportRowExpand = WebPRescalerImportRowExpandC;
+  WebPRescalerImportRowShrink = WebPRescalerImportRowShrinkC;
+  WebPRescalerExportRowExpand = WebPRescalerExportRowExpandC;
+  WebPRescalerExportRowShrink = WebPRescalerExportRowShrinkC;
+
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPRescalerDspInitSSE2();
+    }
+#endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      WebPRescalerDspInitNEON();
+    }
+#endif
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      WebPRescalerDspInitMIPS32();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      WebPRescalerDspInitMIPSdspR2();
+    }
+#endif
+  }
+  rescaler_last_cpuinfo_used = VP8GetCPUInfo;
+}
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c b/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c
new file mode 100644
index 0000000..ddaa391
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c
@@ -0,0 +1,291 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of rescaling functions
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS32)
+
+#include <assert.h>
+#include "../utils/rescaler.h"
+
+//------------------------------------------------------------------------------
+// Row import
+
+static void ImportRowShrink(WebPRescaler* const wrk, const uint8_t* src) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const int fx_scale = wrk->fx_scale;
+  const int x_add = wrk->x_add;
+  const int x_sub = wrk->x_sub;
+  const int x_stride1 = x_stride << 2;
+  int channel;
+  assert(!wrk->x_expand);
+  assert(!WebPRescalerInputDone(wrk));
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    const uint8_t* src1 = src + channel;
+    rescaler_t* frow = wrk->frow + channel;
+    int temp1, temp2, temp3;
+    int base, frac, sum;
+    int accum, accum1;
+    int loop_c = x_out_max - channel;
+
+    __asm__ volatile (
+      "li     %[temp1],   0x8000                    \n\t"
+      "li     %[temp2],   0x10000                   \n\t"
+      "li     %[sum],     0                         \n\t"
+      "li     %[accum],   0                         \n\t"
+    "1:                                             \n\t"
+      "addu   %[accum],   %[accum],   %[x_add]      \n\t"
+      "li     %[base],    0                         \n\t"
+      "blez   %[accum],   3f                        \n\t"
+    "2:                                             \n\t"
+      "lbu    %[base],    0(%[src1])                \n\t"
+      "subu   %[accum],   %[accum],   %[x_sub]      \n\t"
+      "addu   %[src1],    %[src1],    %[x_stride]   \n\t"
+      "addu   %[sum],     %[sum],     %[base]       \n\t"
+      "bgtz   %[accum],   2b                        \n\t"
+    "3:                                             \n\t"
+      "negu   %[accum1],  %[accum]                  \n\t"
+      "mul    %[frac],    %[base],    %[accum1]     \n\t"
+      "mul    %[temp3],   %[sum],     %[x_sub]      \n\t"
+      "subu   %[loop_c],  %[loop_c],  %[x_stride]   \n\t"
+      "mult   %[temp1],   %[temp2]                  \n\t"
+      "maddu  %[frac],    %[fx_scale]               \n\t"
+      "mfhi   %[sum]                                \n\t"
+      "subu   %[temp3],   %[temp3],   %[frac]       \n\t"
+      "sw     %[temp3],   0(%[frow])                \n\t"
+      "addu   %[frow],    %[frow],    %[x_stride1]  \n\t"
+      "bgtz   %[loop_c],  1b                        \n\t"
+      : [accum]"=&r"(accum), [src1]"+r"(src1), [temp3]"=&r"(temp3),
+        [sum]"=&r"(sum), [base]"=&r"(base), [frac]"=&r"(frac),
+        [frow]"+r"(frow), [accum1]"=&r"(accum1),
+        [temp2]"=&r"(temp2), [temp1]"=&r"(temp1)
+      : [x_stride]"r"(x_stride), [fx_scale]"r"(fx_scale),
+        [x_sub]"r"(x_sub), [x_add]"r"(x_add),
+        [loop_c]"r"(loop_c), [x_stride1]"r"(x_stride1)
+      : "memory", "hi", "lo"
+    );
+    assert(accum == 0);
+  }
+}
+
+static void ImportRowExpand(WebPRescaler* const wrk, const uint8_t* src) {
+  const int x_stride = wrk->num_channels;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const int x_add = wrk->x_add;
+  const int x_sub = wrk->x_sub;
+  const int src_width = wrk->src_width;
+  const int x_stride1 = x_stride << 2;
+  int channel;
+  assert(wrk->x_expand);
+  assert(!WebPRescalerInputDone(wrk));
+
+  for (channel = 0; channel < x_stride; ++channel) {
+    const uint8_t* src1 = src + channel;
+    rescaler_t* frow = wrk->frow + channel;
+    int temp1, temp2, temp3, temp4;
+    int frac;
+    int accum;
+    int x_out = channel;
+
+    __asm__ volatile (
+      "addiu  %[temp3],   %[src_width], -1            \n\t"
+      "lbu    %[temp2],   0(%[src1])                  \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "bgtz   %[temp3],   0f                          \n\t"
+      "addiu  %[temp1],   %[temp2],     0             \n\t"
+      "b      3f                                      \n\t"
+    "0:                                               \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+    "3:                                               \n\t"
+      "addiu  %[accum],   %[x_add],     0             \n\t"
+    "1:                                               \n\t"
+      "subu   %[temp3],   %[temp2],     %[temp1]      \n\t"
+      "mul    %[temp3],   %[temp3],     %[accum]      \n\t"
+      "mul    %[temp4],   %[temp1],     %[x_add]      \n\t"
+      "addu   %[temp3],   %[temp4],     %[temp3]      \n\t"
+      "sw     %[temp3],   0(%[frow])                  \n\t"
+      "addu   %[frow],    %[frow],      %[x_stride1]  \n\t"
+      "addu   %[x_out],   %[x_out],     %[x_stride]   \n\t"
+      "subu   %[temp3],   %[x_out],     %[x_out_max]  \n\t"
+      "bgez   %[temp3],   2f                          \n\t"
+      "subu   %[accum],   %[accum],     %[x_sub]      \n\t"
+      "bgez   %[accum],   4f                          \n\t"
+      "addiu  %[temp2],   %[temp1],     0             \n\t"
+      "addu   %[src1],    %[src1],      %[x_stride]   \n\t"
+      "lbu    %[temp1],   0(%[src1])                  \n\t"
+      "addu   %[accum],   %[accum],     %[x_add]      \n\t"
+    "4:                                               \n\t"
+      "b      1b                                      \n\t"
+    "2:                                               \n\t"
+      : [src1]"+r"(src1), [accum]"=&r"(accum), [temp1]"=&r"(temp1),
+        [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
+        [x_out]"+r"(x_out), [frac]"=&r"(frac), [frow]"+r"(frow)
+      : [x_stride]"r"(x_stride), [x_add]"r"(x_add), [x_sub]"r"(x_sub),
+        [x_stride1]"r"(x_stride1), [src_width]"r"(src_width),
+        [x_out_max]"r"(x_out_max)
+      : "memory", "hi", "lo"
+    );
+    assert(wrk->x_sub == 0 /* <- special case for src_width=1 */ || accum == 0);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Row export
+
+static void ExportRowExpand(WebPRescaler* const wrk) {
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  int temp0, temp1, temp3, temp4, temp5, loop_end;
+  const int temp2 = (int)wrk->fy_scale;
+  const int temp6 = x_out_max << 2;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6)
+      : "memory", "hi", "lo"
+    );
+  } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "lw       %[temp1],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[A],        %[temp0]                   \n\t"
+      "maddu    %[B],        %[temp1]                   \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp5],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
+      : "memory", "hi", "lo"
+    );
+  }
+}
+
+static void ExportRowShrink(WebPRescaler* const wrk) {
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const rescaler_t* frow = wrk->frow;
+  const int yscale = wrk->fy_scale * (-wrk->y_accum);
+  int temp0, temp1, temp3, temp4, temp5, loop_end;
+  const int temp2 = (int)wrk->fxy_scale;
+  const int temp6 = x_out_max << 2;
+
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  assert(wrk->fxy_scale != 0);
+  if (yscale) {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[frow])                 \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "addiu    %[frow],     %[frow],     4             \n\t"
+      "maddu    %[temp0],    %[yscale]                  \n\t"
+      "mfhi     %[temp1]                                \n\t"
+      "lw       %[temp0],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "subu     %[temp0],    %[temp0],    %[temp1]      \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sw       %[temp1],    -4(%[irow])                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[frow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+        [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [yscale]"r"(yscale), [temp6]"r"(temp6)
+      : "memory", "hi", "lo"
+    );
+  } else {
+    __asm__ volatile (
+      "li       %[temp3],    0x10000                    \n\t"
+      "li       %[temp4],    0x8000                     \n\t"
+      "addu     %[loop_end], %[irow],     %[temp6]      \n\t"
+    "1:                                                 \n\t"
+      "lw       %[temp0],    0(%[irow])                 \n\t"
+      "addiu    %[dst],      %[dst],      1             \n\t"
+      "addiu    %[irow],     %[irow],     4             \n\t"
+      "mult     %[temp3],    %[temp4]                   \n\t"
+      "maddu    %[temp0],    %[temp2]                   \n\t"
+      "mfhi     %[temp5]                                \n\t"
+      "sw       $zero,       -4(%[irow])                \n\t"
+      "sb       %[temp5],    -1(%[dst])                 \n\t"
+      "bne      %[irow],     %[loop_end], 1b            \n\t"
+      : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+        [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
+        [dst]"+r"(dst), [loop_end]"=&r"(loop_end)
+      : [temp2]"r"(temp2), [temp6]"r"(temp6)
+      : "memory", "hi", "lo"
+    );
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPRescalerDspInitMIPS32(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPS32(void) {
+  WebPRescalerImportRowExpand = ImportRowExpand;
+  WebPRescalerImportRowShrink = ImportRowShrink;
+  WebPRescalerExportRowExpand = ExportRowExpand;
+  WebPRescalerExportRowShrink = ExportRowShrink;
+}
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
new file mode 100644
index 0000000..b457d0a
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
@@ -0,0 +1,314 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS version of rescaling functions
+//
+// Author(s): Djordje Pesut (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MIPS_DSP_R2)
+
+#include <assert.h>
+#include "../utils/rescaler.h"
+
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+
+//------------------------------------------------------------------------------
+// Row export
+
+static void ExportRowShrink(WebPRescaler* const wrk) {
+  int i;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const rescaler_t* frow = wrk->frow;
+  const int yscale = wrk->fy_scale * (-wrk->y_accum);
+  int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
+  const int temp7 = (int)wrk->fxy_scale;
+  const int temp6 = (x_out_max & ~0x3) << 2;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  assert(wrk->fxy_scale != 0);
+  if (yscale) {
+    if (x_out_max >= 4) {
+      int temp8, temp9, temp10, temp11;
+      __asm__ volatile (
+        "li       %[temp3],    0x10000                    \n\t"
+        "li       %[temp4],    0x8000                     \n\t"
+        "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+      "1:                                                 \n\t"
+        "lw       %[temp0],    0(%[frow])                 \n\t"
+        "lw       %[temp1],    4(%[frow])                 \n\t"
+        "lw       %[temp2],    8(%[frow])                 \n\t"
+        "lw       %[temp5],    12(%[frow])                \n\t"
+        "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac0,        %[temp0],    %[yscale]     \n\t"
+        "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac1,        %[temp1],    %[yscale]     \n\t"
+        "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac2,        %[temp2],    %[yscale]     \n\t"
+        "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac3,        %[temp5],    %[yscale]     \n\t"
+        "addiu    %[frow],     %[frow],     16            \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp5],    $ac3                       \n\t"
+        "lw       %[temp8],    0(%[irow])                 \n\t"
+        "lw       %[temp9],    4(%[irow])                 \n\t"
+        "lw       %[temp10],   8(%[irow])                 \n\t"
+        "lw       %[temp11],   12(%[irow])                \n\t"
+        "addiu    %[dst],      %[dst],      4             \n\t"
+        "addiu    %[irow],     %[irow],     16            \n\t"
+        "subu     %[temp8],    %[temp8],    %[temp0]      \n\t"
+        "subu     %[temp9],    %[temp9],    %[temp1]      \n\t"
+        "subu     %[temp10],   %[temp10],   %[temp2]      \n\t"
+        "subu     %[temp11],   %[temp11],   %[temp5]      \n\t"
+        "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac0,        %[temp8],    %[temp7]      \n\t"
+        "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac1,        %[temp9],    %[temp7]      \n\t"
+        "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac2,        %[temp10],   %[temp7]      \n\t"
+        "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac3,        %[temp11],   %[temp7]      \n\t"
+        "mfhi     %[temp8],    $ac0                       \n\t"
+        "mfhi     %[temp9],    $ac1                       \n\t"
+        "mfhi     %[temp10],   $ac2                       \n\t"
+        "mfhi     %[temp11],   $ac3                       \n\t"
+        "sw       %[temp0],    -16(%[irow])               \n\t"
+        "sw       %[temp1],    -12(%[irow])               \n\t"
+        "sw       %[temp2],    -8(%[irow])                \n\t"
+        "sw       %[temp5],    -4(%[irow])                \n\t"
+        "sb       %[temp8],    -4(%[dst])                 \n\t"
+        "sb       %[temp9],    -3(%[dst])                 \n\t"
+        "sb       %[temp10],   -2(%[dst])                 \n\t"
+        "sb       %[temp11],   -1(%[dst])                 \n\t"
+        "bne      %[frow],     %[loop_end], 1b            \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+          [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
+          [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
+          [temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
+        : [temp7]"r"(temp7), [yscale]"r"(yscale), [temp6]"r"(temp6)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+      );
+    }
+    for (i = 0; i < (x_out_max & 0x3); ++i) {
+      const uint32_t frac = (uint32_t)MULT_FIX(*frow++, yscale);
+      const int v = (int)MULT_FIX(*irow - frac, wrk->fxy_scale);
+      assert(v >= 0 && v <= 255);
+      *dst++ = v;
+      *irow++ = frac;   // new fractional start
+    }
+  } else {
+    if (x_out_max >= 4) {
+      __asm__ volatile (
+        "li       %[temp3],    0x10000                    \n\t"
+        "li       %[temp4],    0x8000                     \n\t"
+        "addu     %[loop_end], %[irow],     %[temp6]      \n\t"
+      "1:                                                 \n\t"
+        "lw       %[temp0],    0(%[irow])                 \n\t"
+        "lw       %[temp1],    4(%[irow])                 \n\t"
+        "lw       %[temp2],    8(%[irow])                 \n\t"
+        "lw       %[temp5],    12(%[irow])                \n\t"
+        "addiu    %[dst],      %[dst],      4             \n\t"
+        "addiu    %[irow],     %[irow],     16            \n\t"
+        "mult     $ac0,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
+        "mult     $ac1,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
+        "mult     $ac2,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
+        "mult     $ac3,        %[temp3],    %[temp4]      \n\t"
+        "maddu    $ac3,        %[temp5],    %[temp7]      \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp5],    $ac3                       \n\t"
+        "sw       $zero,       -16(%[irow])               \n\t"
+        "sw       $zero,       -12(%[irow])               \n\t"
+        "sw       $zero,       -8(%[irow])                \n\t"
+        "sw       $zero,       -4(%[irow])                \n\t"
+        "sb       %[temp0],    -4(%[dst])                 \n\t"
+        "sb       %[temp1],    -3(%[dst])                 \n\t"
+        "sb       %[temp2],    -2(%[dst])                 \n\t"
+        "sb       %[temp5],    -1(%[dst])                 \n\t"
+        "bne      %[irow],     %[loop_end], 1b            \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [irow]"+r"(irow),
+          [dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
+        : [temp7]"r"(temp7), [temp6]"r"(temp6)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+      );
+    }
+    for (i = 0; i < (x_out_max & 0x3); ++i) {
+      const int v = (int)MULT_FIX(*irow, wrk->fxy_scale);
+      assert(v >= 0 && v <= 255);
+      *dst++ = v;
+      *irow++ = 0;
+    }
+  }
+}
+
+static void ExportRowExpand(WebPRescaler* const wrk) {
+  int i;
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  int temp0, temp1, temp2, temp3, temp4, temp5, loop_end;
+  const int temp6 = (x_out_max & ~0x3) << 2;
+  const int temp7 = (int)wrk->fy_scale;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    if (x_out_max >= 4) {
+      __asm__ volatile (
+        "li       %[temp4],    0x10000                    \n\t"
+        "li       %[temp5],    0x8000                     \n\t"
+        "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+      "1:                                                 \n\t"
+        "lw       %[temp0],    0(%[frow])                 \n\t"
+        "lw       %[temp1],    4(%[frow])                 \n\t"
+        "lw       %[temp2],    8(%[frow])                 \n\t"
+        "lw       %[temp3],    12(%[frow])                \n\t"
+        "addiu    %[dst],      %[dst],      4             \n\t"
+        "addiu    %[frow],     %[frow],     16            \n\t"
+        "mult     $ac0,        %[temp4],    %[temp5]      \n\t"
+        "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
+        "mult     $ac1,        %[temp4],    %[temp5]      \n\t"
+        "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
+        "mult     $ac2,        %[temp4],    %[temp5]      \n\t"
+        "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
+        "mult     $ac3,        %[temp4],    %[temp5]      \n\t"
+        "maddu    $ac3,        %[temp3],    %[temp7]      \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp3],    $ac3                       \n\t"
+        "sb       %[temp0],    -4(%[dst])                 \n\t"
+        "sb       %[temp1],    -3(%[dst])                 \n\t"
+        "sb       %[temp2],    -2(%[dst])                 \n\t"
+        "sb       %[temp3],    -1(%[dst])                 \n\t"
+        "bne      %[frow],     %[loop_end], 1b            \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+          [dst]"+r"(dst), [loop_end]"=&r"(loop_end), [temp2]"=&r"(temp2)
+        : [temp7]"r"(temp7), [temp6]"r"(temp6)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+      );
+    }
+    for (i = 0; i < (x_out_max & 0x3); ++i) {
+      const uint32_t J = *frow++;
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      *dst++ = v;
+    }
+  } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+    if (x_out_max >= 4) {
+      int temp8, temp9, temp10, temp11;
+      __asm__ volatile (
+        "li       %[temp8],    0x10000                    \n\t"
+        "li       %[temp9],    0x8000                     \n\t"
+        "addu     %[loop_end], %[frow],     %[temp6]      \n\t"
+      "1:                                                 \n\t"
+        "lw       %[temp0],    0(%[frow])                 \n\t"
+        "lw       %[temp1],    4(%[frow])                 \n\t"
+        "lw       %[temp2],    8(%[frow])                 \n\t"
+        "lw       %[temp3],    12(%[frow])                \n\t"
+        "lw       %[temp4],    0(%[irow])                 \n\t"
+        "lw       %[temp5],    4(%[irow])                 \n\t"
+        "lw       %[temp10],   8(%[irow])                 \n\t"
+        "lw       %[temp11],   12(%[irow])                \n\t"
+        "addiu    %[dst],      %[dst],      4             \n\t"
+        "mult     $ac0,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac0,        %[A],        %[temp0]      \n\t"
+        "maddu    $ac0,        %[B],        %[temp4]      \n\t"
+        "mult     $ac1,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac1,        %[A],        %[temp1]      \n\t"
+        "maddu    $ac1,        %[B],        %[temp5]      \n\t"
+        "mult     $ac2,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac2,        %[A],        %[temp2]      \n\t"
+        "maddu    $ac2,        %[B],        %[temp10]     \n\t"
+        "mult     $ac3,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac3,        %[A],        %[temp3]      \n\t"
+        "maddu    $ac3,        %[B],        %[temp11]     \n\t"
+        "addiu    %[frow],     %[frow],     16            \n\t"
+        "addiu    %[irow],     %[irow],     16            \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp3],    $ac3                       \n\t"
+        "mult     $ac0,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac0,        %[temp0],    %[temp7]      \n\t"
+        "mult     $ac1,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac1,        %[temp1],    %[temp7]      \n\t"
+        "mult     $ac2,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac2,        %[temp2],    %[temp7]      \n\t"
+        "mult     $ac3,        %[temp8],    %[temp9]      \n\t"
+        "maddu    $ac3,        %[temp3],    %[temp7]      \n\t"
+        "mfhi     %[temp0],    $ac0                       \n\t"
+        "mfhi     %[temp1],    $ac1                       \n\t"
+        "mfhi     %[temp2],    $ac2                       \n\t"
+        "mfhi     %[temp3],    $ac3                       \n\t"
+        "sb       %[temp0],    -4(%[dst])                 \n\t"
+        "sb       %[temp1],    -3(%[dst])                 \n\t"
+        "sb       %[temp2],    -2(%[dst])                 \n\t"
+        "sb       %[temp3],    -1(%[dst])                 \n\t"
+        "bne      %[frow],     %[loop_end], 1b            \n\t"
+        : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp3]"=&r"(temp3),
+          [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [frow]"+r"(frow),
+          [irow]"+r"(irow), [dst]"+r"(dst), [loop_end]"=&r"(loop_end),
+          [temp8]"=&r"(temp8), [temp9]"=&r"(temp9), [temp10]"=&r"(temp10),
+          [temp11]"=&r"(temp11), [temp2]"=&r"(temp2)
+        : [temp7]"r"(temp7), [temp6]"r"(temp6), [A]"r"(A), [B]"r"(B)
+        : "memory", "hi", "lo", "$ac1hi", "$ac1lo",
+          "$ac2hi", "$ac2lo", "$ac3hi", "$ac3lo"
+      );
+    }
+    for (i = 0; i < (x_out_max & 0x3); ++i) {
+      const uint64_t I = (uint64_t)A * *frow++
+                       + (uint64_t)B * *irow++;
+      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      *dst++ = v;
+    }
+  }
+}
+
+#undef MULT_FIX
+#undef ROUNDER
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPRescalerDspInitMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMIPSdspR2(void) {
+  WebPRescalerExportRowExpand = ExportRowExpand;
+  WebPRescalerExportRowShrink = ExportRowShrink;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_neon.c b/src/3rdparty/libwebp/src/dsp/rescaler_neon.c
new file mode 100644
index 0000000..16fd450
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_neon.c
@@ -0,0 +1,186 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON version of rescaling functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <arm_neon.h>
+#include <assert.h>
+#include "./neon.h"
+#include "../utils/rescaler.h"
+
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+
+#define LOAD_32x4(SRC, DST) const uint32x4_t DST = vld1q_u32((SRC))
+#define LOAD_32x8(SRC, DST0, DST1)                                    \
+    LOAD_32x4(SRC + 0, DST0);                                         \
+    LOAD_32x4(SRC + 4, DST1)
+
+#define STORE_32x8(SRC0, SRC1, DST) do {                              \
+    vst1q_u32((DST) + 0, SRC0);                                       \
+    vst1q_u32((DST) + 4, SRC1);                                       \
+} while (0);
+
+#if (WEBP_RESCALER_RFIX == 32)
+#define MAKE_HALF_CST(C) vdupq_n_s32((int32_t)((C) >> 1))
+#define MULT_FIX(A, B) /* note: B is actualy scale>>1. See MAKE_HALF_CST */ \
+    vreinterpretq_u32_s32(vqrdmulhq_s32(vreinterpretq_s32_u32((A)), (B)))
+#else
+#error "MULT_FIX/WEBP_RESCALER_RFIX need some more work"
+#endif
+
+static uint32x4_t Interpolate(const rescaler_t* const frow,
+                              const rescaler_t* const irow,
+                              uint32_t A, uint32_t B) {
+  LOAD_32x4(frow, A0);
+  LOAD_32x4(irow, B0);
+  const uint64x2_t C0 = vmull_n_u32(vget_low_u32(A0), A);
+  const uint64x2_t C1 = vmull_n_u32(vget_high_u32(A0), A);
+  const uint64x2_t D0 = vmlal_n_u32(C0, vget_low_u32(B0), B);
+  const uint64x2_t D1 = vmlal_n_u32(C1, vget_high_u32(B0), B);
+  const uint32x4_t E = vcombine_u32(
+      vrshrn_n_u64(D0, WEBP_RESCALER_RFIX),
+      vrshrn_n_u64(D1, WEBP_RESCALER_RFIX));
+  return E;
+}
+
+static void RescalerExportRowExpand(WebPRescaler* const wrk) {
+  int x_out;
+  uint8_t* const dst = wrk->dst;
+  rescaler_t* const irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const int max_span = x_out_max & ~7;
+  const rescaler_t* const frow = wrk->frow;
+  const uint32_t fy_scale = wrk->fy_scale;
+  const int32x4_t fy_scale_half = MAKE_HALF_CST(fy_scale);
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    for (x_out = 0; x_out < max_span; x_out += 8) {
+      LOAD_32x4(frow + x_out + 0, A0);
+      LOAD_32x4(frow + x_out + 4, A1);
+      const uint32x4_t B0 = MULT_FIX(A0, fy_scale_half);
+      const uint32x4_t B1 = MULT_FIX(A1, fy_scale_half);
+      const uint16x4_t C0 = vmovn_u32(B0);
+      const uint16x4_t C1 = vmovn_u32(B1);
+      const uint8x8_t D = vmovn_u16(vcombine_u16(C0, C1));
+      vst1_u8(dst + x_out, D);
+    }
+    for (; x_out < x_out_max; ++x_out) {
+      const uint32_t J = frow[x_out];
+      const int v = (int)MULT_FIX_C(J, fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+    for (x_out = 0; x_out < max_span; x_out += 8) {
+      const uint32x4_t C0 =
+          Interpolate(frow + x_out + 0, irow + x_out + 0, A, B);
+      const uint32x4_t C1 =
+          Interpolate(frow + x_out + 4, irow + x_out + 4, A, B);
+      const uint32x4_t D0 = MULT_FIX(C0, fy_scale_half);
+      const uint32x4_t D1 = MULT_FIX(C1, fy_scale_half);
+      const uint16x4_t E0 = vmovn_u32(D0);
+      const uint16x4_t E1 = vmovn_u32(D1);
+      const uint8x8_t F = vmovn_u16(vcombine_u16(E0, E1));
+      vst1_u8(dst + x_out, F);
+    }
+    for (; x_out < x_out_max; ++x_out) {
+      const uint64_t I = (uint64_t)A * frow[x_out]
+                       + (uint64_t)B * irow[x_out];
+      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+      const int v = (int)MULT_FIX_C(J, fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  }
+}
+
+static void RescalerExportRowShrink(WebPRescaler* const wrk) {
+  int x_out;
+  uint8_t* const dst = wrk->dst;
+  rescaler_t* const irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const int max_span = x_out_max & ~7;
+  const rescaler_t* const frow = wrk->frow;
+  const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
+  const uint32_t fxy_scale = wrk->fxy_scale;
+  const uint32x4_t zero = vdupq_n_u32(0);
+  const int32x4_t yscale_half = MAKE_HALF_CST(yscale);
+  const int32x4_t fxy_scale_half = MAKE_HALF_CST(fxy_scale);
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  if (yscale) {
+    for (x_out = 0; x_out < max_span; x_out += 8) {
+      LOAD_32x8(frow + x_out, in0, in1);
+      LOAD_32x8(irow + x_out, in2, in3);
+      const uint32x4_t A0 = MULT_FIX(in0, yscale_half);
+      const uint32x4_t A1 = MULT_FIX(in1, yscale_half);
+      const uint32x4_t B0 = vqsubq_u32(in2, A0);
+      const uint32x4_t B1 = vqsubq_u32(in3, A1);
+      const uint32x4_t C0 = MULT_FIX(B0, fxy_scale_half);
+      const uint32x4_t C1 = MULT_FIX(B1, fxy_scale_half);
+      const uint16x4_t D0 = vmovn_u32(C0);
+      const uint16x4_t D1 = vmovn_u32(C1);
+      const uint8x8_t E = vmovn_u16(vcombine_u16(D0, D1));
+      vst1_u8(dst + x_out, E);
+      STORE_32x8(A0, A1, irow + x_out);
+    }
+    for (; x_out < x_out_max; ++x_out) {
+      const uint32_t frac = (uint32_t)MULT_FIX_C(frow[x_out], yscale);
+      const int v = (int)MULT_FIX_C(irow[x_out] - frac, wrk->fxy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+      irow[x_out] = frac;   // new fractional start
+    }
+  } else {
+    for (x_out = 0; x_out < max_span; x_out += 8) {
+      LOAD_32x8(irow + x_out, in0, in1);
+      const uint32x4_t A0 = MULT_FIX(in0, fxy_scale_half);
+      const uint32x4_t A1 = MULT_FIX(in1, fxy_scale_half);
+      const uint16x4_t B0 = vmovn_u32(A0);
+      const uint16x4_t B1 = vmovn_u32(A1);
+      const uint8x8_t C = vmovn_u16(vcombine_u16(B0, B1));
+      vst1_u8(dst + x_out, C);
+      STORE_32x8(zero, zero, irow + x_out);
+    }
+    for (; x_out < x_out_max; ++x_out) {
+      const int v = (int)MULT_FIX_C(irow[x_out], fxy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+      irow[x_out] = 0;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+extern void WebPRescalerDspInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitNEON(void) {
+  WebPRescalerExportRowExpand = RescalerExportRowExpand;
+  WebPRescalerExportRowShrink = RescalerExportRowShrink;
+}
+
+#else     // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitNEON)
+
+#endif    // WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
new file mode 100644
index 0000000..5ea4ddb
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
@@ -0,0 +1,374 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE2 Rescaling functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_SSE2)
+#include <emmintrin.h>
+
+#include <assert.h>
+#include "../utils/rescaler.h"
+
+//------------------------------------------------------------------------------
+// Implementations of critical functions ImportRow / ExportRow
+
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+
+// input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0
+static void LoadTwoPixels(const uint8_t* const src, __m128i* out) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i A = _mm_loadl_epi64((const __m128i*)(src));  // ABCDEFGH
+  const __m128i B = _mm_unpacklo_epi8(A, zero);              // A0B0C0D0E0F0G0H0
+  const __m128i C = _mm_srli_si128(B, 8);                    // E0F0G0H0
+  *out = _mm_unpacklo_epi16(B, C);
+}
+
+// input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
+static void LoadHeightPixels(const uint8_t* const src, __m128i* out) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i A = _mm_loadl_epi64((const __m128i*)(src));  // ABCDEFGH
+  *out = _mm_unpacklo_epi8(A, zero);
+}
+
+static void RescalerImportRowExpandSSE2(WebPRescaler* const wrk,
+                                        const uint8_t* src) {
+  rescaler_t* frow = wrk->frow;
+  const rescaler_t* const frow_end = frow + wrk->dst_width * wrk->num_channels;
+  const int x_add = wrk->x_add;
+  int accum = x_add;
+  __m128i cur_pixels;
+
+  assert(!WebPRescalerInputDone(wrk));
+  assert(wrk->x_expand);
+  if (wrk->num_channels == 4) {
+    if (wrk->src_width < 2) {
+      WebPRescalerImportRowExpandC(wrk, src);
+      return;
+    }
+    LoadTwoPixels(src, &cur_pixels);
+    src += 4;
+    while (1) {
+      const __m128i mult = _mm_set1_epi32(((x_add - accum) << 16) | accum);
+      const __m128i out = _mm_madd_epi16(cur_pixels, mult);
+      _mm_storeu_si128((__m128i*)frow, out);
+      frow += 4;
+      if (frow >= frow_end) break;
+      accum -= wrk->x_sub;
+      if (accum < 0) {
+        LoadTwoPixels(src, &cur_pixels);
+        src += 4;
+        accum += x_add;
+      }
+    }
+  } else {
+    int left;
+    const uint8_t* const src_limit = src + wrk->src_width - 8;
+    if (wrk->src_width < 8) {
+      WebPRescalerImportRowExpandC(wrk, src);
+      return;
+    }
+    LoadHeightPixels(src, &cur_pixels);
+    src += 7;
+    left = 7;
+    while (1) {
+      const __m128i mult = _mm_cvtsi32_si128(((x_add - accum) << 16) | accum);
+      const __m128i out = _mm_madd_epi16(cur_pixels, mult);
+      assert(sizeof(*frow) == sizeof(uint32_t));
+      WebPUint32ToMem((uint8_t*)frow, _mm_cvtsi128_si32(out));
+      frow += 1;
+      if (frow >= frow_end) break;
+      accum -= wrk->x_sub;
+      if (accum < 0) {
+        if (--left) {
+          cur_pixels = _mm_srli_si128(cur_pixels, 2);
+        } else if (src <= src_limit) {
+          LoadHeightPixels(src, &cur_pixels);
+          src += 7;
+          left = 7;
+        } else {   // tail
+          cur_pixels = _mm_srli_si128(cur_pixels, 2);
+          cur_pixels = _mm_insert_epi16(cur_pixels, src[1], 1);
+          src += 1;
+          left = 1;
+        }
+        accum += x_add;
+      }
+    }
+  }
+  assert(accum == 0);
+}
+
+static void RescalerImportRowShrinkSSE2(WebPRescaler* const wrk,
+                                        const uint8_t* src) {
+  const int x_sub = wrk->x_sub;
+  int accum = 0;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i mult0 = _mm_set1_epi16(x_sub);
+  const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale);
+  const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
+  __m128i sum = zero;
+  rescaler_t* frow = wrk->frow;
+  const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;
+
+  if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
+    WebPRescalerImportRowShrinkC(wrk, src);
+    return;
+  }
+  assert(!WebPRescalerInputDone(wrk));
+  assert(!wrk->x_expand);
+
+  for (; frow < frow_end; frow += 4) {
+    __m128i base = zero;
+    accum += wrk->x_add;
+    while (accum > 0) {
+      const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src));
+      src += 4;
+      base = _mm_unpacklo_epi8(A, zero);
+      // To avoid overflow, we need: base * x_add / x_sub < 32768
+      // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit.
+      sum = _mm_add_epi16(sum, base);
+      accum -= x_sub;
+    }
+    {    // Emit next horizontal pixel.
+      const __m128i mult = _mm_set1_epi16(-accum);
+      const __m128i frac0 = _mm_mullo_epi16(base, mult);  // 16b x 16b -> 32b
+      const __m128i frac1 = _mm_mulhi_epu16(base, mult);
+      const __m128i frac = _mm_unpacklo_epi16(frac0, frac1);  // frac is 32b
+      const __m128i A0 = _mm_mullo_epi16(sum, mult0);
+      const __m128i A1 = _mm_mulhi_epu16(sum, mult0);
+      const __m128i B0 = _mm_unpacklo_epi16(A0, A1);      // sum * x_sub
+      const __m128i frow_out = _mm_sub_epi32(B0, frac);   // sum * x_sub - frac
+      const __m128i D0 = _mm_srli_epi64(frac, 32);
+      const __m128i D1 = _mm_mul_epu32(frac, mult1);      // 32b x 16b -> 64b
+      const __m128i D2 = _mm_mul_epu32(D0, mult1);
+      const __m128i E1 = _mm_add_epi64(D1, rounder);
+      const __m128i E2 = _mm_add_epi64(D2, rounder);
+      const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2));
+      const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2));
+      const __m128i G = _mm_unpacklo_epi32(F1, F2);
+      sum = _mm_packs_epi32(G, zero);
+      _mm_storeu_si128((__m128i*)frow, frow_out);
+    }
+  }
+  assert(accum == 0);
+}
+
+//------------------------------------------------------------------------------
+// Row export
+
+// load *src as epi64, multiply by mult and store result in [out0 ... out3]
+static WEBP_INLINE void LoadDispatchAndMult(const rescaler_t* const src,
+                                            const __m128i* const mult,
+                                            __m128i* const out0,
+                                            __m128i* const out1,
+                                            __m128i* const out2,
+                                            __m128i* const out3) {
+  const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + 0));
+  const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + 4));
+  const __m128i A2 = _mm_srli_epi64(A0, 32);
+  const __m128i A3 = _mm_srli_epi64(A1, 32);
+  if (mult != NULL) {
+    *out0 = _mm_mul_epu32(A0, *mult);
+    *out1 = _mm_mul_epu32(A1, *mult);
+    *out2 = _mm_mul_epu32(A2, *mult);
+    *out3 = _mm_mul_epu32(A3, *mult);
+  } else {
+    *out0 = A0;
+    *out1 = A1;
+    *out2 = A2;
+    *out3 = A3;
+  }
+}
+
+static WEBP_INLINE void ProcessRow(const __m128i* const A0,
+                                   const __m128i* const A1,
+                                   const __m128i* const A2,
+                                   const __m128i* const A3,
+                                   const __m128i* const mult,
+                                   uint8_t* const dst) {
+  const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
+  const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0);
+  const __m128i B0 = _mm_mul_epu32(*A0, *mult);
+  const __m128i B1 = _mm_mul_epu32(*A1, *mult);
+  const __m128i B2 = _mm_mul_epu32(*A2, *mult);
+  const __m128i B3 = _mm_mul_epu32(*A3, *mult);
+  const __m128i C0 = _mm_add_epi64(B0, rounder);
+  const __m128i C1 = _mm_add_epi64(B1, rounder);
+  const __m128i C2 = _mm_add_epi64(B2, rounder);
+  const __m128i C3 = _mm_add_epi64(B3, rounder);
+  const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX);
+  const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
+#if (WEBP_RESCALER_FIX < 32)
+  const __m128i D2 =
+      _mm_and_si128(_mm_slli_epi64(C2, 32 - WEBP_RESCALER_RFIX), mask);
+  const __m128i D3 =
+      _mm_and_si128(_mm_slli_epi64(C3, 32 - WEBP_RESCALER_RFIX), mask);
+#else
+  const __m128i D2 = _mm_and_si128(C2, mask);
+  const __m128i D3 = _mm_and_si128(C3, mask);
+#endif
+  const __m128i E0 = _mm_or_si128(D0, D2);
+  const __m128i E1 = _mm_or_si128(D1, D3);
+  const __m128i F = _mm_packs_epi32(E0, E1);
+  const __m128i G = _mm_packus_epi16(F, F);
+  _mm_storel_epi64((__m128i*)dst, G);
+}
+
+static void RescalerExportRowExpandSSE2(WebPRescaler* const wrk) {
+  int x_out;
+  uint8_t* const dst = wrk->dst;
+  rescaler_t* const irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* const frow = wrk->frow;
+  const __m128i mult = _mm_set_epi32(0, wrk->fy_scale, 0, wrk->fy_scale);
+
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0 && wrk->y_sub + wrk->y_accum >= 0);
+  assert(wrk->y_expand);
+  if (wrk->y_accum == 0) {
+    for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
+      __m128i A0, A1, A2, A3;
+      LoadDispatchAndMult(frow + x_out, NULL, &A0, &A1, &A2, &A3);
+      ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
+    }
+    for (; x_out < x_out_max; ++x_out) {
+      const uint32_t J = frow[x_out];
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  } else {
+    const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+    const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+    const __m128i mA = _mm_set_epi32(0, A, 0, A);
+    const __m128i mB = _mm_set_epi32(0, B, 0, B);
+    const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
+    for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
+      __m128i A0, A1, A2, A3, B0, B1, B2, B3;
+      LoadDispatchAndMult(frow + x_out, &mA, &A0, &A1, &A2, &A3);
+      LoadDispatchAndMult(irow + x_out, &mB, &B0, &B1, &B2, &B3);
+      {
+        const __m128i C0 = _mm_add_epi64(A0, B0);
+        const __m128i C1 = _mm_add_epi64(A1, B1);
+        const __m128i C2 = _mm_add_epi64(A2, B2);
+        const __m128i C3 = _mm_add_epi64(A3, B3);
+        const __m128i D0 = _mm_add_epi64(C0, rounder);
+        const __m128i D1 = _mm_add_epi64(C1, rounder);
+        const __m128i D2 = _mm_add_epi64(C2, rounder);
+        const __m128i D3 = _mm_add_epi64(C3, rounder);
+        const __m128i E0 = _mm_srli_epi64(D0, WEBP_RESCALER_RFIX);
+        const __m128i E1 = _mm_srli_epi64(D1, WEBP_RESCALER_RFIX);
+        const __m128i E2 = _mm_srli_epi64(D2, WEBP_RESCALER_RFIX);
+        const __m128i E3 = _mm_srli_epi64(D3, WEBP_RESCALER_RFIX);
+        ProcessRow(&E0, &E1, &E2, &E3, &mult, dst + x_out);
+      }
+    }
+    for (; x_out < x_out_max; ++x_out) {
+      const uint64_t I = (uint64_t)A * frow[x_out]
+                       + (uint64_t)B * irow[x_out];
+      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  }
+}
+
+static void RescalerExportRowShrinkSSE2(WebPRescaler* const wrk) {
+  int x_out;
+  uint8_t* const dst = wrk->dst;
+  rescaler_t* const irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* const frow = wrk->frow;
+  const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  if (yscale) {
+    const int scale_xy = wrk->fxy_scale;
+    const __m128i mult_xy = _mm_set_epi32(0, scale_xy, 0, scale_xy);
+    const __m128i mult_y = _mm_set_epi32(0, yscale, 0, yscale);
+    const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
+    for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
+      __m128i A0, A1, A2, A3, B0, B1, B2, B3;
+      LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
+      LoadDispatchAndMult(frow + x_out, &mult_y, &B0, &B1, &B2, &B3);
+      {
+        const __m128i C0 = _mm_add_epi64(B0, rounder);
+        const __m128i C1 = _mm_add_epi64(B1, rounder);
+        const __m128i C2 = _mm_add_epi64(B2, rounder);
+        const __m128i C3 = _mm_add_epi64(B3, rounder);
+        const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX);   // = frac
+        const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX);
+        const __m128i D2 = _mm_srli_epi64(C2, WEBP_RESCALER_RFIX);
+        const __m128i D3 = _mm_srli_epi64(C3, WEBP_RESCALER_RFIX);
+        const __m128i E0 = _mm_sub_epi64(A0, D0);   // irow[x] - frac
+        const __m128i E1 = _mm_sub_epi64(A1, D1);
+        const __m128i E2 = _mm_sub_epi64(A2, D2);
+        const __m128i E3 = _mm_sub_epi64(A3, D3);
+        const __m128i F2 = _mm_slli_epi64(D2, 32);
+        const __m128i F3 = _mm_slli_epi64(D3, 32);
+        const __m128i G0 = _mm_or_si128(D0, F2);
+        const __m128i G1 = _mm_or_si128(D1, F3);
+        _mm_storeu_si128((__m128i*)(irow + x_out + 0), G0);
+        _mm_storeu_si128((__m128i*)(irow + x_out + 4), G1);
+        ProcessRow(&E0, &E1, &E2, &E3, &mult_xy, dst + x_out);
+      }
+    }
+    for (; x_out < x_out_max; ++x_out) {
+      const uint32_t frac = (int)MULT_FIX(frow[x_out], yscale);
+      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+      irow[x_out] = frac;   // new fractional start
+    }
+  } else {
+    const uint32_t scale = wrk->fxy_scale;
+    const __m128i mult = _mm_set_epi32(0, scale, 0, scale);
+    const __m128i zero = _mm_setzero_si128();
+    for (x_out = 0; x_out + 8 <= x_out_max; x_out += 8) {
+      __m128i A0, A1, A2, A3;
+      LoadDispatchAndMult(irow + x_out, NULL, &A0, &A1, &A2, &A3);
+      _mm_storeu_si128((__m128i*)(irow + x_out + 0), zero);
+      _mm_storeu_si128((__m128i*)(irow + x_out + 4), zero);
+      ProcessRow(&A0, &A1, &A2, &A3, &mult, dst + x_out);
+    }
+    for (; x_out < x_out_max; ++x_out) {
+      const int v = (int)MULT_FIX(irow[x_out], scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+      irow[x_out] = 0;
+    }
+  }
+}
+
+#undef MULT_FIX
+#undef ROUNDER
+
+//------------------------------------------------------------------------------
+
+extern void WebPRescalerDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitSSE2(void) {
+  WebPRescalerImportRowExpand = RescalerImportRowExpandSSE2;
+  WebPRescalerImportRowShrink = RescalerImportRowShrinkSSE2;
+  WebPRescalerExportRowExpand = RescalerExportRowExpandSSE2;
+  WebPRescalerExportRowShrink = RescalerExportRowShrinkSSE2;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling.c b/src/3rdparty/libwebp/src/dsp/upsampling.c
index 53c68d5..651274f 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling.c
@@ -153,46 +153,73 @@ WebPUpsampleLinePairFunc WebPGetLinePairConverter(int alpha_is_last) {
 // YUV444 converter
 
 #define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
-static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
-                      uint8_t* dst, int len) {                                 \
+extern void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                      uint8_t* dst, int len);                                  \
+void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,           \
+               uint8_t* dst, int len) {                                        \
   int i;                                                                       \
   for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
 }
 
-YUV444_FUNC(Yuv444ToRgb,      VP8YuvToRgb,  3)
-YUV444_FUNC(Yuv444ToBgr,      VP8YuvToBgr,  3)
-YUV444_FUNC(Yuv444ToRgba,     VP8YuvToRgba, 4)
-YUV444_FUNC(Yuv444ToBgra,     VP8YuvToBgra, 4)
-YUV444_FUNC(Yuv444ToArgb,     VP8YuvToArgb, 4)
-YUV444_FUNC(Yuv444ToRgba4444, VP8YuvToRgba4444, 2)
-YUV444_FUNC(Yuv444ToRgb565,   VP8YuvToRgb565, 2)
+YUV444_FUNC(WebPYuv444ToRgbC,      VP8YuvToRgb,  3)
+YUV444_FUNC(WebPYuv444ToBgrC,      VP8YuvToBgr,  3)
+YUV444_FUNC(WebPYuv444ToRgbaC,     VP8YuvToRgba, 4)
+YUV444_FUNC(WebPYuv444ToBgraC,     VP8YuvToBgra, 4)
+YUV444_FUNC(WebPYuv444ToArgbC,     VP8YuvToArgb, 4)
+YUV444_FUNC(WebPYuv444ToRgba4444C, VP8YuvToRgba4444, 2)
+YUV444_FUNC(WebPYuv444ToRgb565C,   VP8YuvToRgb565, 2)
 
 #undef YUV444_FUNC
 
-const WebPYUV444Converter WebPYUV444Converters[MODE_LAST] = {
-  Yuv444ToRgb,       // MODE_RGB
-  Yuv444ToRgba,      // MODE_RGBA
-  Yuv444ToBgr,       // MODE_BGR
-  Yuv444ToBgra,      // MODE_BGRA
-  Yuv444ToArgb,      // MODE_ARGB
-  Yuv444ToRgba4444,  // MODE_RGBA_4444
-  Yuv444ToRgb565,    // MODE_RGB_565
-  Yuv444ToRgba,      // MODE_rgbA
-  Yuv444ToBgra,      // MODE_bgrA
-  Yuv444ToArgb,      // MODE_Argb
-  Yuv444ToRgba4444   // MODE_rgbA_4444
-};
+WebPYUV444Converter WebPYUV444Converters[MODE_LAST];
+
+extern void WebPInitYUV444ConvertersMIPSdspR2(void);
+extern void WebPInitYUV444ConvertersSSE2(void);
+
+static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
+    (VP8CPUInfo)&upsampling_last_cpuinfo_used1;
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
+  if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;
+
+  WebPYUV444Converters[MODE_RGB]       = WebPYuv444ToRgbC;
+  WebPYUV444Converters[MODE_RGBA]      = WebPYuv444ToRgbaC;
+  WebPYUV444Converters[MODE_BGR]       = WebPYuv444ToBgrC;
+  WebPYUV444Converters[MODE_BGRA]      = WebPYuv444ToBgraC;
+  WebPYUV444Converters[MODE_ARGB]      = WebPYuv444ToArgbC;
+  WebPYUV444Converters[MODE_RGBA_4444] = WebPYuv444ToRgba4444C;
+  WebPYUV444Converters[MODE_RGB_565]   = WebPYuv444ToRgb565C;
+  WebPYUV444Converters[MODE_rgbA]      = WebPYuv444ToRgbaC;
+  WebPYUV444Converters[MODE_bgrA]      = WebPYuv444ToBgraC;
+  WebPYUV444Converters[MODE_Argb]      = WebPYuv444ToArgbC;
+  WebPYUV444Converters[MODE_rgbA_4444] = WebPYuv444ToRgba4444C;
+
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitYUV444ConvertersSSE2();
+    }
+#endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      WebPInitYUV444ConvertersMIPSdspR2();
+    }
+#endif
+  }
+  upsampling_last_cpuinfo_used1 = VP8GetCPUInfo;
+}
 
 //------------------------------------------------------------------------------
 // Main calls
 
 extern void WebPInitUpsamplersSSE2(void);
 extern void WebPInitUpsamplersNEON(void);
+extern void WebPInitUpsamplersMIPSdspR2(void);
 
 static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 =
     (VP8CPUInfo)&upsampling_last_cpuinfo_used2;
 
-void WebPInitUpsamplers(void) {
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
   if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
 
 #ifdef FANCY_UPSAMPLING
@@ -220,6 +247,11 @@ void WebPInitUpsamplers(void) {
       WebPInitUpsamplersNEON();
     }
 #endif
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      WebPInitUpsamplersMIPSdspR2();
+    }
+#endif
   }
 #endif  // FANCY_UPSAMPLING
   upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c
new file mode 100644
index 0000000..d4ccbe0
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_mips_dsp_r2.c
@@ -0,0 +1,284 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV to RGB upsampling functions.
+//
+// Author(s): Branimir Vasic (branimir.vasic@imgtec.com)
+//            Djordje Pesut  (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+// Code is disabled for now, in favor of the plain-C version
+// TODO(djordje.pesut): adapt the code to reflect the C-version.
+#if 0 // defined(WEBP_USE_MIPS_DSP_R2)
+
+#include <assert.h>
+#include "./yuv.h"
+
+#if !defined(WEBP_YUV_USE_TABLE)
+
+#define YUV_TO_RGB(Y, U, V, R, G, B) do {                                      \
+    const int t1 = kYScale * Y;                                                \
+    const int t2 = kVToG * V;                                                  \
+    R = kVToR * V;                                                             \
+    G = kUToG * U;                                                             \
+    B = kUToB * U;                                                             \
+    R = t1 + R;                                                                \
+    G = t1 - G;                                                                \
+    B = t1 + B;                                                                \
+    R = R + kRCst;                                                             \
+    G = G - t2 + kGCst;                                                        \
+    B = B + kBCst;                                                             \
+    __asm__ volatile (                                                         \
+      "shll_s.w         %[" #R "],      %[" #R "],        9          \n\t"     \
+      "shll_s.w         %[" #G "],      %[" #G "],        9          \n\t"     \
+      "shll_s.w         %[" #B "],      %[" #B "],        9          \n\t"     \
+      "precrqu_s.qb.ph  %[" #R "],      %[" #R "],        $zero      \n\t"     \
+      "precrqu_s.qb.ph  %[" #G "],      %[" #G "],        $zero      \n\t"     \
+      "precrqu_s.qb.ph  %[" #B "],      %[" #B "],        $zero      \n\t"     \
+      "srl              %[" #R "],      %[" #R "],        24         \n\t"     \
+      "srl              %[" #G "],      %[" #G "],        24         \n\t"     \
+      "srl              %[" #B "],      %[" #B "],        24         \n\t"     \
+      : [R]"+r"(R), [G]"+r"(G), [B]"+r"(B)                                     \
+      :                                                                        \
+    );                                                                         \
+  } while (0)
+
+static WEBP_INLINE void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  rgb[0] = r;
+  rgb[1] = g;
+  rgb[2] = b;
+}
+static WEBP_INLINE void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  bgr[0] = b;
+  bgr[1] = g;
+  bgr[2] = r;
+}
+static WEBP_INLINE void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  {
+    const int rg = (r & 0xf8) | (g >> 5);
+    const int gb = ((g << 3) & 0xe0) | (b >> 3);
+#ifdef WEBP_SWAP_16BIT_CSP
+    rgb[0] = gb;
+    rgb[1] = rg;
+#else
+    rgb[0] = rg;
+    rgb[1] = gb;
+#endif
+  }
+}
+static WEBP_INLINE void YuvToRgba4444(int y, int u, int v,
+                                      uint8_t* const argb) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  {
+    const int rg = (r & 0xf0) | (g >> 4);
+    const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
+#ifdef WEBP_SWAP_16BIT_CSP
+    argb[0] = ba;
+    argb[1] = rg;
+#else
+    argb[0] = rg;
+    argb[1] = ba;
+#endif
+   }
+}
+#endif  // WEBP_YUV_USE_TABLE
+
+//-----------------------------------------------------------------------------
+// Alpha handling variants
+
+static WEBP_INLINE void YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
+                                  uint8_t* const argb) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  argb[0] = 0xff;
+  argb[1] = r;
+  argb[2] = g;
+  argb[3] = b;
+}
+static WEBP_INLINE void YuvToBgra(uint8_t y, uint8_t u, uint8_t v,
+                                  uint8_t* const bgra) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  bgra[0] = b;
+  bgra[1] = g;
+  bgra[2] = r;
+  bgra[3] = 0xff;
+}
+static WEBP_INLINE void YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
+                                  uint8_t* const rgba) {
+  int r, g, b;
+  YUV_TO_RGB(y, u, v, r, g, b);
+  rgba[0] = r;
+  rgba[1] = g;
+  rgba[2] = b;
+  rgba[3] = 0xff;
+}
+
+//------------------------------------------------------------------------------
+// Fancy upsampler
+
+#ifdef FANCY_UPSAMPLING
+
+// Given samples laid out in a square as:
+//  [a b]
+//  [c d]
+// we interpolate u/v as:
+//  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
+//  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
+
+// We process u and v together stashed into 32bit (16bit each).
+#define LOAD_UV(u, v) ((u) | ((v) << 16))
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                                  \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int x;                                                                       \
+  const int last_pixel_pair = (len - 1) >> 1;                                  \
+  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
+  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
+  assert(top_y != NULL);                                                       \
+  {                                                                            \
+    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
+    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
+  }                                                                            \
+  if (bottom_y != NULL) {                                                      \
+    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
+    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
+  }                                                                            \
+  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
+    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
+    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
+    /* precompute invariant values associated with first and second diagonals*/\
+    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
+    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
+    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
+    {                                                                          \
+      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
+      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
+      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
+           top_dst + (2 * x - 1) * XSTEP);                                     \
+      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
+           top_dst + (2 * x - 0) * XSTEP);                                     \
+    }                                                                          \
+    if (bottom_y != NULL) {                                                    \
+      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
+      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
+      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
+           bottom_dst + (2 * x - 1) * XSTEP);                                  \
+      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
+           bottom_dst + (2 * x + 0) * XSTEP);                                  \
+    }                                                                          \
+    tl_uv = t_uv;                                                              \
+    l_uv = uv;                                                                 \
+  }                                                                            \
+  if (!(len & 1)) {                                                            \
+    {                                                                          \
+      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
+      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
+           top_dst + (len - 1) * XSTEP);                                       \
+    }                                                                          \
+    if (bottom_y != NULL) {                                                    \
+      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
+      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
+           bottom_dst + (len - 1) * XSTEP);                                    \
+    }                                                                          \
+  }                                                                            \
+}
+
+// All variants implemented.
+UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
+UPSAMPLE_FUNC(UpsampleRgbaLinePair,     YuvToRgba,     4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair,     YuvToBgra,     4)
+UPSAMPLE_FUNC(UpsampleArgbLinePair,     YuvToArgb,     4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
+
+#undef LOAD_UV
+#undef UPSAMPLE_FUNC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitUpsamplersMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMIPSdspR2(void) {
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+}
+
+#endif  // FANCY_UPSAMPLING
+
+//------------------------------------------------------------------------------
+// YUV444 converter
+
+#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP)                                    \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]);           \
+}
+
+YUV444_FUNC(Yuv444ToRgb,      YuvToRgb,      3)
+YUV444_FUNC(Yuv444ToBgr,      YuvToBgr,      3)
+YUV444_FUNC(Yuv444ToRgba,     YuvToRgba,     4)
+YUV444_FUNC(Yuv444ToBgra,     YuvToBgra,     4)
+YUV444_FUNC(Yuv444ToArgb,     YuvToArgb,     4)
+YUV444_FUNC(Yuv444ToRgba4444, YuvToRgba4444, 2)
+YUV444_FUNC(Yuv444ToRgb565,   YuvToRgb565,   2)
+
+#undef YUV444_FUNC
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitYUV444ConvertersMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersMIPSdspR2(void) {
+  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb;
+  WebPYUV444Converters[MODE_RGBA]      = Yuv444ToRgba;
+  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr;
+  WebPYUV444Converters[MODE_BGRA]      = Yuv444ToBgra;
+  WebPYUV444Converters[MODE_ARGB]      = Yuv444ToArgb;
+  WebPYUV444Converters[MODE_RGBA_4444] = Yuv444ToRgba4444;
+  WebPYUV444Converters[MODE_RGB_565]   = Yuv444ToRgb565;
+  WebPYUV444Converters[MODE_rgbA]      = Yuv444ToRgba;
+  WebPYUV444Converters[MODE_bgrA]      = Yuv444ToBgra;
+  WebPYUV444Converters[MODE_Argb]      = Yuv444ToArgb;
+  WebPYUV444Converters[MODE_rgbA_4444] = Yuv444ToRgba4444;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
+
+#if 1  // !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MIPS_DSP_R2))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersMIPSdspR2)
+#endif
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
index d31ed4d..2b0c99b 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
@@ -89,9 +89,11 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
 //-----------------------------------------------------------------------------
 // YUV->RGB conversion
 
-static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
+// note: we represent the 33050 large constant as 32768 + 282
+static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
 
 #define v255 vdup_n_u8(255)
+#define v_0x0f vdup_n_u8(15)
 
 #define STORE_Rgb(out, r, g, b) do {                                    \
   uint8x8x3_t r_g_b;                                                    \
@@ -117,38 +119,67 @@ static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
   vst4_u8(out, b_g_r_v255);                                             \
 } while (0)
 
-#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) {            \
+#define STORE_Argb(out, r, g, b) do {                                   \
+  uint8x8x4_t v255_r_g_b;                                               \
+  INIT_VECTOR4(v255_r_g_b, v255, r, g, b);                              \
+  vst4_u8(out, v255_r_g_b);                                             \
+} while (0)
+
+#if !defined(WEBP_SWAP_16BIT_CSP)
+#define ZIP_U8(lo, hi) vzip_u8((lo), (hi))
+#else
+#define ZIP_U8(lo, hi) vzip_u8((hi), (lo))
+#endif
+
+#define STORE_Rgba4444(out, r, g, b) do {                               \
+  const uint8x8_t r1 = vshl_n_u8(vshr_n_u8(r, 4), 4);  /* 4bits */      \
+  const uint8x8_t g1 = vshr_n_u8(g, 4);                                 \
+  const uint8x8_t ba = vorr_u8(b, v_0x0f);                              \
+  const uint8x8_t rg = vorr_u8(r1, g1);                                 \
+  const uint8x8x2_t rgba4444 = ZIP_U8(rg, ba);                          \
+  vst1q_u8(out, vcombine_u8(rgba4444.val[0], rgba4444.val[1]));         \
+} while (0)
+
+#define STORE_Rgb565(out, r, g, b) do {                                 \
+  const uint8x8_t r1 = vshl_n_u8(vshr_n_u8(r, 3), 3);  /* 5bits */      \
+  const uint8x8_t g1 = vshr_n_u8(g, 5);                /* upper 3bits */\
+  const uint8x8_t g2 = vshl_n_u8(vshr_n_u8(g, 2), 5);  /* lower 3bits */\
+  const uint8x8_t b1 = vshr_n_u8(b, 3);                /* 5bits */      \
+  const uint8x8_t rg = vorr_u8(r1, g1);                                 \
+  const uint8x8_t gb = vorr_u8(g2, b1);                                 \
+  const uint8x8x2_t rgb565 = ZIP_U8(rg, gb);                            \
+  vst1q_u8(out, vcombine_u8(rgb565.val[0], rgb565.val[1]));             \
+} while (0)
+
+#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) do {         \
   int i;                                                                \
   for (i = 0; i < N; i += 8) {                                          \
     const int off = ((cur_x) + i) * XSTEP;                              \
-    uint8x8_t y  = vld1_u8((src_y) + (cur_x)  + i);                     \
-    uint8x8_t u  = vld1_u8((src_uv) + i);                               \
-    uint8x8_t v  = vld1_u8((src_uv) + i + 16);                          \
-    const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));       \
-    const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));      \
-    const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));      \
-    int32x4_t yl = vmull_lane_s16(vget_low_s16(yy),  cf16, 0);          \
-    int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0);          \
-    const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv),  cf16, 1);\
-    const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\
-    int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu),  cf16, 2);      \
-    int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2);      \
-    const int32x4_t bl = vmovl_s16(vget_low_s16(uu));                   \
-    const int32x4_t bh = vmovl_s16(vget_high_s16(uu));                  \
-    gl = vmlsl_lane_s16(gl, vget_low_s16(vv),  cf16, 3);                \
-    gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3);                \
-    yl = vmlaq_lane_s32(yl, bl, cf32, 0);                               \
-    yh = vmlaq_lane_s32(yh, bh, cf32, 0);                               \
-    /* vrshrn_n_s32() already incorporates the rounding constant */     \
-    y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2),            \
-                                 vrshrn_n_s32(rh, YUV_FIX2)));          \
-    u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2),            \
-                                 vrshrn_n_s32(gh, YUV_FIX2)));          \
-    v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2),            \
-                                 vrshrn_n_s32(yh, YUV_FIX2)));          \
-    STORE_ ## FMT(out + off, y, u, v);                                  \
+    const uint8x8_t y  = vld1_u8((src_y) + (cur_x)  + i);               \
+    const uint8x8_t u  = vld1_u8((src_uv) + i +  0);                    \
+    const uint8x8_t v  = vld1_u8((src_uv) + i + 16);                    \
+    const int16x8_t Y0 = vreinterpretq_s16_u16(vshll_n_u8(y, 7));       \
+    const int16x8_t U0 = vreinterpretq_s16_u16(vshll_n_u8(u, 7));       \
+    const int16x8_t V0 = vreinterpretq_s16_u16(vshll_n_u8(v, 7));       \
+    const int16x8_t Y1 = vqdmulhq_lane_s16(Y0, coeff1, 0);              \
+    const int16x8_t R0 = vqdmulhq_lane_s16(V0, coeff1, 1);              \
+    const int16x8_t G0 = vqdmulhq_lane_s16(U0, coeff1, 2);              \
+    const int16x8_t G1 = vqdmulhq_lane_s16(V0, coeff1, 3);              \
+    const int16x8_t B0 = vqdmulhq_n_s16(U0, 282);                       \
+    const int16x8_t R1 = vqaddq_s16(Y1, R_Rounder);                     \
+    const int16x8_t G2 = vqaddq_s16(Y1, G_Rounder);                     \
+    const int16x8_t B1 = vqaddq_s16(Y1, B_Rounder);                     \
+    const int16x8_t R2 = vqaddq_s16(R0, R1);                            \
+    const int16x8_t G3 = vqaddq_s16(G0, G1);                            \
+    const int16x8_t B2 = vqaddq_s16(B0, B1);                            \
+    const int16x8_t G4 = vqsubq_s16(G2, G3);                            \
+    const int16x8_t B3 = vqaddq_s16(B2, U0);                            \
+    const uint8x8_t R = vqshrun_n_s16(R2, YUV_FIX2);                    \
+    const uint8x8_t G = vqshrun_n_s16(G4, YUV_FIX2);                    \
+    const uint8x8_t B = vqshrun_n_s16(B3, YUV_FIX2);                    \
+    STORE_ ## FMT(out + off, R, G, B);                                  \
   }                                                                     \
-}
+} while (0)
 
 #define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) {           \
   int i;                                                                \
@@ -163,9 +194,9 @@ static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
 
 #define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv,                  \
                       top_dst, bottom_dst, cur_x, len) {                \
-  CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x)                  \
+  CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x);                 \
   if (bottom_y != NULL) {                                               \
-    CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x)   \
+    CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x);  \
   }                                                                     \
 }
 
@@ -195,10 +226,10 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
   const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                  \
   const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                  \
                                                                         \
-  const int16x4_t cf16 = vld1_s16(kCoeffs);                             \
-  const int32x2_t cf32 = vdup_n_s32(kUToB);                             \
-  const uint8x8_t u16  = vdup_n_u8(16);                                 \
-  const uint8x8_t u128 = vdup_n_u8(128);                                \
+  const int16x4_t coeff1 = vld1_s16(kCoeffs1);                          \
+  const int16x8_t R_Rounder = vdupq_n_s16(-14234);                      \
+  const int16x8_t G_Rounder = vdupq_n_s16(8708);                        \
+  const int16x8_t B_Rounder = vdupq_n_s16(-17685);                      \
                                                                         \
   /* Treat the first pixel in regular way */                            \
   assert(top_y != NULL);                                                \
@@ -235,33 +266,35 @@ NEON_UPSAMPLE_FUNC(UpsampleRgbLinePair,  Rgb,  3)
 NEON_UPSAMPLE_FUNC(UpsampleBgrLinePair,  Bgr,  3)
 NEON_UPSAMPLE_FUNC(UpsampleRgbaLinePair, Rgba, 4)
 NEON_UPSAMPLE_FUNC(UpsampleBgraLinePair, Bgra, 4)
-
-#endif  // FANCY_UPSAMPLING
-
-#endif   // WEBP_USE_NEON
+NEON_UPSAMPLE_FUNC(UpsampleArgbLinePair, Argb, 4)
+NEON_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, Rgba4444, 2)
+NEON_UPSAMPLE_FUNC(UpsampleRgb565LinePair, Rgb565, 2)
 
 //------------------------------------------------------------------------------
-
-extern void WebPInitUpsamplersNEON(void);
-
-#ifdef FANCY_UPSAMPLING
+// Entry point
 
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 
-void WebPInitUpsamplersNEON(void) {
-#if defined(WEBP_USE_NEON)
+extern void WebPInitUpsamplersNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersNEON(void) {
   WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair;
   WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
   WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair;
   WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
   WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
   WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
-#endif   // WEBP_USE_NEON
+  WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
 }
 
-#else
+#endif  // FANCY_UPSAMPLING
 
-// this empty function is to avoid an empty .o
-void WebPInitUpsamplersNEON(void) {}
+#endif  // WEBP_USE_NEON
 
-#endif  // FANCY_UPSAMPLING
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_NEON))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersNEON)
+#endif
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c b/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
index 45cf090..b5b6689 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
@@ -60,10 +60,10 @@
 // Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
 #define UPSAMPLE_32PIXELS(r1, r2, out) {                                       \
   const __m128i one = _mm_set1_epi8(1);                                        \
-  const __m128i a = _mm_loadu_si128((__m128i*)&(r1)[0]);                       \
-  const __m128i b = _mm_loadu_si128((__m128i*)&(r1)[1]);                       \
-  const __m128i c = _mm_loadu_si128((__m128i*)&(r2)[0]);                       \
-  const __m128i d = _mm_loadu_si128((__m128i*)&(r2)[1]);                       \
+  const __m128i a = _mm_loadu_si128((const __m128i*)&(r1)[0]);                 \
+  const __m128i b = _mm_loadu_si128((const __m128i*)&(r1)[1]);                 \
+  const __m128i c = _mm_loadu_si128((const __m128i*)&(r2)[0]);                 \
+  const __m128i d = _mm_loadu_si128((const __m128i*)&(r2)[1]);                 \
                                                                                \
   const __m128i s = _mm_avg_epu8(a, d);        /* s = (a + d + 1) / 2 */       \
   const __m128i t = _mm_avg_epu8(b, c);        /* t = (b + c + 1) / 2 */       \
@@ -173,6 +173,9 @@ SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePair,  VP8YuvToRgb,  3)
 SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePair,  VP8YuvToBgr,  3)
 SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4)
 SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4)
+SSE2_UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2)
+SSE2_UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2)
 
 #undef GET_M
 #undef PACK_AND_STORE
@@ -182,33 +185,65 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4)
 #undef CONVERT2RGB_32
 #undef SSE2_UPSAMPLE_FUNC
 
-#endif  // FANCY_UPSAMPLING
-
-#endif   // WEBP_USE_SSE2
-
 //------------------------------------------------------------------------------
-
-extern void WebPInitUpsamplersSSE2(void);
-
-#ifdef FANCY_UPSAMPLING
+// Entry point
 
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 
-void WebPInitUpsamplersSSE2(void) {
-#if defined(WEBP_USE_SSE2)
-  VP8YUVInitSSE2();
+extern void WebPInitUpsamplersSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE2(void) {
   WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair;
   WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair;
   WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair;
   WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair;
   WebPUpsamplers[MODE_rgbA] = UpsampleRgbaLinePair;
   WebPUpsamplers[MODE_bgrA] = UpsampleBgraLinePair;
-#endif   // WEBP_USE_SSE2
+  WebPUpsamplers[MODE_Argb] = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+}
+
+#endif  // FANCY_UPSAMPLING
+
+//------------------------------------------------------------------------------
+
+extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
+extern void WebPInitYUV444ConvertersSSE2(void);
+
+#define YUV444_FUNC(FUNC_NAME, CALL, XSTEP) \
+extern void WebP##FUNC_NAME##C(const uint8_t* y, const uint8_t* u,             \
+                               const uint8_t* v, uint8_t* dst, int len);       \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  const int max_len = len & ~31;                                               \
+  for (i = 0; i < max_len; i += 32) CALL(y + i, u + i, v + i, dst + i * XSTEP);\
+  if (i < len) {  /* C-fallback */                                             \
+    WebP##FUNC_NAME##C(y + i, u + i, v + i, dst + i * XSTEP, len - i);         \
+  }                                                                            \
+}
+
+YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba32, 4);
+YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra32, 4);
+YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb32, 3);
+YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr32, 3);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE2(void) {
+  WebPYUV444Converters[MODE_RGBA] = Yuv444ToRgba;
+  WebPYUV444Converters[MODE_BGRA] = Yuv444ToBgra;
+  WebPYUV444Converters[MODE_RGB]  = Yuv444ToRgb;
+  WebPYUV444Converters[MODE_BGR]  = Yuv444ToBgr;
 }
 
 #else
 
-// this empty function is to avoid an empty .o
-void WebPInitUpsamplersSSE2(void) {}
+WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersSSE2)
 
-#endif  // FANCY_UPSAMPLING
+#endif  // WEBP_USE_SSE2
+
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_SSE2))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersSSE2)
+#endif
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.c b/src/3rdparty/libwebp/src/dsp/yuv.c
index 6f422da..f50a253 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv.c
@@ -26,7 +26,7 @@ int32_t VP8kVToG[256], VP8kUToG[256];
 uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
 uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
 
-void VP8YUVInit(void) {
+WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {
   int i;
   if (done) {
     return;
@@ -62,7 +62,7 @@ void VP8YUVInit(void) {
 
 #else
 
-void VP8YUVInit(void) {}
+WEBP_TSAN_IGNORE_FUNCTION void VP8YUVInit(void) {}
 
 #endif  // WEBP_YUV_USE_TABLE
 
@@ -122,11 +122,12 @@ WebPSamplerRowFunc WebPSamplers[MODE_LAST];
 
 extern void WebPInitSamplersSSE2(void);
 extern void WebPInitSamplersMIPS32(void);
+extern void WebPInitSamplersMIPSdspR2(void);
 
 static volatile VP8CPUInfo yuv_last_cpuinfo_used =
     (VP8CPUInfo)&yuv_last_cpuinfo_used;
 
-void WebPInitSamplers(void) {
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
   if (yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
 
   WebPSamplers[MODE_RGB]       = YuvToRgbRow;
@@ -153,8 +154,127 @@ void WebPInitSamplers(void) {
       WebPInitSamplersMIPS32();
     }
 #endif  // WEBP_USE_MIPS32
+#if defined(WEBP_USE_MIPS_DSP_R2)
+    if (VP8GetCPUInfo(kMIPSdspR2)) {
+      WebPInitSamplersMIPSdspR2();
+    }
+#endif  // WEBP_USE_MIPS_DSP_R2
   }
   yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }
 
 //-----------------------------------------------------------------------------
+// ARGB -> YUV converters
+
+static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
+  int i;
+  for (i = 0; i < width; ++i) {
+    const uint32_t p = argb[i];
+    y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
+                     YUV_HALF);
+  }
+}
+
+void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
+                           int src_width, int do_store) {
+  // No rounding. Last pixel is dealt with separately.
+  const int uv_width = src_width >> 1;
+  int i;
+  for (i = 0; i < uv_width; ++i) {
+    const uint32_t v0 = argb[2 * i + 0];
+    const uint32_t v1 = argb[2 * i + 1];
+    // VP8RGBToU/V expects four accumulated pixels. Hence we need to
+    // scale r/g/b value by a factor 2. We just shift v0/v1 one bit less.
+    const int r = ((v0 >> 15) & 0x1fe) + ((v1 >> 15) & 0x1fe);
+    const int g = ((v0 >>  7) & 0x1fe) + ((v1 >>  7) & 0x1fe);
+    const int b = ((v0 <<  1) & 0x1fe) + ((v1 <<  1) & 0x1fe);
+    const int tmp_u = VP8RGBToU(r, g, b, YUV_HALF << 2);
+    const int tmp_v = VP8RGBToV(r, g, b, YUV_HALF << 2);
+    if (do_store) {
+      u[i] = tmp_u;
+      v[i] = tmp_v;
+    } else {
+      // Approximated average-of-four. But it's an acceptable diff.
+      u[i] = (u[i] + tmp_u + 1) >> 1;
+      v[i] = (v[i] + tmp_v + 1) >> 1;
+    }
+  }
+  if (src_width & 1) {       // last pixel
+    const uint32_t v0 = argb[2 * i + 0];
+    const int r = (v0 >> 14) & 0x3fc;
+    const int g = (v0 >>  6) & 0x3fc;
+    const int b = (v0 <<  2) & 0x3fc;
+    const int tmp_u = VP8RGBToU(r, g, b, YUV_HALF << 2);
+    const int tmp_v = VP8RGBToV(r, g, b, YUV_HALF << 2);
+    if (do_store) {
+      u[i] = tmp_u;
+      v[i] = tmp_v;
+    } else {
+      u[i] = (u[i] + tmp_u + 1) >> 1;
+      v[i] = (v[i] + tmp_v + 1) >> 1;
+    }
+  }
+}
+
+//-----------------------------------------------------------------------------
+
+static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
+  int i;
+  for (i = 0; i < width; ++i, rgb += 3) {
+    y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
+  }
+}
+
+static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
+  int i;
+  for (i = 0; i < width; ++i, bgr += 3) {
+    y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
+  }
+}
+
+void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
+                             uint8_t* u, uint8_t* v, int width) {
+  int i;
+  for (i = 0; i < width; i += 1, rgb += 4) {
+    const int r = rgb[0], g = rgb[1], b = rgb[2];
+    u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
+    v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
+  }
+}
+
+//-----------------------------------------------------------------------------
+
+void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
+void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
+void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
+                              uint8_t* u, uint8_t* v, int width);
+
+void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
+void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
+                            int src_width, int do_store);
+
+static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
+    (VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;
+
+extern void WebPInitConvertARGBToYUVSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
+  if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
+
+  WebPConvertARGBToY = ConvertARGBToY;
+  WebPConvertARGBToUV = WebPConvertARGBToUV_C;
+
+  WebPConvertRGB24ToY = ConvertRGB24ToY;
+  WebPConvertBGR24ToY = ConvertBGR24ToY;
+
+  WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
+
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      WebPInitConvertARGBToYUVSSE2();
+    }
+#endif  // WEBP_USE_SSE2
+  }
+  rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo;
+}
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.h b/src/3rdparty/libwebp/src/dsp/yuv.h
index 8a47edd..01c40fc 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.h
+++ b/src/3rdparty/libwebp/src/dsp/yuv.h
@@ -21,16 +21,15 @@
 //   G = 1.164 * (Y-16) - 0.813 * (V-128) - 0.391 * (U-128)
 //   B = 1.164 * (Y-16)                   + 2.018 * (U-128)
 // where Y is in the [16,235] range, and U/V in the [16,240] range.
-// In the table-lookup version (WEBP_YUV_USE_TABLE), the common factor
-// "1.164 * (Y-16)" can be handled as an offset in the VP8kClip[] table.
-// So in this case the formulae should read:
-//   R = 1.164 * [Y + 1.371 * (V-128)                  ] - 18.624
-//   G = 1.164 * [Y - 0.698 * (V-128) - 0.336 * (U-128)] - 18.624
-//   B = 1.164 * [Y                   + 1.733 * (U-128)] - 18.624
-// once factorized.
-// For YUV->RGB conversion, only 14bit fixed precision is used (YUV_FIX2).
-// That's the maximum possible for a convenient ARM implementation.
 //
+// The fixed-point implementation used here is:
+//  R = (19077 . y             + 26149 . v - 14234) >> 6
+//  G = (19077 . y -  6419 . u - 13320 . v +  8708) >> 6
+//  B = (19077 . y + 33050 . u             - 17685) >> 6
+// where the '.' operator is the mulhi_epu16 variant:
+//   a . b = ((a << 8) * b) >> 16
+// that preserves 8 bits of fractional precision before final descaling.
+
 // Author: Skal (pascal.massimino@gmail.com)
 
 #ifndef WEBP_DSP_YUV_H_
@@ -39,9 +38,6 @@
 #include "./dsp.h"
 #include "../dec/decode_vp8.h"
 
-// Define the following to use the LUT-based code:
-// #define WEBP_YUV_USE_TABLE
-
 #if defined(WEBP_EXPERIMENTAL_FEATURES)
 // Do NOT activate this feature for real compression. This is only experimental!
 // This flag is for comparison purpose against JPEG's "YUVj" natural colorspace.
@@ -66,41 +62,32 @@ enum {
   YUV_RANGE_MIN = -227,            // min value of r/g/b output
   YUV_RANGE_MAX = 256 + 226,       // max value of r/g/b output
 
-  YUV_FIX2 = 14,                   // fixed-point precision for YUV->RGB
-  YUV_HALF2 = 1 << (YUV_FIX2 - 1),
+  YUV_FIX2 = 6,                   // fixed-point precision for YUV->RGB
+  YUV_HALF2 = 1 << YUV_FIX2 >> 1,
   YUV_MASK2 = (256 << YUV_FIX2) - 1
 };
 
-// These constants are 14b fixed-point version of ITU-R BT.601 constants.
-#define kYScale 19077    // 1.164 = 255 / 219
-#define kVToR   26149    // 1.596 = 255 / 112 * 0.701
-#define kUToG   6419     // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
-#define kVToG   13320    // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
-#define kUToB   33050    // 2.018 = 255 / 112 * 0.886
-#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF2)
-#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF2)
-#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF2)
-
 //------------------------------------------------------------------------------
+// slower on x86 by ~7-8%, but bit-exact with the SSE2/NEON version
 
-#if !defined(WEBP_YUV_USE_TABLE)
-
-// slower on x86 by ~7-8%, but bit-exact with the SSE2 version
+static WEBP_INLINE int MultHi(int v, int coeff) {   // _mm_mulhi_epu16 emulation
+  return (v * coeff) >> 8;
+}
 
 static WEBP_INLINE int VP8Clip8(int v) {
   return ((v & ~YUV_MASK2) == 0) ? (v >> YUV_FIX2) : (v < 0) ? 0 : 255;
 }
 
 static WEBP_INLINE int VP8YUVToR(int y, int v) {
-  return VP8Clip8(kYScale * y + kVToR * v + kRCst);
+  return VP8Clip8(MultHi(y, 19077) + MultHi(v, 26149) - 14234);
 }
 
 static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
-  return VP8Clip8(kYScale * y - kUToG * u - kVToG * v + kGCst);
+  return VP8Clip8(MultHi(y, 19077) - MultHi(u, 6419) - MultHi(v, 13320) + 8708);
 }
 
 static WEBP_INLINE int VP8YUVToB(int y, int u) {
-  return VP8Clip8(kYScale * y + kUToB * u + kBCst);
+  return VP8Clip8(MultHi(y, 19077) + MultHi(u, 33050) - 17685);
 }
 
 static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
@@ -149,73 +136,6 @@ static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
 #endif
 }
 
-#else
-
-// Table-based version, not totally equivalent to the SSE2 version.
-// Rounding diff is only +/-1 though.
-
-extern int16_t VP8kVToR[256], VP8kUToB[256];
-extern int32_t VP8kVToG[256], VP8kUToG[256];
-extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
-
-static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
-                                    uint8_t* const rgb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN];
-  rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
-  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
-}
-
-static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
-                                    uint8_t* const bgr) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
-  bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
-  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
-}
-
-static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
-                                       uint8_t* const rgb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  const int rg = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
-                  (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
-  const int gb = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
-                   (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
-#ifdef WEBP_SWAP_16BIT_CSP
-  rgb[0] = gb;
-  rgb[1] = rg;
-#else
-  rgb[0] = rg;
-  rgb[1] = gb;
-#endif
-}
-
-static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
-                                         uint8_t* const argb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  const int rg = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
-                   VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
-  const int ba = (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4) | 0x0f;
-#ifdef WEBP_SWAP_16BIT_CSP
-  argb[0] = ba;
-  argb[1] = rg;
-#else
-  argb[0] = rg;
-  argb[1] = ba;
-#endif
-}
-
-#endif  // WEBP_YUV_USE_TABLE
-
 //-----------------------------------------------------------------------------
 // Alpha handling variants
 
@@ -245,12 +165,7 @@ void VP8YUVInit(void);
 
 #if defined(WEBP_USE_SSE2)
 
-// When the following is defined, tables are initialized statically, adding ~12k
-// to the binary size. Otherwise, they are initialized at run-time (small cost).
-#define WEBP_YUV_USE_SSE2_TABLES
-
-#if defined(FANCY_UPSAMPLING)
-// Process 32 pixels and store the result (24b or 32b per pixel) in *dst.
+// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                     uint8_t* dst);
 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
@@ -259,10 +174,12 @@ void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                     uint8_t* dst);
 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                    uint8_t* dst);
-#endif  // FANCY_UPSAMPLING
-
-// Must be called to initialize tables before using the functions.
-void VP8YUVInitSSE2(void);
+void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst);
+void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        uint8_t* dst);
+void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                      uint8_t* dst);
 
 #endif    // WEBP_USE_SSE2
 
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_mips32.c b/src/3rdparty/libwebp/src/dsp/yuv_mips32.c
index c82b4df..b8fe512 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv_mips32.c
@@ -14,7 +14,8 @@
 
 #include "./dsp.h"
 
-#if defined(WEBP_USE_MIPS32)
+// Code is disabled for now, in favor of the plain-C version
+#if 0  // defined(WEBP_USE_MIPS32)
 
 #include "./yuv.h"
 
@@ -84,17 +85,20 @@ ROW_FUNC(YuvToBgraRow,     4, 2, 1, 0, 3)
 
 #undef ROW_FUNC
 
-#endif   // WEBP_USE_MIPS32
-
 //------------------------------------------------------------------------------
+// Entry point
 
 extern void WebPInitSamplersMIPS32(void);
 
-void WebPInitSamplersMIPS32(void) {
-#if defined(WEBP_USE_MIPS32)
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPS32(void) {
   WebPSamplers[MODE_RGB]  = YuvToRgbRow;
   WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
   WebPSamplers[MODE_BGR]  = YuvToBgrRow;
   WebPSamplers[MODE_BGRA] = YuvToBgraRow;
-#endif  // WEBP_USE_MIPS32
 }
+
+#else  // !WEBP_USE_MIPS32
+
+WEBP_DSP_INIT_STUB(WebPInitSamplersMIPS32)
+
+#endif  // WEBP_USE_MIPS32
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c
new file mode 100644
index 0000000..dea0fdb
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/yuv_mips_dsp_r2.c
@@ -0,0 +1,135 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MIPS DSPr2 version of YUV to RGB upsampling functions.
+//
+// Author(s):  Branimir Vasic (branimir.vasic@imgtec.com)
+//             Djordje Pesut  (djordje.pesut@imgtec.com)
+
+#include "./dsp.h"
+
+// Code is disabled for now, in favor of the plain-C version
+#if 0  // defined(WEBP_USE_MIPS_DSP_R2)
+
+#include "./yuv.h"
+
+//------------------------------------------------------------------------------
+// simple point-sampling
+
+#define ROW_FUNC_PART_1()                                                      \
+  "lbu              %[temp3],   0(%[v])                         \n\t"          \
+  "lbu              %[temp4],   0(%[u])                         \n\t"          \
+  "lbu              %[temp0],   0(%[y])                         \n\t"          \
+  "mul              %[temp1],   %[t_con_1],     %[temp3]        \n\t"          \
+  "mul              %[temp3],   %[t_con_2],     %[temp3]        \n\t"          \
+  "mul              %[temp2],   %[t_con_3],     %[temp4]        \n\t"          \
+  "mul              %[temp4],   %[t_con_4],     %[temp4]        \n\t"          \
+  "mul              %[temp0],   %[t_con_5],     %[temp0]        \n\t"          \
+  "addu             %[temp1],   %[temp1],       %[t_con_6]      \n\t"          \
+  "subu             %[temp3],   %[temp3],       %[t_con_7]      \n\t"          \
+  "addu             %[temp2],   %[temp2],       %[temp3]        \n\t"          \
+  "addu             %[temp4],   %[temp4],       %[t_con_8]      \n\t"          \
+
+#define ROW_FUNC_PART_2(R, G, B, K)                                            \
+  "addu             %[temp5],   %[temp0],       %[temp1]        \n\t"          \
+  "subu             %[temp6],   %[temp0],       %[temp2]        \n\t"          \
+  "addu             %[temp7],   %[temp0],       %[temp4]        \n\t"          \
+".if " #K "                                                     \n\t"          \
+  "lbu              %[temp0],   1(%[y])                         \n\t"          \
+".endif                                                         \n\t"          \
+  "shll_s.w         %[temp5],   %[temp5],       9               \n\t"          \
+  "shll_s.w         %[temp6],   %[temp6],       9               \n\t"          \
+".if " #K "                                                     \n\t"          \
+  "mul              %[temp0],   %[t_con_5],     %[temp0]        \n\t"          \
+".endif                                                         \n\t"          \
+  "shll_s.w         %[temp7],   %[temp7],       9               \n\t"          \
+  "precrqu_s.qb.ph  %[temp5],   %[temp5],       $zero           \n\t"          \
+  "precrqu_s.qb.ph  %[temp6],   %[temp6],       $zero           \n\t"          \
+  "precrqu_s.qb.ph  %[temp7],   %[temp7],       $zero           \n\t"          \
+  "srl              %[temp5],   %[temp5],       24              \n\t"          \
+  "srl              %[temp6],   %[temp6],       24              \n\t"          \
+  "srl              %[temp7],   %[temp7],       24              \n\t"          \
+  "sb               %[temp5],   " #R "(%[dst])                  \n\t"          \
+  "sb               %[temp6],   " #G "(%[dst])                  \n\t"          \
+  "sb               %[temp7],   " #B "(%[dst])                  \n\t"          \
+
+#define ASM_CLOBBER_LIST()                                                     \
+  : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),             \
+    [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),             \
+    [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)                                   \
+  : [t_con_1]"r"(t_con_1), [t_con_2]"r"(t_con_2), [t_con_3]"r"(t_con_3),       \
+    [t_con_4]"r"(t_con_4), [t_con_5]"r"(t_con_5), [t_con_6]"r"(t_con_6),       \
+    [u]"r"(u), [v]"r"(v), [y]"r"(y), [dst]"r"(dst),                            \
+    [t_con_7]"r"(t_con_7), [t_con_8]"r"(t_con_8)                               \
+  : "memory", "hi", "lo"                                                       \
+
+#define ROW_FUNC(FUNC_NAME, XSTEP, R, G, B, A)                                 \
+static void FUNC_NAME(const uint8_t* y,                                        \
+                      const uint8_t* u, const uint8_t* v,                      \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;             \
+  const int t_con_1 = kVToR;                                                   \
+  const int t_con_2 = kVToG;                                                   \
+  const int t_con_3 = kUToG;                                                   \
+  const int t_con_4 = kUToB;                                                   \
+  const int t_con_5 = kYScale;                                                 \
+  const int t_con_6 = kRCst;                                                   \
+  const int t_con_7 = kGCst;                                                   \
+  const int t_con_8 = kBCst;                                                   \
+  for (i = 0; i < (len >> 1); i++) {                                           \
+    __asm__ volatile (                                                         \
+      ROW_FUNC_PART_1()                                                        \
+      ROW_FUNC_PART_2(R, G, B, 1)                                              \
+      ROW_FUNC_PART_2(R + XSTEP, G + XSTEP, B + XSTEP, 0)                      \
+      ASM_CLOBBER_LIST()                                                       \
+    );                                                                         \
+    if (A) dst[A] = dst[A + XSTEP] = 0xff;                                     \
+    y += 2;                                                                    \
+    ++u;                                                                       \
+    ++v;                                                                       \
+    dst += 2 * XSTEP;                                                          \
+  }                                                                            \
+  if (len & 1) {                                                               \
+    __asm__ volatile (                                                         \
+      ROW_FUNC_PART_1()                                                        \
+      ROW_FUNC_PART_2(R, G, B, 0)                                              \
+      ASM_CLOBBER_LIST()                                                       \
+    );                                                                         \
+    if (A) dst[A] = 0xff;                                                      \
+  }                                                                            \
+}
+
+ROW_FUNC(YuvToRgbRow,      3, 0, 1, 2, 0)
+ROW_FUNC(YuvToRgbaRow,     4, 0, 1, 2, 3)
+ROW_FUNC(YuvToBgrRow,      3, 2, 1, 0, 0)
+ROW_FUNC(YuvToBgraRow,     4, 2, 1, 0, 3)
+
+#undef ROW_FUNC
+#undef ASM_CLOBBER_LIST
+#undef ROW_FUNC_PART_2
+#undef ROW_FUNC_PART_1
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitSamplersMIPSdspR2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersMIPSdspR2(void) {
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow;
+  WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow;
+  WebPSamplers[MODE_BGRA] = YuvToBgraRow;
+}
+
+#else  // !WEBP_USE_MIPS_DSP_R2
+
+WEBP_DSP_INIT_STUB(WebPInitSamplersMIPSdspR2)
+
+#endif  // WEBP_USE_MIPS_DSP_R2
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
index 6fe0f3b..f72fe32 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
@@ -16,307 +16,759 @@
 #if defined(WEBP_USE_SSE2)
 
 #include <emmintrin.h>
-#include <string.h>   // for memcpy
 
-typedef union {   // handy struct for converting SSE2 registers
-  int32_t i32[4];
-  uint8_t u8[16];
-  __m128i m;
-} VP8kCstSSE2;
-
-#if defined(WEBP_YUV_USE_SSE2_TABLES)
-
-#include "./yuv_tables_sse2.h"
-
-void VP8YUVInitSSE2(void) {}
-
-#else
+//-----------------------------------------------------------------------------
+// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
 
-static int done_sse2 = 0;
-static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256];
-
-void VP8YUVInitSSE2(void) {
-  if (!done_sse2) {
-    int i;
-    for (i = 0; i < 256; ++i) {
-      VP8kYtoRGBA[i].i32[0] =
-        VP8kYtoRGBA[i].i32[1] =
-        VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2;
-      VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2;
-
-      VP8kUtoRGBA[i].i32[0] = 0;
-      VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128);
-      VP8kUtoRGBA[i].i32[2] =  kUToB * (i - 128);
-      VP8kUtoRGBA[i].i32[3] = 0;
-
-      VP8kVtoRGBA[i].i32[0] =  kVToR * (i - 128);
-      VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128);
-      VP8kVtoRGBA[i].i32[2] = 0;
-      VP8kVtoRGBA[i].i32[3] = 0;
-    }
-    done_sse2 = 1;
-
-#if 0   // code used to generate 'yuv_tables_sse2.h'
-    printf("static const VP8kCstSSE2 VP8kYtoRGBA[256] = {\n");
-    for (i = 0; i < 256; ++i) {
-      printf("  {{0x%.8x, 0x%.8x, 0x%.8x, 0x%.8x}},\n",
-             VP8kYtoRGBA[i].i32[0], VP8kYtoRGBA[i].i32[1],
-             VP8kYtoRGBA[i].i32[2], VP8kYtoRGBA[i].i32[3]);
-    }
-    printf("};\n\n");
-    printf("static const VP8kCstSSE2 VP8kUtoRGBA[256] = {\n");
-    for (i = 0; i < 256; ++i) {
-      printf("  {{0, 0x%.8x, 0x%.8x, 0}},\n",
-             VP8kUtoRGBA[i].i32[1], VP8kUtoRGBA[i].i32[2]);
-    }
-    printf("};\n\n");
-    printf("static VP8kCstSSE2 VP8kVtoRGBA[256] = {\n");
-    for (i = 0; i < 256; ++i) {
-      printf("  {{0x%.8x, 0x%.8x, 0, 0}},\n",
-             VP8kVtoRGBA[i].i32[0], VP8kVtoRGBA[i].i32[1]);
-    }
-    printf("};\n\n");
-#endif
-  }
+// These constants are 14b fixed-point version of ITU-R BT.601 constants.
+// R = (19077 * y             + 26149 * v - 14234) >> 6
+// G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
+// B = (19077 * y + 33050 * u             - 17685) >> 6
+static void ConvertYUV444ToRGB(const __m128i* const Y0,
+                               const __m128i* const U0,
+                               const __m128i* const V0,
+                               __m128i* const R,
+                               __m128i* const G,
+                               __m128i* const B) {
+  const __m128i k19077 = _mm_set1_epi16(19077);
+  const __m128i k26149 = _mm_set1_epi16(26149);
+  const __m128i k14234 = _mm_set1_epi16(14234);
+  const __m128i k33050 = _mm_set1_epi16(33050);
+  const __m128i k17685 = _mm_set1_epi16(17685);
+  const __m128i k6419  = _mm_set1_epi16(6419);
+  const __m128i k13320 = _mm_set1_epi16(13320);
+  const __m128i k8708  = _mm_set1_epi16(8708);
+
+  const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
+
+  const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
+  const __m128i R1 = _mm_sub_epi16(Y1, k14234);
+  const __m128i R2 = _mm_add_epi16(R1, R0);
+
+  const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
+  const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
+  const __m128i G2 = _mm_add_epi16(Y1, k8708);
+  const __m128i G3 = _mm_add_epi16(G0, G1);
+  const __m128i G4 = _mm_sub_epi16(G2, G3);
+
+  // be careful with the saturated *unsigned* arithmetic here!
+  const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
+  const __m128i B1 = _mm_adds_epu16(B0, Y1);
+  const __m128i B2 = _mm_subs_epu16(B1, k17685);
+
+  // use logical shift for B2, which can be larger than 32767
+  *R = _mm_srai_epi16(R2, 6);   // range: [-14234, 30815]
+  *G = _mm_srai_epi16(G4, 6);   // range: [-10953, 27710]
+  *B = _mm_srli_epi16(B2, 6);   // range: [0, 34238]
 }
 
-#endif  // WEBP_YUV_USE_SSE2_TABLES
+// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
+static WEBP_INLINE __m128i Load_HI_16(const uint8_t* src) {
+  const __m128i zero = _mm_setzero_si128();
+  return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
+}
 
-//-----------------------------------------------------------------------------
+// Load and replicate the U/V samples
+static WEBP_INLINE __m128i Load_UV_HI_8(const uint8_t* src) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
+  const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
+  return _mm_unpacklo_epi16(tmp1, tmp1);   // replicate samples
+}
 
-static WEBP_INLINE __m128i LoadUVPart(int u, int v) {
-  const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
-  const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
-  const __m128i uv_part = _mm_add_epi32(u_part, v_part);
-  return uv_part;
+// Convert 32 samples of YUV444 to R/G/B
+static void YUV444ToRGB(const uint8_t* const y,
+                        const uint8_t* const u,
+                        const uint8_t* const v,
+                        __m128i* const R, __m128i* const G, __m128i* const B) {
+  const __m128i Y0 = Load_HI_16(y), U0 = Load_HI_16(u), V0 = Load_HI_16(v);
+  ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
 }
 
-static WEBP_INLINE __m128i GetRGBA32bWithUV(int y, const __m128i uv_part) {
-  const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
-  const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
-  const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
-  return rgba2;
+// Convert 32 samples of YUV420 to R/G/B
+static void YUV420ToRGB(const uint8_t* const y,
+                        const uint8_t* const u,
+                        const uint8_t* const v,
+                        __m128i* const R, __m128i* const G, __m128i* const B) {
+  const __m128i Y0 = Load_HI_16(y), U0 = Load_UV_HI_8(u), V0 = Load_UV_HI_8(v);
+  ConvertYUV444ToRGB(&Y0, &U0, &V0, R, G, B);
 }
 
-static WEBP_INLINE __m128i GetRGBA32b(int y, int u, int v) {
-  const __m128i uv_part = LoadUVPart(u, v);
-  return GetRGBA32bWithUV(y, uv_part);
+// Pack R/G/B/A results into 32b output.
+static WEBP_INLINE void PackAndStore4(const __m128i* const R,
+                                      const __m128i* const G,
+                                      const __m128i* const B,
+                                      const __m128i* const A,
+                                      uint8_t* const dst) {
+  const __m128i rb = _mm_packus_epi16(*R, *B);
+  const __m128i ga = _mm_packus_epi16(*G, *A);
+  const __m128i rg = _mm_unpacklo_epi8(rb, ga);
+  const __m128i ba = _mm_unpackhi_epi8(rb, ga);
+  const __m128i RGBA_lo = _mm_unpacklo_epi16(rg, ba);
+  const __m128i RGBA_hi = _mm_unpackhi_epi16(rg, ba);
+  _mm_storeu_si128((__m128i*)(dst +  0), RGBA_lo);
+  _mm_storeu_si128((__m128i*)(dst + 16), RGBA_hi);
 }
 
-static WEBP_INLINE void YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v,
-                                     uint8_t* const rgb) {
-  const __m128i tmp0 = GetRGBA32b(y, u, v);
-  const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0);
-  const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1);
-  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
-  _mm_storel_epi64((__m128i*)rgb, tmp2);
+// Pack R/G/B/A results into 16b output.
+static WEBP_INLINE void PackAndStore4444(const __m128i* const R,
+                                         const __m128i* const G,
+                                         const __m128i* const B,
+                                         const __m128i* const A,
+                                         uint8_t* const dst) {
+#if !defined(WEBP_SWAP_16BIT_CSP)
+  const __m128i rg0 = _mm_packus_epi16(*R, *G);
+  const __m128i ba0 = _mm_packus_epi16(*B, *A);
+#else
+  const __m128i rg0 = _mm_packus_epi16(*B, *A);
+  const __m128i ba0 = _mm_packus_epi16(*R, *G);
+#endif
+  const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
+  const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0);  // rbrbrbrbrb...
+  const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0);  // gagagagaga...
+  const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0);
+  const __m128i ga2 = _mm_srli_epi16(_mm_and_si128(ga1, mask_0xf0), 4);
+  const __m128i rgba4444 = _mm_or_si128(rb2, ga2);
+  _mm_storeu_si128((__m128i*)dst, rgba4444);
 }
 
-static WEBP_INLINE void YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v,
-                                     uint8_t* const bgr) {
-  const __m128i tmp0 = GetRGBA32b(y, u, v);
-  const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2));
-  const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1);
-  const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2);
-  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
-  _mm_storel_epi64((__m128i*)bgr, tmp3);
+// Pack R/G/B results into 16b output.
+static WEBP_INLINE void PackAndStore565(const __m128i* const R,
+                                        const __m128i* const G,
+                                        const __m128i* const B,
+                                        uint8_t* const dst) {
+  const __m128i r0 = _mm_packus_epi16(*R, *R);
+  const __m128i g0 = _mm_packus_epi16(*G, *G);
+  const __m128i b0 = _mm_packus_epi16(*B, *B);
+  const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8(0xf8));
+  const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f));
+  const __m128i g1 = _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0xe0)), 5);
+  const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
+  const __m128i rg = _mm_or_si128(r1, g1);
+  const __m128i gb = _mm_or_si128(g2, b1);
+#if !defined(WEBP_SWAP_16BIT_CSP)
+  const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
+#else
+  const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
+#endif
+  _mm_storeu_si128((__m128i*)dst, rgb565);
 }
 
-//-----------------------------------------------------------------------------
-// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
+// Function used several times in PlanarTo24b.
+// It samples the in buffer as follows: one every two unsigned char is stored
+// at the beginning of the buffer, while the other half is stored at the end.
+static WEBP_INLINE void PlanarTo24bHelper(const __m128i* const in /*in[6]*/,
+                                          __m128i* const out /*out[6]*/) {
+  const __m128i v_mask = _mm_set1_epi16(0x00ff);
+
+  // Take one every two upper 8b values.
+  out[0] = _mm_packus_epi16(_mm_and_si128(in[0], v_mask),
+                            _mm_and_si128(in[1], v_mask));
+  out[1] = _mm_packus_epi16(_mm_and_si128(in[2], v_mask),
+                            _mm_and_si128(in[3], v_mask));
+  out[2] = _mm_packus_epi16(_mm_and_si128(in[4], v_mask),
+                            _mm_and_si128(in[5], v_mask));
+  // Take one every two lower 8b values.
+  out[3] = _mm_packus_epi16(_mm_srli_epi16(in[0], 8), _mm_srli_epi16(in[1], 8));
+  out[4] = _mm_packus_epi16(_mm_srli_epi16(in[2], 8), _mm_srli_epi16(in[3], 8));
+  out[5] = _mm_packus_epi16(_mm_srli_epi16(in[4], 8), _mm_srli_epi16(in[5], 8));
+}
 
-#ifdef FANCY_UPSAMPLING
+// Pack the planar buffers
+// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
+static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) {
+  // The input is 6 registers of sixteen 8b but for the sake of explanation,
+  // let's take 6 registers of four 8b values.
+  // To pack, we will keep taking one every two 8b integer and move it
+  // around as follows:
+  // Input:
+  //   r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
+  // Split the 6 registers in two sets of 3 registers: the first set as the even
+  // 8b bytes, the second the odd ones:
+  //   r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
+  // Repeat the same permutations twice more:
+  //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
+  //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
+  __m128i tmp[6];
+  PlanarTo24bHelper(in, tmp);
+  PlanarTo24bHelper(tmp, in);
+  PlanarTo24bHelper(in, tmp);
+  // We need to do it two more times than the example as we have sixteen bytes.
+  PlanarTo24bHelper(tmp, in);
+  PlanarTo24bHelper(in, tmp);
+
+  _mm_storeu_si128((__m128i*)(rgb +  0), tmp[0]);
+  _mm_storeu_si128((__m128i*)(rgb + 16), tmp[1]);
+  _mm_storeu_si128((__m128i*)(rgb + 32), tmp[2]);
+  _mm_storeu_si128((__m128i*)(rgb + 48), tmp[3]);
+  _mm_storeu_si128((__m128i*)(rgb + 64), tmp[4]);
+  _mm_storeu_si128((__m128i*)(rgb + 80), tmp[5]);
+}
+#undef MK_UINT32
 
 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                     uint8_t* dst) {
+  const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
-  for (n = 0; n < 32; n += 4) {
-    const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
-    const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
-    const __m128i tmp0_3 = GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]);
-    const __m128i tmp0_4 = GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]);
-    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
-    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
-    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
-    _mm_storeu_si128((__m128i*)dst, tmp2);
-    dst += 4 * 4;
+  for (n = 0; n < 32; n += 8, dst += 32) {
+    __m128i R, G, B;
+    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore4(&R, &G, &B, &kAlpha, dst);
   }
 }
 
 void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                     uint8_t* dst) {
+  const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
-  for (n = 0; n < 32; n += 2) {
-    const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
-    const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
-    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
-    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
-    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
-    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
-    _mm_storel_epi64((__m128i*)dst, tmp3);
-    dst += 4 * 2;
+  for (n = 0; n < 32; n += 8, dst += 32) {
+    __m128i R, G, B;
+    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore4(&B, &G, &R, &kAlpha, dst);
   }
 }
 
-void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst) {
+void VP8YuvToArgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst) {
+  const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
-  uint8_t tmp0[2 * 3 + 5 + 15];
-  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
-  for (n = 0; n < 30; ++n) {   // we directly stomp the *dst memory
-    YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3);
+  for (n = 0; n < 32; n += 8, dst += 32) {
+    __m128i R, G, B;
+    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore4(&kAlpha, &R, &G, &B, dst);
   }
-  // Last two pixels are special: we write in a tmp buffer before sending
-  // to dst.
-  YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
-  YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
-  memcpy(dst + n * 3, tmp, 2 * 3);
 }
 
-void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
-                   uint8_t* dst) {
+void VP8YuvToRgba444432(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        uint8_t* dst) {
+  const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
-  uint8_t tmp0[2 * 3 + 5 + 15];
-  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
-  for (n = 0; n < 30; ++n) {
-    YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3);
+  for (n = 0; n < 32; n += 8, dst += 16) {
+    __m128i R, G, B;
+    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore4444(&R, &G, &B, &kAlpha, dst);
   }
-  YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
-  YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
-  memcpy(dst + n * 3, tmp, 2 * 3);
 }
 
-#endif  // FANCY_UPSAMPLING
+void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                      uint8_t* dst) {
+  int n;
+  for (n = 0; n < 32; n += 8, dst += 16) {
+    __m128i R, G, B;
+    YUV444ToRGB(y + n, u + n, v + n, &R, &G, &B);
+    PackAndStore565(&R, &G, &B, dst);
+  }
+}
+
+void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst) {
+  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+  __m128i rgb[6];
+
+  YUV444ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+  YUV444ToRGB(y +  8, u +  8, v +  8, &R1, &G1, &B1);
+  YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
+  YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
+
+  // Cast to 8b and store as RRRRGGGGBBBB.
+  rgb[0] = _mm_packus_epi16(R0, R1);
+  rgb[1] = _mm_packus_epi16(R2, R3);
+  rgb[2] = _mm_packus_epi16(G0, G1);
+  rgb[3] = _mm_packus_epi16(G2, G3);
+  rgb[4] = _mm_packus_epi16(B0, B1);
+  rgb[5] = _mm_packus_epi16(B2, B3);
+
+  // Pack as RGBRGBRGBRGB.
+  PlanarTo24b(rgb, dst);
+}
+
+void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst) {
+  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+  __m128i bgr[6];
+
+  YUV444ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+  YUV444ToRGB(y +  8, u +  8, v +  8, &R1, &G1, &B1);
+  YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
+  YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
+
+  // Cast to 8b and store as BBBBGGGGRRRR.
+  bgr[0] = _mm_packus_epi16(B0, B1);
+  bgr[1] = _mm_packus_epi16(B2, B3);
+  bgr[2] = _mm_packus_epi16(G0, G1);
+  bgr[3] = _mm_packus_epi16(G2, G3);
+  bgr[4] = _mm_packus_epi16(R0, R1);
+  bgr[5] = _mm_packus_epi16(R2, R3);
+
+  // Pack as BGRBGRBGRBGR.
+  PlanarTo24b(bgr, dst);
+}
 
 //-----------------------------------------------------------------------------
 // Arbitrary-length row conversion functions
 
-static void YuvToRgbaRowSSE2(const uint8_t* y,
-                             const uint8_t* u, const uint8_t* v,
-                             uint8_t* dst, int len) {
+static void YuvToRgbaRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst, int len) {
+  const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
-  for (n = 0; n + 4 <= len; n += 4) {
-    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
-    const __m128i uv_1 = LoadUVPart(u[1], v[1]);
-    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
-    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
-    const __m128i tmp0_3 = GetRGBA32bWithUV(y[2], uv_1);
-    const __m128i tmp0_4 = GetRGBA32bWithUV(y[3], uv_1);
-    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
-    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
-    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
-    _mm_storeu_si128((__m128i*)dst, tmp2);
-    dst += 4 * 4;
-    y += 4;
-    u += 2;
-    v += 2;
+  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
+    __m128i R, G, B;
+    YUV420ToRGB(y, u, v, &R, &G, &B);
+    PackAndStore4(&R, &G, &B, &kAlpha, dst);
+    y += 8;
+    u += 4;
+    v += 4;
   }
-  // Finish off
-  while (n < len) {
+  for (; n < len; ++n) {   // Finish off
     VP8YuvToRgba(y[0], u[0], v[0], dst);
     dst += 4;
-    ++y;
+    y += 1;
     u += (n & 1);
     v += (n & 1);
-    ++n;
   }
 }
 
-static void YuvToBgraRowSSE2(const uint8_t* y,
-                             const uint8_t* u, const uint8_t* v,
-                             uint8_t* dst, int len) {
+static void YuvToBgraRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst, int len) {
+  const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
-  for (n = 0; n + 2 <= len; n += 2) {
-    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
-    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
-    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
-    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
-    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
-    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
-    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
-    _mm_storel_epi64((__m128i*)dst, tmp3);
-    dst += 4 * 2;
-    y += 2;
-    ++u;
-    ++v;
+  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
+    __m128i R, G, B;
+    YUV420ToRGB(y, u, v, &R, &G, &B);
+    PackAndStore4(&B, &G, &R, &kAlpha, dst);
+    y += 8;
+    u += 4;
+    v += 4;
   }
-  // Finish off
-  if (len & 1) {
+  for (; n < len; ++n) {   // Finish off
     VP8YuvToBgra(y[0], u[0], v[0], dst);
+    dst += 4;
+    y += 1;
+    u += (n & 1);
+    v += (n & 1);
   }
 }
 
-static void YuvToArgbRowSSE2(const uint8_t* y,
-                             const uint8_t* u, const uint8_t* v,
-                             uint8_t* dst, int len) {
+static void YuvToArgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst, int len) {
+  const __m128i kAlpha = _mm_set1_epi16(255);
   int n;
-  for (n = 0; n + 2 <= len; n += 2) {
-    const __m128i uv_0 = LoadUVPart(u[0], v[0]);
-    const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0);
-    const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0);
-    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(2, 1, 0, 3));
-    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(2, 1, 0, 3));
-    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
-    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
-    _mm_storel_epi64((__m128i*)dst, tmp3);
-    dst += 4 * 2;
-    y += 2;
-    ++u;
-    ++v;
+  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
+    __m128i R, G, B;
+    YUV420ToRGB(y, u, v, &R, &G, &B);
+    PackAndStore4(&kAlpha, &R, &G, &B, dst);
+    y += 8;
+    u += 4;
+    v += 4;
   }
-  // Finish off
-  if (len & 1) {
+  for (; n < len; ++n) {   // Finish off
     VP8YuvToArgb(y[0], u[0], v[0], dst);
+    dst += 4;
+    y += 1;
+    u += (n & 1);
+    v += (n & 1);
   }
 }
 
-static void YuvToRgbRowSSE2(const uint8_t* y,
-                            const uint8_t* u, const uint8_t* v,
-                            uint8_t* dst, int len) {
+static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        uint8_t* dst, int len) {
   int n;
-  for (n = 0; n + 2 < len; ++n) {   // we directly stomp the *dst memory
-    YuvToRgbSSE2(y[0], u[0], v[0], dst);  // stomps 8 bytes
+  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
+    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+    __m128i rgb[6];
+
+    YUV420ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+    YUV420ToRGB(y +  8, u +  4, v +  4, &R1, &G1, &B1);
+    YUV420ToRGB(y + 16, u +  8, v +  8, &R2, &G2, &B2);
+    YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
+
+    // Cast to 8b and store as RRRRGGGGBBBB.
+    rgb[0] = _mm_packus_epi16(R0, R1);
+    rgb[1] = _mm_packus_epi16(R2, R3);
+    rgb[2] = _mm_packus_epi16(G0, G1);
+    rgb[3] = _mm_packus_epi16(G2, G3);
+    rgb[4] = _mm_packus_epi16(B0, B1);
+    rgb[5] = _mm_packus_epi16(B2, B3);
+
+    // Pack as RGBRGBRGBRGB.
+    PlanarTo24b(rgb, dst);
+
+    y += 32;
+    u += 16;
+    v += 16;
+  }
+  for (; n < len; ++n) {   // Finish off
+    VP8YuvToRgb(y[0], u[0], v[0], dst);
     dst += 3;
-    ++y;
+    y += 1;
     u += (n & 1);
     v += (n & 1);
   }
-  VP8YuvToRgb(y[0], u[0], v[0], dst);
-  if (len > 1) {
-    VP8YuvToRgb(y[1], u[n & 1], v[n & 1], dst + 3);
-  }
 }
 
-static void YuvToBgrRowSSE2(const uint8_t* y,
-                            const uint8_t* u, const uint8_t* v,
-                            uint8_t* dst, int len) {
+static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                        uint8_t* dst, int len) {
   int n;
-  for (n = 0; n + 2 < len; ++n) {   // we directly stomp the *dst memory
-    YuvToBgrSSE2(y[0], u[0], v[0], dst);  // stomps 8 bytes
+  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
+    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+    __m128i bgr[6];
+
+    YUV420ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+    YUV420ToRGB(y +  8, u +  4, v +  4, &R1, &G1, &B1);
+    YUV420ToRGB(y + 16, u +  8, v +  8, &R2, &G2, &B2);
+    YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
+
+    // Cast to 8b and store as BBBBGGGGRRRR.
+    bgr[0] = _mm_packus_epi16(B0, B1);
+    bgr[1] = _mm_packus_epi16(B2, B3);
+    bgr[2] = _mm_packus_epi16(G0, G1);
+    bgr[3] = _mm_packus_epi16(G2, G3);
+    bgr[4] = _mm_packus_epi16(R0, R1);
+    bgr[5] = _mm_packus_epi16(R2, R3);
+
+    // Pack as BGRBGRBGRBGR.
+    PlanarTo24b(bgr, dst);
+
+    y += 32;
+    u += 16;
+    v += 16;
+  }
+  for (; n < len; ++n) {   // Finish off
+    VP8YuvToBgr(y[0], u[0], v[0], dst);
     dst += 3;
-    ++y;
+    y += 1;
     u += (n & 1);
     v += (n & 1);
   }
-  VP8YuvToBgr(y[0], u[0], v[0], dst + 0);
-  if (len > 1) {
-    VP8YuvToBgr(y[1], u[n & 1], v[n & 1], dst + 3);
-  }
 }
 
-#endif  // WEBP_USE_SSE2
-
 //------------------------------------------------------------------------------
 // Entry point
 
 extern void WebPInitSamplersSSE2(void);
 
-void WebPInitSamplersSSE2(void) {
-#if defined(WEBP_USE_SSE2)
-  WebPSamplers[MODE_RGB]  = YuvToRgbRowSSE2;
-  WebPSamplers[MODE_RGBA] = YuvToRgbaRowSSE2;
-  WebPSamplers[MODE_BGR]  = YuvToBgrRowSSE2;
-  WebPSamplers[MODE_BGRA] = YuvToBgraRowSSE2;
-  WebPSamplers[MODE_ARGB] = YuvToArgbRowSSE2;
-#endif  // WEBP_USE_SSE2
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow;
+  WebPSamplers[MODE_RGBA] = YuvToRgbaRow;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow;
+  WebPSamplers[MODE_BGRA] = YuvToBgraRow;
+  WebPSamplers[MODE_ARGB] = YuvToArgbRow;
 }
+
+//------------------------------------------------------------------------------
+// RGB24/32 -> YUV converters
+
+// Load eight 16b-words from *src.
+#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
+// Store either 16b-words into *dst
+#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
+
+// Function that inserts a value of the second half of the in buffer in between
+// every two char of the first half.
+static WEBP_INLINE void RGB24PackedToPlanarHelper(
+    const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) {
+  out[0] = _mm_unpacklo_epi8(in[0], in[3]);
+  out[1] = _mm_unpackhi_epi8(in[0], in[3]);
+  out[2] = _mm_unpacklo_epi8(in[1], in[4]);
+  out[3] = _mm_unpackhi_epi8(in[1], in[4]);
+  out[4] = _mm_unpacklo_epi8(in[2], in[5]);
+  out[5] = _mm_unpackhi_epi8(in[2], in[5]);
+}
+
+// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
+// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+// Similar to PlanarTo24bHelper(), but in reverse order.
+static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
+                                            __m128i* const out /*out[6]*/) {
+  __m128i tmp[6];
+  tmp[0] = _mm_loadu_si128((const __m128i*)(rgb +  0));
+  tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
+  tmp[2] = _mm_loadu_si128((const __m128i*)(rgb + 32));
+  tmp[3] = _mm_loadu_si128((const __m128i*)(rgb + 48));
+  tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64));
+  tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80));
+
+  RGB24PackedToPlanarHelper(tmp, out);
+  RGB24PackedToPlanarHelper(out, tmp);
+  RGB24PackedToPlanarHelper(tmp, out);
+  RGB24PackedToPlanarHelper(out, tmp);
+  RGB24PackedToPlanarHelper(tmp, out);
+}
+
+// Convert 8 packed ARGB to r[], g[], b[]
+static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
+                                            __m128i* const r,
+                                            __m128i* const g,
+                                            __m128i* const b) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i in0 = LOAD_16(argb + 0);    // argb3 | argb2 | argb1 | argb0
+  const __m128i in1 = LOAD_16(argb + 4);    // argb7 | argb6 | argb5 | argb4
+  // column-wise transpose
+  const __m128i A0 = _mm_unpacklo_epi8(in0, in1);
+  const __m128i A1 = _mm_unpackhi_epi8(in0, in1);
+  const __m128i B0 = _mm_unpacklo_epi8(A0, A1);
+  const __m128i B1 = _mm_unpackhi_epi8(A0, A1);
+  // C0 = g7 g6 ... g1 g0 | b7 b6 ... b1 b0
+  // C1 = a7 a6 ... a1 a0 | r7 r6 ... r1 r0
+  const __m128i C0 = _mm_unpacklo_epi8(B0, B1);
+  const __m128i C1 = _mm_unpackhi_epi8(B0, B1);
+  // store 16b
+  *r = _mm_unpacklo_epi8(C1, zero);
+  *g = _mm_unpackhi_epi8(C0, zero);
+  *b = _mm_unpacklo_epi8(C0, zero);
+}
+
+// This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
+// It's a macro and not a function because we need to use immediate values with
+// srai_epi32, e.g.
+#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
+                  ROUNDER, DESCALE_FIX, OUT) do {               \
+  const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG);         \
+  const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG);         \
+  const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB);         \
+  const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB);         \
+  const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo);            \
+  const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi);            \
+  const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER);          \
+  const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER);          \
+  const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX);     \
+  const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX);     \
+  (OUT) = _mm_packs_epi32(V5_lo, V5_hi);                        \
+} while (0)
+
+#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
+static WEBP_INLINE void ConvertRGBToY(const __m128i* const R,
+                                      const __m128i* const G,
+                                      const __m128i* const B,
+                                      __m128i* const Y) {
+  const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
+  const __m128i kGB_y = MK_CST_16(16384, 6420);
+  const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
+
+  const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
+  const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
+  const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
+  const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
+  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
+}
+
+static WEBP_INLINE void ConvertRGBToUV(const __m128i* const R,
+                                       const __m128i* const G,
+                                       const __m128i* const B,
+                                       __m128i* const U, __m128i* const V) {
+  const __m128i kRG_u = MK_CST_16(-9719, -19081);
+  const __m128i kGB_u = MK_CST_16(0, 28800);
+  const __m128i kRG_v = MK_CST_16(28800, 0);
+  const __m128i kGB_v = MK_CST_16(-24116, -4684);
+  const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
+
+  const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
+  const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
+  const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
+  const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
+  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
+            kHALF_UV, YUV_FIX + 2, *U);
+  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
+            kHALF_UV, YUV_FIX + 2, *V);
+}
+
+#undef MK_CST_16
+#undef TRANSFORM
+
+static void ConvertRGB24ToY(const uint8_t* rgb, uint8_t* y, int width) {
+  const int max_width = width & ~31;
+  int i;
+  for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
+    __m128i rgb_plane[6];
+    int j;
+
+    RGB24PackedToPlanar(rgb, rgb_plane);
+
+    for (j = 0; j < 2; ++j, i += 16) {
+      const __m128i zero = _mm_setzero_si128();
+      __m128i r, g, b, Y0, Y1;
+
+      // Convert to 16-bit Y.
+      r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
+      g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
+      b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
+      ConvertRGBToY(&r, &g, &b, &Y0);
+
+      // Convert to 16-bit Y.
+      r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
+      g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
+      b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
+      ConvertRGBToY(&r, &g, &b, &Y1);
+
+      // Cast to 8-bit and store.
+      STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
+    }
+  }
+  for (; i < width; ++i, rgb += 3) {   // left-over
+    y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
+  }
+}
+
+static void ConvertBGR24ToY(const uint8_t* bgr, uint8_t* y, int width) {
+  const int max_width = width & ~31;
+  int i;
+  for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
+    __m128i bgr_plane[6];
+    int j;
+
+    RGB24PackedToPlanar(bgr, bgr_plane);
+
+    for (j = 0; j < 2; ++j, i += 16) {
+      const __m128i zero = _mm_setzero_si128();
+      __m128i r, g, b, Y0, Y1;
+
+      // Convert to 16-bit Y.
+      b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
+      g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
+      r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
+      ConvertRGBToY(&r, &g, &b, &Y0);
+
+      // Convert to 16-bit Y.
+      b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
+      g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
+      r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
+      ConvertRGBToY(&r, &g, &b, &Y1);
+
+      // Cast to 8-bit and store.
+      STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
+    }
+  }
+  for (; i < width; ++i, bgr += 3) {  // left-over
+    y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
+  }
+}
+
+static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
+  const int max_width = width & ~15;
+  int i;
+  for (i = 0; i < max_width; i += 16) {
+    __m128i r, g, b, Y0, Y1;
+    RGB32PackedToPlanar(&argb[i + 0], &r, &g, &b);
+    ConvertRGBToY(&r, &g, &b, &Y0);
+    RGB32PackedToPlanar(&argb[i + 8], &r, &g, &b);
+    ConvertRGBToY(&r, &g, &b, &Y1);
+    STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
+  }
+  for (; i < width; ++i) {   // left-over
+    const uint32_t p = argb[i];
+    y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
+                     YUV_HALF);
+  }
+}
+
+// Horizontal add (doubled) of two 16b values, result is 16b.
+// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
+static void HorizontalAddPack(const __m128i* const A, const __m128i* const B,
+                              __m128i* const out) {
+  const __m128i k2 = _mm_set1_epi16(2);
+  const __m128i C = _mm_madd_epi16(*A, k2);
+  const __m128i D = _mm_madd_epi16(*B, k2);
+  *out = _mm_packs_epi32(C, D);
+}
+
+static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
+                            int src_width, int do_store) {
+  const int max_width = src_width & ~31;
+  int i;
+  for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
+    __m128i r0, g0, b0, r1, g1, b1, U0, V0, U1, V1;
+    RGB32PackedToPlanar(&argb[i +  0], &r0, &g0, &b0);
+    RGB32PackedToPlanar(&argb[i +  8], &r1, &g1, &b1);
+    HorizontalAddPack(&r0, &r1, &r0);
+    HorizontalAddPack(&g0, &g1, &g0);
+    HorizontalAddPack(&b0, &b1, &b0);
+    ConvertRGBToUV(&r0, &g0, &b0, &U0, &V0);
+
+    RGB32PackedToPlanar(&argb[i + 16], &r0, &g0, &b0);
+    RGB32PackedToPlanar(&argb[i + 24], &r1, &g1, &b1);
+    HorizontalAddPack(&r0, &r1, &r0);
+    HorizontalAddPack(&g0, &g1, &g0);
+    HorizontalAddPack(&b0, &b1, &b0);
+    ConvertRGBToUV(&r0, &g0, &b0, &U1, &V1);
+
+    U0 = _mm_packus_epi16(U0, U1);
+    V0 = _mm_packus_epi16(V0, V1);
+    if (!do_store) {
+      const __m128i prev_u = LOAD_16(u);
+      const __m128i prev_v = LOAD_16(v);
+      U0 = _mm_avg_epu8(U0, prev_u);
+      V0 = _mm_avg_epu8(V0, prev_v);
+    }
+    STORE_16(U0, u);
+    STORE_16(V0, v);
+  }
+  if (i < src_width) {  // left-over
+    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
+  }
+}
+
+// Convert 16 packed ARGB 16b-values to r[], g[], b[]
+static WEBP_INLINE void RGBA32PackedToPlanar_16b(const uint16_t* const rgbx,
+                                                 __m128i* const r,
+                                                 __m128i* const g,
+                                                 __m128i* const b) {
+  const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
+  const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
+  const __m128i in2 = LOAD_16(rgbx + 16);  // r4 | ...
+  const __m128i in3 = LOAD_16(rgbx + 24);  // r6 | ...
+  // column-wise transpose
+  const __m128i A0 = _mm_unpacklo_epi16(in0, in1);
+  const __m128i A1 = _mm_unpackhi_epi16(in0, in1);
+  const __m128i A2 = _mm_unpacklo_epi16(in2, in3);
+  const __m128i A3 = _mm_unpackhi_epi16(in2, in3);
+  const __m128i B0 = _mm_unpacklo_epi16(A0, A1);  // r0 r1 r2 r3 | g0 g1 ..
+  const __m128i B1 = _mm_unpackhi_epi16(A0, A1);  // b0 b1 b2 b3 | x x x x
+  const __m128i B2 = _mm_unpacklo_epi16(A2, A3);  // r4 r5 r6 r7 | g4 g5 ..
+  const __m128i B3 = _mm_unpackhi_epi16(A2, A3);  // b4 b5 b6 b7 | x x x x
+  *r = _mm_unpacklo_epi64(B0, B2);
+  *g = _mm_unpackhi_epi64(B0, B2);
+  *b = _mm_unpacklo_epi64(B1, B3);
+}
+
+static void ConvertRGBA32ToUV(const uint16_t* rgb,
+                              uint8_t* u, uint8_t* v, int width) {
+  const int max_width = width & ~15;
+  const uint16_t* const last_rgb = rgb + 4 * max_width;
+  while (rgb < last_rgb) {
+    __m128i r, g, b, U0, V0, U1, V1;
+    RGBA32PackedToPlanar_16b(rgb +  0, &r, &g, &b);
+    ConvertRGBToUV(&r, &g, &b, &U0, &V0);
+    RGBA32PackedToPlanar_16b(rgb + 32, &r, &g, &b);
+    ConvertRGBToUV(&r, &g, &b, &U1, &V1);
+    STORE_16(_mm_packus_epi16(U0, U1), u);
+    STORE_16(_mm_packus_epi16(V0, V1), v);
+    u += 16;
+    v += 16;
+    rgb += 2 * 32;
+  }
+  if (max_width < width) {  // left-over
+    WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitConvertARGBToYUVSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
+  WebPConvertARGBToY = ConvertARGBToY;
+  WebPConvertARGBToUV = ConvertARGBToUV;
+
+  WebPConvertRGB24ToY = ConvertRGB24ToY;
+  WebPConvertBGR24ToY = ConvertBGR24ToY;
+
+  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;
+}
+
+#else  // !WEBP_USE_SSE2
+
+WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
+WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
+
+#endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_tables_sse2.h b/src/3rdparty/libwebp/src/dsp/yuv_tables_sse2.h
deleted file mode 100644
index 2b0f057..0000000
--- a/src/3rdparty/libwebp/src/dsp/yuv_tables_sse2.h
+++ /dev/null
@@ -1,536 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// SSE2 tables for YUV->RGB conversion (12kB overall)
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-// This file is not compiled, but #include'd directly from yuv.c
-// Only used if WEBP_YUV_USE_SSE2_TABLES is defined.
-
-static const VP8kCstSSE2 VP8kYtoRGBA[256] = {
-  {{0xfffb77b0, 0xfffb77b0, 0xfffb77b0, 0x003fc000}},
-  {{0xfffbc235, 0xfffbc235, 0xfffbc235, 0x003fc000}},
-  {{0xfffc0cba, 0xfffc0cba, 0xfffc0cba, 0x003fc000}},
-  {{0xfffc573f, 0xfffc573f, 0xfffc573f, 0x003fc000}},
-  {{0xfffca1c4, 0xfffca1c4, 0xfffca1c4, 0x003fc000}},
-  {{0xfffcec49, 0xfffcec49, 0xfffcec49, 0x003fc000}},
-  {{0xfffd36ce, 0xfffd36ce, 0xfffd36ce, 0x003fc000}},
-  {{0xfffd8153, 0xfffd8153, 0xfffd8153, 0x003fc000}},
-  {{0xfffdcbd8, 0xfffdcbd8, 0xfffdcbd8, 0x003fc000}},
-  {{0xfffe165d, 0xfffe165d, 0xfffe165d, 0x003fc000}},
-  {{0xfffe60e2, 0xfffe60e2, 0xfffe60e2, 0x003fc000}},
-  {{0xfffeab67, 0xfffeab67, 0xfffeab67, 0x003fc000}},
-  {{0xfffef5ec, 0xfffef5ec, 0xfffef5ec, 0x003fc000}},
-  {{0xffff4071, 0xffff4071, 0xffff4071, 0x003fc000}},
-  {{0xffff8af6, 0xffff8af6, 0xffff8af6, 0x003fc000}},
-  {{0xffffd57b, 0xffffd57b, 0xffffd57b, 0x003fc000}},
-  {{0x00002000, 0x00002000, 0x00002000, 0x003fc000}},
-  {{0x00006a85, 0x00006a85, 0x00006a85, 0x003fc000}},
-  {{0x0000b50a, 0x0000b50a, 0x0000b50a, 0x003fc000}},
-  {{0x0000ff8f, 0x0000ff8f, 0x0000ff8f, 0x003fc000}},
-  {{0x00014a14, 0x00014a14, 0x00014a14, 0x003fc000}},
-  {{0x00019499, 0x00019499, 0x00019499, 0x003fc000}},
-  {{0x0001df1e, 0x0001df1e, 0x0001df1e, 0x003fc000}},
-  {{0x000229a3, 0x000229a3, 0x000229a3, 0x003fc000}},
-  {{0x00027428, 0x00027428, 0x00027428, 0x003fc000}},
-  {{0x0002bead, 0x0002bead, 0x0002bead, 0x003fc000}},
-  {{0x00030932, 0x00030932, 0x00030932, 0x003fc000}},
-  {{0x000353b7, 0x000353b7, 0x000353b7, 0x003fc000}},
-  {{0x00039e3c, 0x00039e3c, 0x00039e3c, 0x003fc000}},
-  {{0x0003e8c1, 0x0003e8c1, 0x0003e8c1, 0x003fc000}},
-  {{0x00043346, 0x00043346, 0x00043346, 0x003fc000}},
-  {{0x00047dcb, 0x00047dcb, 0x00047dcb, 0x003fc000}},
-  {{0x0004c850, 0x0004c850, 0x0004c850, 0x003fc000}},
-  {{0x000512d5, 0x000512d5, 0x000512d5, 0x003fc000}},
-  {{0x00055d5a, 0x00055d5a, 0x00055d5a, 0x003fc000}},
-  {{0x0005a7df, 0x0005a7df, 0x0005a7df, 0x003fc000}},
-  {{0x0005f264, 0x0005f264, 0x0005f264, 0x003fc000}},
-  {{0x00063ce9, 0x00063ce9, 0x00063ce9, 0x003fc000}},
-  {{0x0006876e, 0x0006876e, 0x0006876e, 0x003fc000}},
-  {{0x0006d1f3, 0x0006d1f3, 0x0006d1f3, 0x003fc000}},
-  {{0x00071c78, 0x00071c78, 0x00071c78, 0x003fc000}},
-  {{0x000766fd, 0x000766fd, 0x000766fd, 0x003fc000}},
-  {{0x0007b182, 0x0007b182, 0x0007b182, 0x003fc000}},
-  {{0x0007fc07, 0x0007fc07, 0x0007fc07, 0x003fc000}},
-  {{0x0008468c, 0x0008468c, 0x0008468c, 0x003fc000}},
-  {{0x00089111, 0x00089111, 0x00089111, 0x003fc000}},
-  {{0x0008db96, 0x0008db96, 0x0008db96, 0x003fc000}},
-  {{0x0009261b, 0x0009261b, 0x0009261b, 0x003fc000}},
-  {{0x000970a0, 0x000970a0, 0x000970a0, 0x003fc000}},
-  {{0x0009bb25, 0x0009bb25, 0x0009bb25, 0x003fc000}},
-  {{0x000a05aa, 0x000a05aa, 0x000a05aa, 0x003fc000}},
-  {{0x000a502f, 0x000a502f, 0x000a502f, 0x003fc000}},
-  {{0x000a9ab4, 0x000a9ab4, 0x000a9ab4, 0x003fc000}},
-  {{0x000ae539, 0x000ae539, 0x000ae539, 0x003fc000}},
-  {{0x000b2fbe, 0x000b2fbe, 0x000b2fbe, 0x003fc000}},
-  {{0x000b7a43, 0x000b7a43, 0x000b7a43, 0x003fc000}},
-  {{0x000bc4c8, 0x000bc4c8, 0x000bc4c8, 0x003fc000}},
-  {{0x000c0f4d, 0x000c0f4d, 0x000c0f4d, 0x003fc000}},
-  {{0x000c59d2, 0x000c59d2, 0x000c59d2, 0x003fc000}},
-  {{0x000ca457, 0x000ca457, 0x000ca457, 0x003fc000}},
-  {{0x000ceedc, 0x000ceedc, 0x000ceedc, 0x003fc000}},
-  {{0x000d3961, 0x000d3961, 0x000d3961, 0x003fc000}},
-  {{0x000d83e6, 0x000d83e6, 0x000d83e6, 0x003fc000}},
-  {{0x000dce6b, 0x000dce6b, 0x000dce6b, 0x003fc000}},
-  {{0x000e18f0, 0x000e18f0, 0x000e18f0, 0x003fc000}},
-  {{0x000e6375, 0x000e6375, 0x000e6375, 0x003fc000}},
-  {{0x000eadfa, 0x000eadfa, 0x000eadfa, 0x003fc000}},
-  {{0x000ef87f, 0x000ef87f, 0x000ef87f, 0x003fc000}},
-  {{0x000f4304, 0x000f4304, 0x000f4304, 0x003fc000}},
-  {{0x000f8d89, 0x000f8d89, 0x000f8d89, 0x003fc000}},
-  {{0x000fd80e, 0x000fd80e, 0x000fd80e, 0x003fc000}},
-  {{0x00102293, 0x00102293, 0x00102293, 0x003fc000}},
-  {{0x00106d18, 0x00106d18, 0x00106d18, 0x003fc000}},
-  {{0x0010b79d, 0x0010b79d, 0x0010b79d, 0x003fc000}},
-  {{0x00110222, 0x00110222, 0x00110222, 0x003fc000}},
-  {{0x00114ca7, 0x00114ca7, 0x00114ca7, 0x003fc000}},
-  {{0x0011972c, 0x0011972c, 0x0011972c, 0x003fc000}},
-  {{0x0011e1b1, 0x0011e1b1, 0x0011e1b1, 0x003fc000}},
-  {{0x00122c36, 0x00122c36, 0x00122c36, 0x003fc000}},
-  {{0x001276bb, 0x001276bb, 0x001276bb, 0x003fc000}},
-  {{0x0012c140, 0x0012c140, 0x0012c140, 0x003fc000}},
-  {{0x00130bc5, 0x00130bc5, 0x00130bc5, 0x003fc000}},
-  {{0x0013564a, 0x0013564a, 0x0013564a, 0x003fc000}},
-  {{0x0013a0cf, 0x0013a0cf, 0x0013a0cf, 0x003fc000}},
-  {{0x0013eb54, 0x0013eb54, 0x0013eb54, 0x003fc000}},
-  {{0x001435d9, 0x001435d9, 0x001435d9, 0x003fc000}},
-  {{0x0014805e, 0x0014805e, 0x0014805e, 0x003fc000}},
-  {{0x0014cae3, 0x0014cae3, 0x0014cae3, 0x003fc000}},
-  {{0x00151568, 0x00151568, 0x00151568, 0x003fc000}},
-  {{0x00155fed, 0x00155fed, 0x00155fed, 0x003fc000}},
-  {{0x0015aa72, 0x0015aa72, 0x0015aa72, 0x003fc000}},
-  {{0x0015f4f7, 0x0015f4f7, 0x0015f4f7, 0x003fc000}},
-  {{0x00163f7c, 0x00163f7c, 0x00163f7c, 0x003fc000}},
-  {{0x00168a01, 0x00168a01, 0x00168a01, 0x003fc000}},
-  {{0x0016d486, 0x0016d486, 0x0016d486, 0x003fc000}},
-  {{0x00171f0b, 0x00171f0b, 0x00171f0b, 0x003fc000}},
-  {{0x00176990, 0x00176990, 0x00176990, 0x003fc000}},
-  {{0x0017b415, 0x0017b415, 0x0017b415, 0x003fc000}},
-  {{0x0017fe9a, 0x0017fe9a, 0x0017fe9a, 0x003fc000}},
-  {{0x0018491f, 0x0018491f, 0x0018491f, 0x003fc000}},
-  {{0x001893a4, 0x001893a4, 0x001893a4, 0x003fc000}},
-  {{0x0018de29, 0x0018de29, 0x0018de29, 0x003fc000}},
-  {{0x001928ae, 0x001928ae, 0x001928ae, 0x003fc000}},
-  {{0x00197333, 0x00197333, 0x00197333, 0x003fc000}},
-  {{0x0019bdb8, 0x0019bdb8, 0x0019bdb8, 0x003fc000}},
-  {{0x001a083d, 0x001a083d, 0x001a083d, 0x003fc000}},
-  {{0x001a52c2, 0x001a52c2, 0x001a52c2, 0x003fc000}},
-  {{0x001a9d47, 0x001a9d47, 0x001a9d47, 0x003fc000}},
-  {{0x001ae7cc, 0x001ae7cc, 0x001ae7cc, 0x003fc000}},
-  {{0x001b3251, 0x001b3251, 0x001b3251, 0x003fc000}},
-  {{0x001b7cd6, 0x001b7cd6, 0x001b7cd6, 0x003fc000}},
-  {{0x001bc75b, 0x001bc75b, 0x001bc75b, 0x003fc000}},
-  {{0x001c11e0, 0x001c11e0, 0x001c11e0, 0x003fc000}},
-  {{0x001c5c65, 0x001c5c65, 0x001c5c65, 0x003fc000}},
-  {{0x001ca6ea, 0x001ca6ea, 0x001ca6ea, 0x003fc000}},
-  {{0x001cf16f, 0x001cf16f, 0x001cf16f, 0x003fc000}},
-  {{0x001d3bf4, 0x001d3bf4, 0x001d3bf4, 0x003fc000}},
-  {{0x001d8679, 0x001d8679, 0x001d8679, 0x003fc000}},
-  {{0x001dd0fe, 0x001dd0fe, 0x001dd0fe, 0x003fc000}},
-  {{0x001e1b83, 0x001e1b83, 0x001e1b83, 0x003fc000}},
-  {{0x001e6608, 0x001e6608, 0x001e6608, 0x003fc000}},
-  {{0x001eb08d, 0x001eb08d, 0x001eb08d, 0x003fc000}},
-  {{0x001efb12, 0x001efb12, 0x001efb12, 0x003fc000}},
-  {{0x001f4597, 0x001f4597, 0x001f4597, 0x003fc000}},
-  {{0x001f901c, 0x001f901c, 0x001f901c, 0x003fc000}},
-  {{0x001fdaa1, 0x001fdaa1, 0x001fdaa1, 0x003fc000}},
-  {{0x00202526, 0x00202526, 0x00202526, 0x003fc000}},
-  {{0x00206fab, 0x00206fab, 0x00206fab, 0x003fc000}},
-  {{0x0020ba30, 0x0020ba30, 0x0020ba30, 0x003fc000}},
-  {{0x002104b5, 0x002104b5, 0x002104b5, 0x003fc000}},
-  {{0x00214f3a, 0x00214f3a, 0x00214f3a, 0x003fc000}},
-  {{0x002199bf, 0x002199bf, 0x002199bf, 0x003fc000}},
-  {{0x0021e444, 0x0021e444, 0x0021e444, 0x003fc000}},
-  {{0x00222ec9, 0x00222ec9, 0x00222ec9, 0x003fc000}},
-  {{0x0022794e, 0x0022794e, 0x0022794e, 0x003fc000}},
-  {{0x0022c3d3, 0x0022c3d3, 0x0022c3d3, 0x003fc000}},
-  {{0x00230e58, 0x00230e58, 0x00230e58, 0x003fc000}},
-  {{0x002358dd, 0x002358dd, 0x002358dd, 0x003fc000}},
-  {{0x0023a362, 0x0023a362, 0x0023a362, 0x003fc000}},
-  {{0x0023ede7, 0x0023ede7, 0x0023ede7, 0x003fc000}},
-  {{0x0024386c, 0x0024386c, 0x0024386c, 0x003fc000}},
-  {{0x002482f1, 0x002482f1, 0x002482f1, 0x003fc000}},
-  {{0x0024cd76, 0x0024cd76, 0x0024cd76, 0x003fc000}},
-  {{0x002517fb, 0x002517fb, 0x002517fb, 0x003fc000}},
-  {{0x00256280, 0x00256280, 0x00256280, 0x003fc000}},
-  {{0x0025ad05, 0x0025ad05, 0x0025ad05, 0x003fc000}},
-  {{0x0025f78a, 0x0025f78a, 0x0025f78a, 0x003fc000}},
-  {{0x0026420f, 0x0026420f, 0x0026420f, 0x003fc000}},
-  {{0x00268c94, 0x00268c94, 0x00268c94, 0x003fc000}},
-  {{0x0026d719, 0x0026d719, 0x0026d719, 0x003fc000}},
-  {{0x0027219e, 0x0027219e, 0x0027219e, 0x003fc000}},
-  {{0x00276c23, 0x00276c23, 0x00276c23, 0x003fc000}},
-  {{0x0027b6a8, 0x0027b6a8, 0x0027b6a8, 0x003fc000}},
-  {{0x0028012d, 0x0028012d, 0x0028012d, 0x003fc000}},
-  {{0x00284bb2, 0x00284bb2, 0x00284bb2, 0x003fc000}},
-  {{0x00289637, 0x00289637, 0x00289637, 0x003fc000}},
-  {{0x0028e0bc, 0x0028e0bc, 0x0028e0bc, 0x003fc000}},
-  {{0x00292b41, 0x00292b41, 0x00292b41, 0x003fc000}},
-  {{0x002975c6, 0x002975c6, 0x002975c6, 0x003fc000}},
-  {{0x0029c04b, 0x0029c04b, 0x0029c04b, 0x003fc000}},
-  {{0x002a0ad0, 0x002a0ad0, 0x002a0ad0, 0x003fc000}},
-  {{0x002a5555, 0x002a5555, 0x002a5555, 0x003fc000}},
-  {{0x002a9fda, 0x002a9fda, 0x002a9fda, 0x003fc000}},
-  {{0x002aea5f, 0x002aea5f, 0x002aea5f, 0x003fc000}},
-  {{0x002b34e4, 0x002b34e4, 0x002b34e4, 0x003fc000}},
-  {{0x002b7f69, 0x002b7f69, 0x002b7f69, 0x003fc000}},
-  {{0x002bc9ee, 0x002bc9ee, 0x002bc9ee, 0x003fc000}},
-  {{0x002c1473, 0x002c1473, 0x002c1473, 0x003fc000}},
-  {{0x002c5ef8, 0x002c5ef8, 0x002c5ef8, 0x003fc000}},
-  {{0x002ca97d, 0x002ca97d, 0x002ca97d, 0x003fc000}},
-  {{0x002cf402, 0x002cf402, 0x002cf402, 0x003fc000}},
-  {{0x002d3e87, 0x002d3e87, 0x002d3e87, 0x003fc000}},
-  {{0x002d890c, 0x002d890c, 0x002d890c, 0x003fc000}},
-  {{0x002dd391, 0x002dd391, 0x002dd391, 0x003fc000}},
-  {{0x002e1e16, 0x002e1e16, 0x002e1e16, 0x003fc000}},
-  {{0x002e689b, 0x002e689b, 0x002e689b, 0x003fc000}},
-  {{0x002eb320, 0x002eb320, 0x002eb320, 0x003fc000}},
-  {{0x002efda5, 0x002efda5, 0x002efda5, 0x003fc000}},
-  {{0x002f482a, 0x002f482a, 0x002f482a, 0x003fc000}},
-  {{0x002f92af, 0x002f92af, 0x002f92af, 0x003fc000}},
-  {{0x002fdd34, 0x002fdd34, 0x002fdd34, 0x003fc000}},
-  {{0x003027b9, 0x003027b9, 0x003027b9, 0x003fc000}},
-  {{0x0030723e, 0x0030723e, 0x0030723e, 0x003fc000}},
-  {{0x0030bcc3, 0x0030bcc3, 0x0030bcc3, 0x003fc000}},
-  {{0x00310748, 0x00310748, 0x00310748, 0x003fc000}},
-  {{0x003151cd, 0x003151cd, 0x003151cd, 0x003fc000}},
-  {{0x00319c52, 0x00319c52, 0x00319c52, 0x003fc000}},
-  {{0x0031e6d7, 0x0031e6d7, 0x0031e6d7, 0x003fc000}},
-  {{0x0032315c, 0x0032315c, 0x0032315c, 0x003fc000}},
-  {{0x00327be1, 0x00327be1, 0x00327be1, 0x003fc000}},
-  {{0x0032c666, 0x0032c666, 0x0032c666, 0x003fc000}},
-  {{0x003310eb, 0x003310eb, 0x003310eb, 0x003fc000}},
-  {{0x00335b70, 0x00335b70, 0x00335b70, 0x003fc000}},
-  {{0x0033a5f5, 0x0033a5f5, 0x0033a5f5, 0x003fc000}},
-  {{0x0033f07a, 0x0033f07a, 0x0033f07a, 0x003fc000}},
-  {{0x00343aff, 0x00343aff, 0x00343aff, 0x003fc000}},
-  {{0x00348584, 0x00348584, 0x00348584, 0x003fc000}},
-  {{0x0034d009, 0x0034d009, 0x0034d009, 0x003fc000}},
-  {{0x00351a8e, 0x00351a8e, 0x00351a8e, 0x003fc000}},
-  {{0x00356513, 0x00356513, 0x00356513, 0x003fc000}},
-  {{0x0035af98, 0x0035af98, 0x0035af98, 0x003fc000}},
-  {{0x0035fa1d, 0x0035fa1d, 0x0035fa1d, 0x003fc000}},
-  {{0x003644a2, 0x003644a2, 0x003644a2, 0x003fc000}},
-  {{0x00368f27, 0x00368f27, 0x00368f27, 0x003fc000}},
-  {{0x0036d9ac, 0x0036d9ac, 0x0036d9ac, 0x003fc000}},
-  {{0x00372431, 0x00372431, 0x00372431, 0x003fc000}},
-  {{0x00376eb6, 0x00376eb6, 0x00376eb6, 0x003fc000}},
-  {{0x0037b93b, 0x0037b93b, 0x0037b93b, 0x003fc000}},
-  {{0x003803c0, 0x003803c0, 0x003803c0, 0x003fc000}},
-  {{0x00384e45, 0x00384e45, 0x00384e45, 0x003fc000}},
-  {{0x003898ca, 0x003898ca, 0x003898ca, 0x003fc000}},
-  {{0x0038e34f, 0x0038e34f, 0x0038e34f, 0x003fc000}},
-  {{0x00392dd4, 0x00392dd4, 0x00392dd4, 0x003fc000}},
-  {{0x00397859, 0x00397859, 0x00397859, 0x003fc000}},
-  {{0x0039c2de, 0x0039c2de, 0x0039c2de, 0x003fc000}},
-  {{0x003a0d63, 0x003a0d63, 0x003a0d63, 0x003fc000}},
-  {{0x003a57e8, 0x003a57e8, 0x003a57e8, 0x003fc000}},
-  {{0x003aa26d, 0x003aa26d, 0x003aa26d, 0x003fc000}},
-  {{0x003aecf2, 0x003aecf2, 0x003aecf2, 0x003fc000}},
-  {{0x003b3777, 0x003b3777, 0x003b3777, 0x003fc000}},
-  {{0x003b81fc, 0x003b81fc, 0x003b81fc, 0x003fc000}},
-  {{0x003bcc81, 0x003bcc81, 0x003bcc81, 0x003fc000}},
-  {{0x003c1706, 0x003c1706, 0x003c1706, 0x003fc000}},
-  {{0x003c618b, 0x003c618b, 0x003c618b, 0x003fc000}},
-  {{0x003cac10, 0x003cac10, 0x003cac10, 0x003fc000}},
-  {{0x003cf695, 0x003cf695, 0x003cf695, 0x003fc000}},
-  {{0x003d411a, 0x003d411a, 0x003d411a, 0x003fc000}},
-  {{0x003d8b9f, 0x003d8b9f, 0x003d8b9f, 0x003fc000}},
-  {{0x003dd624, 0x003dd624, 0x003dd624, 0x003fc000}},
-  {{0x003e20a9, 0x003e20a9, 0x003e20a9, 0x003fc000}},
-  {{0x003e6b2e, 0x003e6b2e, 0x003e6b2e, 0x003fc000}},
-  {{0x003eb5b3, 0x003eb5b3, 0x003eb5b3, 0x003fc000}},
-  {{0x003f0038, 0x003f0038, 0x003f0038, 0x003fc000}},
-  {{0x003f4abd, 0x003f4abd, 0x003f4abd, 0x003fc000}},
-  {{0x003f9542, 0x003f9542, 0x003f9542, 0x003fc000}},
-  {{0x003fdfc7, 0x003fdfc7, 0x003fdfc7, 0x003fc000}},
-  {{0x00402a4c, 0x00402a4c, 0x00402a4c, 0x003fc000}},
-  {{0x004074d1, 0x004074d1, 0x004074d1, 0x003fc000}},
-  {{0x0040bf56, 0x0040bf56, 0x0040bf56, 0x003fc000}},
-  {{0x004109db, 0x004109db, 0x004109db, 0x003fc000}},
-  {{0x00415460, 0x00415460, 0x00415460, 0x003fc000}},
-  {{0x00419ee5, 0x00419ee5, 0x00419ee5, 0x003fc000}},
-  {{0x0041e96a, 0x0041e96a, 0x0041e96a, 0x003fc000}},
-  {{0x004233ef, 0x004233ef, 0x004233ef, 0x003fc000}},
-  {{0x00427e74, 0x00427e74, 0x00427e74, 0x003fc000}},
-  {{0x0042c8f9, 0x0042c8f9, 0x0042c8f9, 0x003fc000}},
-  {{0x0043137e, 0x0043137e, 0x0043137e, 0x003fc000}},
-  {{0x00435e03, 0x00435e03, 0x00435e03, 0x003fc000}},
-  {{0x0043a888, 0x0043a888, 0x0043a888, 0x003fc000}},
-  {{0x0043f30d, 0x0043f30d, 0x0043f30d, 0x003fc000}},
-  {{0x00443d92, 0x00443d92, 0x00443d92, 0x003fc000}},
-  {{0x00448817, 0x00448817, 0x00448817, 0x003fc000}},
-  {{0x0044d29c, 0x0044d29c, 0x0044d29c, 0x003fc000}},
-  {{0x00451d21, 0x00451d21, 0x00451d21, 0x003fc000}},
-  {{0x004567a6, 0x004567a6, 0x004567a6, 0x003fc000}},
-  {{0x0045b22b, 0x0045b22b, 0x0045b22b, 0x003fc000}}
-};
-
-static const VP8kCstSSE2 VP8kUtoRGBA[256] = {
-  {{0, 0x000c8980, 0xffbf7300, 0}}, {{0, 0x000c706d, 0xffbff41a, 0}},
-  {{0, 0x000c575a, 0xffc07534, 0}}, {{0, 0x000c3e47, 0xffc0f64e, 0}},
-  {{0, 0x000c2534, 0xffc17768, 0}}, {{0, 0x000c0c21, 0xffc1f882, 0}},
-  {{0, 0x000bf30e, 0xffc2799c, 0}}, {{0, 0x000bd9fb, 0xffc2fab6, 0}},
-  {{0, 0x000bc0e8, 0xffc37bd0, 0}}, {{0, 0x000ba7d5, 0xffc3fcea, 0}},
-  {{0, 0x000b8ec2, 0xffc47e04, 0}}, {{0, 0x000b75af, 0xffc4ff1e, 0}},
-  {{0, 0x000b5c9c, 0xffc58038, 0}}, {{0, 0x000b4389, 0xffc60152, 0}},
-  {{0, 0x000b2a76, 0xffc6826c, 0}}, {{0, 0x000b1163, 0xffc70386, 0}},
-  {{0, 0x000af850, 0xffc784a0, 0}}, {{0, 0x000adf3d, 0xffc805ba, 0}},
-  {{0, 0x000ac62a, 0xffc886d4, 0}}, {{0, 0x000aad17, 0xffc907ee, 0}},
-  {{0, 0x000a9404, 0xffc98908, 0}}, {{0, 0x000a7af1, 0xffca0a22, 0}},
-  {{0, 0x000a61de, 0xffca8b3c, 0}}, {{0, 0x000a48cb, 0xffcb0c56, 0}},
-  {{0, 0x000a2fb8, 0xffcb8d70, 0}}, {{0, 0x000a16a5, 0xffcc0e8a, 0}},
-  {{0, 0x0009fd92, 0xffcc8fa4, 0}}, {{0, 0x0009e47f, 0xffcd10be, 0}},
-  {{0, 0x0009cb6c, 0xffcd91d8, 0}}, {{0, 0x0009b259, 0xffce12f2, 0}},
-  {{0, 0x00099946, 0xffce940c, 0}}, {{0, 0x00098033, 0xffcf1526, 0}},
-  {{0, 0x00096720, 0xffcf9640, 0}}, {{0, 0x00094e0d, 0xffd0175a, 0}},
-  {{0, 0x000934fa, 0xffd09874, 0}}, {{0, 0x00091be7, 0xffd1198e, 0}},
-  {{0, 0x000902d4, 0xffd19aa8, 0}}, {{0, 0x0008e9c1, 0xffd21bc2, 0}},
-  {{0, 0x0008d0ae, 0xffd29cdc, 0}}, {{0, 0x0008b79b, 0xffd31df6, 0}},
-  {{0, 0x00089e88, 0xffd39f10, 0}}, {{0, 0x00088575, 0xffd4202a, 0}},
-  {{0, 0x00086c62, 0xffd4a144, 0}}, {{0, 0x0008534f, 0xffd5225e, 0}},
-  {{0, 0x00083a3c, 0xffd5a378, 0}}, {{0, 0x00082129, 0xffd62492, 0}},
-  {{0, 0x00080816, 0xffd6a5ac, 0}}, {{0, 0x0007ef03, 0xffd726c6, 0}},
-  {{0, 0x0007d5f0, 0xffd7a7e0, 0}}, {{0, 0x0007bcdd, 0xffd828fa, 0}},
-  {{0, 0x0007a3ca, 0xffd8aa14, 0}}, {{0, 0x00078ab7, 0xffd92b2e, 0}},
-  {{0, 0x000771a4, 0xffd9ac48, 0}}, {{0, 0x00075891, 0xffda2d62, 0}},
-  {{0, 0x00073f7e, 0xffdaae7c, 0}}, {{0, 0x0007266b, 0xffdb2f96, 0}},
-  {{0, 0x00070d58, 0xffdbb0b0, 0}}, {{0, 0x0006f445, 0xffdc31ca, 0}},
-  {{0, 0x0006db32, 0xffdcb2e4, 0}}, {{0, 0x0006c21f, 0xffdd33fe, 0}},
-  {{0, 0x0006a90c, 0xffddb518, 0}}, {{0, 0x00068ff9, 0xffde3632, 0}},
-  {{0, 0x000676e6, 0xffdeb74c, 0}}, {{0, 0x00065dd3, 0xffdf3866, 0}},
-  {{0, 0x000644c0, 0xffdfb980, 0}}, {{0, 0x00062bad, 0xffe03a9a, 0}},
-  {{0, 0x0006129a, 0xffe0bbb4, 0}}, {{0, 0x0005f987, 0xffe13cce, 0}},
-  {{0, 0x0005e074, 0xffe1bde8, 0}}, {{0, 0x0005c761, 0xffe23f02, 0}},
-  {{0, 0x0005ae4e, 0xffe2c01c, 0}}, {{0, 0x0005953b, 0xffe34136, 0}},
-  {{0, 0x00057c28, 0xffe3c250, 0}}, {{0, 0x00056315, 0xffe4436a, 0}},
-  {{0, 0x00054a02, 0xffe4c484, 0}}, {{0, 0x000530ef, 0xffe5459e, 0}},
-  {{0, 0x000517dc, 0xffe5c6b8, 0}}, {{0, 0x0004fec9, 0xffe647d2, 0}},
-  {{0, 0x0004e5b6, 0xffe6c8ec, 0}}, {{0, 0x0004cca3, 0xffe74a06, 0}},
-  {{0, 0x0004b390, 0xffe7cb20, 0}}, {{0, 0x00049a7d, 0xffe84c3a, 0}},
-  {{0, 0x0004816a, 0xffe8cd54, 0}}, {{0, 0x00046857, 0xffe94e6e, 0}},
-  {{0, 0x00044f44, 0xffe9cf88, 0}}, {{0, 0x00043631, 0xffea50a2, 0}},
-  {{0, 0x00041d1e, 0xffead1bc, 0}}, {{0, 0x0004040b, 0xffeb52d6, 0}},
-  {{0, 0x0003eaf8, 0xffebd3f0, 0}}, {{0, 0x0003d1e5, 0xffec550a, 0}},
-  {{0, 0x0003b8d2, 0xffecd624, 0}}, {{0, 0x00039fbf, 0xffed573e, 0}},
-  {{0, 0x000386ac, 0xffedd858, 0}}, {{0, 0x00036d99, 0xffee5972, 0}},
-  {{0, 0x00035486, 0xffeeda8c, 0}}, {{0, 0x00033b73, 0xffef5ba6, 0}},
-  {{0, 0x00032260, 0xffefdcc0, 0}}, {{0, 0x0003094d, 0xfff05dda, 0}},
-  {{0, 0x0002f03a, 0xfff0def4, 0}}, {{0, 0x0002d727, 0xfff1600e, 0}},
-  {{0, 0x0002be14, 0xfff1e128, 0}}, {{0, 0x0002a501, 0xfff26242, 0}},
-  {{0, 0x00028bee, 0xfff2e35c, 0}}, {{0, 0x000272db, 0xfff36476, 0}},
-  {{0, 0x000259c8, 0xfff3e590, 0}}, {{0, 0x000240b5, 0xfff466aa, 0}},
-  {{0, 0x000227a2, 0xfff4e7c4, 0}}, {{0, 0x00020e8f, 0xfff568de, 0}},
-  {{0, 0x0001f57c, 0xfff5e9f8, 0}}, {{0, 0x0001dc69, 0xfff66b12, 0}},
-  {{0, 0x0001c356, 0xfff6ec2c, 0}}, {{0, 0x0001aa43, 0xfff76d46, 0}},
-  {{0, 0x00019130, 0xfff7ee60, 0}}, {{0, 0x0001781d, 0xfff86f7a, 0}},
-  {{0, 0x00015f0a, 0xfff8f094, 0}}, {{0, 0x000145f7, 0xfff971ae, 0}},
-  {{0, 0x00012ce4, 0xfff9f2c8, 0}}, {{0, 0x000113d1, 0xfffa73e2, 0}},
-  {{0, 0x0000fabe, 0xfffaf4fc, 0}}, {{0, 0x0000e1ab, 0xfffb7616, 0}},
-  {{0, 0x0000c898, 0xfffbf730, 0}}, {{0, 0x0000af85, 0xfffc784a, 0}},
-  {{0, 0x00009672, 0xfffcf964, 0}}, {{0, 0x00007d5f, 0xfffd7a7e, 0}},
-  {{0, 0x0000644c, 0xfffdfb98, 0}}, {{0, 0x00004b39, 0xfffe7cb2, 0}},
-  {{0, 0x00003226, 0xfffefdcc, 0}}, {{0, 0x00001913, 0xffff7ee6, 0}},
-  {{0, 0x00000000, 0x00000000, 0}}, {{0, 0xffffe6ed, 0x0000811a, 0}},
-  {{0, 0xffffcdda, 0x00010234, 0}}, {{0, 0xffffb4c7, 0x0001834e, 0}},
-  {{0, 0xffff9bb4, 0x00020468, 0}}, {{0, 0xffff82a1, 0x00028582, 0}},
-  {{0, 0xffff698e, 0x0003069c, 0}}, {{0, 0xffff507b, 0x000387b6, 0}},
-  {{0, 0xffff3768, 0x000408d0, 0}}, {{0, 0xffff1e55, 0x000489ea, 0}},
-  {{0, 0xffff0542, 0x00050b04, 0}}, {{0, 0xfffeec2f, 0x00058c1e, 0}},
-  {{0, 0xfffed31c, 0x00060d38, 0}}, {{0, 0xfffeba09, 0x00068e52, 0}},
-  {{0, 0xfffea0f6, 0x00070f6c, 0}}, {{0, 0xfffe87e3, 0x00079086, 0}},
-  {{0, 0xfffe6ed0, 0x000811a0, 0}}, {{0, 0xfffe55bd, 0x000892ba, 0}},
-  {{0, 0xfffe3caa, 0x000913d4, 0}}, {{0, 0xfffe2397, 0x000994ee, 0}},
-  {{0, 0xfffe0a84, 0x000a1608, 0}}, {{0, 0xfffdf171, 0x000a9722, 0}},
-  {{0, 0xfffdd85e, 0x000b183c, 0}}, {{0, 0xfffdbf4b, 0x000b9956, 0}},
-  {{0, 0xfffda638, 0x000c1a70, 0}}, {{0, 0xfffd8d25, 0x000c9b8a, 0}},
-  {{0, 0xfffd7412, 0x000d1ca4, 0}}, {{0, 0xfffd5aff, 0x000d9dbe, 0}},
-  {{0, 0xfffd41ec, 0x000e1ed8, 0}}, {{0, 0xfffd28d9, 0x000e9ff2, 0}},
-  {{0, 0xfffd0fc6, 0x000f210c, 0}}, {{0, 0xfffcf6b3, 0x000fa226, 0}},
-  {{0, 0xfffcdda0, 0x00102340, 0}}, {{0, 0xfffcc48d, 0x0010a45a, 0}},
-  {{0, 0xfffcab7a, 0x00112574, 0}}, {{0, 0xfffc9267, 0x0011a68e, 0}},
-  {{0, 0xfffc7954, 0x001227a8, 0}}, {{0, 0xfffc6041, 0x0012a8c2, 0}},
-  {{0, 0xfffc472e, 0x001329dc, 0}}, {{0, 0xfffc2e1b, 0x0013aaf6, 0}},
-  {{0, 0xfffc1508, 0x00142c10, 0}}, {{0, 0xfffbfbf5, 0x0014ad2a, 0}},
-  {{0, 0xfffbe2e2, 0x00152e44, 0}}, {{0, 0xfffbc9cf, 0x0015af5e, 0}},
-  {{0, 0xfffbb0bc, 0x00163078, 0}}, {{0, 0xfffb97a9, 0x0016b192, 0}},
-  {{0, 0xfffb7e96, 0x001732ac, 0}}, {{0, 0xfffb6583, 0x0017b3c6, 0}},
-  {{0, 0xfffb4c70, 0x001834e0, 0}}, {{0, 0xfffb335d, 0x0018b5fa, 0}},
-  {{0, 0xfffb1a4a, 0x00193714, 0}}, {{0, 0xfffb0137, 0x0019b82e, 0}},
-  {{0, 0xfffae824, 0x001a3948, 0}}, {{0, 0xfffacf11, 0x001aba62, 0}},
-  {{0, 0xfffab5fe, 0x001b3b7c, 0}}, {{0, 0xfffa9ceb, 0x001bbc96, 0}},
-  {{0, 0xfffa83d8, 0x001c3db0, 0}}, {{0, 0xfffa6ac5, 0x001cbeca, 0}},
-  {{0, 0xfffa51b2, 0x001d3fe4, 0}}, {{0, 0xfffa389f, 0x001dc0fe, 0}},
-  {{0, 0xfffa1f8c, 0x001e4218, 0}}, {{0, 0xfffa0679, 0x001ec332, 0}},
-  {{0, 0xfff9ed66, 0x001f444c, 0}}, {{0, 0xfff9d453, 0x001fc566, 0}},
-  {{0, 0xfff9bb40, 0x00204680, 0}}, {{0, 0xfff9a22d, 0x0020c79a, 0}},
-  {{0, 0xfff9891a, 0x002148b4, 0}}, {{0, 0xfff97007, 0x0021c9ce, 0}},
-  {{0, 0xfff956f4, 0x00224ae8, 0}}, {{0, 0xfff93de1, 0x0022cc02, 0}},
-  {{0, 0xfff924ce, 0x00234d1c, 0}}, {{0, 0xfff90bbb, 0x0023ce36, 0}},
-  {{0, 0xfff8f2a8, 0x00244f50, 0}}, {{0, 0xfff8d995, 0x0024d06a, 0}},
-  {{0, 0xfff8c082, 0x00255184, 0}}, {{0, 0xfff8a76f, 0x0025d29e, 0}},
-  {{0, 0xfff88e5c, 0x002653b8, 0}}, {{0, 0xfff87549, 0x0026d4d2, 0}},
-  {{0, 0xfff85c36, 0x002755ec, 0}}, {{0, 0xfff84323, 0x0027d706, 0}},
-  {{0, 0xfff82a10, 0x00285820, 0}}, {{0, 0xfff810fd, 0x0028d93a, 0}},
-  {{0, 0xfff7f7ea, 0x00295a54, 0}}, {{0, 0xfff7ded7, 0x0029db6e, 0}},
-  {{0, 0xfff7c5c4, 0x002a5c88, 0}}, {{0, 0xfff7acb1, 0x002adda2, 0}},
-  {{0, 0xfff7939e, 0x002b5ebc, 0}}, {{0, 0xfff77a8b, 0x002bdfd6, 0}},
-  {{0, 0xfff76178, 0x002c60f0, 0}}, {{0, 0xfff74865, 0x002ce20a, 0}},
-  {{0, 0xfff72f52, 0x002d6324, 0}}, {{0, 0xfff7163f, 0x002de43e, 0}},
-  {{0, 0xfff6fd2c, 0x002e6558, 0}}, {{0, 0xfff6e419, 0x002ee672, 0}},
-  {{0, 0xfff6cb06, 0x002f678c, 0}}, {{0, 0xfff6b1f3, 0x002fe8a6, 0}},
-  {{0, 0xfff698e0, 0x003069c0, 0}}, {{0, 0xfff67fcd, 0x0030eada, 0}},
-  {{0, 0xfff666ba, 0x00316bf4, 0}}, {{0, 0xfff64da7, 0x0031ed0e, 0}},
-  {{0, 0xfff63494, 0x00326e28, 0}}, {{0, 0xfff61b81, 0x0032ef42, 0}},
-  {{0, 0xfff6026e, 0x0033705c, 0}}, {{0, 0xfff5e95b, 0x0033f176, 0}},
-  {{0, 0xfff5d048, 0x00347290, 0}}, {{0, 0xfff5b735, 0x0034f3aa, 0}},
-  {{0, 0xfff59e22, 0x003574c4, 0}}, {{0, 0xfff5850f, 0x0035f5de, 0}},
-  {{0, 0xfff56bfc, 0x003676f8, 0}}, {{0, 0xfff552e9, 0x0036f812, 0}},
-  {{0, 0xfff539d6, 0x0037792c, 0}}, {{0, 0xfff520c3, 0x0037fa46, 0}},
-  {{0, 0xfff507b0, 0x00387b60, 0}}, {{0, 0xfff4ee9d, 0x0038fc7a, 0}},
-  {{0, 0xfff4d58a, 0x00397d94, 0}}, {{0, 0xfff4bc77, 0x0039feae, 0}},
-  {{0, 0xfff4a364, 0x003a7fc8, 0}}, {{0, 0xfff48a51, 0x003b00e2, 0}},
-  {{0, 0xfff4713e, 0x003b81fc, 0}}, {{0, 0xfff4582b, 0x003c0316, 0}},
-  {{0, 0xfff43f18, 0x003c8430, 0}}, {{0, 0xfff42605, 0x003d054a, 0}},
-  {{0, 0xfff40cf2, 0x003d8664, 0}}, {{0, 0xfff3f3df, 0x003e077e, 0}},
-  {{0, 0xfff3dacc, 0x003e8898, 0}}, {{0, 0xfff3c1b9, 0x003f09b2, 0}},
-  {{0, 0xfff3a8a6, 0x003f8acc, 0}}, {{0, 0xfff38f93, 0x00400be6, 0}}
-};
-
-static VP8kCstSSE2 VP8kVtoRGBA[256] = {
-  {{0xffcced80, 0x001a0400, 0, 0}}, {{0xffcd53a5, 0x0019cff8, 0, 0}},
-  {{0xffcdb9ca, 0x00199bf0, 0, 0}}, {{0xffce1fef, 0x001967e8, 0, 0}},
-  {{0xffce8614, 0x001933e0, 0, 0}}, {{0xffceec39, 0x0018ffd8, 0, 0}},
-  {{0xffcf525e, 0x0018cbd0, 0, 0}}, {{0xffcfb883, 0x001897c8, 0, 0}},
-  {{0xffd01ea8, 0x001863c0, 0, 0}}, {{0xffd084cd, 0x00182fb8, 0, 0}},
-  {{0xffd0eaf2, 0x0017fbb0, 0, 0}}, {{0xffd15117, 0x0017c7a8, 0, 0}},
-  {{0xffd1b73c, 0x001793a0, 0, 0}}, {{0xffd21d61, 0x00175f98, 0, 0}},
-  {{0xffd28386, 0x00172b90, 0, 0}}, {{0xffd2e9ab, 0x0016f788, 0, 0}},
-  {{0xffd34fd0, 0x0016c380, 0, 0}}, {{0xffd3b5f5, 0x00168f78, 0, 0}},
-  {{0xffd41c1a, 0x00165b70, 0, 0}}, {{0xffd4823f, 0x00162768, 0, 0}},
-  {{0xffd4e864, 0x0015f360, 0, 0}}, {{0xffd54e89, 0x0015bf58, 0, 0}},
-  {{0xffd5b4ae, 0x00158b50, 0, 0}}, {{0xffd61ad3, 0x00155748, 0, 0}},
-  {{0xffd680f8, 0x00152340, 0, 0}}, {{0xffd6e71d, 0x0014ef38, 0, 0}},
-  {{0xffd74d42, 0x0014bb30, 0, 0}}, {{0xffd7b367, 0x00148728, 0, 0}},
-  {{0xffd8198c, 0x00145320, 0, 0}}, {{0xffd87fb1, 0x00141f18, 0, 0}},
-  {{0xffd8e5d6, 0x0013eb10, 0, 0}}, {{0xffd94bfb, 0x0013b708, 0, 0}},
-  {{0xffd9b220, 0x00138300, 0, 0}}, {{0xffda1845, 0x00134ef8, 0, 0}},
-  {{0xffda7e6a, 0x00131af0, 0, 0}}, {{0xffdae48f, 0x0012e6e8, 0, 0}},
-  {{0xffdb4ab4, 0x0012b2e0, 0, 0}}, {{0xffdbb0d9, 0x00127ed8, 0, 0}},
-  {{0xffdc16fe, 0x00124ad0, 0, 0}}, {{0xffdc7d23, 0x001216c8, 0, 0}},
-  {{0xffdce348, 0x0011e2c0, 0, 0}}, {{0xffdd496d, 0x0011aeb8, 0, 0}},
-  {{0xffddaf92, 0x00117ab0, 0, 0}}, {{0xffde15b7, 0x001146a8, 0, 0}},
-  {{0xffde7bdc, 0x001112a0, 0, 0}}, {{0xffdee201, 0x0010de98, 0, 0}},
-  {{0xffdf4826, 0x0010aa90, 0, 0}}, {{0xffdfae4b, 0x00107688, 0, 0}},
-  {{0xffe01470, 0x00104280, 0, 0}}, {{0xffe07a95, 0x00100e78, 0, 0}},
-  {{0xffe0e0ba, 0x000fda70, 0, 0}}, {{0xffe146df, 0x000fa668, 0, 0}},
-  {{0xffe1ad04, 0x000f7260, 0, 0}}, {{0xffe21329, 0x000f3e58, 0, 0}},
-  {{0xffe2794e, 0x000f0a50, 0, 0}}, {{0xffe2df73, 0x000ed648, 0, 0}},
-  {{0xffe34598, 0x000ea240, 0, 0}}, {{0xffe3abbd, 0x000e6e38, 0, 0}},
-  {{0xffe411e2, 0x000e3a30, 0, 0}}, {{0xffe47807, 0x000e0628, 0, 0}},
-  {{0xffe4de2c, 0x000dd220, 0, 0}}, {{0xffe54451, 0x000d9e18, 0, 0}},
-  {{0xffe5aa76, 0x000d6a10, 0, 0}}, {{0xffe6109b, 0x000d3608, 0, 0}},
-  {{0xffe676c0, 0x000d0200, 0, 0}}, {{0xffe6dce5, 0x000ccdf8, 0, 0}},
-  {{0xffe7430a, 0x000c99f0, 0, 0}}, {{0xffe7a92f, 0x000c65e8, 0, 0}},
-  {{0xffe80f54, 0x000c31e0, 0, 0}}, {{0xffe87579, 0x000bfdd8, 0, 0}},
-  {{0xffe8db9e, 0x000bc9d0, 0, 0}}, {{0xffe941c3, 0x000b95c8, 0, 0}},
-  {{0xffe9a7e8, 0x000b61c0, 0, 0}}, {{0xffea0e0d, 0x000b2db8, 0, 0}},
-  {{0xffea7432, 0x000af9b0, 0, 0}}, {{0xffeada57, 0x000ac5a8, 0, 0}},
-  {{0xffeb407c, 0x000a91a0, 0, 0}}, {{0xffeba6a1, 0x000a5d98, 0, 0}},
-  {{0xffec0cc6, 0x000a2990, 0, 0}}, {{0xffec72eb, 0x0009f588, 0, 0}},
-  {{0xffecd910, 0x0009c180, 0, 0}}, {{0xffed3f35, 0x00098d78, 0, 0}},
-  {{0xffeda55a, 0x00095970, 0, 0}}, {{0xffee0b7f, 0x00092568, 0, 0}},
-  {{0xffee71a4, 0x0008f160, 0, 0}}, {{0xffeed7c9, 0x0008bd58, 0, 0}},
-  {{0xffef3dee, 0x00088950, 0, 0}}, {{0xffefa413, 0x00085548, 0, 0}},
-  {{0xfff00a38, 0x00082140, 0, 0}}, {{0xfff0705d, 0x0007ed38, 0, 0}},
-  {{0xfff0d682, 0x0007b930, 0, 0}}, {{0xfff13ca7, 0x00078528, 0, 0}},
-  {{0xfff1a2cc, 0x00075120, 0, 0}}, {{0xfff208f1, 0x00071d18, 0, 0}},
-  {{0xfff26f16, 0x0006e910, 0, 0}}, {{0xfff2d53b, 0x0006b508, 0, 0}},
-  {{0xfff33b60, 0x00068100, 0, 0}}, {{0xfff3a185, 0x00064cf8, 0, 0}},
-  {{0xfff407aa, 0x000618f0, 0, 0}}, {{0xfff46dcf, 0x0005e4e8, 0, 0}},
-  {{0xfff4d3f4, 0x0005b0e0, 0, 0}}, {{0xfff53a19, 0x00057cd8, 0, 0}},
-  {{0xfff5a03e, 0x000548d0, 0, 0}}, {{0xfff60663, 0x000514c8, 0, 0}},
-  {{0xfff66c88, 0x0004e0c0, 0, 0}}, {{0xfff6d2ad, 0x0004acb8, 0, 0}},
-  {{0xfff738d2, 0x000478b0, 0, 0}}, {{0xfff79ef7, 0x000444a8, 0, 0}},
-  {{0xfff8051c, 0x000410a0, 0, 0}}, {{0xfff86b41, 0x0003dc98, 0, 0}},
-  {{0xfff8d166, 0x0003a890, 0, 0}}, {{0xfff9378b, 0x00037488, 0, 0}},
-  {{0xfff99db0, 0x00034080, 0, 0}}, {{0xfffa03d5, 0x00030c78, 0, 0}},
-  {{0xfffa69fa, 0x0002d870, 0, 0}}, {{0xfffad01f, 0x0002a468, 0, 0}},
-  {{0xfffb3644, 0x00027060, 0, 0}}, {{0xfffb9c69, 0x00023c58, 0, 0}},
-  {{0xfffc028e, 0x00020850, 0, 0}}, {{0xfffc68b3, 0x0001d448, 0, 0}},
-  {{0xfffcced8, 0x0001a040, 0, 0}}, {{0xfffd34fd, 0x00016c38, 0, 0}},
-  {{0xfffd9b22, 0x00013830, 0, 0}}, {{0xfffe0147, 0x00010428, 0, 0}},
-  {{0xfffe676c, 0x0000d020, 0, 0}}, {{0xfffecd91, 0x00009c18, 0, 0}},
-  {{0xffff33b6, 0x00006810, 0, 0}}, {{0xffff99db, 0x00003408, 0, 0}},
-  {{0x00000000, 0x00000000, 0, 0}}, {{0x00006625, 0xffffcbf8, 0, 0}},
-  {{0x0000cc4a, 0xffff97f0, 0, 0}}, {{0x0001326f, 0xffff63e8, 0, 0}},
-  {{0x00019894, 0xffff2fe0, 0, 0}}, {{0x0001feb9, 0xfffefbd8, 0, 0}},
-  {{0x000264de, 0xfffec7d0, 0, 0}}, {{0x0002cb03, 0xfffe93c8, 0, 0}},
-  {{0x00033128, 0xfffe5fc0, 0, 0}}, {{0x0003974d, 0xfffe2bb8, 0, 0}},
-  {{0x0003fd72, 0xfffdf7b0, 0, 0}}, {{0x00046397, 0xfffdc3a8, 0, 0}},
-  {{0x0004c9bc, 0xfffd8fa0, 0, 0}}, {{0x00052fe1, 0xfffd5b98, 0, 0}},
-  {{0x00059606, 0xfffd2790, 0, 0}}, {{0x0005fc2b, 0xfffcf388, 0, 0}},
-  {{0x00066250, 0xfffcbf80, 0, 0}}, {{0x0006c875, 0xfffc8b78, 0, 0}},
-  {{0x00072e9a, 0xfffc5770, 0, 0}}, {{0x000794bf, 0xfffc2368, 0, 0}},
-  {{0x0007fae4, 0xfffbef60, 0, 0}}, {{0x00086109, 0xfffbbb58, 0, 0}},
-  {{0x0008c72e, 0xfffb8750, 0, 0}}, {{0x00092d53, 0xfffb5348, 0, 0}},
-  {{0x00099378, 0xfffb1f40, 0, 0}}, {{0x0009f99d, 0xfffaeb38, 0, 0}},
-  {{0x000a5fc2, 0xfffab730, 0, 0}}, {{0x000ac5e7, 0xfffa8328, 0, 0}},
-  {{0x000b2c0c, 0xfffa4f20, 0, 0}}, {{0x000b9231, 0xfffa1b18, 0, 0}},
-  {{0x000bf856, 0xfff9e710, 0, 0}}, {{0x000c5e7b, 0xfff9b308, 0, 0}},
-  {{0x000cc4a0, 0xfff97f00, 0, 0}}, {{0x000d2ac5, 0xfff94af8, 0, 0}},
-  {{0x000d90ea, 0xfff916f0, 0, 0}}, {{0x000df70f, 0xfff8e2e8, 0, 0}},
-  {{0x000e5d34, 0xfff8aee0, 0, 0}}, {{0x000ec359, 0xfff87ad8, 0, 0}},
-  {{0x000f297e, 0xfff846d0, 0, 0}}, {{0x000f8fa3, 0xfff812c8, 0, 0}},
-  {{0x000ff5c8, 0xfff7dec0, 0, 0}}, {{0x00105bed, 0xfff7aab8, 0, 0}},
-  {{0x0010c212, 0xfff776b0, 0, 0}}, {{0x00112837, 0xfff742a8, 0, 0}},
-  {{0x00118e5c, 0xfff70ea0, 0, 0}}, {{0x0011f481, 0xfff6da98, 0, 0}},
-  {{0x00125aa6, 0xfff6a690, 0, 0}}, {{0x0012c0cb, 0xfff67288, 0, 0}},
-  {{0x001326f0, 0xfff63e80, 0, 0}}, {{0x00138d15, 0xfff60a78, 0, 0}},
-  {{0x0013f33a, 0xfff5d670, 0, 0}}, {{0x0014595f, 0xfff5a268, 0, 0}},
-  {{0x0014bf84, 0xfff56e60, 0, 0}}, {{0x001525a9, 0xfff53a58, 0, 0}},
-  {{0x00158bce, 0xfff50650, 0, 0}}, {{0x0015f1f3, 0xfff4d248, 0, 0}},
-  {{0x00165818, 0xfff49e40, 0, 0}}, {{0x0016be3d, 0xfff46a38, 0, 0}},
-  {{0x00172462, 0xfff43630, 0, 0}}, {{0x00178a87, 0xfff40228, 0, 0}},
-  {{0x0017f0ac, 0xfff3ce20, 0, 0}}, {{0x001856d1, 0xfff39a18, 0, 0}},
-  {{0x0018bcf6, 0xfff36610, 0, 0}}, {{0x0019231b, 0xfff33208, 0, 0}},
-  {{0x00198940, 0xfff2fe00, 0, 0}}, {{0x0019ef65, 0xfff2c9f8, 0, 0}},
-  {{0x001a558a, 0xfff295f0, 0, 0}}, {{0x001abbaf, 0xfff261e8, 0, 0}},
-  {{0x001b21d4, 0xfff22de0, 0, 0}}, {{0x001b87f9, 0xfff1f9d8, 0, 0}},
-  {{0x001bee1e, 0xfff1c5d0, 0, 0}}, {{0x001c5443, 0xfff191c8, 0, 0}},
-  {{0x001cba68, 0xfff15dc0, 0, 0}}, {{0x001d208d, 0xfff129b8, 0, 0}},
-  {{0x001d86b2, 0xfff0f5b0, 0, 0}}, {{0x001decd7, 0xfff0c1a8, 0, 0}},
-  {{0x001e52fc, 0xfff08da0, 0, 0}}, {{0x001eb921, 0xfff05998, 0, 0}},
-  {{0x001f1f46, 0xfff02590, 0, 0}}, {{0x001f856b, 0xffeff188, 0, 0}},
-  {{0x001feb90, 0xffefbd80, 0, 0}}, {{0x002051b5, 0xffef8978, 0, 0}},
-  {{0x0020b7da, 0xffef5570, 0, 0}}, {{0x00211dff, 0xffef2168, 0, 0}},
-  {{0x00218424, 0xffeeed60, 0, 0}}, {{0x0021ea49, 0xffeeb958, 0, 0}},
-  {{0x0022506e, 0xffee8550, 0, 0}}, {{0x0022b693, 0xffee5148, 0, 0}},
-  {{0x00231cb8, 0xffee1d40, 0, 0}}, {{0x002382dd, 0xffede938, 0, 0}},
-  {{0x0023e902, 0xffedb530, 0, 0}}, {{0x00244f27, 0xffed8128, 0, 0}},
-  {{0x0024b54c, 0xffed4d20, 0, 0}}, {{0x00251b71, 0xffed1918, 0, 0}},
-  {{0x00258196, 0xffece510, 0, 0}}, {{0x0025e7bb, 0xffecb108, 0, 0}},
-  {{0x00264de0, 0xffec7d00, 0, 0}}, {{0x0026b405, 0xffec48f8, 0, 0}},
-  {{0x00271a2a, 0xffec14f0, 0, 0}}, {{0x0027804f, 0xffebe0e8, 0, 0}},
-  {{0x0027e674, 0xffebace0, 0, 0}}, {{0x00284c99, 0xffeb78d8, 0, 0}},
-  {{0x0028b2be, 0xffeb44d0, 0, 0}}, {{0x002918e3, 0xffeb10c8, 0, 0}},
-  {{0x00297f08, 0xffeadcc0, 0, 0}}, {{0x0029e52d, 0xffeaa8b8, 0, 0}},
-  {{0x002a4b52, 0xffea74b0, 0, 0}}, {{0x002ab177, 0xffea40a8, 0, 0}},
-  {{0x002b179c, 0xffea0ca0, 0, 0}}, {{0x002b7dc1, 0xffe9d898, 0, 0}},
-  {{0x002be3e6, 0xffe9a490, 0, 0}}, {{0x002c4a0b, 0xffe97088, 0, 0}},
-  {{0x002cb030, 0xffe93c80, 0, 0}}, {{0x002d1655, 0xffe90878, 0, 0}},
-  {{0x002d7c7a, 0xffe8d470, 0, 0}}, {{0x002de29f, 0xffe8a068, 0, 0}},
-  {{0x002e48c4, 0xffe86c60, 0, 0}}, {{0x002eaee9, 0xffe83858, 0, 0}},
-  {{0x002f150e, 0xffe80450, 0, 0}}, {{0x002f7b33, 0xffe7d048, 0, 0}},
-  {{0x002fe158, 0xffe79c40, 0, 0}}, {{0x0030477d, 0xffe76838, 0, 0}},
-  {{0x0030ada2, 0xffe73430, 0, 0}}, {{0x003113c7, 0xffe70028, 0, 0}},
-  {{0x003179ec, 0xffe6cc20, 0, 0}}, {{0x0031e011, 0xffe69818, 0, 0}},
-  {{0x00324636, 0xffe66410, 0, 0}}, {{0x0032ac5b, 0xffe63008, 0, 0}}
-};
author	Liang Qi <liang.qi@theqtcompany.com>	2016-03-09 10:22:13 +0100
committer	Liang Qi <liang.qi@theqtcompany.com>	2016-03-11 20:05:19 +0000
commit	b114e552211456fbde3ff6ca2da21cbc8d1e90e2 (patch)
tree	9c033ea7bcc9cc7314eaa8aff57356b2ae301257 /src/3rdparty/libwebp/src/dsp
parent	1d4f24820c0fff474d524e006d715e13e409a4b8 (diff)