Bundled libwebp updated to version 0.6.0

This commit imports libwebp 0.6.0, including AUTHORS, COPYING, ChangeLog, NEWS, PATENTS, README and src directories. In src, only includes header and source files. Upstream changes since 0.5.1 have been merged in. Also updated version in qt_attribution.json. Conflicts: src/3rdparty/libwebp.pri src/3rdparty/libwebp/qt_attribution.json src/3rdparty/libwebp/src/webp/config.h Change-Id: I001aa7a3fabf0130b54f9005c23aa822bc1d0ec1 Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io>
author: Liang Qi <liang.qi@qt.io> 2017-03-07 13:05:21 +0100
committer: Liang Qi <liang.qi@qt.io> 2017-03-13 10:47:45 +0000
commit: b7ec9e78633d8f2c75a8b02e17e169497bb103e2 (patch)
tree: e4be04af4dbcf8cd635715efdf4e769281183746 /src/3rdparty/libwebp/src
parent: f2dbc67c2b032a5f27d0224e020fb6dfcd3fd142 (diff)
140 files changed, 10406 insertions, 3311 deletions
diff --git a/src/3rdparty/libwebp/src/dec/alpha.c b/src/3rdparty/libwebp/src/dec/alpha_dec.c
index 028eb3d..83ffd4b 100644
--- a/src/3rdparty/libwebp/src/dec/alpha.c
+++ b/src/3rdparty/libwebp/src/dec/alpha_dec.c
@@ -12,11 +12,11 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
-#include "./alphai.h"
-#include "./vp8i.h"
-#include "./vp8li.h"
+#include "./alphai_dec.h"
+#include "./vp8i_dec.h"
+#include "./vp8li_dec.h"
 #include "../dsp/dsp.h"
-#include "../utils/quant_levels_dec.h"
+#include "../utils/quant_levels_dec_utils.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
@@ -67,7 +67,7 @@ static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
   }
 
   dec->method_ = (data[0] >> 0) & 0x03;
-  dec->filter_ = (data[0] >> 2) & 0x03;
+  dec->filter_ = (WEBP_FILTER_TYPE)((data[0] >> 2) & 0x03);
   dec->pre_processing_ = (data[0] >> 4) & 0x03;
   rsrv = (data[0] >> 6) & 0x03;
   if (dec->method_ < ALPHA_NO_COMPRESSION ||
diff --git a/src/3rdparty/libwebp/src/dec/alphai.h b/src/3rdparty/libwebp/src/dec/alphai_dec.h
index 69dd7c0..561e815 100644
--- a/src/3rdparty/libwebp/src/dec/alphai.h
+++ b/src/3rdparty/libwebp/src/dec/alphai_dec.h
@@ -14,8 +14,8 @@
 #ifndef WEBP_DEC_ALPHAI_H_
 #define WEBP_DEC_ALPHAI_H_
 
-#include "./webpi.h"
-#include "../utils/filters.h"
+#include "./webpi_dec.h"
+#include "../utils/filters_utils.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/3rdparty/libwebp/src/dec/buffer.c b/src/3rdparty/libwebp/src/dec/buffer_dec.c
index 547e69b..c685fd5 100644
--- a/src/3rdparty/libwebp/src/dec/buffer.c
+++ b/src/3rdparty/libwebp/src/dec/buffer_dec.c
@@ -13,8 +13,8 @@
 
 #include <stdlib.h>
 
-#include "./vp8i.h"
-#include "./webpi.h"
+#include "./vp8i_dec.h"
+#include "./webpi_dec.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dec/common.h b/src/3rdparty/libwebp/src/dec/common_dec.h
index 6961e22..6961e22 100644
--- a/src/3rdparty/libwebp/src/dec/common.h
+++ b/src/3rdparty/libwebp/src/dec/common_dec.h
diff --git a/src/3rdparty/libwebp/src/dec/frame.c b/src/3rdparty/libwebp/src/dec/frame_dec.c
index 22d291d..f91e27f 100644
--- a/src/3rdparty/libwebp/src/dec/frame.c
+++ b/src/3rdparty/libwebp/src/dec/frame_dec.c
@@ -12,7 +12,7 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
-#include "./vp8i.h"
+#include "./vp8i_dec.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
@@ -723,7 +723,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
       return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
                          "no memory during frame initialization.");
     }
-    // down-cast is ok, thanks to WebPSafeAlloc() above.
+    // down-cast is ok, thanks to WebPSafeMalloc() above.
     dec->mem_size_ = (size_t)needed;
   }
 
diff --git a/src/3rdparty/libwebp/src/dec/idec.c b/src/3rdparty/libwebp/src/dec/idec_dec.c
index 8de1319..78fb2e7 100644
--- a/src/3rdparty/libwebp/src/dec/idec.c
+++ b/src/3rdparty/libwebp/src/dec/idec_dec.c
@@ -15,9 +15,9 @@
 #include <string.h>
 #include <stdlib.h>
 
-#include "./alphai.h"
-#include "./webpi.h"
-#include "./vp8i.h"
+#include "./alphai_dec.h"
+#include "./webpi_dec.h"
+#include "./vp8i_dec.h"
 #include "../utils/utils.h"
 
 // In append mode, buffer allocations increase as multiples of this value.
diff --git a/src/3rdparty/libwebp/src/dec/io.c b/src/3rdparty/libwebp/src/dec/io_dec.c
index 8d5c43f..8bfab86 100644
--- a/src/3rdparty/libwebp/src/dec/io.c
+++ b/src/3rdparty/libwebp/src/dec/io_dec.c
@@ -13,8 +13,8 @@
 
 #include <assert.h>
 #include <stdlib.h>
-#include "../dec/vp8i.h"
-#include "./webpi.h"
+#include "../dec/vp8i_dec.h"
+#include "./webpi_dec.h"
 #include "../dsp/dsp.h"
 #include "../dsp/yuv.h"
 #include "../utils/utils.h"
@@ -256,7 +256,7 @@ static int Rescale(const uint8_t* src, int src_stride,
 static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
   const int mb_h = io->mb_h;
   const int uv_mb_h = (mb_h + 1) >> 1;
-  WebPRescaler* const scaler = &p->scaler_y;
+  WebPRescaler* const scaler = p->scaler_y;
   int num_lines_out = 0;
   if (WebPIsAlphaMode(p->output->colorspace) && io->a != NULL) {
     // Before rescaling, we premultiply the luma directly into the io->y
@@ -267,29 +267,28 @@ static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) {
                  io->a, io->width, io->mb_w, mb_h, 0);
   }
   num_lines_out = Rescale(io->y, io->y_stride, mb_h, scaler);
-  Rescale(io->u, io->uv_stride, uv_mb_h, &p->scaler_u);
-  Rescale(io->v, io->uv_stride, uv_mb_h, &p->scaler_v);
+  Rescale(io->u, io->uv_stride, uv_mb_h, p->scaler_u);
+  Rescale(io->v, io->uv_stride, uv_mb_h, p->scaler_v);
   return num_lines_out;
 }
 
 static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p,
                                 int expected_num_lines_out) {
   const WebPYUVABuffer* const buf = &p->output->u.YUVA;
+  uint8_t* const dst_a = buf->a + p->last_y * buf->a_stride;
   if (io->a != NULL) {
-    uint8_t* dst_y = buf->y + p->last_y * buf->y_stride;
-    const uint8_t* src_a = buf->a + p->last_y * buf->a_stride;
-    const int num_lines_out = Rescale(io->a, io->width, io->mb_h, &p->scaler_a);
-    (void)expected_num_lines_out;
+    uint8_t* const dst_y = buf->y + p->last_y * buf->y_stride;
+    const int num_lines_out = Rescale(io->a, io->width, io->mb_h, p->scaler_a);
     assert(expected_num_lines_out == num_lines_out);
     if (num_lines_out > 0) {   // unmultiply the Y
-      WebPMultRows(dst_y, buf->y_stride, src_a, buf->a_stride,
-                   p->scaler_a.dst_width, num_lines_out, 1);
+      WebPMultRows(dst_y, buf->y_stride, dst_a, buf->a_stride,
+                   p->scaler_a->dst_width, num_lines_out, 1);
     }
   } else if (buf->a != NULL) {
     // the user requested alpha, but there is none, set it to opaque.
     assert(p->last_y + expected_num_lines_out <= io->scaled_height);
-    FillAlphaPlane(buf->a + p->last_y * buf->a_stride,
-                   io->scaled_width, expected_num_lines_out, buf->a_stride);
+    FillAlphaPlane(dst_a, io->scaled_width, expected_num_lines_out,
+                   buf->a_stride);
   }
   return 0;
 }
@@ -305,31 +304,42 @@ static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const int uv_in_height = (io->mb_h + 1) >> 1;
   const size_t work_size = 2 * out_width;   // scratch memory for luma rescaler
   const size_t uv_work_size = 2 * uv_out_width;  // and for each u/v ones
-  size_t tmp_size;
+  size_t tmp_size, rescaler_size;
   rescaler_t* work;
+  WebPRescaler* scalers;
+  const int num_rescalers = has_alpha ? 4 : 3;
 
   tmp_size = (work_size + 2 * uv_work_size) * sizeof(*work);
   if (has_alpha) {
     tmp_size += work_size * sizeof(*work);
   }
-  p->memory = WebPSafeMalloc(1ULL, tmp_size);
+  rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
+
+  p->memory = WebPSafeMalloc(1ULL, tmp_size + rescaler_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
   work = (rescaler_t*)p->memory;
-  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
+
+  scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + tmp_size);
+  p->scaler_y = &scalers[0];
+  p->scaler_u = &scalers[1];
+  p->scaler_v = &scalers[2];
+  p->scaler_a = has_alpha ? &scalers[3] : NULL;
+
+  WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
                    buf->y, out_width, out_height, buf->y_stride, 1,
                    work);
-  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
+  WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
                    buf->u, uv_out_width, uv_out_height, buf->u_stride, 1,
                    work + work_size);
-  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
+  WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
                    buf->v, uv_out_width, uv_out_height, buf->v_stride, 1,
                    work + work_size + uv_work_size);
   p->emit = EmitRescaledYUV;
 
   if (has_alpha) {
-    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
+    WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
                      buf->a, out_width, out_height, buf->a_stride, 1,
                      work + work_size + 2 * uv_work_size);
     p->emit_alpha = EmitRescaledAlphaYUV;
@@ -349,15 +359,15 @@ static int ExportRGB(WebPDecParams* const p, int y_pos) {
   int num_lines_out = 0;
   // For RGB rescaling, because of the YUV420, current scan position
   // U/V can be +1/-1 line from the Y one.  Hence the double test.
-  while (WebPRescalerHasPendingOutput(&p->scaler_y) &&
-         WebPRescalerHasPendingOutput(&p->scaler_u)) {
+  while (WebPRescalerHasPendingOutput(p->scaler_y) &&
+         WebPRescalerHasPendingOutput(p->scaler_u)) {
     assert(y_pos + num_lines_out < p->output->height);
-    assert(p->scaler_u.y_accum == p->scaler_v.y_accum);
-    WebPRescalerExportRow(&p->scaler_y);
-    WebPRescalerExportRow(&p->scaler_u);
-    WebPRescalerExportRow(&p->scaler_v);
-    convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst,
-            dst, p->scaler_y.dst_width);
+    assert(p->scaler_u->y_accum == p->scaler_v->y_accum);
+    WebPRescalerExportRow(p->scaler_y);
+    WebPRescalerExportRow(p->scaler_u);
+    WebPRescalerExportRow(p->scaler_v);
+    convert(p->scaler_y->dst, p->scaler_u->dst, p->scaler_v->dst,
+            dst, p->scaler_y->dst_width);
     dst += buf->stride;
     ++num_lines_out;
   }
@@ -371,15 +381,15 @@ static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) {
   int num_lines_out = 0;
   while (j < mb_h) {
     const int y_lines_in =
-        WebPRescalerImport(&p->scaler_y, mb_h - j,
+        WebPRescalerImport(p->scaler_y, mb_h - j,
                            io->y + j * io->y_stride, io->y_stride);
     j += y_lines_in;
-    if (WebPRescaleNeededLines(&p->scaler_u, uv_mb_h - uv_j)) {
+    if (WebPRescaleNeededLines(p->scaler_u, uv_mb_h - uv_j)) {
       const int u_lines_in =
-          WebPRescalerImport(&p->scaler_u, uv_mb_h - uv_j,
+          WebPRescalerImport(p->scaler_u, uv_mb_h - uv_j,
                              io->u + uv_j * io->uv_stride, io->uv_stride);
       const int v_lines_in =
-          WebPRescalerImport(&p->scaler_v, uv_mb_h - uv_j,
+          WebPRescalerImport(p->scaler_v, uv_mb_h - uv_j,
                              io->v + uv_j * io->uv_stride, io->uv_stride);
       (void)v_lines_in;   // remove a gcc warning
       assert(u_lines_in == v_lines_in);
@@ -400,13 +410,13 @@ static int ExportAlpha(WebPDecParams* const p, int y_pos, int max_lines_out) {
   int num_lines_out = 0;
   const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
   uint32_t non_opaque = 0;
-  const int width = p->scaler_a.dst_width;
+  const int width = p->scaler_a->dst_width;
 
-  while (WebPRescalerHasPendingOutput(&p->scaler_a) &&
+  while (WebPRescalerHasPendingOutput(p->scaler_a) &&
          num_lines_out < max_lines_out) {
     assert(y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a);
-    non_opaque |= WebPDispatchAlpha(p->scaler_a.dst, 0, width, 1, dst, 0);
+    WebPRescalerExportRow(p->scaler_a);
+    non_opaque |= WebPDispatchAlpha(p->scaler_a->dst, 0, width, 1, dst, 0);
     dst += buf->stride;
     ++num_lines_out;
   }
@@ -428,18 +438,18 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
 #endif
   int num_lines_out = 0;
   const WEBP_CSP_MODE colorspace = p->output->colorspace;
-  const int width = p->scaler_a.dst_width;
+  const int width = p->scaler_a->dst_width;
   const int is_premult_alpha = WebPIsPremultipliedMode(colorspace);
   uint32_t alpha_mask = 0x0f;
 
-  while (WebPRescalerHasPendingOutput(&p->scaler_a) &&
+  while (WebPRescalerHasPendingOutput(p->scaler_a) &&
          num_lines_out < max_lines_out) {
     int i;
     assert(y_pos + num_lines_out < p->output->height);
-    WebPRescalerExportRow(&p->scaler_a);
+    WebPRescalerExportRow(p->scaler_a);
     for (i = 0; i < width; ++i) {
       // Fill in the alpha value (converted to 4 bits).
-      const uint32_t alpha_value = p->scaler_a.dst[i] >> 4;
+      const uint32_t alpha_value = p->scaler_a->dst[i] >> 4;
       alpha_dst[2 * i] = (alpha_dst[2 * i] & 0xf0) | alpha_value;
       alpha_mask &= alpha_value;
     }
@@ -455,7 +465,7 @@ static int ExportAlphaRGBA4444(WebPDecParams* const p, int y_pos,
 static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p,
                                 int expected_num_out_lines) {
   if (io->a != NULL) {
-    WebPRescaler* const scaler = &p->scaler_a;
+    WebPRescaler* const scaler = p->scaler_a;
     int lines_left = expected_num_out_lines;
     const int y_end = p->last_y + lines_left;
     while (lines_left > 0) {
@@ -477,7 +487,9 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
   const size_t work_size = 2 * out_width;   // scratch memory for one rescaler
   rescaler_t* work;  // rescalers work area
   uint8_t* tmp;   // tmp storage for scaled YUV444 samples before RGB conversion
-  size_t tmp_size1, tmp_size2, total_size;
+  size_t tmp_size1, tmp_size2, total_size, rescaler_size;
+  WebPRescaler* scalers;
+  const int num_rescalers = has_alpha ? 4 : 3;
 
   tmp_size1 = 3 * work_size;
   tmp_size2 = 3 * out_width;
@@ -486,26 +498,35 @@ static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) {
     tmp_size2 += out_width;
   }
   total_size = tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp);
-  p->memory = WebPSafeMalloc(1ULL, total_size);
+  rescaler_size = num_rescalers * sizeof(*p->scaler_y) + WEBP_ALIGN_CST;
+
+  p->memory = WebPSafeMalloc(1ULL, total_size + rescaler_size);
   if (p->memory == NULL) {
     return 0;   // memory error
   }
   work = (rescaler_t*)p->memory;
   tmp = (uint8_t*)(work + tmp_size1);
-  WebPRescalerInit(&p->scaler_y, io->mb_w, io->mb_h,
+
+  scalers = (WebPRescaler*)WEBP_ALIGN((const uint8_t*)work + total_size);
+  p->scaler_y = &scalers[0];
+  p->scaler_u = &scalers[1];
+  p->scaler_v = &scalers[2];
+  p->scaler_a = has_alpha ? &scalers[3] : NULL;
+
+  WebPRescalerInit(p->scaler_y, io->mb_w, io->mb_h,
                    tmp + 0 * out_width, out_width, out_height, 0, 1,
                    work + 0 * work_size);
-  WebPRescalerInit(&p->scaler_u, uv_in_width, uv_in_height,
+  WebPRescalerInit(p->scaler_u, uv_in_width, uv_in_height,
                    tmp + 1 * out_width, out_width, out_height, 0, 1,
                    work + 1 * work_size);
-  WebPRescalerInit(&p->scaler_v, uv_in_width, uv_in_height,
+  WebPRescalerInit(p->scaler_v, uv_in_width, uv_in_height,
                    tmp + 2 * out_width, out_width, out_height, 0, 1,
                    work + 2 * work_size);
   p->emit = EmitRescaledRGB;
   WebPInitYUV444Converters();
 
   if (has_alpha) {
-    WebPRescalerInit(&p->scaler_a, io->mb_w, io->mb_h,
+    WebPRescalerInit(p->scaler_a, io->mb_w, io->mb_h,
                      tmp + 3 * out_width, out_width, out_height, 0, 1,
                      work + 3 * work_size);
     p->emit_alpha = EmitRescaledAlphaRGB;
diff --git a/src/3rdparty/libwebp/src/dec/quant.c b/src/3rdparty/libwebp/src/dec/quant_dec.c
index 5b648f9..14e3198 100644
--- a/src/3rdparty/libwebp/src/dec/quant.c
+++ b/src/3rdparty/libwebp/src/dec/quant_dec.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./vp8i.h"
+#include "./vp8i_dec.h"
 
 static WEBP_INLINE int clip(int v, int M) {
   return v < 0 ? 0 : v > M ? M : v;
diff --git a/src/3rdparty/libwebp/src/dec/tree.c b/src/3rdparty/libwebp/src/dec/tree_dec.c
index c2007ea..9e805f6 100644
--- a/src/3rdparty/libwebp/src/dec/tree.c
+++ b/src/3rdparty/libwebp/src/dec/tree_dec.c
@@ -11,10 +11,13 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./vp8i.h"
-#include "../utils/bit_reader_inl.h"
+#include "./vp8i_dec.h"
+#include "../utils/bit_reader_inl_utils.h"
 
+#if !defined(__arm__) && !defined(_M_ARM) && !defined(__aarch64__)
+// using a table is ~1-2% slower on ARM. Prefer the coded-tree approach then.
 #define USE_GENERIC_TREE
+#endif
 
 #ifdef USE_GENERIC_TREE
 static const int8_t kYModesIntra4[18] = {
diff --git a/src/3rdparty/libwebp/src/dec/vp8.c b/src/3rdparty/libwebp/src/dec/vp8_dec.c
index 336680c..fad8d9c 100644
--- a/src/3rdparty/libwebp/src/dec/vp8.c
+++ b/src/3rdparty/libwebp/src/dec/vp8_dec.c
@@ -13,11 +13,11 @@
 
 #include <stdlib.h>
 
-#include "./alphai.h"
-#include "./vp8i.h"
-#include "./vp8li.h"
-#include "./webpi.h"
-#include "../utils/bit_reader_inl.h"
+#include "./alphai_dec.h"
+#include "./vp8i_dec.h"
+#include "./vp8li_dec.h"
+#include "./webpi_dec.h"
+#include "../utils/bit_reader_inl_utils.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
@@ -27,6 +27,16 @@ int WebPGetDecoderVersion(void) {
 }
 
 //------------------------------------------------------------------------------
+// Signature and pointer-to-function for GetCoeffs() variants below.
+
+typedef int (*GetCoeffsFunc)(VP8BitReader* const br,
+                             const VP8BandProbas* const prob[],
+                             int ctx, const quant_t dq, int n, int16_t* out);
+static volatile GetCoeffsFunc GetCoeffs = NULL;
+
+static void InitGetCoeffs(void);
+
+//------------------------------------------------------------------------------
 // VP8Decoder
 
 static void SetOk(VP8Decoder* const dec) {
@@ -51,6 +61,7 @@ VP8Decoder* VP8New(void) {
     WebPGetWorkerInterface()->Init(&dec->worker_);
     dec->ready_ = 0;
     dec->num_parts_minus_one_ = 0;
+    InitGetCoeffs();
   }
   return dec;
 }
@@ -273,12 +284,14 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
     frm_hdr->profile_ = (bits >> 1) & 7;
     frm_hdr->show_ = (bits >> 4) & 1;
     frm_hdr->partition_length_ = (bits >> 5);
-    if (frm_hdr->profile_ > 3)
+    if (frm_hdr->profile_ > 3) {
       return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
                          "Incorrect keyframe parameters.");
-    if (!frm_hdr->show_)
+    }
+    if (!frm_hdr->show_) {
       return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
                          "Frame not displayable.");
+    }
     buf += 3;
     buf_size -= 3;
   }
@@ -420,8 +433,9 @@ static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
 }
 
 // Returns the position of the last non-zero coeff plus one
-static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob[],
-                     int ctx, const quant_t dq, int n, int16_t* out) {
+static int GetCoeffsFast(VP8BitReader* const br,
+                         const VP8BandProbas* const prob[],
+                         int ctx, const quant_t dq, int n, int16_t* out) {
   const uint8_t* p = prob[n]->probas_[ctx];
   for (; n < 16; ++n) {
     if (!VP8GetBit(br, p[0])) {
@@ -447,6 +461,46 @@ static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob[],
   return 16;
 }
 
+// This version of GetCoeffs() uses VP8GetBitAlt() which is an alternate version
+// of VP8GetBitAlt() targeting specific platforms.
+static int GetCoeffsAlt(VP8BitReader* const br,
+                        const VP8BandProbas* const prob[],
+                        int ctx, const quant_t dq, int n, int16_t* out) {
+  const uint8_t* p = prob[n]->probas_[ctx];
+  for (; n < 16; ++n) {
+    if (!VP8GetBitAlt(br, p[0])) {
+      return n;  // previous coeff was last non-zero coeff
+    }
+    while (!VP8GetBitAlt(br, p[1])) {       // sequence of zero coeffs
+      p = prob[++n]->probas_[0];
+      if (n == 16) return 16;
+    }
+    {        // non zero coeff
+      const VP8ProbaArray* const p_ctx = &prob[n + 1]->probas_[0];
+      int v;
+      if (!VP8GetBitAlt(br, p[2])) {
+        v = 1;
+        p = p_ctx[1];
+      } else {
+        v = GetLargeValue(br, p);
+        p = p_ctx[2];
+      }
+      out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
+    }
+  }
+  return 16;
+}
+
+WEBP_TSAN_IGNORE_FUNCTION static void InitGetCoeffs(void) {
+  if (GetCoeffs == NULL) {
+    if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
+      GetCoeffs = GetCoeffsAlt;
+    } else {
+      GetCoeffs = GetCoeffsFast;
+    }
+  }
+}
+
 static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {
   nz_coeffs <<= 2;
   nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz;
diff --git a/src/3rdparty/libwebp/src/dec/decode_vp8.h b/src/3rdparty/libwebp/src/dec/vp8_dec.h
index b9337bb..b9337bb 100644
--- a/src/3rdparty/libwebp/src/dec/decode_vp8.h
+++ b/src/3rdparty/libwebp/src/dec/vp8_dec.h
diff --git a/src/3rdparty/libwebp/src/dec/vp8i.h b/src/3rdparty/libwebp/src/dec/vp8i_dec.h
index 00da02b..555853e 100644
--- a/src/3rdparty/libwebp/src/dec/vp8i.h
+++ b/src/3rdparty/libwebp/src/dec/vp8i_dec.h
@@ -15,11 +15,11 @@
 #define WEBP_DEC_VP8I_H_
 
 #include <string.h>     // for memcpy()
-#include "./common.h"
-#include "./vp8li.h"
-#include "../utils/bit_reader.h"
-#include "../utils/random.h"
-#include "../utils/thread.h"
+#include "./common_dec.h"
+#include "./vp8li_dec.h"
+#include "../utils/bit_reader_utils.h"
+#include "../utils/random_utils.h"
+#include "../utils/thread_utils.h"
 #include "../dsp/dsp.h"
 
 #ifdef __cplusplus
@@ -31,8 +31,8 @@ extern "C" {
 
 // version numbers
 #define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 5
-#define DEC_REV_VERSION 1
+#define DEC_MIN_VERSION 6
+#define DEC_REV_VERSION 0
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
diff --git a/src/3rdparty/libwebp/src/dec/vp8l.c b/src/3rdparty/libwebp/src/dec/vp8l_dec.c
index cb2e317..ef359a9 100644
--- a/src/3rdparty/libwebp/src/dec/vp8l.c
+++ b/src/3rdparty/libwebp/src/dec/vp8l_dec.c
@@ -14,13 +14,14 @@
 
 #include <stdlib.h>
 
-#include "./alphai.h"
-#include "./vp8li.h"
+#include "./alphai_dec.h"
+#include "./vp8li_dec.h"
 #include "../dsp/dsp.h"
 #include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
 #include "../dsp/yuv.h"
-#include "../utils/endian_inl.h"
-#include "../utils/huffman.h"
+#include "../utils/endian_inl_utils.h"
+#include "../utils/huffman_utils.h"
 #include "../utils/utils.h"
 
 #define NUM_ARGB_CACHE_ROWS          16
@@ -547,11 +548,14 @@ static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
     uint8_t* const row_out = out + num_lines_out * out_stride;
     const int lines_left = mb_h - num_lines_in;
     const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
+    int lines_imported;
     assert(needed_lines > 0 && needed_lines <= lines_left);
     WebPMultARGBRows(row_in, in_stride,
                      dec->rescaler->src_width, needed_lines, 0);
-    WebPRescalerImport(dec->rescaler, lines_left, row_in, in_stride);
-    num_lines_in += needed_lines;
+    lines_imported =
+        WebPRescalerImport(dec->rescaler, lines_left, row_in, in_stride);
+    assert(lines_imported == needed_lines);
+    num_lines_in += lines_imported;
     num_lines_out += Export(dec->rescaler, colorspace, out_stride, row_out);
   }
   return num_lines_out;
@@ -623,9 +627,12 @@ static int EmitRescaledRowsYUVA(const VP8LDecoder* const dec,
   while (num_lines_in < mb_h) {
     const int lines_left = mb_h - num_lines_in;
     const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
+    int lines_imported;
     WebPMultARGBRows(in, in_stride, dec->rescaler->src_width, needed_lines, 0);
-    WebPRescalerImport(dec->rescaler, lines_left, in, in_stride);
-    num_lines_in += needed_lines;
+    lines_imported =
+        WebPRescalerImport(dec->rescaler, lines_left, in, in_stride);
+    assert(lines_imported == needed_lines);
+    num_lines_in += lines_imported;
     in += needed_lines * in_stride;
     y_pos += ExportYUVA(dec, y_pos);
   }
@@ -705,13 +712,15 @@ static void ApplyInverseTransforms(VP8LDecoder* const dec, int num_rows,
   uint32_t* const rows_out = dec->argb_cache_;
 
   // Inverse transforms.
-  // TODO: most transforms only need to operate on the cropped region only.
-  memcpy(rows_out, rows_in, cache_pixs * sizeof(*rows_out));
   while (n-- > 0) {
     VP8LTransform* const transform = &dec->transforms_[n];
     VP8LInverseTransform(transform, start_row, end_row, rows_in, rows_out);
     rows_in = rows_out;
   }
+  if (rows_in != rows_out) {
+    // No transform called, hence just copy.
+    memcpy(rows_out, rows_in, cache_pixs * sizeof(*rows_out));
+  }
 }
 
 // Processes (transforms, scales & color-converts) the rows decoded after the
@@ -1210,8 +1219,9 @@ static int ExpandColorMap(int num_colors, VP8LTransform* const transform) {
       // Equivalent to AddPixelEq(), on a byte-basis.
       new_data[i] = (data[i] + new_data[i - 4]) & 0xff;
     }
-    for (; i < 4 * final_num_colors; ++i)
+    for (; i < 4 * final_num_colors; ++i) {
       new_data[i] = 0;  // black tail.
+    }
     WebPSafeFree(transform->data_);
     transform->data_ = new_color_map;
   }
@@ -1482,9 +1492,8 @@ static void ExtractAlphaRows(VP8LDecoder* const dec, int last_row) {
     const int cache_pixs = width * num_rows_to_process;
     uint8_t* const dst = output + width * cur_row;
     const uint32_t* const src = dec->argb_cache_;
-    int i;
     ApplyInverseTransforms(dec, num_rows_to_process, in);
-    for (i = 0; i < cache_pixs; ++i) dst[i] = (src[i] >> 8) & 0xff;
+    WebPExtractGreen(src, dst, cache_pixs);
     AlphaApplyFilter(alph_dec,
                      cur_row, cur_row + num_rows_to_process, dst, width);
     num_rows -= num_rows_to_process;
@@ -1552,6 +1561,8 @@ int VP8LDecodeAlphaImageStream(ALPHDecoder* const alph_dec, int last_row) {
     return 1;  // done
   }
 
+  if (!alph_dec->use_8b_decode_) WebPInitAlphaProcessing();
+
   // Decode (with special row processing).
   return alph_dec->use_8b_decode_ ?
       DecodeAlphaData(dec, (uint8_t*)dec->pixels_, dec->width_, dec->height_,
diff --git a/src/3rdparty/libwebp/src/dec/vp8li.h b/src/3rdparty/libwebp/src/dec/vp8li_dec.h
index 9313bdc..097a9d0 100644
--- a/src/3rdparty/libwebp/src/dec/vp8li.h
+++ b/src/3rdparty/libwebp/src/dec/vp8li_dec.h
@@ -16,10 +16,10 @@
 #define WEBP_DEC_VP8LI_H_
 
 #include <string.h>     // for memcpy()
-#include "./webpi.h"
-#include "../utils/bit_reader.h"
-#include "../utils/color_cache.h"
-#include "../utils/huffman.h"
+#include "./webpi_dec.h"
+#include "../utils/bit_reader_utils.h"
+#include "../utils/color_cache_utils.h"
+#include "../utils/huffman_utils.h"
 
 #ifdef __cplusplus
 extern "C" {
diff --git a/src/3rdparty/libwebp/src/dec/webp.c b/src/3rdparty/libwebp/src/dec/webp_dec.c
index d0b912f..a8e9c2c 100644
--- a/src/3rdparty/libwebp/src/dec/webp.c
+++ b/src/3rdparty/libwebp/src/dec/webp_dec.c
@@ -13,9 +13,9 @@
 
 #include <stdlib.h>
 
-#include "./vp8i.h"
-#include "./vp8li.h"
-#include "./webpi.h"
+#include "./vp8i_dec.h"
+#include "./vp8li_dec.h"
+#include "./webpi_dec.h"
 #include "../utils/utils.h"
 #include "../webp/mux_types.h"  // ALPHA_FLAG
 
@@ -39,8 +39,8 @@
 //   20..23  VP8X flags bit-map corresponding to the chunk-types present.
 //   24..26  Width of the Canvas Image.
 //   27..29  Height of the Canvas Image.
-// There can be extra chunks after the "VP8X" chunk (ICCP, FRGM, ANMF, VP8,
-// VP8L, XMP, EXIF  ...)
+// There can be extra chunks after the "VP8X" chunk (ICCP, ANMF, VP8, VP8L,
+// XMP, EXIF  ...)
 // All sizes are in little-endian order.
 // Note: chunk data size must be padded to multiple of 2 when written.
 
@@ -289,7 +289,6 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
   int found_riff = 0;
   int found_vp8x = 0;
   int animation_present = 0;
-  int fragments_present = 0;
   const int have_all_data = (headers != NULL) ? headers->have_all_data : 0;
 
   VP8StatusCode status;
@@ -318,7 +317,6 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
       return status;  // Wrong VP8X / insufficient data.
     }
     animation_present = !!(flags & ANIMATION_FLAG);
-    fragments_present = !!(flags & FRAGMENTS_FLAG);
     if (!found_riff && found_vp8x) {
       // Note: This restriction may be removed in the future, if it becomes
       // necessary to send VP8X chunk to the decoder.
@@ -330,8 +328,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
 
     image_width = canvas_width;
     image_height = canvas_height;
-    if (found_vp8x && (animation_present || fragments_present) &&
-        headers == NULL) {
+    if (found_vp8x && animation_present && headers == NULL) {
       status = VP8_STATUS_OK;
       goto ReturnWidthHeight;  // Just return features from VP8X header.
     }
@@ -362,7 +359,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
     return VP8_STATUS_BITSTREAM_ERROR;
   }
 
-  if (format != NULL && !(animation_present || fragments_present)) {
+  if (format != NULL && !animation_present) {
     *format = hdrs.is_lossless ? 2 : 1;
   }
 
diff --git a/src/3rdparty/libwebp/src/dec/webpi.h b/src/3rdparty/libwebp/src/dec/webpi_dec.h
index 991b194..696abc1 100644
--- a/src/3rdparty/libwebp/src/dec/webpi.h
+++ b/src/3rdparty/libwebp/src/dec/webpi_dec.h
@@ -18,8 +18,8 @@
 extern "C" {
 #endif
 
-#include "../utils/rescaler.h"
-#include "./decode_vp8.h"
+#include "../utils/rescaler_utils.h"
+#include "./vp8_dec.h"
 
 //------------------------------------------------------------------------------
 // WebPDecParams: Decoding output parameters. Transient internal object.
@@ -38,27 +38,18 @@ struct WebPDecParams {
 
   int last_y;                 // coordinate of the line that was last output
   const WebPDecoderOptions* options;  // if not NULL, use alt decoding features
-  // rescalers
-  WebPRescaler scaler_y, scaler_u, scaler_v, scaler_a;
+
+  WebPRescaler* scaler_y, *scaler_u, *scaler_v, *scaler_a;  // rescalers
   void* memory;                  // overall scratch memory for the output work.
 
   OutputFunc emit;               // output RGB or YUV samples
   OutputAlphaFunc emit_alpha;    // output alpha channel
   OutputRowFunc emit_alpha_row;  // output one line of rescaled alpha values
-
-  WebPDecBuffer* final_output;   // In case the user supplied a slow-memory
-                                 // output, we decode image in temporary buffer
-                                 // (this::output) and copy it here.
-  WebPDecBuffer tmp_buffer;      // this::output will point to this one in case
-                                 // of slow memory.
 };
 
 // Should be called first, before any use of the WebPDecParams object.
 void WebPResetDecParams(WebPDecParams* const params);
 
-// Delete all memory (after an error occurred, for instance)
-void WebPFreeDecParams(WebPDecParams* const params);
-
 //------------------------------------------------------------------------------
 // Header parsing helpers
 
diff --git a/src/3rdparty/libwebp/src/demux/anim_decode.c b/src/3rdparty/libwebp/src/demux/anim_decode.c
index 1989eb4..f1cf176 100644
--- a/src/3rdparty/libwebp/src/demux/anim_decode.c
+++ b/src/3rdparty/libwebp/src/demux/anim_decode.c
@@ -112,18 +112,15 @@ WebPAnimDecoder* WebPAnimDecoderNewInternal(
   dec->info_.bgcolor = WebPDemuxGetI(dec->demux_, WEBP_FF_BACKGROUND_COLOR);
   dec->info_.frame_count = WebPDemuxGetI(dec->demux_, WEBP_FF_FRAME_COUNT);
 
-  {
-    const int canvas_bytes =
-        dec->info_.canvas_width * NUM_CHANNELS * dec->info_.canvas_height;
-    // Note: calloc() because we fill frame with zeroes as well.
-    dec->curr_frame_ = WebPSafeCalloc(1ULL, canvas_bytes);
-    if (dec->curr_frame_ == NULL) goto Error;
-    dec->prev_frame_disposed_ = WebPSafeCalloc(1ULL, canvas_bytes);
-    if (dec->prev_frame_disposed_ == NULL) goto Error;
-  }
+  // Note: calloc() because we fill frame with zeroes as well.
+  dec->curr_frame_ = (uint8_t*)WebPSafeCalloc(
+      dec->info_.canvas_width * NUM_CHANNELS, dec->info_.canvas_height);
+  if (dec->curr_frame_ == NULL) goto Error;
+  dec->prev_frame_disposed_ = (uint8_t*)WebPSafeCalloc(
+      dec->info_.canvas_width * NUM_CHANNELS, dec->info_.canvas_height);
+  if (dec->prev_frame_disposed_ == NULL) goto Error;
 
   WebPAnimDecoderReset(dec);
-
   return dec;
 
  Error:
@@ -144,9 +141,13 @@ static int IsFullFrame(int width, int height, int canvas_width,
 }
 
 // Clear the canvas to transparent.
-static void ZeroFillCanvas(uint8_t* buf, uint32_t canvas_width,
-                           uint32_t canvas_height) {
-  memset(buf, 0, canvas_width * NUM_CHANNELS * canvas_height);
+static int ZeroFillCanvas(uint8_t* buf, uint32_t canvas_width,
+                          uint32_t canvas_height) {
+  const uint64_t size =
+      (uint64_t)canvas_width * canvas_height * NUM_CHANNELS * sizeof(*buf);
+  if (size != (size_t)size) return 0;
+  memset(buf, 0, (size_t)size);
+  return 1;
 }
 
 // Clear given frame rectangle to transparent.
@@ -162,10 +163,13 @@ static void ZeroFillFrameRect(uint8_t* buf, int buf_stride, int x_offset,
 }
 
 // Copy width * height pixels from 'src' to 'dst'.
-static void CopyCanvas(const uint8_t* src, uint8_t* dst,
-                       uint32_t width, uint32_t height) {
+static int CopyCanvas(const uint8_t* src, uint8_t* dst,
+                      uint32_t width, uint32_t height) {
+  const uint64_t size = (uint64_t)width * height * NUM_CHANNELS;
+  if (size != (size_t)size) return 0;
   assert(src != NULL && dst != NULL);
-  memcpy(dst, src, width * NUM_CHANNELS * height);
+  memcpy(dst, src, (size_t)size);
+  return 1;
 }
 
 // Returns true if the current frame is a key-frame.
@@ -328,9 +332,14 @@ int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
   is_key_frame = IsKeyFrame(&iter, &dec->prev_iter_,
                             dec->prev_frame_was_keyframe_, width, height);
   if (is_key_frame) {
-    ZeroFillCanvas(dec->curr_frame_, width, height);
+    if (!ZeroFillCanvas(dec->curr_frame_, width, height)) {
+      goto Error;
+    }
   } else {
-    CopyCanvas(dec->prev_frame_disposed_, dec->curr_frame_, width, height);
+    if (!CopyCanvas(dec->prev_frame_disposed_, dec->curr_frame_,
+                    width, height)) {
+      goto Error;
+    }
   }
 
   // Decode.
@@ -393,6 +402,7 @@ int WebPAnimDecoderGetNext(WebPAnimDecoder* dec,
 
   // Update info of the previous frame and dispose it for the next iteration.
   dec->prev_frame_timestamp_ = timestamp;
+  WebPDemuxReleaseIterator(&dec->prev_iter_);
   dec->prev_iter_ = iter;
   dec->prev_frame_was_keyframe_ = is_key_frame;
   CopyCanvas(dec->curr_frame_, dec->prev_frame_disposed_, width, height);
@@ -421,6 +431,7 @@ int WebPAnimDecoderHasMoreFrames(const WebPAnimDecoder* dec) {
 void WebPAnimDecoderReset(WebPAnimDecoder* dec) {
   if (dec != NULL) {
     dec->prev_frame_timestamp_ = 0;
+    WebPDemuxReleaseIterator(&dec->prev_iter_);
     memset(&dec->prev_iter_, 0, sizeof(dec->prev_iter_));
     dec->prev_frame_was_keyframe_ = 0;
     dec->next_frame_ = 1;
@@ -434,6 +445,7 @@ const WebPDemuxer* WebPAnimDecoderGetDemuxer(const WebPAnimDecoder* dec) {
 
 void WebPAnimDecoderDelete(WebPAnimDecoder* dec) {
   if (dec != NULL) {
+    WebPDemuxReleaseIterator(&dec->prev_iter_);
     WebPDemuxDelete(dec->demux_);
     WebPSafeFree(dec->curr_frame_);
     WebPSafeFree(dec->prev_frame_disposed_);
diff --git a/src/3rdparty/libwebp/src/demux/demux.c b/src/3rdparty/libwebp/src/demux/demux.c
index 0d2989f..100eab8 100644
--- a/src/3rdparty/libwebp/src/demux/demux.c
+++ b/src/3rdparty/libwebp/src/demux/demux.c
@@ -25,7 +25,7 @@
 
 #define DMUX_MAJ_VERSION 0
 #define DMUX_MIN_VERSION 3
-#define DMUX_REV_VERSION 0
+#define DMUX_REV_VERSION 2
 
 typedef struct {
   size_t start_;        // start location of the data
@@ -590,7 +590,6 @@ static int CheckFrameBounds(const Frame* const frame, int exact,
 
 static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
   const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
-  const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
   const Frame* f = dmux->frames_;
 
   if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;
@@ -598,7 +597,7 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
   if (dmux->canvas_width_ <= 0 || dmux->canvas_height_ <= 0) return 0;
   if (dmux->loop_count_ < 0) return 0;
   if (dmux->state_ == WEBP_DEMUX_DONE && dmux->frames_ == NULL) return 0;
-  if (is_fragmented) return 0;
+  if (dmux->feature_flags_ & ~ALL_VALID_FLAGS) return 0;  // invalid bitstream
 
   while (f != NULL) {
     const int cur_frame_set = f->frame_num_;
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing.c b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
index 1716cac..4b60e09 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
@@ -284,9 +284,9 @@ static void ApplyAlphaMultiply_16b(uint8_t* rgba4444,
 #endif
 }
 
-static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
-                         int width, int height,
-                         uint8_t* dst, int dst_stride) {
+static int DispatchAlpha_C(const uint8_t* alpha, int alpha_stride,
+                           int width, int height,
+                           uint8_t* dst, int dst_stride) {
   uint32_t alpha_mask = 0xff;
   int i, j;
 
@@ -303,9 +303,9 @@ static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
   return (alpha_mask != 0xff);
 }
 
-static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
-                                 int width, int height,
-                                 uint32_t* dst, int dst_stride) {
+static void DispatchAlphaToGreen_C(const uint8_t* alpha, int alpha_stride,
+                                   int width, int height,
+                                   uint32_t* dst, int dst_stride) {
   int i, j;
   for (j = 0; j < height; ++j) {
     for (i = 0; i < width; ++i) {
@@ -316,9 +316,9 @@ static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride,
   }
 }
 
-static int ExtractAlpha(const uint8_t* argb, int argb_stride,
-                        int width, int height,
-                        uint8_t* alpha, int alpha_stride) {
+static int ExtractAlpha_C(const uint8_t* argb, int argb_stride,
+                          int width, int height,
+                          uint8_t* alpha, int alpha_stride) {
   uint8_t alpha_mask = 0xff;
   int i, j;
 
@@ -334,11 +334,17 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
   return (alpha_mask == 0xff);
 }
 
+static void ExtractGreen_C(const uint32_t* argb, uint8_t* alpha, int size) {
+  int i;
+  for (i = 0; i < size; ++i) alpha[i] = argb[i] >> 8;
+}
+
 void (*WebPApplyAlphaMultiply)(uint8_t*, int, int, int, int);
 void (*WebPApplyAlphaMultiply4444)(uint8_t*, int, int, int);
 int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
 int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
+void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
 
 //------------------------------------------------------------------------------
 // Init function
@@ -346,6 +352,7 @@ int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 extern void WebPInitAlphaProcessingMIPSdspR2(void);
 extern void WebPInitAlphaProcessingSSE2(void);
 extern void WebPInitAlphaProcessingSSE41(void);
+extern void WebPInitAlphaProcessingNEON(void);
 
 static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
     (VP8CPUInfo)&alpha_processing_last_cpuinfo_used;
@@ -357,9 +364,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
   WebPMultRow = WebPMultRowC;
   WebPApplyAlphaMultiply = ApplyAlphaMultiply;
   WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b;
-  WebPDispatchAlpha = DispatchAlpha;
-  WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
-  WebPExtractAlpha = ExtractAlpha;
+
+  WebPDispatchAlpha = DispatchAlpha_C;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen_C;
+  WebPExtractAlpha = ExtractAlpha_C;
+  WebPExtractGreen = ExtractGreen_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -373,6 +382,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
 #endif
     }
 #endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      WebPInitAlphaProcessingNEON();
+    }
+#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
     if (VP8GetCPUInfo(kMIPSdspR2)) {
       WebPInitAlphaProcessingMIPSdspR2();
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
new file mode 100644
index 0000000..606a401
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_neon.c
@@ -0,0 +1,191 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel, NEON version.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include "./neon.h"
+
+//------------------------------------------------------------------------------
+
+#define MULTIPLIER(a) ((a) * 0x8081)
+#define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
+
+#define MULTIPLY_BY_ALPHA(V, ALPHA, OTHER) do {                        \
+  const uint8x8_t alpha = (V).val[(ALPHA)];                            \
+  const uint16x8_t r1 = vmull_u8((V).val[1], alpha);                   \
+  const uint16x8_t g1 = vmull_u8((V).val[2], alpha);                   \
+  const uint16x8_t b1 = vmull_u8((V).val[(OTHER)], alpha);             \
+  /* we use: v / 255 = (v + 1 + (v >> 8)) >> 8 */                      \
+  const uint16x8_t r2 = vsraq_n_u16(r1, r1, 8);                        \
+  const uint16x8_t g2 = vsraq_n_u16(g1, g1, 8);                        \
+  const uint16x8_t b2 = vsraq_n_u16(b1, b1, 8);                        \
+  const uint16x8_t r3 = vaddq_u16(r2, kOne);                           \
+  const uint16x8_t g3 = vaddq_u16(g2, kOne);                           \
+  const uint16x8_t b3 = vaddq_u16(b2, kOne);                           \
+  (V).val[1] = vshrn_n_u16(r3, 8);                                     \
+  (V).val[2] = vshrn_n_u16(g3, 8);                                     \
+  (V).val[(OTHER)] = vshrn_n_u16(b3, 8);                               \
+} while (0)
+
+static void ApplyAlphaMultiply_NEON(uint8_t* rgba, int alpha_first,
+                                    int w, int h, int stride) {
+  const uint16x8_t kOne = vdupq_n_u16(1u);
+  while (h-- > 0) {
+    uint32_t* const rgbx = (uint32_t*)rgba;
+    int i = 0;
+    if (alpha_first) {
+      for (; i + 8 <= w; i += 8) {
+        // load aaaa...|rrrr...|gggg...|bbbb...
+        uint8x8x4_t RGBX = vld4_u8((const uint8_t*)(rgbx + i));
+        MULTIPLY_BY_ALPHA(RGBX, 0, 3);
+        vst4_u8((uint8_t*)(rgbx + i), RGBX);
+      }
+    } else {
+      for (; i + 8 <= w; i += 8) {
+        uint8x8x4_t RGBX = vld4_u8((const uint8_t*)(rgbx + i));
+        MULTIPLY_BY_ALPHA(RGBX, 3, 0);
+        vst4_u8((uint8_t*)(rgbx + i), RGBX);
+      }
+    }
+    // Finish with left-overs.
+    for (; i < w; ++i) {
+      uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
+      const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
+      const uint32_t a = alpha[4 * i];
+      if (a != 0xff) {
+        const uint32_t mult = MULTIPLIER(a);
+        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
+        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
+        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
+      }
+    }
+    rgba += stride;
+  }
+}
+#undef MULTIPLY_BY_ALPHA
+#undef MULTIPLIER
+#undef PREMULTIPLY
+
+//------------------------------------------------------------------------------
+
+static int DispatchAlpha_NEON(const uint8_t* alpha, int alpha_stride,
+                              int width, int height,
+                              uint8_t* dst, int dst_stride) {
+  uint32_t alpha_mask = 0xffffffffu;
+  uint8x8_t mask8 = vdup_n_u8(0xff);
+  uint32_t tmp[2];
+  int i, j;
+  for (j = 0; j < height; ++j) {
+    // We don't know if alpha is first or last in dst[] (depending on rgbA/Argb
+    // mode). So we must be sure dst[4*i + 8 - 1] is writable for the store.
+    // Hence the test with 'width - 1' instead of just 'width'.
+    for (i = 0; i + 8 <= width - 1; i += 8) {
+      uint8x8x4_t rgbX = vld4_u8((const uint8_t*)(dst + 4 * i));
+      const uint8x8_t alphas = vld1_u8(alpha + i);
+      rgbX.val[0] = alphas;
+      vst4_u8((uint8_t*)(dst + 4 * i), rgbX);
+      mask8 = vand_u8(mask8, alphas);
+    }
+    for (; i < width; ++i) {
+      const uint32_t alpha_value = alpha[i];
+      dst[4 * i] = alpha_value;
+      alpha_mask &= alpha_value;
+    }
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+  vst1_u8((uint8_t*)tmp, mask8);
+  alpha_mask &= tmp[0];
+  alpha_mask &= tmp[1];
+  return (alpha_mask != 0xffffffffu);
+}
+
+static void DispatchAlphaToGreen_NEON(const uint8_t* alpha, int alpha_stride,
+                                      int width, int height,
+                                      uint32_t* dst, int dst_stride) {
+  int i, j;
+  uint8x8x4_t greens;   // leave A/R/B channels zero'd.
+  greens.val[0] = vdup_n_u8(0);
+  greens.val[2] = vdup_n_u8(0);
+  greens.val[3] = vdup_n_u8(0);
+  for (j = 0; j < height; ++j) {
+    for (i = 0; i + 8 <= width; i += 8) {
+      greens.val[1] = vld1_u8(alpha + i);
+      vst4_u8((uint8_t*)(dst + i), greens);
+    }
+    for (; i < width; ++i) dst[i] = alpha[i] << 8;
+    alpha += alpha_stride;
+    dst += dst_stride;
+  }
+}
+
+static int ExtractAlpha_NEON(const uint8_t* argb, int argb_stride,
+                             int width, int height,
+                             uint8_t* alpha, int alpha_stride) {
+  uint32_t alpha_mask = 0xffffffffu;
+  uint8x8_t mask8 = vdup_n_u8(0xff);
+  uint32_t tmp[2];
+  int i, j;
+  for (j = 0; j < height; ++j) {
+    // We don't know if alpha is first or last in dst[] (depending on rgbA/Argb
+    // mode). So we must be sure dst[4*i + 8 - 1] is writable for the store.
+    // Hence the test with 'width - 1' instead of just 'width'.
+    for (i = 0; i + 8 <= width - 1; i += 8) {
+      const uint8x8x4_t rgbX = vld4_u8((const uint8_t*)(argb + 4 * i));
+      const uint8x8_t alphas = rgbX.val[0];
+      vst1_u8((uint8_t*)(alpha + i), alphas);
+      mask8 = vand_u8(mask8, alphas);
+    }
+    for (; i < width; ++i) {
+      alpha[i] = argb[4 * i];
+      alpha_mask &= alpha[i];
+    }
+    argb += argb_stride;
+    alpha += alpha_stride;
+  }
+  vst1_u8((uint8_t*)tmp, mask8);
+  alpha_mask &= tmp[0];
+  alpha_mask &= tmp[1];
+  return (alpha_mask == 0xffffffffu);
+}
+
+static void ExtractGreen_NEON(const uint32_t* argb,
+                              uint8_t* alpha, int size) {
+  int i;
+  for (i = 0; i + 16 <= size; i += 16) {
+    const uint8x16x4_t rgbX = vld4q_u8((const uint8_t*)(argb + i));
+    const uint8x16_t greens = rgbX.val[1];
+    vst1q_u8(alpha + i, greens);
+  }
+  for (; i < size; ++i) alpha[i] = (argb[i] >> 8) & 0xff;
+}
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitAlphaProcessingNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingNEON(void) {
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply_NEON;
+  WebPDispatchAlpha = DispatchAlpha_NEON;
+  WebPDispatchAlphaToGreen = DispatchAlphaToGreen_NEON;
+  WebPExtractAlpha = ExtractAlpha_NEON;
+  WebPExtractGreen = ExtractGreen_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
index 5acb481..83dc559 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_sse2.c
@@ -150,46 +150,46 @@ static int ExtractAlpha(const uint8_t* argb, int argb_stride,
 #define PREMULTIPLY(x, m) (((x) * (m)) >> 23)
 
 // We can't use a 'const int' for the SHUFFLE value, because it has to be an
-// immediate in the _mm_shufflexx_epi16() instruction. We really a macro here.
-#define APPLY_ALPHA(RGBX, SHUFFLE, MASK, MULT) do {             \
-  const __m128i argb0 = _mm_loadl_epi64((__m128i*)&(RGBX));     \
-  const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);         \
-  const __m128i alpha0 = _mm_and_si128(argb1, MASK);            \
-  const __m128i alpha1 = _mm_shufflelo_epi16(alpha0, SHUFFLE);  \
-  const __m128i alpha2 = _mm_shufflehi_epi16(alpha1, SHUFFLE);  \
-  /* alpha2 = [0 a0 a0 a0][0 a1 a1 a1] */                       \
-  const __m128i scale0 = _mm_mullo_epi16(alpha2, MULT);         \
-  const __m128i scale1 = _mm_mulhi_epu16(alpha2, MULT);         \
-  const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);         \
-  const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);         \
-  const __m128i argb4 = _mm_adds_epu16(argb2, argb3);           \
-  const __m128i argb5 = _mm_srli_epi16(argb4, 7);               \
-  const __m128i argb6 = _mm_or_si128(argb5, alpha0);            \
-  const __m128i argb7 = _mm_packus_epi16(argb6, zero);          \
-  _mm_storel_epi64((__m128i*)&(RGBX), argb7);                   \
+// immediate in the _mm_shufflexx_epi16() instruction. We really need a macro.
+// We use: v / 255 = (v * 0x8081) >> 23, where v = alpha * {r,g,b} is a 16bit
+// value.
+#define APPLY_ALPHA(RGBX, SHUFFLE) do {                              \
+  const __m128i argb0 = _mm_loadu_si128((const __m128i*)&(RGBX));    \
+  const __m128i argb1_lo = _mm_unpacklo_epi8(argb0, zero);           \
+  const __m128i argb1_hi = _mm_unpackhi_epi8(argb0, zero);           \
+  const __m128i alpha0_lo = _mm_or_si128(argb1_lo, kMask);           \
+  const __m128i alpha0_hi = _mm_or_si128(argb1_hi, kMask);           \
+  const __m128i alpha1_lo = _mm_shufflelo_epi16(alpha0_lo, SHUFFLE); \
+  const __m128i alpha1_hi = _mm_shufflelo_epi16(alpha0_hi, SHUFFLE); \
+  const __m128i alpha2_lo = _mm_shufflehi_epi16(alpha1_lo, SHUFFLE); \
+  const __m128i alpha2_hi = _mm_shufflehi_epi16(alpha1_hi, SHUFFLE); \
+  /* alpha2 = [ff a0 a0 a0][ff a1 a1 a1] */                          \
+  const __m128i A0_lo = _mm_mullo_epi16(alpha2_lo, argb1_lo);        \
+  const __m128i A0_hi = _mm_mullo_epi16(alpha2_hi, argb1_hi);        \
+  const __m128i A1_lo = _mm_mulhi_epu16(A0_lo, kMult);               \
+  const __m128i A1_hi = _mm_mulhi_epu16(A0_hi, kMult);               \
+  const __m128i A2_lo = _mm_srli_epi16(A1_lo, 7);                    \
+  const __m128i A2_hi = _mm_srli_epi16(A1_hi, 7);                    \
+  const __m128i A3 = _mm_packus_epi16(A2_lo, A2_hi);                 \
+  _mm_storeu_si128((__m128i*)&(RGBX), A3);                           \
 } while (0)
 
-static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
-                               int w, int h, int stride) {
+static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first,
+                                    int w, int h, int stride) {
   const __m128i zero = _mm_setzero_si128();
-  const int kSpan = 2;
-  const int w2 = w & ~(kSpan - 1);
+  const __m128i kMult = _mm_set1_epi16(0x8081u);
+  const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0);
+  const int kSpan = 4;
   while (h-- > 0) {
     uint32_t* const rgbx = (uint32_t*)rgba;
     int i;
     if (!alpha_first) {
-      const __m128i kMask = _mm_set_epi16(0xff, 0, 0, 0, 0xff, 0, 0, 0);
-      const __m128i kMult =
-          _mm_set_epi16(0, 0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081);
-      for (i = 0; i < w2; i += kSpan) {
-        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 3, 3, 3), kMask, kMult);
+      for (i = 0; i + kSpan <= w; i += kSpan) {
+        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(2, 3, 3, 3));
       }
     } else {
-      const __m128i kMask = _mm_set_epi16(0, 0, 0, 0xff, 0, 0, 0, 0xff);
-      const __m128i kMult =
-          _mm_set_epi16(0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081, 0);
-      for (i = 0; i < w2; i += kSpan) {
-        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 3), kMask, kMult);
+      for (i = 0; i + kSpan <= w; i += kSpan) {
+        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 1));
       }
     }
     // Finish with left-overs.
@@ -213,64 +213,51 @@ static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
 // -----------------------------------------------------------------------------
 // Apply alpha value to rows
 
-// We use: kINV255 = (1 << 24) / 255 = 0x010101
-// So: a * kINV255 = (a << 16) | [(a << 8) | a]
-// -> _mm_mulhi_epu16() takes care of the (a<<16) part,
-// and _mm_mullo_epu16(a * 0x0101,...) takes care of the "(a << 8) | a" one.
-
-static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
+static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) {
   int x = 0;
   if (!inverse) {
     const int kSpan = 2;
     const __m128i zero = _mm_setzero_si128();
-    const __m128i kRound =
-        _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7);
-    const __m128i kMult =
-        _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101);
-    const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0);
-    const int w2 = width & ~(kSpan - 1);
-    for (x = 0; x < w2; x += kSpan) {
-      const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
-      const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);
-      const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3));
-      const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3));
-      const __m128i tmp2 = _mm_srli_epi64(tmp1, 16);
-      const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult);
-      const __m128i scale1 = _mm_or_si128(tmp2, kOne64);
-      const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);
-      const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);
-      const __m128i argb4 = _mm_adds_epu16(argb2, argb3);
-      const __m128i argb5 = _mm_adds_epu16(argb4, kRound);
-      const __m128i argb6 = _mm_srli_epi16(argb5, 8);
-      const __m128i argb7 = _mm_packus_epi16(argb6, zero);
-      _mm_storel_epi64((__m128i*)&ptr[x], argb7);
+    const __m128i k128 = _mm_set1_epi16(128);
+    const __m128i kMult = _mm_set1_epi16(0x0101);
+    const __m128i kMask = _mm_set_epi16(0, 0xff, 0, 0, 0, 0xff, 0, 0);
+    for (x = 0; x + kSpan <= width; x += kSpan) {
+      // To compute 'result = (int)(a * x / 255. + .5)', we use:
+      //   tmp = a * v + 128, result = (tmp * 0x0101u) >> 16
+      const __m128i A0 = _mm_loadl_epi64((const __m128i*)&ptr[x]);
+      const __m128i A1 = _mm_unpacklo_epi8(A0, zero);
+      const __m128i A2 = _mm_or_si128(A1, kMask);
+      const __m128i A3 = _mm_shufflelo_epi16(A2, _MM_SHUFFLE(2, 3, 3, 3));
+      const __m128i A4 = _mm_shufflehi_epi16(A3, _MM_SHUFFLE(2, 3, 3, 3));
+      // here, A4 = [ff a0 a0 a0][ff a1 a1 a1]
+      const __m128i A5 = _mm_mullo_epi16(A4, A1);
+      const __m128i A6 = _mm_add_epi16(A5, k128);
+      const __m128i A7 = _mm_mulhi_epu16(A6, kMult);
+      const __m128i A10 = _mm_packus_epi16(A7, zero);
+      _mm_storel_epi64((__m128i*)&ptr[x], A10);
     }
   }
   width -= x;
   if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
 }
 
-static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
-                    int width, int inverse) {
+static void MultRow_SSE2(uint8_t* const ptr, const uint8_t* const alpha,
+                         int width, int inverse) {
   int x = 0;
   if (!inverse) {
-    const int kSpan = 8;
     const __m128i zero = _mm_setzero_si128();
-    const __m128i kRound = _mm_set1_epi16(1 << 7);
-    const int w2 = width & ~(kSpan - 1);
-    for (x = 0; x < w2; x += kSpan) {
+    const __m128i k128 = _mm_set1_epi16(128);
+    const __m128i kMult = _mm_set1_epi16(0x0101);
+    for (x = 0; x + 8 <= width; x += 8) {
       const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
+      const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);
       const __m128i v1 = _mm_unpacklo_epi8(v0, zero);
-      const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);
-      const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero);
-      const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0);
-      const __m128i v2 = _mm_mulhi_epu16(v1, alpha2);
-      const __m128i v3 = _mm_mullo_epi16(v1, alpha1);
-      const __m128i v4 = _mm_adds_epu16(v2, v3);
-      const __m128i v5 = _mm_adds_epu16(v4, kRound);
-      const __m128i v6 = _mm_srli_epi16(v5, 8);
-      const __m128i v7 = _mm_packus_epi16(v6, zero);
-      _mm_storel_epi64((__m128i*)&ptr[x], v7);
+      const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
+      const __m128i v2 = _mm_mullo_epi16(v1, a1);
+      const __m128i v3 = _mm_add_epi16(v2, k128);
+      const __m128i v4 = _mm_mulhi_epu16(v3, kMult);
+      const __m128i v5 = _mm_packus_epi16(v4, zero);
+      _mm_storel_epi64((__m128i*)&ptr[x], v5);
     }
   }
   width -= x;
@@ -283,9 +270,9 @@ static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
 extern void WebPInitAlphaProcessingSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) {
-  WebPMultARGBRow = MultARGBRow;
-  WebPMultRow = MultRow;
-  WebPApplyAlphaMultiply = ApplyAlphaMultiply;
+  WebPMultARGBRow = MultARGBRow_SSE2;
+  WebPMultRow = MultRow_SSE2;
+  WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2;
   WebPDispatchAlpha = DispatchAlpha;
   WebPDispatchAlphaToGreen = DispatchAlphaToGreen;
   WebPExtractAlpha = ExtractAlpha;
diff --git a/src/3rdparty/libwebp/src/dsp/common_sse2.h b/src/3rdparty/libwebp/src/dsp/common_sse2.h
index 7cea13f..995d7cf 100644
--- a/src/3rdparty/libwebp/src/dsp/common_sse2.h
+++ b/src/3rdparty/libwebp/src/dsp/common_sse2.h
@@ -100,6 +100,91 @@ static WEBP_INLINE void VP8Transpose_2_4x4_16b(
   // a03 a13 a23 a33   b03 b13 b23 b33
 }
 
+//------------------------------------------------------------------------------
+// Channel mixing.
+
+// Function used several times in VP8PlanarTo24b.
+// It samples the in buffer as follows: one every two unsigned char is stored
+// at the beginning of the buffer, while the other half is stored at the end.
+#define VP8PlanarTo24bHelper(IN, OUT)                            \
+  do {                                                           \
+    const __m128i v_mask = _mm_set1_epi16(0x00ff);               \
+    /* Take one every two upper 8b values.*/                     \
+    (OUT##0) = _mm_packus_epi16(_mm_and_si128((IN##0), v_mask),  \
+                                _mm_and_si128((IN##1), v_mask)); \
+    (OUT##1) = _mm_packus_epi16(_mm_and_si128((IN##2), v_mask),  \
+                                _mm_and_si128((IN##3), v_mask)); \
+    (OUT##2) = _mm_packus_epi16(_mm_and_si128((IN##4), v_mask),  \
+                                _mm_and_si128((IN##5), v_mask)); \
+    /* Take one every two lower 8b values.*/                     \
+    (OUT##3) = _mm_packus_epi16(_mm_srli_epi16((IN##0), 8),      \
+                                _mm_srli_epi16((IN##1), 8));     \
+    (OUT##4) = _mm_packus_epi16(_mm_srli_epi16((IN##2), 8),      \
+                                _mm_srli_epi16((IN##3), 8));     \
+    (OUT##5) = _mm_packus_epi16(_mm_srli_epi16((IN##4), 8),      \
+                                _mm_srli_epi16((IN##5), 8));     \
+  } while (0)
+
+// Pack the planar buffers
+// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
+static WEBP_INLINE void VP8PlanarTo24b(__m128i* const in0, __m128i* const in1,
+                                       __m128i* const in2, __m128i* const in3,
+                                       __m128i* const in4, __m128i* const in5) {
+  // The input is 6 registers of sixteen 8b but for the sake of explanation,
+  // let's take 6 registers of four 8b values.
+  // To pack, we will keep taking one every two 8b integer and move it
+  // around as follows:
+  // Input:
+  //   r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
+  // Split the 6 registers in two sets of 3 registers: the first set as the even
+  // 8b bytes, the second the odd ones:
+  //   r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
+  // Repeat the same permutations twice more:
+  //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
+  //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  VP8PlanarTo24bHelper(*in, tmp);
+  VP8PlanarTo24bHelper(tmp, *in);
+  VP8PlanarTo24bHelper(*in, tmp);
+  // We need to do it two more times than the example as we have sixteen bytes.
+  {
+    __m128i out0, out1, out2, out3, out4, out5;
+    VP8PlanarTo24bHelper(tmp, out);
+    VP8PlanarTo24bHelper(out, *in);
+  }
+}
+
+#undef VP8PlanarTo24bHelper
+
+// Convert four packed four-channel buffers like argbargbargbargb... into the
+// split channels aaaaa ... rrrr ... gggg .... bbbbb ......
+static WEBP_INLINE void VP8L32bToPlanar(__m128i* const in0,
+                                        __m128i* const in1,
+                                        __m128i* const in2,
+                                        __m128i* const in3) {
+  // Column-wise transpose.
+  const __m128i A0 = _mm_unpacklo_epi8(*in0, *in1);
+  const __m128i A1 = _mm_unpackhi_epi8(*in0, *in1);
+  const __m128i A2 = _mm_unpacklo_epi8(*in2, *in3);
+  const __m128i A3 = _mm_unpackhi_epi8(*in2, *in3);
+  const __m128i B0 = _mm_unpacklo_epi8(A0, A1);
+  const __m128i B1 = _mm_unpackhi_epi8(A0, A1);
+  const __m128i B2 = _mm_unpacklo_epi8(A2, A3);
+  const __m128i B3 = _mm_unpackhi_epi8(A2, A3);
+  // C0 = g7 g6 ... g1 g0 | b7 b6 ... b1 b0
+  // C1 = a7 a6 ... a1 a0 | r7 r6 ... r1 r0
+  const __m128i C0 = _mm_unpacklo_epi8(B0, B1);
+  const __m128i C1 = _mm_unpackhi_epi8(B0, B1);
+  const __m128i C2 = _mm_unpacklo_epi8(B2, B3);
+  const __m128i C3 = _mm_unpackhi_epi8(B2, B3);
+  // Gather the channels.
+  *in0 = _mm_unpackhi_epi64(C1, C3);
+  *in1 = _mm_unpacklo_epi64(C1, C3);
+  *in2 = _mm_unpackhi_epi64(C0, C2);
+  *in3 = _mm_unpacklo_epi64(C0, C2);
+}
+
 #endif  // WEBP_USE_SSE2
 
 #ifdef __cplusplus
diff --git a/src/3rdparty/libwebp/src/dsp/cost.c b/src/3rdparty/libwebp/src/dsp/cost.c
index fe72d26..58ddea7 100644
--- a/src/3rdparty/libwebp/src/dsp/cost.c
+++ b/src/3rdparty/libwebp/src/dsp/cost.c
@@ -10,7 +10,7 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include "./dsp.h"
-#include "../enc/cost.h"
+#include "../enc/cost_enc.h"
 
 //------------------------------------------------------------------------------
 // Boolean-cost cost table
diff --git a/src/3rdparty/libwebp/src/dsp/cost_mips32.c b/src/3rdparty/libwebp/src/dsp/cost_mips32.c
index d1e240e..3102da8 100644
--- a/src/3rdparty/libwebp/src/dsp/cost_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/cost_mips32.c
@@ -13,7 +13,7 @@
 
 #if defined(WEBP_USE_MIPS32)
 
-#include "../enc/cost.h"
+#include "../enc/cost_enc.h"
 
 static int GetResidualCost(int ctx0, const VP8Residual* const res) {
   int temp0, temp1;
diff --git a/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
index ce64067..6ec8aeb 100644
--- a/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/cost_mips_dsp_r2.c
@@ -13,7 +13,7 @@
 
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
-#include "../enc/cost.h"
+#include "../enc/cost_enc.h"
 
 static int GetResidualCost(int ctx0, const VP8Residual* const res) {
   int temp0, temp1;
diff --git a/src/3rdparty/libwebp/src/dsp/cost_sse2.c b/src/3rdparty/libwebp/src/dsp/cost_sse2.c
index 0cb1c1f..421d51f 100644
--- a/src/3rdparty/libwebp/src/dsp/cost_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/cost_sse2.c
@@ -16,8 +16,8 @@
 #if defined(WEBP_USE_SSE2)
 #include <emmintrin.h>
 
-#include "../enc/cost.h"
-#include "../enc/vp8enci.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/cpu.c b/src/3rdparty/libwebp/src/dsp/cpu.c
index cbb08db..b5583b6 100644
--- a/src/3rdparty/libwebp/src/dsp/cpu.c
+++ b/src/3rdparty/libwebp/src/dsp/cpu.c
@@ -95,26 +95,62 @@ static WEBP_INLINE uint64_t xgetbv(void) {
 #endif
 
 #if defined(__i386__) || defined(__x86_64__) || defined(WEBP_MSC_SSE2)
+
+// helper function for run-time detection of slow SSSE3 platforms
+static int CheckSlowModel(int info) {
+  // Table listing display models with longer latencies for the bsr instruction
+  // (ie 2 cycles vs 10/16 cycles) and some SSSE3 instructions like pshufb.
+  // Refer to Intel 64 and IA-32 Architectures Optimization Reference Manual.
+  static const uint8_t kSlowModels[] = {
+    0x37, 0x4a, 0x4d,  // Silvermont Microarchitecture
+    0x1c, 0x26, 0x27   // Atom Microarchitecture
+  };
+  const uint32_t model = ((info & 0xf0000) >> 12) | ((info >> 4) & 0xf);
+  const uint32_t family = (info >> 8) & 0xf;
+  if (family == 0x06) {
+    size_t i;
+    for (i = 0; i < sizeof(kSlowModels) / sizeof(kSlowModels[0]); ++i) {
+      if (model == kSlowModels[i]) return 1;
+    }
+  }
+  return 0;
+}
+
 static int x86CPUInfo(CPUFeature feature) {
   int max_cpuid_value;
   int cpu_info[4];
+  int is_intel = 0;
 
   // get the highest feature value cpuid supports
   GetCPUInfo(cpu_info, 0);
   max_cpuid_value = cpu_info[0];
   if (max_cpuid_value < 1) {
     return 0;
+  } else {
+    const int VENDOR_ID_INTEL_EBX = 0x756e6547;  // uneG
+    const int VENDOR_ID_INTEL_EDX = 0x49656e69;  // Ieni
+    const int VENDOR_ID_INTEL_ECX = 0x6c65746e;  // letn
+    is_intel = (cpu_info[1] == VENDOR_ID_INTEL_EBX &&
+                cpu_info[2] == VENDOR_ID_INTEL_ECX &&
+                cpu_info[3] == VENDOR_ID_INTEL_EDX);    // genuine Intel?
   }
 
   GetCPUInfo(cpu_info, 1);
   if (feature == kSSE2) {
-    return 0 != (cpu_info[3] & 0x04000000);
+    return !!(cpu_info[3] & (1 << 26));
   }
   if (feature == kSSE3) {
-    return 0 != (cpu_info[2] & 0x00000001);
+    return !!(cpu_info[2] & (1 << 0));
+  }
+  if (feature == kSlowSSSE3) {
+    if (is_intel && (cpu_info[2] & (1 << 0))) {   // SSSE3?
+      return CheckSlowModel(cpu_info[0]);
+    }
+    return 0;
   }
+
   if (feature == kSSE4_1) {
-    return 0 != (cpu_info[2] & 0x00080000);
+    return !!(cpu_info[2] & (1 << 19));
   }
   if (feature == kAVX) {
     // bits 27 (OSXSAVE) & 28 (256-bit AVX)
@@ -126,7 +162,7 @@ static int x86CPUInfo(CPUFeature feature) {
   if (feature == kAVX2) {
     if (x86CPUInfo(kAVX) && max_cpuid_value >= 7) {
       GetCPUInfo(cpu_info, 7);
-      return ((cpu_info[1] & 0x00000020) == 0x00000020);
+      return !!(cpu_info[1] & (1 << 5));
     }
   }
   return 0;
@@ -184,4 +220,3 @@ VP8CPUInfo VP8GetCPUInfo = mipsCPUInfo;
 #else
 VP8CPUInfo VP8GetCPUInfo = NULL;
 #endif
-
diff --git a/src/3rdparty/libwebp/src/dsp/dec.c b/src/3rdparty/libwebp/src/dsp/dec.c
index e92d693..007e985 100644
--- a/src/3rdparty/libwebp/src/dsp/dec.c
+++ b/src/3rdparty/libwebp/src/dsp/dec.c
@@ -12,7 +12,7 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include "./dsp.h"
-#include "../dec/vp8i.h"
+#include "../dec/vp8i_dec.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
@@ -239,7 +239,7 @@ VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES];
 //------------------------------------------------------------------------------
 // 4x4
 
-#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
 static void VE4(uint8_t* dst) {    // vertical
diff --git a/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c b/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c
index 3b6dde8..74ba34c 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_clip_tables.c
@@ -63,7 +63,7 @@ static const uint8_t abs0[255 + 255 + 1] = {
   0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
 };
 
-static const int8_t sclip1[1020 + 1020 + 1] = {
+static const uint8_t sclip1[1020 + 1020 + 1] = {
   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
   0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
@@ -236,7 +236,7 @@ static const int8_t sclip1[1020 + 1020 + 1] = {
   0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f
 };
 
-static const int8_t sclip2[112 + 112 + 1] = {
+static const uint8_t sclip2[112 + 112 + 1] = {
   0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
   0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
   0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0,
@@ -339,8 +339,8 @@ static volatile int tables_ok = 0;
 
 #endif
 
-const int8_t* const VP8ksclip1 = &sclip1[1020];
-const int8_t* const VP8ksclip2 = &sclip2[112];
+const int8_t* const VP8ksclip1 = (const int8_t*)&sclip1[1020];
+const int8_t* const VP8ksclip2 = (const int8_t*)&sclip2[112];
 const uint8_t* const VP8kclip1 = &clip1[255];
 const uint8_t* const VP8kabs0 = &abs0[255];
 
diff --git a/src/3rdparty/libwebp/src/dsp/dec_msa.c b/src/3rdparty/libwebp/src/dsp/dec_msa.c
index f76055c..8d9c98c 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_msa.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_msa.c
@@ -154,6 +154,820 @@ static void TransformAC3(const int16_t* in, uint8_t* dst) {
 }
 
 //------------------------------------------------------------------------------
+// Edge filtering functions
+
+#define FLIP_SIGN2(in0, in1, out0, out1) {  \
+  out0 = (v16i8)__msa_xori_b(in0, 0x80);    \
+  out1 = (v16i8)__msa_xori_b(in1, 0x80);    \
+}
+
+#define FLIP_SIGN4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  FLIP_SIGN2(in0, in1, out0, out1);                               \
+  FLIP_SIGN2(in2, in3, out2, out3);                               \
+}
+
+#define FILT_VAL(q0_m, p0_m, mask, filt) do {  \
+  v16i8 q0_sub_p0;                             \
+  q0_sub_p0 = __msa_subs_s_b(q0_m, p0_m);      \
+  filt = __msa_adds_s_b(filt, q0_sub_p0);      \
+  filt = __msa_adds_s_b(filt, q0_sub_p0);      \
+  filt = __msa_adds_s_b(filt, q0_sub_p0);      \
+  filt = filt & mask;                          \
+} while (0)
+
+#define FILT2(q_m, p_m, q, p) do {            \
+  u_r = SRAI_H(temp1, 7);                     \
+  u_r = __msa_sat_s_h(u_r, 7);                \
+  u_l = SRAI_H(temp3, 7);                     \
+  u_l = __msa_sat_s_h(u_l, 7);                \
+  u = __msa_pckev_b((v16i8)u_l, (v16i8)u_r);  \
+  q_m = __msa_subs_s_b(q_m, u);               \
+  p_m = __msa_adds_s_b(p_m, u);               \
+  q = __msa_xori_b((v16u8)q_m, 0x80);         \
+  p = __msa_xori_b((v16u8)p_m, 0x80);         \
+} while (0)
+
+#define LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev) do {  \
+  v16i8 p1_m, p0_m, q0_m, q1_m;                         \
+  v16i8 filt, t1, t2;                                   \
+  const v16i8 cnst4b = __msa_ldi_b(4);                  \
+  const v16i8 cnst3b = __msa_ldi_b(3);                  \
+                                                        \
+  FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m);   \
+  filt = __msa_subs_s_b(p1_m, q1_m);                    \
+  filt = filt & hev;                                    \
+  FILT_VAL(q0_m, p0_m, mask, filt);                     \
+  t1 = __msa_adds_s_b(filt, cnst4b);                    \
+  t1 = SRAI_B(t1, 3);                                   \
+  t2 = __msa_adds_s_b(filt, cnst3b);                    \
+  t2 = SRAI_B(t2, 3);                                   \
+  q0_m = __msa_subs_s_b(q0_m, t1);                      \
+  q0 = __msa_xori_b((v16u8)q0_m, 0x80);                 \
+  p0_m = __msa_adds_s_b(p0_m, t2);                      \
+  p0 = __msa_xori_b((v16u8)p0_m, 0x80);                 \
+  filt = __msa_srari_b(t1, 1);                          \
+  hev = __msa_xori_b(hev, 0xff);                        \
+  filt = filt & hev;                                    \
+  q1_m = __msa_subs_s_b(q1_m, filt);                    \
+  q1 = __msa_xori_b((v16u8)q1_m, 0x80);                 \
+  p1_m = __msa_adds_s_b(p1_m, filt);                    \
+  p1 = __msa_xori_b((v16u8)p1_m, 0x80);                 \
+} while (0)
+
+#define LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) do {  \
+  v16i8 p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;                   \
+  v16i8 u, filt, t1, t2, filt_sign;                           \
+  v8i16 filt_r, filt_l, u_r, u_l;                             \
+  v8i16 temp0, temp1, temp2, temp3;                           \
+  const v16i8 cnst4b = __msa_ldi_b(4);                        \
+  const v16i8 cnst3b = __msa_ldi_b(3);                        \
+  const v8i16 cnst9h = __msa_ldi_h(9);                        \
+                                                              \
+  FLIP_SIGN4(p1, p0, q0, q1, p1_m, p0_m, q0_m, q1_m);         \
+  filt = __msa_subs_s_b(p1_m, q1_m);                          \
+  FILT_VAL(q0_m, p0_m, mask, filt);                           \
+  FLIP_SIGN2(p2, q2, p2_m, q2_m);                             \
+  t2 = filt & hev;                                            \
+  /* filt_val &= ~hev */                                      \
+  hev = __msa_xori_b(hev, 0xff);                              \
+  filt = filt & hev;                                          \
+  t1 = __msa_adds_s_b(t2, cnst4b);                            \
+  t1 = SRAI_B(t1, 3);                                         \
+  t2 = __msa_adds_s_b(t2, cnst3b);                            \
+  t2 = SRAI_B(t2, 3);                                         \
+  q0_m = __msa_subs_s_b(q0_m, t1);                            \
+  p0_m = __msa_adds_s_b(p0_m, t2);                            \
+  filt_sign = __msa_clti_s_b(filt, 0);                        \
+  ILVRL_B2_SH(filt_sign, filt, filt_r, filt_l);               \
+  /* update q2/p2 */                                          \
+  temp0 = filt_r * cnst9h;                                    \
+  temp1 = ADDVI_H(temp0, 63);                                 \
+  temp2 = filt_l * cnst9h;                                    \
+  temp3 = ADDVI_H(temp2, 63);                                 \
+  FILT2(q2_m, p2_m, q2, p2);                                  \
+  /* update q1/p1 */                                          \
+  temp1 = temp1 + temp0;                                      \
+  temp3 = temp3 + temp2;                                      \
+  FILT2(q1_m, p1_m, q1, p1);                                  \
+  /* update q0/p0 */                                          \
+  temp1 = temp1 + temp0;                                      \
+  temp3 = temp3 + temp2;                                      \
+  FILT2(q0_m, p0_m, q0, p0);                                  \
+} while (0)
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in,                 \
+                     q0_in, q1_in, q2_in, q3_in,                 \
+                     limit_in, b_limit_in, thresh_in,            \
+                     hev_out, mask_out) do {                     \
+  v16u8 p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;  \
+  v16u8 p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;  \
+  v16u8 flat_out;                                                \
+                                                                 \
+  /* absolute subtraction of pixel values */                     \
+  p3_asub_p2_m = __msa_asub_u_b(p3_in, p2_in);                   \
+  p2_asub_p1_m = __msa_asub_u_b(p2_in, p1_in);                   \
+  p1_asub_p0_m = __msa_asub_u_b(p1_in, p0_in);                   \
+  q1_asub_q0_m = __msa_asub_u_b(q1_in, q0_in);                   \
+  q2_asub_q1_m = __msa_asub_u_b(q2_in, q1_in);                   \
+  q3_asub_q2_m = __msa_asub_u_b(q3_in, q2_in);                   \
+  p0_asub_q0_m = __msa_asub_u_b(p0_in, q0_in);                   \
+  p1_asub_q1_m = __msa_asub_u_b(p1_in, q1_in);                   \
+  /* calculation of hev */                                       \
+  flat_out = __msa_max_u_b(p1_asub_p0_m, q1_asub_q0_m);          \
+  hev_out = (thresh_in < flat_out);                              \
+  /* calculation of mask */                                      \
+  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p0_asub_q0_m);     \
+  p1_asub_q1_m = SRAI_B(p1_asub_q1_m, 1);                        \
+  p0_asub_q0_m = __msa_adds_u_b(p0_asub_q0_m, p1_asub_q1_m);     \
+  mask_out = (b_limit_in < p0_asub_q0_m);                        \
+  mask_out = __msa_max_u_b(flat_out, mask_out);                  \
+  p3_asub_p2_m = __msa_max_u_b(p3_asub_p2_m, p2_asub_p1_m);      \
+  mask_out = __msa_max_u_b(p3_asub_p2_m, mask_out);              \
+  q2_asub_q1_m = __msa_max_u_b(q2_asub_q1_m, q3_asub_q2_m);      \
+  mask_out = __msa_max_u_b(q2_asub_q1_m, mask_out);              \
+  mask_out = (limit_in < mask_out);                              \
+  mask_out = __msa_xori_b(mask_out, 0xff);                       \
+} while (0)
+
+#define ST6x1_UB(in0, in0_idx, in1, in1_idx, pdst, stride) do { \
+  const uint16_t tmp0_h = __msa_copy_s_h((v8i16)in1, in1_idx);  \
+  const uint32_t tmp0_w = __msa_copy_s_w((v4i32)in0, in0_idx);  \
+  SW(tmp0_w, pdst);                                             \
+  SH(tmp0_h, pdst + stride);                                    \
+} while (0)
+
+#define ST6x4_UB(in0, start_in0_idx, in1, start_in1_idx, pdst, stride) do { \
+  uint8_t* ptmp1 = (uint8_t*)pdst;                                          \
+  ST6x1_UB(in0, start_in0_idx, in1, start_in1_idx, ptmp1, 4);               \
+  ptmp1 += stride;                                                          \
+  ST6x1_UB(in0, start_in0_idx + 1, in1, start_in1_idx + 1, ptmp1, 4);       \
+  ptmp1 += stride;                                                          \
+  ST6x1_UB(in0, start_in0_idx + 2, in1, start_in1_idx + 2, ptmp1, 4);       \
+  ptmp1 += stride;                                                          \
+  ST6x1_UB(in0, start_in0_idx + 3, in1, start_in1_idx + 3, ptmp1, 4);       \
+} while (0)
+
+#define LPF_SIMPLE_FILT(p1_in, p0_in, q0_in, q1_in, mask) do {       \
+    v16i8 p1_m, p0_m, q0_m, q1_m, filt, filt1, filt2;                \
+    const v16i8 cnst4b = __msa_ldi_b(4);                             \
+    const v16i8 cnst3b =  __msa_ldi_b(3);                            \
+                                                                     \
+    FLIP_SIGN4(p1_in, p0_in, q0_in, q1_in, p1_m, p0_m, q0_m, q1_m);  \
+    filt = __msa_subs_s_b(p1_m, q1_m);                               \
+    FILT_VAL(q0_m, p0_m, mask, filt);                                \
+    filt1 = __msa_adds_s_b(filt, cnst4b);                            \
+    filt1 = SRAI_B(filt1, 3);                                        \
+    filt2 = __msa_adds_s_b(filt, cnst3b);                            \
+    filt2 = SRAI_B(filt2, 3);                                        \
+    q0_m = __msa_subs_s_b(q0_m, filt1);                              \
+    p0_m = __msa_adds_s_b(p0_m, filt2);                              \
+    q0_in = __msa_xori_b((v16u8)q0_m, 0x80);                         \
+    p0_in = __msa_xori_b((v16u8)p0_m, 0x80);                         \
+} while (0)
+
+#define LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask) do {    \
+    v16u8 p1_a_sub_q1, p0_a_sub_q0;                            \
+                                                               \
+    p0_a_sub_q0 = __msa_asub_u_b(p0, q0);                      \
+    p1_a_sub_q1 = __msa_asub_u_b(p1, q1);                      \
+    p1_a_sub_q1 = (v16u8)__msa_srli_b((v16i8)p1_a_sub_q1, 1);  \
+    p0_a_sub_q0 = __msa_adds_u_b(p0_a_sub_q0, p0_a_sub_q0);    \
+    mask = __msa_adds_u_b(p0_a_sub_q0, p1_a_sub_q1);           \
+    mask = (mask <= b_limit);                                  \
+} while (0)
+
+static void VFilter16(uint8_t* src, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptemp = src - 4 * stride;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 mask, hev;
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB8(ptemp, stride, p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  ptemp = src - 3 * stride;
+  ST_UB4(p2, p1, p0, q0, ptemp, stride);
+  ptemp += (4 * stride);
+  ST_UB2(q1, q2, ptemp, stride);
+}
+
+static void HFilter16(uint8_t* src, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptmp  = src - 4;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 mask, hev;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  v16u8 row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+  LD_UB8(ptmp, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  ptmp += (8 * stride);
+  LD_UB8(ptmp, stride, row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+  ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+  ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+  ptmp = src - 3;
+  ST6x1_UB(tmp3, 0, tmp2, 0, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp3, 1, tmp2, 1, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp3, 2, tmp2, 2, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp3, 3, tmp2, 3, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 0, tmp2, 4, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 1, tmp2, 5, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 2, tmp2, 6, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp4, 3, tmp2, 7, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 0, tmp5, 0, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 1, tmp5, 1, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 2, tmp5, 2, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp6, 3, tmp5, 3, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 0, tmp5, 4, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 1, tmp5, 5, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 2, tmp5, 6, ptmp, 4);
+  ptmp += stride;
+  ST6x1_UB(tmp7, 3, tmp5, 7, ptmp, 4);
+}
+
+// on three inner edges
+static void VFilterHorEdge16i(uint8_t* src, int stride,
+                              int b_limit, int limit, int thresh) {
+  v16u8 mask, hev;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);
+  const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);
+  const v16u8 limit0 = (v16u8)__msa_fill_b(limit);
+
+  LD_UB8((src - 4 * stride), stride, p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  ST_UB4(p1, p0, q0, q1, (src - 2 * stride), stride);
+}
+
+static void VFilter16i(uint8_t* src_y, int stride,
+                       int b_limit, int limit, int thresh) {
+  VFilterHorEdge16i(src_y +  4 * stride, stride, b_limit, limit, thresh);
+  VFilterHorEdge16i(src_y +  8 * stride, stride, b_limit, limit, thresh);
+  VFilterHorEdge16i(src_y + 12 * stride, stride, b_limit, limit, thresh);
+}
+
+static void HFilterVertEdge16i(uint8_t* src, int stride,
+                               int b_limit, int limit, int thresh) {
+  v16u8 mask, hev;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  const v16u8 thresh0 = (v16u8)__msa_fill_b(thresh);
+  const v16u8 b_limit0 = (v16u8)__msa_fill_b(b_limit);
+  const v16u8 limit0 = (v16u8)__msa_fill_b(limit);
+
+  LD_UB8(src - 4, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(src - 4 + (8 * stride), stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  ILVR_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp2, tmp3);
+  ILVL_B2_SH(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
+  src -= 2;
+  ST4x8_UB(tmp2, tmp3, src, stride);
+  src += (8 * stride);
+  ST4x8_UB(tmp4, tmp5, src, stride);
+}
+
+static void HFilter16i(uint8_t* src_y, int stride,
+                       int b_limit, int limit, int thresh) {
+  HFilterVertEdge16i(src_y +  4, stride, b_limit, limit, thresh);
+  HFilterVertEdge16i(src_y +  8, stride, b_limit, limit, thresh);
+  HFilterVertEdge16i(src_y + 12, stride, b_limit, limit, thresh);
+}
+
+// 8-pixels wide variants, for chroma filtering
+static void VFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
+                     int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptmp_src_u = src_u - 4 * stride;
+  uint8_t* ptmp_src_v = src_v - 4 * stride;
+  uint64_t p2_d, p1_d, p0_d, q0_d, q1_d, q2_d;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+  LD_UB8(ptmp_src_u, stride, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+  LD_UB8(ptmp_src_v, stride, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  p2_d = __msa_copy_s_d((v2i64)p2, 0);
+  p1_d = __msa_copy_s_d((v2i64)p1, 0);
+  p0_d = __msa_copy_s_d((v2i64)p0, 0);
+  q0_d = __msa_copy_s_d((v2i64)q0, 0);
+  q1_d = __msa_copy_s_d((v2i64)q1, 0);
+  q2_d = __msa_copy_s_d((v2i64)q2, 0);
+  ptmp_src_u += stride;
+  SD4(p2_d, p1_d, p0_d, q0_d, ptmp_src_u, stride);
+  ptmp_src_u += (4 * stride);
+  SD(q1_d, ptmp_src_u);
+  ptmp_src_u += stride;
+  SD(q2_d, ptmp_src_u);
+  p2_d = __msa_copy_s_d((v2i64)p2, 1);
+  p1_d = __msa_copy_s_d((v2i64)p1, 1);
+  p0_d = __msa_copy_s_d((v2i64)p0, 1);
+  q0_d = __msa_copy_s_d((v2i64)q0, 1);
+  q1_d = __msa_copy_s_d((v2i64)q1, 1);
+  q2_d = __msa_copy_s_d((v2i64)q2, 1);
+  ptmp_src_v += stride;
+  SD4(p2_d, p1_d, p0_d, q0_d, ptmp_src_v, stride);
+  ptmp_src_v += (4 * stride);
+  SD(q1_d, ptmp_src_v);
+  ptmp_src_v += stride;
+  SD(q2_d, ptmp_src_v);
+}
+
+static void HFilter8(uint8_t* src_u, uint8_t* src_v, int stride,
+                     int b_limit_in, int limit_in, int thresh_in) {
+  uint8_t* ptmp_src_u = src_u - 4;
+  uint8_t* ptmp_src_v = src_v - 4;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  v16u8 row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+
+  LD_UB8(ptmp_src_u, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(ptmp_src_v, stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  ILVR_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp3, tmp4);
+  ILVL_B2_SH(p1, p2, q0, p0, tmp0, tmp1);
+  ILVRL_H2_SH(tmp1, tmp0, tmp6, tmp7);
+  ILVRL_B2_SH(q2, q1, tmp2, tmp5);
+  ptmp_src_u += 1;
+  ST6x4_UB(tmp3, 0, tmp2, 0, ptmp_src_u, stride);
+  ptmp_src_u += 4 * stride;
+  ST6x4_UB(tmp4, 0, tmp2, 4, ptmp_src_u, stride);
+  ptmp_src_v += 1;
+  ST6x4_UB(tmp6, 0, tmp5, 0, ptmp_src_v, stride);
+  ptmp_src_v += 4 * stride;
+  ST6x4_UB(tmp7, 0, tmp5, 4, ptmp_src_v, stride);
+}
+
+static void VFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  uint64_t p1_d, p0_d, q0_d, q1_d;
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  v16u8 p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB8(src_u, stride, p3_u, p2_u, p1_u, p0_u, q0_u, q1_u, q2_u, q3_u);
+  src_u += (5 * stride);
+  LD_UB8(src_v, stride, p3_v, p2_v, p1_v, p0_v, q0_v, q1_v, q2_v, q3_v);
+  src_v += (5 * stride);
+  ILVR_D4_UB(p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3, p2, p1, p0);
+  ILVR_D4_UB(q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  p1_d = __msa_copy_s_d((v2i64)p1, 0);
+  p0_d = __msa_copy_s_d((v2i64)p0, 0);
+  q0_d = __msa_copy_s_d((v2i64)q0, 0);
+  q1_d = __msa_copy_s_d((v2i64)q1, 0);
+  SD4(q1_d, q0_d, p0_d, p1_d, src_u, -stride);
+  p1_d = __msa_copy_s_d((v2i64)p1, 1);
+  p0_d = __msa_copy_s_d((v2i64)p0, 1);
+  q0_d = __msa_copy_s_d((v2i64)q0, 1);
+  q1_d = __msa_copy_s_d((v2i64)q1, 1);
+  SD4(q1_d, q0_d, p0_d, p1_d, src_v, -stride);
+}
+
+static void HFilter8i(uint8_t* src_u, uint8_t* src_v, int stride,
+                      int b_limit_in, int limit_in, int thresh_in) {
+  v16u8 p3, p2, p1, p0, q3, q2, q1, q0, mask, hev;
+  v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  v16u8 row9, row10, row11, row12, row13, row14, row15;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  const v16u8 thresh = (v16u8)__msa_fill_b(thresh_in);
+  const v16u8 limit = (v16u8)__msa_fill_b(limit_in);
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB8(src_u, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(src_v, stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x8_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p3, p2, p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh,
+               hev, mask);
+  LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+  ILVR_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SW(tmp1, tmp0, tmp2, tmp3);
+  ILVL_B2_SW(p0, p1, q1, q0, tmp0, tmp1);
+  ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
+  src_u += 2;
+  ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, src_u, stride);
+  src_u += 4 * stride;
+  ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src_u, stride);
+  src_v += 2;
+  ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src_v, stride);
+  src_v += 4 * stride;
+  ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, src_v, stride);
+}
+
+static void SimpleVFilter16(uint8_t* src, int stride, int b_limit_in) {
+  v16u8 p1, p0, q1, q0, mask;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+
+  LD_UB4(src - 2 * stride, stride, p1, p0, q0, q1);
+  LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+  LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);
+  ST_UB2(p0, q0, src - stride, stride);
+}
+
+static void SimpleHFilter16(uint8_t* src, int stride, int b_limit_in) {
+  v16u8 p1, p0, q1, q0, mask, row0, row1, row2, row3, row4, row5, row6, row7;
+  v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
+  v8i16 tmp0, tmp1;
+  const v16u8 b_limit = (v16u8)__msa_fill_b(b_limit_in);
+  uint8_t* ptemp_src = src - 2;
+
+  LD_UB8(ptemp_src, stride, row0, row1, row2, row3, row4, row5, row6, row7);
+  LD_UB8(ptemp_src + 8 * stride, stride,
+         row8, row9, row10, row11, row12, row13, row14, row15);
+  TRANSPOSE16x4_UB_UB(row0, row1, row2, row3, row4, row5, row6, row7,
+                      row8, row9, row10, row11, row12, row13, row14, row15,
+                      p1, p0, q0, q1);
+  LPF_SIMPLE_MASK(p1, p0, q0, q1, b_limit, mask);
+  LPF_SIMPLE_FILT(p1, p0, q0, q1, mask);
+  ILVRL_B2_SH(q0, p0, tmp1, tmp0);
+  ptemp_src += 1;
+  ST2x4_UB(tmp1, 0, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+  ST2x4_UB(tmp1, 4, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+  ST2x4_UB(tmp0, 0, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+  ST2x4_UB(tmp0, 4, ptemp_src, stride);
+  ptemp_src += 4 * stride;
+}
+
+static void SimpleVFilter16i(uint8_t* src_y, int stride, int b_limit_in) {
+  SimpleVFilter16(src_y +  4 * stride, stride, b_limit_in);
+  SimpleVFilter16(src_y +  8 * stride, stride, b_limit_in);
+  SimpleVFilter16(src_y + 12 * stride, stride, b_limit_in);
+}
+
+static void SimpleHFilter16i(uint8_t* src_y, int stride, int b_limit_in) {
+  SimpleHFilter16(src_y +  4, stride, b_limit_in);
+  SimpleHFilter16(src_y +  8, stride, b_limit_in);
+  SimpleHFilter16(src_y + 12, stride, b_limit_in);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+//------------------------------------------------------------------------------
+
+// 4x4
+
+static void DC4(uint8_t* dst) {   // DC
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
+  dc >>= 3;
+  dc = dc | (dc << 8) | (dc << 16) | (dc << 24);
+  SW4(dc, dc, dc, dc, dst, BPS);
+}
+
+static void TM4(uint8_t* dst) {
+  const uint8_t* const ptemp = dst - BPS - 1;
+  v8i16 T, d, r0, r1, r2, r3;
+  const v16i8 zero = { 0 };
+  const v8i16 TL = (v8i16)__msa_fill_h(ptemp[0 * BPS]);
+  const v8i16 L0 = (v8i16)__msa_fill_h(ptemp[1 * BPS]);
+  const v8i16 L1 = (v8i16)__msa_fill_h(ptemp[2 * BPS]);
+  const v8i16 L2 = (v8i16)__msa_fill_h(ptemp[3 * BPS]);
+  const v8i16 L3 = (v8i16)__msa_fill_h(ptemp[4 * BPS]);
+  const v16u8 T1 = LD_UB(ptemp + 1);
+
+  T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+  d = T - TL;
+  ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
+  CLIP_SH4_0_255(r0, r1, r2, r3);
+  PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
+}
+
+static void VE4(uint8_t* dst) {    // vertical
+  const uint8_t* const ptop = dst - BPS - 1;
+  const uint32_t val0 = LW(ptop + 0);
+  const uint32_t val1 = LW(ptop + 4);
+  uint32_t out;
+  v16u8 A, B, C, AC, B2, R;
+
+  INSERT_W2_UB(val0, val1, A);
+  B = SLDI_UB(A, A, 1);
+  C = SLDI_UB(A, A, 2);
+  AC = __msa_ave_u_b(A, C);
+  B2 = __msa_ave_u_b(B, B);
+  R = __msa_aver_u_b(AC, B2);
+  out = __msa_copy_s_w((v4i32)R, 0);
+  SW4(out, out, out, out, dst, BPS);
+}
+
+static void RD4(uint8_t* dst) {   // Down-right
+  const uint8_t* const ptop = dst - 1 - BPS;
+  uint32_t val0 = LW(ptop + 0);
+  uint32_t val1 = LW(ptop + 4);
+  uint32_t val2, val3;
+  v16u8 A, B, C, AC, B2, R, A1;
+
+  INSERT_W2_UB(val0, val1, A1);
+  A = SLDI_UB(A1, A1, 12);
+  A = (v16u8)__msa_insert_b((v16i8)A, 3, ptop[1 * BPS]);
+  A = (v16u8)__msa_insert_b((v16i8)A, 2, ptop[2 * BPS]);
+  A = (v16u8)__msa_insert_b((v16i8)A, 1, ptop[3 * BPS]);
+  A = (v16u8)__msa_insert_b((v16i8)A, 0, ptop[4 * BPS]);
+  B = SLDI_UB(A, A, 1);
+  C = SLDI_UB(A, A, 2);
+  AC = __msa_ave_u_b(A, C);
+  B2 = __msa_ave_u_b(B, B);
+  R = __msa_aver_u_b(AC, B2);
+  val3 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val2 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val1 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val0 = __msa_copy_s_w((v4i32)R, 0);
+  SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+static void LD4(uint8_t* dst) {   // Down-Left
+  const uint8_t* const ptop = dst - BPS;
+  uint32_t val0 = LW(ptop + 0);
+  uint32_t val1 = LW(ptop + 4);
+  uint32_t val2, val3;
+  v16u8 A, B, C, AC, B2, R;
+
+  INSERT_W2_UB(val0, val1, A);
+  B = SLDI_UB(A, A, 1);
+  C = SLDI_UB(A, A, 2);
+  C = (v16u8)__msa_insert_b((v16i8)C, 6, ptop[7]);
+  AC = __msa_ave_u_b(A, C);
+  B2 = __msa_ave_u_b(B, B);
+  R = __msa_aver_u_b(AC, B2);
+  val0 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val1 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val2 = __msa_copy_s_w((v4i32)R, 0);
+  R = SLDI_UB(R, R, 1);
+  val3 = __msa_copy_s_w((v4i32)R, 0);
+  SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+// 16x16
+
+static void DC16(uint8_t* dst) {   // DC
+  uint32_t dc = 16;
+  int i;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+  v16u8 out;
+
+  for (i = 0; i < 16; ++i) {
+    dc += dst[-1 + i * BPS];
+  }
+  dc += HADD_UH_U32(dctop);
+  out = (v16u8)__msa_fill_b(dc >> 5);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void TM16(uint8_t* dst) {
+  int j;
+  v8i16 d1, d2;
+  const v16i8 zero = { 0 };
+  const v8i16 TL = (v8i16)__msa_fill_h(dst[-1 - BPS]);
+  const v16i8 T = LD_SB(dst - BPS);
+
+  ILVRL_B2_SH(zero, T, d1, d2);
+  SUB2(d1, TL, d2, TL, d1, d2);
+  for (j = 0; j < 16; j += 4) {
+    v16i8 t0, t1, t2, t3;
+    v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
+    const v8i16 L0 = (v8i16)__msa_fill_h(dst[-1 + 0 * BPS]);
+    const v8i16 L1 = (v8i16)__msa_fill_h(dst[-1 + 1 * BPS]);
+    const v8i16 L2 = (v8i16)__msa_fill_h(dst[-1 + 2 * BPS]);
+    const v8i16 L3 = (v8i16)__msa_fill_h(dst[-1 + 3 * BPS]);
+    ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
+    ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
+    CLIP_SH4_0_255(r0, r1, r2, r3);
+    CLIP_SH4_0_255(r4, r5, r6, r7);
+    PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
+    ST_SB4(t0, t1, t2, t3, dst, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void VE16(uint8_t* dst) {   // vertical
+  const v16u8 rtop = LD_UB(dst - BPS);
+  ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst, BPS);
+  ST_UB8(rtop, rtop, rtop, rtop, rtop, rtop, rtop, rtop, dst + 8 * BPS, BPS);
+}
+
+static void HE16(uint8_t* dst) {   // horizontal
+  int j;
+  for (j = 16; j > 0; j -= 4) {
+    const v16u8 L0 = (v16u8)__msa_fill_b(dst[-1 + 0 * BPS]);
+    const v16u8 L1 = (v16u8)__msa_fill_b(dst[-1 + 1 * BPS]);
+    const v16u8 L2 = (v16u8)__msa_fill_b(dst[-1 + 2 * BPS]);
+    const v16u8 L3 = (v16u8)__msa_fill_b(dst[-1 + 3 * BPS]);
+    ST_UB4(L0, L1, L2, L3, dst, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void DC16NoTop(uint8_t* dst) {   // DC with top samples not available
+  int j;
+  uint32_t dc = 8;
+  v16u8 out;
+
+  for (j = 0; j < 16; ++j) {
+    dc += dst[-1 + j * BPS];
+  }
+  out = (v16u8)__msa_fill_b(dc >> 4);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void DC16NoLeft(uint8_t* dst) {   // DC with left samples not available
+  uint32_t dc = 8;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+  v16u8 out;
+
+  dc += HADD_UH_U32(dctop);
+  out = (v16u8)__msa_fill_b(dc >> 4);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+static void DC16NoTopLeft(uint8_t* dst) {   // DC with nothing
+  const v16u8 out = (v16u8)__msa_fill_b(0x80);
+  ST_UB8(out, out, out, out, out, out, out, out, dst, BPS);
+  ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);
+}
+
+// Chroma
+
+#define STORE8x8(out, dst) do {                 \
+  SD4(out, out, out, out, dst + 0 * BPS, BPS);  \
+  SD4(out, out, out, out, dst + 4 * BPS, BPS);  \
+} while (0)
+
+static void DC8uv(uint8_t* dst) {   // DC
+  uint32_t dc = 8;
+  int i;
+  uint64_t out;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 temp0 = __msa_hadd_u_h(rtop, rtop);
+  const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);
+  const v2u64 temp2 = __msa_hadd_u_d(temp1, temp1);
+  v16u8 dctemp;
+
+  for (i = 0; i < 8; ++i) {
+    dc += dst[-1 + i * BPS];
+  }
+  dc += __msa_copy_s_w((v4i32)temp2, 0);
+  dctemp = (v16u8)__msa_fill_b(dc >> 4);
+  out = __msa_copy_s_d((v2i64)dctemp, 0);
+  STORE8x8(out, dst);
+}
+
+static void TM8uv(uint8_t* dst) {
+  int j;
+  const v16i8 T1 = LD_SB(dst - BPS);
+  const v16i8 zero = { 0 };
+  const v8i16 T  = (v8i16)__msa_ilvr_b(zero, T1);
+  const v8i16 TL = (v8i16)__msa_fill_h(dst[-1 - BPS]);
+  const v8i16 d = T - TL;
+
+  for (j = 0; j < 8; j += 4) {
+    v16i8 t0, t1;
+    v8i16 r0 = (v8i16)__msa_fill_h(dst[-1 + 0 * BPS]);
+    v8i16 r1 = (v8i16)__msa_fill_h(dst[-1 + 1 * BPS]);
+    v8i16 r2 = (v8i16)__msa_fill_h(dst[-1 + 2 * BPS]);
+    v8i16 r3 = (v8i16)__msa_fill_h(dst[-1 + 3 * BPS]);
+    ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
+    CLIP_SH4_0_255(r0, r1, r2, r3);
+    PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
+    ST4x4_UB(t0, t1, 0, 2, 0, 2, dst, BPS);
+    ST4x4_UB(t0, t1, 1, 3, 1, 3, dst + 4, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void VE8uv(uint8_t* dst) {   // vertical
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const uint64_t out = __msa_copy_s_d((v2i64)rtop, 0);
+  STORE8x8(out, dst);
+}
+
+static void HE8uv(uint8_t* dst) {   // horizontal
+  int j;
+  for (j = 0; j < 8; j += 4) {
+    const v16u8 L0 = (v16u8)__msa_fill_b(dst[-1 + 0 * BPS]);
+    const v16u8 L1 = (v16u8)__msa_fill_b(dst[-1 + 1 * BPS]);
+    const v16u8 L2 = (v16u8)__msa_fill_b(dst[-1 + 2 * BPS]);
+    const v16u8 L3 = (v16u8)__msa_fill_b(dst[-1 + 3 * BPS]);
+    const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);
+    const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);
+    const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);
+    const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);
+    SD4(out0, out1, out2, out3, dst, BPS);
+    dst += 4 * BPS;
+  }
+}
+
+static void DC8uvNoLeft(uint8_t* dst) {   // DC with no left samples
+  const uint32_t dc = 4;
+  const v16u8 rtop = LD_UB(dst - BPS);
+  const v8u16 temp0 = __msa_hadd_u_h(rtop, rtop);
+  const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);
+  const v2u64 temp2 = __msa_hadd_u_d(temp1, temp1);
+  const uint32_t sum_m = __msa_copy_s_w((v4i32)temp2, 0);
+  const v16u8 dcval = (v16u8)__msa_fill_b((dc + sum_m) >> 3);
+  const uint64_t out = __msa_copy_s_d((v2i64)dcval, 0);
+  STORE8x8(out, dst);
+}
+
+static void DC8uvNoTop(uint8_t* dst) {   // DC with no top samples
+  uint32_t dc = 4;
+  int i;
+  uint64_t out;
+  v16u8 dctemp;
+
+  for (i = 0; i < 8; ++i) {
+    dc += dst[-1 + i * BPS];
+  }
+  dctemp = (v16u8)__msa_fill_b(dc >> 3);
+  out = __msa_copy_s_d((v2i64)dctemp, 0);
+  STORE8x8(out, dst);
+}
+
+static void DC8uvNoTopLeft(uint8_t* dst) {   // DC with nothing
+  const uint64_t out = 0x8080808080808080ULL;
+  STORE8x8(out, dst);
+}
+
+//------------------------------------------------------------------------------
 // Entry point
 
 extern void VP8DspInitMSA(void);
@@ -163,6 +977,39 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInitMSA(void) {
   VP8Transform = TransformTwo;
   VP8TransformDC = TransformDC;
   VP8TransformAC3 = TransformAC3;
+
+  VP8VFilter16  = VFilter16;
+  VP8HFilter16  = HFilter16;
+  VP8VFilter16i = VFilter16i;
+  VP8HFilter16i = HFilter16i;
+  VP8VFilter8  = VFilter8;
+  VP8HFilter8  = HFilter8;
+  VP8VFilter8i = VFilter8i;
+  VP8HFilter8i = HFilter8i;
+  VP8SimpleVFilter16  = SimpleVFilter16;
+  VP8SimpleHFilter16  = SimpleHFilter16;
+  VP8SimpleVFilter16i = SimpleVFilter16i;
+  VP8SimpleHFilter16i = SimpleHFilter16i;
+
+  VP8PredLuma4[0] = DC4;
+  VP8PredLuma4[1] = TM4;
+  VP8PredLuma4[2] = VE4;
+  VP8PredLuma4[4] = RD4;
+  VP8PredLuma4[6] = LD4;
+  VP8PredLuma16[0] = DC16;
+  VP8PredLuma16[1] = TM16;
+  VP8PredLuma16[2] = VE16;
+  VP8PredLuma16[3] = HE16;
+  VP8PredLuma16[4] = DC16NoTop;
+  VP8PredLuma16[5] = DC16NoLeft;
+  VP8PredLuma16[6] = DC16NoTopLeft;
+  VP8PredChroma8[0] = DC8uv;
+  VP8PredChroma8[1] = TM8uv;
+  VP8PredChroma8[2] = VE8uv;
+  VP8PredChroma8[3] = HE8uv;
+  VP8PredChroma8[4] = DC8uvNoTop;
+  VP8PredChroma8[5] = DC8uvNoLeft;
+  VP8PredChroma8[6] = DC8uvNoTopLeft;
 }
 
 #else  // !WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/dec_neon.c b/src/3rdparty/libwebp/src/dsp/dec_neon.c
index a63f43f..34796cf 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_neon.c
@@ -17,7 +17,7 @@
 #if defined(WEBP_USE_NEON)
 
 #include "./neon.h"
-#include "../dec/vp8i.h"
+#include "../dec/vp8i_dec.h"
 
 //------------------------------------------------------------------------------
 // NxM Loading functions
@@ -666,9 +666,8 @@ static uint8x16_t NeedsHev(const uint8x16_t p1, const uint8x16_t p0,
   const uint8x16_t hev_thresh_v = vdupq_n_u8((uint8_t)hev_thresh);
   const uint8x16_t a_p1_p0 = vabdq_u8(p1, p0);  // abs(p1 - p0)
   const uint8x16_t a_q1_q0 = vabdq_u8(q1, q0);  // abs(q1 - q0)
-  const uint8x16_t mask1 = vcgtq_u8(a_p1_p0, hev_thresh_v);
-  const uint8x16_t mask2 = vcgtq_u8(a_q1_q0, hev_thresh_v);
-  const uint8x16_t mask = vorrq_u8(mask1, mask2);
+  const uint8x16_t a_max = vmaxq_u8(a_p1_p0, a_q1_q0);
+  const uint8x16_t mask = vcgtq_u8(a_max, hev_thresh_v);
   return mask;
 }
 
@@ -756,24 +755,25 @@ static void ApplyFilter6(
     const int8x16_t delta,
     uint8x16_t* const op2, uint8x16_t* const op1, uint8x16_t* const op0,
     uint8x16_t* const oq0, uint8x16_t* const oq1, uint8x16_t* const oq2) {
-  const int16x8_t kCst63 = vdupq_n_s16(63);
-  const int8x8_t kCst27 = vdup_n_s8(27);
-  const int8x8_t kCst18 = vdup_n_s8(18);
-  const int8x8_t kCst9 = vdup_n_s8(9);
+  // We have to compute: X = (9*a+63) >> 7, Y = (18*a+63)>>7, Z = (27*a+63) >> 7
+  // Turns out, there's a common sub-expression S=9 * a - 1 that can be used
+  // with the special vqrshrn_n_s16 rounding-shift-and-narrow instruction:
+  //   X = (S + 64) >> 7, Y = (S + 32) >> 6, Z = (18 * a + S + 64) >> 7
   const int8x8_t delta_lo = vget_low_s8(delta);
   const int8x8_t delta_hi = vget_high_s8(delta);
-  const int16x8_t s1_lo = vmlal_s8(kCst63, kCst27, delta_lo);  // 63 + 27 * a
-  const int16x8_t s1_hi = vmlal_s8(kCst63, kCst27, delta_hi);  // 63 + 27 * a
-  const int16x8_t s2_lo = vmlal_s8(kCst63, kCst18, delta_lo);  // 63 + 18 * a
-  const int16x8_t s2_hi = vmlal_s8(kCst63, kCst18, delta_hi);  // 63 + 18 * a
-  const int16x8_t s3_lo = vmlal_s8(kCst63, kCst9, delta_lo);   // 63 + 9 * a
-  const int16x8_t s3_hi = vmlal_s8(kCst63, kCst9, delta_hi);   // 63 + 9 * a
-  const int8x8_t a1_lo = vqshrn_n_s16(s1_lo, 7);
-  const int8x8_t a1_hi = vqshrn_n_s16(s1_hi, 7);
-  const int8x8_t a2_lo = vqshrn_n_s16(s2_lo, 7);
-  const int8x8_t a2_hi = vqshrn_n_s16(s2_hi, 7);
-  const int8x8_t a3_lo = vqshrn_n_s16(s3_lo, 7);
-  const int8x8_t a3_hi = vqshrn_n_s16(s3_hi, 7);
+  const int8x8_t kCst9 = vdup_n_s8(9);
+  const int16x8_t kCstm1 = vdupq_n_s16(-1);
+  const int8x8_t kCst18 = vdup_n_s8(18);
+  const int16x8_t S_lo = vmlal_s8(kCstm1, kCst9, delta_lo);  // S = 9 * a - 1
+  const int16x8_t S_hi = vmlal_s8(kCstm1, kCst9, delta_hi);
+  const int16x8_t Z_lo = vmlal_s8(S_lo, kCst18, delta_lo);   // S + 18 * a
+  const int16x8_t Z_hi = vmlal_s8(S_hi, kCst18, delta_hi);
+  const int8x8_t a3_lo = vqrshrn_n_s16(S_lo, 7);   // (9 * a + 63) >> 7
+  const int8x8_t a3_hi = vqrshrn_n_s16(S_hi, 7);
+  const int8x8_t a2_lo = vqrshrn_n_s16(S_lo, 6);   // (9 * a + 31) >> 6
+  const int8x8_t a2_hi = vqrshrn_n_s16(S_hi, 6);
+  const int8x8_t a1_lo = vqrshrn_n_s16(Z_lo, 7);   // (27 * a + 63) >> 7
+  const int8x8_t a1_hi = vqrshrn_n_s16(Z_hi, 7);
   const int8x16_t a1 = vcombine_s8(a1_lo, a1_hi);
   const int8x16_t a2 = vcombine_s8(a2_lo, a2_hi);
   const int8x16_t a3 = vcombine_s8(a3_lo, a3_hi);
diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse2.c b/src/3rdparty/libwebp/src/dsp/dec_sse2.c
index f0a8ddc..411fb02 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_sse2.c
@@ -22,7 +22,7 @@
 
 #include <emmintrin.h>
 #include "./common_sse2.h"
-#include "../dec/vp8i.h"
+#include "../dec/vp8i_dec.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
@@ -140,7 +140,7 @@ static void Transform(const int16_t* in, uint8_t* dst, int do_two) {
 
     // Transpose the two 4x4.
     VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,
-                        &T2, &T3);
+                           &T2, &T3);
   }
 
   // Add inverse transform to 'dst' and store.
diff --git a/src/3rdparty/libwebp/src/dsp/dec_sse41.c b/src/3rdparty/libwebp/src/dsp/dec_sse41.c
index 8d6aed1..4e81ec4 100644
--- a/src/3rdparty/libwebp/src/dsp/dec_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/dec_sse41.c
@@ -16,7 +16,7 @@
 #if defined(WEBP_USE_SSE41)
 
 #include <smmintrin.h>
-#include "../dec/vp8i.h"
+#include "../dec/vp8i_dec.h"
 #include "../utils/utils.h"
 
 static void HE16(uint8_t* dst) {     // horizontal
diff --git a/src/3rdparty/libwebp/src/dsp/dsp.h b/src/3rdparty/libwebp/src/dsp/dsp.h
index 1faac27..813fed4 100644
--- a/src/3rdparty/libwebp/src/dsp/dsp.h
+++ b/src/3rdparty/libwebp/src/dsp/dsp.h
@@ -111,8 +111,7 @@ extern "C" {
 
 #define WEBP_UBSAN_IGNORE_UNDEF
 #define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
-#if !defined(WEBP_FORCE_ALIGNED) && defined(__clang__) && \
-    defined(__has_attribute)
+#if defined(__clang__) && defined(__has_attribute)
 #if __has_attribute(no_sanitize)
 // This macro prevents the undefined behavior sanitizer from reporting
 // failures. This is only meant to silence unaligned loads on platforms that
@@ -133,6 +132,7 @@ extern "C" {
 typedef enum {
   kSSE2,
   kSSE3,
+  kSlowSSSE3,  // special feature for slow SSSE3 architectures
   kSSE4_1,
   kAVX,
   kAVX2,
@@ -185,6 +185,11 @@ typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref,
 // 4 by 4 symmetric matrix.
 extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16;
 
+// Compute the average (DC) of four 4x4 blocks.
+// Each sub-4x4 block #i sum is stored in dc[i].
+typedef void (*VP8MeanMetric)(const uint8_t* ref, uint32_t dc[4]);
+extern VP8MeanMetric VP8Mean16x4;
+
 typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst);
 extern VP8BlockCopy VP8Copy4x4;
 extern VP8BlockCopy VP8Copy16x8;
@@ -246,30 +251,37 @@ extern VP8GetResidualCostFunc VP8GetResidualCost;
 void VP8EncDspCostInit(void);
 
 //------------------------------------------------------------------------------
-// SSIM utils
+// SSIM / PSNR utils
 
 // struct for accumulating statistical moments
 typedef struct {
-  double w;              // sum(w_i) : sum of weights
-  double xm, ym;         // sum(w_i * x_i), sum(w_i * y_i)
-  double xxm, xym, yym;  // sum(w_i * x_i * x_i), etc.
+  uint32_t w;              // sum(w_i) : sum of weights
+  uint32_t xm, ym;         // sum(w_i * x_i), sum(w_i * y_i)
+  uint32_t xxm, xym, yym;  // sum(w_i * x_i * x_i), etc.
 } VP8DistoStats;
 
+// Compute the final SSIM value
+// The non-clipped version assumes stats->w = (2 * VP8_SSIM_KERNEL + 1)^2.
+double VP8SSIMFromStats(const VP8DistoStats* const stats);
+double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats);
+
 #define VP8_SSIM_KERNEL 3   // total size of the kernel: 2 * VP8_SSIM_KERNEL + 1
-typedef void (*VP8SSIMAccumulateClippedFunc)(const uint8_t* src1, int stride1,
-                                             const uint8_t* src2, int stride2,
-                                             int xo, int yo,  // center position
-                                             int W, int H,    // plane dimension
-                                             VP8DistoStats* const stats);
+typedef double (*VP8SSIMGetClippedFunc)(const uint8_t* src1, int stride1,
+                                        const uint8_t* src2, int stride2,
+                                        int xo, int yo,  // center position
+                                        int W, int H);   // plane dimension
 
 // This version is called with the guarantee that you can load 8 bytes and
 // 8 rows at offset src1 and src2
-typedef void (*VP8SSIMAccumulateFunc)(const uint8_t* src1, int stride1,
-                                      const uint8_t* src2, int stride2,
-                                      VP8DistoStats* const stats);
+typedef double (*VP8SSIMGetFunc)(const uint8_t* src1, int stride1,
+                                 const uint8_t* src2, int stride2);
+
+extern VP8SSIMGetFunc VP8SSIMGet;         // unclipped / unchecked
+extern VP8SSIMGetClippedFunc VP8SSIMGetClipped;   // with clipping
 
-extern VP8SSIMAccumulateFunc VP8SSIMAccumulate;         // unclipped / unchecked
-extern VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped;   // with clipping
+typedef uint32_t (*VP8AccumulateSSEFunc)(const uint8_t* src1,
+                                         const uint8_t* src2, int len);
+extern VP8AccumulateSSEFunc VP8AccumulateSSE;
 
 // must be called before using any of the above directly
 void VP8SSIMDspInit(void);
@@ -416,6 +428,15 @@ extern void WebPConvertARGBToUV_C(const uint32_t* argb, uint8_t* u, uint8_t* v,
 extern void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
                                     uint8_t* u, uint8_t* v, int width);
 
+// utilities for accurate RGB->YUV conversion
+extern uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* src, const uint16_t* ref,
+                                       uint16_t* dst, int len);
+extern void (*WebPSharpYUVUpdateRGB)(const int16_t* src, const int16_t* ref,
+                                     int16_t* dst, int len);
+extern void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B,
+                                     int len,
+                                     const uint16_t* best_y, uint16_t* out);
+
 // Must be called before using the above.
 void WebPInitConvertARGBToYUV(void);
 
@@ -488,6 +509,10 @@ extern int (*WebPExtractAlpha)(const uint8_t* argb, int argb_stride,
                                int width, int height,
                                uint8_t* alpha, int alpha_stride);
 
+// Extract the green values from 32b values in argb[] and pack them into alpha[]
+// (this is the opposite of WebPDispatchAlphaToGreen).
+extern void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+
 // Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
 // Un-Multiply operation transforms x into x * 255 / A.
 
diff --git a/src/3rdparty/libwebp/src/dsp/enc.c b/src/3rdparty/libwebp/src/dsp/enc.c
index f639f55..f31bc6d 100644
--- a/src/3rdparty/libwebp/src/dsp/enc.c
+++ b/src/3rdparty/libwebp/src/dsp/enc.c
@@ -15,7 +15,7 @@
 #include <stdlib.h>  // for abs()
 
 #include "./dsp.h"
-#include "../enc/vp8enci.h"
+#include "../enc/vp8i_enc.h"
 
 static WEBP_INLINE uint8_t clip_8b(int v) {
   return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
@@ -335,7 +335,7 @@ static void Intra16Preds(uint8_t* dst,
 // luma 4x4 prediction
 
 #define DST(x, y) dst[(x) + (y) * BPS]
-#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG3(a, b, c) ((uint8_t)(((a) + 2 * (b) + (c) + 2) >> 2))
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
 static void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
@@ -551,6 +551,20 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
   return GetSSE(a, b, 4, 4);
 }
 
+static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+  int k, x, y;
+  for (k = 0; k < 4; ++k) {
+    uint32_t avg = 0;
+    for (y = 0; y < 4; ++y) {
+      for (x = 0; x < 4; ++x) {
+        avg += ref[x + y * BPS];
+      }
+    }
+    dc[k] = avg;
+    ref += 4;   // go to next 4x4 block.
+  }
+}
+
 //------------------------------------------------------------------------------
 // Texture distortion
 //
@@ -656,32 +670,6 @@ static int Quantize2Blocks(int16_t in[32], int16_t out[32],
   return nz;
 }
 
-static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
-                            const VP8Matrix* const mtx) {
-  int n, last = -1;
-  for (n = 0; n < 16; ++n) {
-    const int j = kZigzag[n];
-    const int sign = (in[j] < 0);
-    const uint32_t coeff = sign ? -in[j] : in[j];
-    assert(mtx->sharpen_[j] == 0);
-    if (coeff > mtx->zthresh_[j]) {
-      const uint32_t Q = mtx->q_[j];
-      const uint32_t iQ = mtx->iq_[j];
-      const uint32_t B = mtx->bias_[j];
-      int level = QUANTDIV(coeff, iQ, B);
-      if (level > MAX_LEVEL) level = MAX_LEVEL;
-      if (sign) level = -level;
-      in[j] = level * (int)Q;
-      out[n] = level;
-      if (level) last = n;
-    } else {
-      out[n] = 0;
-      in[j] = 0;
-    }
-  }
-  return (last >= 0);
-}
-
 //------------------------------------------------------------------------------
 // Block copy
 
@@ -703,11 +691,51 @@ static void Copy16x8(const uint8_t* src, uint8_t* dst) {
 }
 
 //------------------------------------------------------------------------------
+// SSIM / PSNR
 
-static void SSIMAccumulateClipped(const uint8_t* src1, int stride1,
-                                  const uint8_t* src2, int stride2,
-                                  int xo, int yo, int W, int H,
-                                  VP8DistoStats* const stats) {
+// hat-shaped filter. Sum of coefficients is equal to 16.
+static const uint32_t kWeight[2 * VP8_SSIM_KERNEL + 1] = {
+  1, 2, 3, 4, 3, 2, 1
+};
+static const uint32_t kWeightSum = 16 * 16;   // sum{kWeight}^2
+
+static WEBP_INLINE double SSIMCalculation(
+    const VP8DistoStats* const stats, uint32_t N  /*num samples*/) {
+  const uint32_t w2 =  N * N;
+  const uint32_t C1 = 20 * w2;
+  const uint32_t C2 = 60 * w2;
+  const uint32_t C3 = 8 * 8 * w2;   // 'dark' limit ~= 6
+  const uint64_t xmxm = (uint64_t)stats->xm * stats->xm;
+  const uint64_t ymym = (uint64_t)stats->ym * stats->ym;
+  if (xmxm + ymym >= C3) {
+    const int64_t xmym = (int64_t)stats->xm * stats->ym;
+    const int64_t sxy = (int64_t)stats->xym * N - xmym;    // can be negative
+    const uint64_t sxx = (uint64_t)stats->xxm * N - xmxm;
+    const uint64_t syy = (uint64_t)stats->yym * N - ymym;
+    // we descale by 8 to prevent overflow during the fnum/fden multiply.
+    const uint64_t num_S = (2 * (uint64_t)(sxy < 0 ? 0 : sxy) + C2) >> 8;
+    const uint64_t den_S = (sxx + syy + C2) >> 8;
+    const uint64_t fnum = (2 * xmym + C1) * num_S;
+    const uint64_t fden = (xmxm + ymym + C1) * den_S;
+    const double r = (double)fnum / fden;
+    assert(r >= 0. && r <= 1.0);
+    return r;
+  }
+  return 1.;   // area is too dark to contribute meaningfully
+}
+
+double VP8SSIMFromStats(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, kWeightSum);
+}
+
+double VP8SSIMFromStatsClipped(const VP8DistoStats* const stats) {
+  return SSIMCalculation(stats, stats->w);
+}
+
+static double SSIMGetClipped_C(const uint8_t* src1, int stride1,
+                               const uint8_t* src2, int stride2,
+                               int xo, int yo, int W, int H) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
   const int ymin = (yo - VP8_SSIM_KERNEL < 0) ? 0 : yo - VP8_SSIM_KERNEL;
   const int ymax = (yo + VP8_SSIM_KERNEL > H - 1) ? H - 1
                                                   : yo + VP8_SSIM_KERNEL;
@@ -719,38 +747,61 @@ static void SSIMAccumulateClipped(const uint8_t* src1, int stride1,
   src2 += ymin * stride2;
   for (y = ymin; y <= ymax; ++y, src1 += stride1, src2 += stride2) {
     for (x = xmin; x <= xmax; ++x) {
-      const int s1 = src1[x];
-      const int s2 = src2[x];
-      stats->w   += 1;
-      stats->xm  += s1;
-      stats->ym  += s2;
-      stats->xxm += s1 * s1;
-      stats->xym += s1 * s2;
-      stats->yym += s2 * s2;
+      const uint32_t w = kWeight[VP8_SSIM_KERNEL + x - xo]
+                       * kWeight[VP8_SSIM_KERNEL + y - yo];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.w   += w;
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
     }
   }
+  return VP8SSIMFromStatsClipped(&stats);
 }
 
-static void SSIMAccumulate(const uint8_t* src1, int stride1,
-                           const uint8_t* src2, int stride2,
-                           VP8DistoStats* const stats) {
+static double SSIMGet_C(const uint8_t* src1, int stride1,
+                        const uint8_t* src2, int stride2) {
+  VP8DistoStats stats = { 0, 0, 0, 0, 0, 0 };
   int x, y;
   for (y = 0; y <= 2 * VP8_SSIM_KERNEL; ++y, src1 += stride1, src2 += stride2) {
     for (x = 0; x <= 2 * VP8_SSIM_KERNEL; ++x) {
-      const int s1 = src1[x];
-      const int s2 = src2[x];
-      stats->w   += 1;
-      stats->xm  += s1;
-      stats->ym  += s2;
-      stats->xxm += s1 * s1;
-      stats->xym += s1 * s2;
-      stats->yym += s2 * s2;
+      const uint32_t w = kWeight[x] * kWeight[y];
+      const uint32_t s1 = src1[x];
+      const uint32_t s2 = src2[x];
+      stats.xm  += w * s1;
+      stats.ym  += w * s2;
+      stats.xxm += w * s1 * s1;
+      stats.xym += w * s1 * s2;
+      stats.yym += w * s2 * s2;
     }
   }
+  return VP8SSIMFromStats(&stats);
+}
+
+//------------------------------------------------------------------------------
+
+static uint32_t AccumulateSSE(const uint8_t* src1,
+                              const uint8_t* src2, int len) {
+  int i;
+  uint32_t sse2 = 0;
+  assert(len <= 65535);  // to ensure that accumulation fits within uint32_t
+  for (i = 0; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
 }
 
-VP8SSIMAccumulateFunc VP8SSIMAccumulate;
-VP8SSIMAccumulateClippedFunc VP8SSIMAccumulateClipped;
+//------------------------------------------------------------------------------
+
+VP8SSIMGetFunc VP8SSIMGet;
+VP8SSIMGetClippedFunc VP8SSIMGetClipped;
+VP8AccumulateSSEFunc VP8AccumulateSSE;
+
+extern void VP8SSIMDspInitSSE2(void);
 
 static volatile VP8CPUInfo ssim_last_cpuinfo_used =
     (VP8CPUInfo)&ssim_last_cpuinfo_used;
@@ -758,8 +809,17 @@ static volatile VP8CPUInfo ssim_last_cpuinfo_used =
 WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
   if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
 
-  VP8SSIMAccumulate = SSIMAccumulate;
-  VP8SSIMAccumulateClipped = SSIMAccumulateClipped;
+  VP8SSIMGetClipped = SSIMGetClipped_C;
+  VP8SSIMGet = SSIMGet_C;
+
+  VP8AccumulateSSE = AccumulateSSE;
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8SSIMDspInitSSE2();
+    }
+#endif
+  }
 
   ssim_last_cpuinfo_used = VP8GetCPUInfo;
 }
@@ -783,6 +843,7 @@ VP8Metric VP8SSE16x8;
 VP8Metric VP8SSE4x4;
 VP8WMetric VP8TDisto4x4;
 VP8WMetric VP8TDisto16x16;
+VP8MeanMetric VP8Mean16x4;
 VP8QuantizeBlock VP8EncQuantizeBlock;
 VP8Quantize2Blocks VP8EncQuantize2Blocks;
 VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
@@ -795,6 +856,7 @@ extern void VP8EncDspInitAVX2(void);
 extern void VP8EncDspInitNEON(void);
 extern void VP8EncDspInitMIPS32(void);
 extern void VP8EncDspInitMIPSdspR2(void);
+extern void VP8EncDspInitMSA(void);
 
 static volatile VP8CPUInfo enc_last_cpuinfo_used =
     (VP8CPUInfo)&enc_last_cpuinfo_used;
@@ -820,9 +882,10 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
   VP8SSE4x4 = SSE4x4;
   VP8TDisto4x4 = Disto4x4;
   VP8TDisto16x16 = Disto16x16;
+  VP8Mean16x4 = Mean16x4;
   VP8EncQuantizeBlock = QuantizeBlock;
   VP8EncQuantize2Blocks = Quantize2Blocks;
-  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
+  VP8EncQuantizeBlockWHT = QuantizeBlock;
   VP8Copy4x4 = Copy4x4;
   VP8Copy16x8 = Copy16x8;
 
@@ -858,6 +921,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
       VP8EncDspInitMIPSdspR2();
     }
 #endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      VP8EncDspInitMSA();
+    }
+#endif
   }
   enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/enc_mips32.c b/src/3rdparty/libwebp/src/dsp/enc_mips32.c
index fd10143..752b14d 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_mips32.c
@@ -18,8 +18,8 @@
 #if defined(WEBP_USE_MIPS32)
 
 #include "./mips_macro.h"
-#include "../enc/vp8enci.h"
-#include "../enc/cost.h"
+#include "../enc/vp8i_enc.h"
+#include "../enc/cost_enc.h"
 
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
diff --git a/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
index 7ab96f6..6c8c1c6 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_mips_dsp_r2.c
@@ -17,8 +17,8 @@
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
 #include "./mips_macro.h"
-#include "../enc/cost.h"
-#include "../enc/vp8enci.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
 
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
diff --git a/src/3rdparty/libwebp/src/dsp/enc_msa.c b/src/3rdparty/libwebp/src/dsp/enc_msa.c
new file mode 100644
index 0000000..909b46d
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/enc_msa.c
@@ -0,0 +1,892 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of encoder dsp functions.
+//
+// Author:  Prashant Patil   (prashant.patil@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include <stdlib.h>
+#include "./msa_macro.h"
+#include "../enc/vp8i_enc.h"
+
+//------------------------------------------------------------------------------
+// Transforms
+
+#define IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  v4i32 a1_m, b1_m, c1_m, d1_m;                                     \
+  const v4i32 cospi8sqrt2minus1 = __msa_fill_w(20091);              \
+  const v4i32 sinpi8sqrt2 = __msa_fill_w(35468);                    \
+  v4i32 c_tmp1_m = in1 * sinpi8sqrt2;                               \
+  v4i32 c_tmp2_m = in3 * cospi8sqrt2minus1;                         \
+  v4i32 d_tmp1_m = in1 * cospi8sqrt2minus1;                         \
+  v4i32 d_tmp2_m = in3 * sinpi8sqrt2;                               \
+                                                                    \
+  ADDSUB2(in0, in2, a1_m, b1_m);                                    \
+  SRAI_W2_SW(c_tmp1_m, c_tmp2_m, 16);                               \
+  c_tmp2_m = c_tmp2_m + in3;                                        \
+  c1_m = c_tmp1_m - c_tmp2_m;                                       \
+  SRAI_W2_SW(d_tmp1_m, d_tmp2_m, 16);                               \
+  d_tmp1_m = d_tmp1_m + in1;                                        \
+  d1_m = d_tmp1_m + d_tmp2_m;                                       \
+  BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);      \
+} while (0)
+
+static WEBP_INLINE void ITransformOne(const uint8_t* ref, const int16_t* in,
+                                      uint8_t* dst) {
+  v8i16 input0, input1;
+  v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+  v4i32 res0, res1, res2, res3;
+  v16i8 dest0, dest1, dest2, dest3;
+  const v16i8 zero = { 0 };
+
+  LD_SH2(in, 8, input0, input1);
+  UNPCK_SH_SW(input0, in0, in1);
+  UNPCK_SH_SW(input1, in2, in3);
+  IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+  TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+  IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+  SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+  TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+  LD_SB4(ref, BPS, dest0, dest1, dest2, dest3);
+  ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+             res0, res1, res2, res3);
+  ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3,
+             res0, res1, res2, res3);
+  ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+  CLIP_SW4_0_255(res0, res1, res2, res3);
+  PCKEV_B2_SW(res0, res1, res2, res3, vt0, vt1);
+  res0 = (v4i32)__msa_pckev_b((v16i8)vt0, (v16i8)vt1);
+  ST4x4_UB(res0, res0, 3, 2, 1, 0, dst, BPS);
+}
+
+static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
+                       int do_two) {
+  ITransformOne(ref, in, dst);
+  if (do_two) {
+    ITransformOne(ref + 4, in + 16, dst + 4);
+  }
+}
+
+static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
+  uint64_t out0, out1, out2, out3;
+  uint32_t in0, in1, in2, in3;
+  v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  v8i16 t0, t1, t2, t3;
+  v16u8 srcl0, srcl1, src0, src1;
+  const v8i16 mask0 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask1 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+  const v8i16 mask2 = { 4, 0, 5, 1, 6, 2, 7, 3 };
+  const v8i16 mask3 = { 0, 4, 1, 5, 2, 6, 3, 7 };
+  const v8i16 cnst0 = { 2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352 };
+  const v8i16 cnst1 = { 5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217 };
+
+  LW4(src, BPS, in0, in1, in2, in3);
+  INSERT_W4_UB(in0, in1, in2, in3, src0);
+  LW4(ref, BPS, in0, in1, in2, in3);
+  INSERT_W4_UB(in0, in1, in2, in3, src1);
+  ILVRL_B2_UB(src0, src1, srcl0, srcl1);
+  HSUB_UB2_SH(srcl0, srcl1, t0, t1);
+  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
+  ADDSUB2(t2, t3, t0, t1);
+  t0 = SRLI_H(t0, 3);
+  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
+  tmp0 = __msa_hadd_s_w(t3, t3);
+  tmp2 = __msa_hsub_s_w(t3, t3);
+  FILL_W2_SW(1812, 937, tmp1, tmp3);
+  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
+  SRAI_W2_SW(tmp1, tmp3, 9);
+  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
+  VSHF_H2_SH(t0, t1, t0, t1, mask0, mask1, t2, t3);
+  ADDSUB2(t2, t3, t0, t1);
+  VSHF_H2_SH(t0, t0, t1, t1, mask2, mask3, t3, t2);
+  tmp0 = __msa_hadd_s_w(t3, t3);
+  tmp2 = __msa_hsub_s_w(t3, t3);
+  ADDVI_W2_SW(tmp0, 7, tmp2, 7, tmp0, tmp2);
+  SRAI_W2_SW(tmp0, tmp2, 4);
+  FILL_W2_SW(12000, 51000, tmp1, tmp3);
+  DPADD_SH2_SW(t2, t2, cnst0, cnst1, tmp3, tmp1);
+  SRAI_W2_SW(tmp1, tmp3, 16);
+  UNPCK_R_SH_SW(t1, tmp4);
+  tmp5 = __msa_ceqi_w(tmp4, 0);
+  tmp4 = (v4i32)__msa_nor_v((v16u8)tmp5, (v16u8)tmp5);
+  tmp5 = __msa_fill_w(1);
+  tmp5 = (v4i32)__msa_and_v((v16u8)tmp5, (v16u8)tmp4);
+  tmp1 += tmp5;
+  PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, t0, t1);
+  out0 = __msa_copy_s_d((v2i64)t0, 0);
+  out1 = __msa_copy_s_d((v2i64)t0, 1);
+  out2 = __msa_copy_s_d((v2i64)t1, 0);
+  out3 = __msa_copy_s_d((v2i64)t1, 1);
+  SD4(out0, out1, out2, out3, out, 8);
+}
+
+static void FTransformWHT(const int16_t* in, int16_t* out) {
+  v8i16 in0 = { 0 };
+  v8i16 in1 = { 0 };
+  v8i16 tmp0, tmp1, tmp2, tmp3;
+  v8i16 out0, out1;
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+
+  in0 = __msa_insert_h(in0, 0, in[  0]);
+  in0 = __msa_insert_h(in0, 1, in[ 64]);
+  in0 = __msa_insert_h(in0, 2, in[128]);
+  in0 = __msa_insert_h(in0, 3, in[192]);
+  in0 = __msa_insert_h(in0, 4, in[ 16]);
+  in0 = __msa_insert_h(in0, 5, in[ 80]);
+  in0 = __msa_insert_h(in0, 6, in[144]);
+  in0 = __msa_insert_h(in0, 7, in[208]);
+  in1 = __msa_insert_h(in1, 0, in[ 48]);
+  in1 = __msa_insert_h(in1, 1, in[112]);
+  in1 = __msa_insert_h(in1, 2, in[176]);
+  in1 = __msa_insert_h(in1, 3, in[240]);
+  in1 = __msa_insert_h(in1, 4, in[ 32]);
+  in1 = __msa_insert_h(in1, 5, in[ 96]);
+  in1 = __msa_insert_h(in1, 6, in[160]);
+  in1 = __msa_insert_h(in1, 7, in[224]);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, out0, out1);
+  SRAI_H2_SH(out0, out1, 1);
+  ST_SH2(out0, out1, out, 8);
+}
+
+static int TTransform(const uint8_t* in, const uint16_t* w) {
+  int sum;
+  uint32_t in0_m, in1_m, in2_m, in3_m;
+  v16i8 src0;
+  v8i16 in0, in1, tmp0, tmp1, tmp2, tmp3;
+  v4i32 dst0, dst1;
+  const v16i8 zero = { 0 };
+  const v8i16 mask0 = { 0, 1, 2, 3, 8, 9, 10, 11 };
+  const v8i16 mask1 = { 4, 5, 6, 7, 12, 13, 14, 15 };
+  const v8i16 mask2 = { 0, 4, 8, 12, 1, 5, 9, 13 };
+  const v8i16 mask3 = { 3, 7, 11, 15, 2, 6, 10, 14 };
+
+  LW4(in, BPS, in0_m, in1_m, in2_m, in3_m);
+  INSERT_W4_SB(in0_m, in1_m, in2_m, in3_m, src0);
+  ILVRL_B2_SH(zero, src0, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask2, mask3, in0, in1);
+  ADDSUB2(in0, in1, tmp0, tmp1);
+  VSHF_H2_SH(tmp0, tmp1, tmp0, tmp1, mask0, mask1, tmp2, tmp3);
+  ADDSUB2(tmp2, tmp3, tmp0, tmp1);
+  tmp0 = __msa_add_a_h(tmp0, (v8i16)zero);
+  tmp1 = __msa_add_a_h(tmp1, (v8i16)zero);
+  LD_SH2(w, 8, tmp2, tmp3);
+  DOTP_SH2_SW(tmp0, tmp1, tmp2, tmp3, dst0, dst1);
+  dst0 = dst0 + dst1;
+  sum = HADD_SW_S32(dst0);
+  return sum;
+}
+
+static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
+                    const uint16_t* const w) {
+  const int sum1 = TTransform(a, w);
+  const int sum2 = TTransform(b, w);
+  return abs(sum2 - sum1) >> 5;
+}
+
+static int Disto16x16(const uint8_t* const a, const uint8_t* const b,
+                      const uint16_t* const w) {
+  int D = 0;
+  int x, y;
+  for (y = 0; y < 16 * BPS; y += 4 * BPS) {
+    for (x = 0; x < 16; x += 4) {
+      D += Disto4x4(a + x + y, b + x + y, w);
+    }
+  }
+  return D;
+}
+
+//------------------------------------------------------------------------------
+// Histogram
+
+static void CollectHistogram(const uint8_t* ref, const uint8_t* pred,
+                             int start_block, int end_block,
+                             VP8Histogram* const histo) {
+  int j;
+  int distribution[MAX_COEFF_THRESH + 1] = { 0 };
+  for (j = start_block; j < end_block; ++j) {
+    int16_t out[16];
+    VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out);
+    {
+      int k;
+      v8i16 coeff0, coeff1;
+      const v8i16 zero = { 0 };
+      const v8i16 max_coeff_thr = __msa_ldi_h(MAX_COEFF_THRESH);
+      LD_SH2(&out[0], 8, coeff0, coeff1);
+      coeff0 = __msa_add_a_h(coeff0, zero);
+      coeff1 = __msa_add_a_h(coeff1, zero);
+      SRAI_H2_SH(coeff0, coeff1, 3);
+      coeff0 = __msa_min_s_h(coeff0, max_coeff_thr);
+      coeff1 = __msa_min_s_h(coeff1, max_coeff_thr);
+      ST_SH2(coeff0, coeff1, &out[0], 8);
+      for (k = 0; k < 16; ++k) {
+        ++distribution[out[k]];
+      }
+    }
+  }
+  VP8SetHistogramData(distribution, histo);
+}
+
+//------------------------------------------------------------------------------
+// Intra predictions
+
+// luma 4x4 prediction
+
+#define DST(x, y) dst[(x) + (y) * BPS]
+#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
+#define AVG2(a, b) (((a) + (b) + 1) >> 1)
+
+static WEBP_INLINE void VE4(uint8_t* dst, const uint8_t* top) {    // vertical
+  const uint64_t val_m = LD(top - 1);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
+  const v16u8 B = SLDI_UB(A, A, 1);
+  const v16u8 C = SLDI_UB(A, A, 2);
+  const v16u8 AC = __msa_ave_u_b(A, C);
+  const v16u8 B2 = __msa_ave_u_b(B, B);
+  const v16u8 R = __msa_aver_u_b(AC, B2);
+  const uint32_t out = __msa_copy_s_w((v4i32)R, 0);
+  SW4(out, out, out, out, dst, BPS);
+}
+
+static WEBP_INLINE void HE4(uint8_t* dst, const uint8_t* top) {    // horizontal
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  WebPUint32ToMem(dst + 0 * BPS, 0x01010101U * AVG3(X, I, J));
+  WebPUint32ToMem(dst + 1 * BPS, 0x01010101U * AVG3(I, J, K));
+  WebPUint32ToMem(dst + 2 * BPS, 0x01010101U * AVG3(J, K, L));
+  WebPUint32ToMem(dst + 3 * BPS, 0x01010101U * AVG3(K, L, L));
+}
+
+static WEBP_INLINE void DC4(uint8_t* dst, const uint8_t* top) {
+  uint32_t dc = 4;
+  int i;
+  for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i];
+  dc >>= 3;
+  dc = dc | (dc << 8) | (dc << 16) | (dc << 24);
+  SW4(dc, dc, dc, dc, dst, BPS);
+}
+
+static WEBP_INLINE void RD4(uint8_t* dst, const uint8_t* top) {
+  const uint64_t val_m = LD(top - 5);
+  const v16u8 A1 = (v16u8)__msa_insert_d((v2i64)A1, 0, val_m);
+  const v16u8 A = (v16u8)__msa_insert_b((v16i8)A1, 8, top[3]);
+  const v16u8 B = SLDI_UB(A, A, 1);
+  const v16u8 C = SLDI_UB(A, A, 2);
+  const v16u8 AC = __msa_ave_u_b(A, C);
+  const v16u8 B2 = __msa_ave_u_b(B, B);
+  const v16u8 R0 = __msa_aver_u_b(AC, B2);
+  const v16u8 R1 = SLDI_UB(R0, R0, 1);
+  const v16u8 R2 = SLDI_UB(R1, R1, 1);
+  const v16u8 R3 = SLDI_UB(R2, R2, 1);
+  const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
+  const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
+  const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
+  const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
+  SW4(val3, val2, val1, val0, dst, BPS);
+}
+
+static WEBP_INLINE void LD4(uint8_t* dst, const uint8_t* top) {
+  const uint64_t val_m = LD(top);
+  const v16u8 A = (v16u8)__msa_insert_d((v2i64)A, 0, val_m);
+  const v16u8 B = SLDI_UB(A, A, 1);
+  const v16u8 C1 = SLDI_UB(A, A, 2);
+  const v16u8 C = (v16u8)__msa_insert_b((v16i8)C1, 6, top[7]);
+  const v16u8 AC = __msa_ave_u_b(A, C);
+  const v16u8 B2 = __msa_ave_u_b(B, B);
+  const v16u8 R0 = __msa_aver_u_b(AC, B2);
+  const v16u8 R1 = SLDI_UB(R0, R0, 1);
+  const v16u8 R2 = SLDI_UB(R1, R1, 1);
+  const v16u8 R3 = SLDI_UB(R2, R2, 1);
+  const uint32_t val0 = __msa_copy_s_w((v4i32)R0, 0);
+  const uint32_t val1 = __msa_copy_s_w((v4i32)R1, 0);
+  const uint32_t val2 = __msa_copy_s_w((v4i32)R2, 0);
+  const uint32_t val3 = __msa_copy_s_w((v4i32)R3, 0);
+  SW4(val0, val1, val2, val3, dst, BPS);
+}
+
+static WEBP_INLINE void VR4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  DST(0, 0) = DST(1, 2) = AVG2(X, A);
+  DST(1, 0) = DST(2, 2) = AVG2(A, B);
+  DST(2, 0) = DST(3, 2) = AVG2(B, C);
+  DST(3, 0)             = AVG2(C, D);
+  DST(0, 3) =             AVG3(K, J, I);
+  DST(0, 2) =             AVG3(J, I, X);
+  DST(0, 1) = DST(1, 3) = AVG3(I, X, A);
+  DST(1, 1) = DST(2, 3) = AVG3(X, A, B);
+  DST(2, 1) = DST(3, 3) = AVG3(A, B, C);
+  DST(3, 1) =             AVG3(B, C, D);
+}
+
+static WEBP_INLINE void VL4(uint8_t* dst, const uint8_t* top) {
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  const int D = top[3];
+  const int E = top[4];
+  const int F = top[5];
+  const int G = top[6];
+  const int H = top[7];
+  DST(0, 0) =             AVG2(A, B);
+  DST(1, 0) = DST(0, 2) = AVG2(B, C);
+  DST(2, 0) = DST(1, 2) = AVG2(C, D);
+  DST(3, 0) = DST(2, 2) = AVG2(D, E);
+  DST(0, 1) =             AVG3(A, B, C);
+  DST(1, 1) = DST(0, 3) = AVG3(B, C, D);
+  DST(2, 1) = DST(1, 3) = AVG3(C, D, E);
+  DST(3, 1) = DST(2, 3) = AVG3(D, E, F);
+              DST(3, 2) = AVG3(E, F, G);
+              DST(3, 3) = AVG3(F, G, H);
+}
+
+static WEBP_INLINE void HU4(uint8_t* dst, const uint8_t* top) {
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  DST(0, 0) =             AVG2(I, J);
+  DST(2, 0) = DST(0, 1) = AVG2(J, K);
+  DST(2, 1) = DST(0, 2) = AVG2(K, L);
+  DST(1, 0) =             AVG3(I, J, K);
+  DST(3, 0) = DST(1, 1) = AVG3(J, K, L);
+  DST(3, 1) = DST(1, 2) = AVG3(K, L, L);
+  DST(3, 2) = DST(2, 2) =
+  DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L;
+}
+
+static WEBP_INLINE void HD4(uint8_t* dst, const uint8_t* top) {
+  const int X = top[-1];
+  const int I = top[-2];
+  const int J = top[-3];
+  const int K = top[-4];
+  const int L = top[-5];
+  const int A = top[0];
+  const int B = top[1];
+  const int C = top[2];
+  DST(0, 0) = DST(2, 1) = AVG2(I, X);
+  DST(0, 1) = DST(2, 2) = AVG2(J, I);
+  DST(0, 2) = DST(2, 3) = AVG2(K, J);
+  DST(0, 3)             = AVG2(L, K);
+  DST(3, 0)             = AVG3(A, B, C);
+  DST(2, 0)             = AVG3(X, A, B);
+  DST(1, 0) = DST(3, 1) = AVG3(I, X, A);
+  DST(1, 1) = DST(3, 2) = AVG3(J, I, X);
+  DST(1, 2) = DST(3, 3) = AVG3(K, J, I);
+  DST(1, 3)             = AVG3(L, K, J);
+}
+
+static WEBP_INLINE void TM4(uint8_t* dst, const uint8_t* top) {
+  const v16i8 zero = { 0 };
+  const v8i16 TL = (v8i16)__msa_fill_h(top[-1]);
+  const v8i16 L0 = (v8i16)__msa_fill_h(top[-2]);
+  const v8i16 L1 = (v8i16)__msa_fill_h(top[-3]);
+  const v8i16 L2 = (v8i16)__msa_fill_h(top[-4]);
+  const v8i16 L3 = (v8i16)__msa_fill_h(top[-5]);
+  const v16u8 T1 = LD_UB(top);
+  const v8i16 T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+  const v8i16 d = T - TL;
+  v8i16 r0, r1, r2, r3;
+  ADD4(d, L0, d, L1, d, L2, d, L3, r0, r1, r2, r3);
+  CLIP_SH4_0_255(r0, r1, r2, r3);
+  PCKEV_ST4x4_UB(r0, r1, r2, r3, dst, BPS);
+}
+
+#undef DST
+#undef AVG3
+#undef AVG2
+
+static void Intra4Preds(uint8_t* dst, const uint8_t* top) {
+  DC4(I4DC4 + dst, top);
+  TM4(I4TM4 + dst, top);
+  VE4(I4VE4 + dst, top);
+  HE4(I4HE4 + dst, top);
+  RD4(I4RD4 + dst, top);
+  VR4(I4VR4 + dst, top);
+  LD4(I4LD4 + dst, top);
+  VL4(I4VL4 + dst, top);
+  HD4(I4HD4 + dst, top);
+  HU4(I4HU4 + dst, top);
+}
+
+// luma 16x16 prediction
+
+#define STORE16x16(out, dst) do {                                        \
+    ST_UB8(out, out, out, out, out, out, out, out, dst + 0 * BPS, BPS);  \
+    ST_UB8(out, out, out, out, out, out, out, out, dst + 8 * BPS, BPS);  \
+} while (0)
+
+static WEBP_INLINE void VerticalPred16x16(uint8_t* dst, const uint8_t* top) {
+  if (top != NULL) {
+    const v16u8 out = LD_UB(top);
+    STORE16x16(out, dst);
+  } else {
+    const v16u8 out = (v16u8)__msa_fill_b(0x7f);
+    STORE16x16(out, dst);
+  }
+}
+
+static WEBP_INLINE void HorizontalPred16x16(uint8_t* dst,
+                                            const uint8_t* left) {
+  if (left != NULL) {
+    int j;
+    for (j = 0; j < 16; j += 4) {
+      const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
+      const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
+      const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
+      const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
+      ST_UB4(L0, L1, L2, L3, dst, BPS);
+      dst += 4 * BPS;
+      left += 4;
+    }
+  } else {
+    const v16u8 out = (v16u8)__msa_fill_b(0x81);
+    STORE16x16(out, dst);
+  }
+}
+
+static WEBP_INLINE void TrueMotion16x16(uint8_t* dst, const uint8_t* left,
+                                        const uint8_t* top) {
+  if (left != NULL) {
+    if (top != NULL) {
+      int j;
+      v8i16 d1, d2;
+      const v16i8 zero = { 0 };
+      const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
+      const v16u8 T = LD_UB(top);
+      ILVRL_B2_SH(zero, T, d1, d2);
+      SUB2(d1, TL, d2, TL, d1, d2);
+      for (j = 0; j < 16; j += 4) {
+        v16i8 t0, t1, t2, t3;
+        v8i16 r0, r1, r2, r3, r4, r5, r6, r7;
+        const v8i16 L0 = (v8i16)__msa_fill_h(left[j + 0]);
+        const v8i16 L1 = (v8i16)__msa_fill_h(left[j + 1]);
+        const v8i16 L2 = (v8i16)__msa_fill_h(left[j + 2]);
+        const v8i16 L3 = (v8i16)__msa_fill_h(left[j + 3]);
+        ADD4(d1, L0, d1, L1, d1, L2, d1, L3, r0, r1, r2, r3);
+        ADD4(d2, L0, d2, L1, d2, L2, d2, L3, r4, r5, r6, r7);
+        CLIP_SH4_0_255(r0, r1, r2, r3);
+        CLIP_SH4_0_255(r4, r5, r6, r7);
+        PCKEV_B4_SB(r4, r0, r5, r1, r6, r2, r7, r3, t0, t1, t2, t3);
+        ST_SB4(t0, t1, t2, t3, dst, BPS);
+        dst += 4 * BPS;
+      }
+    } else {
+      HorizontalPred16x16(dst, left);
+    }
+  } else {
+    if (top != NULL) {
+      VerticalPred16x16(dst, top);
+    } else {
+      const v16u8 out = (v16u8)__msa_fill_b(0x81);
+      STORE16x16(out, dst);
+    }
+  }
+}
+
+static WEBP_INLINE void DCMode16x16(uint8_t* dst, const uint8_t* left,
+                                    const uint8_t* top) {
+  int DC;
+  v16u8 out;
+  if (top != NULL && left != NULL) {
+    const v16u8 rtop = LD_UB(top);
+    const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+    const v16u8 rleft = LD_UB(left);
+    const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
+    const v8u16 dctemp = dctop + dcleft;
+    DC = HADD_UH_U32(dctemp);
+    DC = (DC + 16) >> 5;
+  } else if (left != NULL) {   // left but no top
+    const v16u8 rleft = LD_UB(left);
+    const v8u16 dcleft = __msa_hadd_u_h(rleft, rleft);
+    DC = HADD_UH_U32(dcleft);
+    DC = (DC + DC + 16) >> 5;
+  } else if (top != NULL) {   // top but no left
+    const v16u8 rtop = LD_UB(top);
+    const v8u16 dctop = __msa_hadd_u_h(rtop, rtop);
+    DC = HADD_UH_U32(dctop);
+    DC = (DC + DC + 16) >> 5;
+  } else {   // no top, no left, nothing.
+    DC = 0x80;
+  }
+  out = (v16u8)__msa_fill_b(DC);
+  STORE16x16(out, dst);
+}
+
+static void Intra16Preds(uint8_t* dst,
+                         const uint8_t* left, const uint8_t* top) {
+  DCMode16x16(I16DC16 + dst, left, top);
+  VerticalPred16x16(I16VE16 + dst, top);
+  HorizontalPred16x16(I16HE16 + dst, left);
+  TrueMotion16x16(I16TM16 + dst, left, top);
+}
+
+// Chroma 8x8 prediction
+
+#define CALC_DC8(in, out) do {                              \
+  const v8u16 temp0 = __msa_hadd_u_h(in, in);               \
+  const v4u32 temp1 = __msa_hadd_u_w(temp0, temp0);         \
+  const v2i64 temp2 = (v2i64)__msa_hadd_u_d(temp1, temp1);  \
+  const v2i64 temp3 = __msa_splati_d(temp2, 1);             \
+  const v2i64 temp4 = temp3 + temp2;                        \
+  const v16i8 temp5 = (v16i8)__msa_srari_d(temp4, 4);       \
+  const v2i64 temp6 = (v2i64)__msa_splati_b(temp5, 0);      \
+  out = __msa_copy_s_d(temp6, 0);                           \
+} while (0)
+
+#define STORE8x8(out, dst) do {                 \
+  SD4(out, out, out, out, dst + 0 * BPS, BPS);  \
+  SD4(out, out, out, out, dst + 4 * BPS, BPS);  \
+} while (0)
+
+static WEBP_INLINE void VerticalPred8x8(uint8_t* dst, const uint8_t* top) {
+  if (top != NULL) {
+    const uint64_t out = LD(top);
+    STORE8x8(out, dst);
+  } else {
+    const uint64_t out = 0x7f7f7f7f7f7f7f7fULL;
+    STORE8x8(out, dst);
+  }
+}
+
+static WEBP_INLINE void HorizontalPred8x8(uint8_t* dst, const uint8_t* left) {
+  if (left != NULL) {
+    int j;
+    for (j = 0; j < 8; j += 4) {
+      const v16u8 L0 = (v16u8)__msa_fill_b(left[0]);
+      const v16u8 L1 = (v16u8)__msa_fill_b(left[1]);
+      const v16u8 L2 = (v16u8)__msa_fill_b(left[2]);
+      const v16u8 L3 = (v16u8)__msa_fill_b(left[3]);
+      const uint64_t out0 = __msa_copy_s_d((v2i64)L0, 0);
+      const uint64_t out1 = __msa_copy_s_d((v2i64)L1, 0);
+      const uint64_t out2 = __msa_copy_s_d((v2i64)L2, 0);
+      const uint64_t out3 = __msa_copy_s_d((v2i64)L3, 0);
+      SD4(out0, out1, out2, out3, dst, BPS);
+      dst += 4 * BPS;
+      left += 4;
+    }
+  } else {
+    const uint64_t out = 0x8181818181818181ULL;
+    STORE8x8(out, dst);
+  }
+}
+
+static WEBP_INLINE void TrueMotion8x8(uint8_t* dst, const uint8_t* left,
+                                      const uint8_t* top) {
+  if (left != NULL) {
+    if (top != NULL) {
+      int j;
+      const v8i16 TL = (v8i16)__msa_fill_h(left[-1]);
+      const v16u8 T1 = LD_UB(top);
+      const v16i8 zero = { 0 };
+      const v8i16 T  = (v8i16)__msa_ilvr_b(zero, (v16i8)T1);
+      const v8i16 d = T - TL;
+      for (j = 0; j < 8; j += 4) {
+        uint64_t out0, out1, out2, out3;
+        v16i8 t0, t1;
+        v8i16 r0 = (v8i16)__msa_fill_h(left[j + 0]);
+        v8i16 r1 = (v8i16)__msa_fill_h(left[j + 1]);
+        v8i16 r2 = (v8i16)__msa_fill_h(left[j + 2]);
+        v8i16 r3 = (v8i16)__msa_fill_h(left[j + 3]);
+        ADD4(d, r0, d, r1, d, r2, d, r3, r0, r1, r2, r3);
+        CLIP_SH4_0_255(r0, r1, r2, r3);
+        PCKEV_B2_SB(r1, r0, r3, r2, t0, t1);
+        out0 = __msa_copy_s_d((v2i64)t0, 0);
+        out1 = __msa_copy_s_d((v2i64)t0, 1);
+        out2 = __msa_copy_s_d((v2i64)t1, 0);
+        out3 = __msa_copy_s_d((v2i64)t1, 1);
+        SD4(out0, out1, out2, out3, dst, BPS);
+        dst += 4 * BPS;
+      }
+    } else {
+      HorizontalPred8x8(dst, left);
+    }
+  } else {
+    if (top != NULL) {
+      VerticalPred8x8(dst, top);
+    } else {
+      const uint64_t out = 0x8181818181818181ULL;
+      STORE8x8(out, dst);
+    }
+  }
+}
+
+static WEBP_INLINE void DCMode8x8(uint8_t* dst, const uint8_t* left,
+                                  const uint8_t* top) {
+  uint64_t out;
+  v16u8 src;
+  if (top != NULL && left != NULL) {
+    const uint64_t left_m = LD(left);
+    const uint64_t top_m = LD(top);
+    INSERT_D2_UB(left_m, top_m, src);
+    CALC_DC8(src, out);
+  } else if (left != NULL) {   // left but no top
+    const uint64_t left_m = LD(left);
+    INSERT_D2_UB(left_m, left_m, src);
+    CALC_DC8(src, out);
+  } else if (top != NULL) {   // top but no left
+    const uint64_t top_m = LD(top);
+    INSERT_D2_UB(top_m, top_m, src);
+    CALC_DC8(src, out);
+  } else {   // no top, no left, nothing.
+    src = (v16u8)__msa_fill_b(0x80);
+    out = __msa_copy_s_d((v2i64)src, 0);
+  }
+  STORE8x8(out, dst);
+}
+
+static void IntraChromaPreds(uint8_t* dst, const uint8_t* left,
+                             const uint8_t* top) {
+  // U block
+  DCMode8x8(C8DC8 + dst, left, top);
+  VerticalPred8x8(C8VE8 + dst, top);
+  HorizontalPred8x8(C8HE8 + dst, left);
+  TrueMotion8x8(C8TM8 + dst, left, top);
+  // V block
+  dst += 8;
+  if (top != NULL) top += 8;
+  if (left != NULL) left += 16;
+  DCMode8x8(C8DC8 + dst, left, top);
+  VerticalPred8x8(C8VE8 + dst, top);
+  HorizontalPred8x8(C8HE8 + dst, left);
+  TrueMotion8x8(C8TM8 + dst, left, top);
+}
+
+//------------------------------------------------------------------------------
+// Metric
+
+#define PACK_DOTP_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  v16u8 tmp0, tmp1;                                                        \
+  v8i16 tmp2, tmp3;                                                        \
+  ILVRL_B2_UB(in0, in1, tmp0, tmp1);                                       \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                     \
+  DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1);                         \
+  ILVRL_B2_UB(in2, in3, tmp0, tmp1);                                       \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                     \
+  DOTP_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
+} while (0)
+
+#define PACK_DPADD_UB4_SW(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  v16u8 tmp0, tmp1;                                                         \
+  v8i16 tmp2, tmp3;                                                         \
+  ILVRL_B2_UB(in0, in1, tmp0, tmp1);                                        \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                      \
+  DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out0, out1);                         \
+  ILVRL_B2_UB(in2, in3, tmp0, tmp1);                                        \
+  HSUB_UB2_SH(tmp0, tmp1, tmp2, tmp3);                                      \
+  DPADD_SH2_SW(tmp2, tmp3, tmp2, tmp3, out2, out3);                         \
+} while (0)
+
+static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v4i32 out0, out1, out2, out3;
+
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+  a += 8 * BPS;
+  b += 8 * BPS;
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  PACK_DPADD_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+  out0 += out1;
+  out2 += out3;
+  out0 += out2;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v4i32 out0, out1, out2, out3;
+
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  PACK_DOTP_UB4_SW(src0, ref0, src1, ref1, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src2, ref2, src3, ref3, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src4, ref4, src5, ref5, out0, out1, out2, out3);
+  PACK_DPADD_UB4_SW(src6, ref6, src7, ref7, out0, out1, out2, out3);
+  out0 += out1;
+  out2 += out3;
+  out0 += out2;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  v16u8 t0, t1, t2, t3;
+  v4i32 out0, out1, out2, out3;
+
+  LD_UB8(a, BPS, src0, src1, src2, src3, src4, src5, src6, src7);
+  LD_UB8(b, BPS, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7);
+  ILVR_B4_UB(src0, src1, src2, src3, ref0, ref1, ref2, ref3, t0, t1, t2, t3);
+  PACK_DOTP_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
+  ILVR_B4_UB(src4, src5, src6, src7, ref4, ref5, ref6, ref7, t0, t1, t2, t3);
+  PACK_DPADD_UB4_SW(t0, t2, t1, t3, out0, out1, out2, out3);
+  out0 += out1;
+  out2 += out3;
+  out0 += out2;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+  uint32_t sum = 0;
+  uint32_t src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  v16u8 src, ref, tmp0, tmp1;
+  v8i16 diff0, diff1;
+  v4i32 out0, out1;
+
+  LW4(a, BPS, src0, src1, src2, src3);
+  LW4(b, BPS, ref0, ref1, ref2, ref3);
+  INSERT_W4_UB(src0, src1, src2, src3, src);
+  INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+  ILVRL_B2_UB(src, ref, tmp0, tmp1);
+  HSUB_UB2_SH(tmp0, tmp1, diff0, diff1);
+  DOTP_SH2_SW(diff0, diff1, diff0, diff1, out0, out1);
+  out0 += out1;
+  sum = HADD_SW_S32(out0);
+  return sum;
+}
+
+//------------------------------------------------------------------------------
+// Quantization
+
+static int QuantizeBlock(int16_t in[16], int16_t out[16],
+                         const VP8Matrix* const mtx) {
+  int sum;
+  v8i16 in0, in1, sh0, sh1, out0, out1;
+  v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, sign0, sign1;
+  v4i32 s0, s1, s2, s3, b0, b1, b2, b3, t0, t1, t2, t3;
+  const v8i16 zero = { 0 };
+  const v8i16 zigzag0 = { 0, 1, 4, 8, 5, 2, 3, 6 };
+  const v8i16 zigzag1 = { 9, 12, 13, 10, 7, 11, 14, 15 };
+  const v8i16 maxlevel = __msa_fill_h(MAX_LEVEL);
+
+  LD_SH2(&in[0], 8, in0, in1);
+  LD_SH2(&mtx->sharpen_[0], 8, sh0, sh1);
+  tmp4 = __msa_add_a_h(in0, zero);
+  tmp5 = __msa_add_a_h(in1, zero);
+  ILVRL_H2_SH(sh0, tmp4, tmp0, tmp1);
+  ILVRL_H2_SH(sh1, tmp5, tmp2, tmp3);
+  HADD_SH4_SW(tmp0, tmp1, tmp2, tmp3, s0, s1, s2, s3);
+  sign0 = (in0 < zero);
+  sign1 = (in1 < zero);                           // sign
+  LD_SH2(&mtx->iq_[0], 8, tmp0, tmp1);            // iq
+  ILVRL_H2_SW(zero, tmp0, t0, t1);
+  ILVRL_H2_SW(zero, tmp1, t2, t3);
+  LD_SW4(&mtx->bias_[0], 4, b0, b1, b2, b3);      // bias
+  MUL4(t0, s0, t1, s1, t2, s2, t3, s3, t0, t1, t2, t3);
+  ADD4(b0, t0, b1, t1, b2, t2, b3, t3, b0, b1, b2, b3);
+  SRAI_W4_SW(b0, b1, b2, b3, 17);
+  PCKEV_H2_SH(b1, b0, b3, b2, tmp2, tmp3);
+  tmp0 = (tmp2 > maxlevel);
+  tmp1 = (tmp3 > maxlevel);
+  tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)maxlevel, (v16u8)tmp0);
+  tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)maxlevel, (v16u8)tmp1);
+  SUB2(0, tmp2, 0, tmp3, tmp0, tmp1);
+  tmp2 = (v8i16)__msa_bmnz_v((v16u8)tmp2, (v16u8)tmp0, (v16u8)sign0);
+  tmp3 = (v8i16)__msa_bmnz_v((v16u8)tmp3, (v16u8)tmp1, (v16u8)sign1);
+  LD_SW4(&mtx->zthresh_[0], 4, t0, t1, t2, t3);   // zthresh
+  t0 = (s0 > t0);
+  t1 = (s1 > t1);
+  t2 = (s2 > t2);
+  t3 = (s3 > t3);
+  PCKEV_H2_SH(t1, t0, t3, t2, tmp0, tmp1);
+  tmp4 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp2, (v16u8)tmp0);
+  tmp5 = (v8i16)__msa_bmnz_v((v16u8)zero, (v16u8)tmp3, (v16u8)tmp1);
+  LD_SH2(&mtx->q_[0], 8, tmp0, tmp1);
+  MUL2(tmp4, tmp0, tmp5, tmp1, in0, in1);
+  VSHF_H2_SH(tmp4, tmp5, tmp4, tmp5, zigzag0, zigzag1, out0, out1);
+  ST_SH2(in0, in1, &in[0], 8);
+  ST_SH2(out0, out1, &out[0], 8);
+  out0 = __msa_add_a_h(out0, out1);
+  sum = HADD_SH_S32(out0);
+  return (sum > 0);
+}
+
+static int Quantize2Blocks(int16_t in[32], int16_t out[32],
+                           const VP8Matrix* const mtx) {
+  int nz;
+  nz  = VP8EncQuantizeBlock(in + 0 * 16, out + 0 * 16, mtx) << 0;
+  nz |= VP8EncQuantizeBlock(in + 1 * 16, out + 1 * 16, mtx) << 1;
+  return nz;
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8EncDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMSA(void) {
+  VP8ITransform = ITransform;
+  VP8FTransform = FTransform;
+  VP8FTransformWHT = FTransformWHT;
+
+  VP8TDisto4x4 = Disto4x4;
+  VP8TDisto16x16 = Disto16x16;
+  VP8CollectHistogram = CollectHistogram;
+
+  VP8EncPredLuma4 = Intra4Preds;
+  VP8EncPredLuma16 = Intra16Preds;
+  VP8EncPredChroma8 = IntraChromaPreds;
+
+  VP8SSE16x16 = SSE16x16;
+  VP8SSE16x8 = SSE16x8;
+  VP8SSE8x8 = SSE8x8;
+  VP8SSE4x4 = SSE4x4;
+
+  VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantize2Blocks = Quantize2Blocks;
+  VP8EncQuantizeBlockWHT = QuantizeBlock;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8EncDspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/enc_neon.c b/src/3rdparty/libwebp/src/dsp/enc_neon.c
index 46f6bf9..6a078d6 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_neon.c
@@ -18,7 +18,7 @@
 #include <assert.h>
 
 #include "./neon.h"
-#include "../enc/vp8enci.h"
+#include "../enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
@@ -746,9 +746,14 @@ static WEBP_INLINE void AccumulateSSE16(const uint8_t* const a,
   const uint8x16_t a0 = vld1q_u8(a);
   const uint8x16_t b0 = vld1q_u8(b);
   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
-  uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
-  prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
-  *sum = vpadalq_u16(*sum, prod);      // pair-wise add and accumulate
+  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
+                                    vget_low_u8(abs_diff));
+  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
+                                    vget_high_u8(abs_diff));
+  /* pair-wise adds and widen */
+  const uint32x4_t sum1 = vpaddlq_u16(prod1);
+  const uint32x4_t sum2 = vpaddlq_u16(prod2);
+  *sum = vaddq_u32(*sum, vaddq_u32(sum1, sum2));
 }
 
 // Horizontal sum of all four uint32_t values in 'sum'.
@@ -758,7 +763,7 @@ static int SumToInt(uint32x4_t sum) {
   return (int)sum3;
 }
 
-static int SSE16x16(const uint8_t* a, const uint8_t* b) {
+static int SSE16x16_NEON(const uint8_t* a, const uint8_t* b) {
   uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 16; ++y) {
@@ -767,7 +772,7 @@ static int SSE16x16(const uint8_t* a, const uint8_t* b) {
   return SumToInt(sum);
 }
 
-static int SSE16x8(const uint8_t* a, const uint8_t* b) {
+static int SSE16x8_NEON(const uint8_t* a, const uint8_t* b) {
   uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 8; ++y) {
@@ -776,7 +781,7 @@ static int SSE16x8(const uint8_t* a, const uint8_t* b) {
   return SumToInt(sum);
 }
 
-static int SSE8x8(const uint8_t* a, const uint8_t* b) {
+static int SSE8x8_NEON(const uint8_t* a, const uint8_t* b) {
   uint32x4_t sum = vdupq_n_u32(0);
   int y;
   for (y = 0; y < 8; ++y) {
@@ -789,13 +794,18 @@ static int SSE8x8(const uint8_t* a, const uint8_t* b) {
   return SumToInt(sum);
 }
 
-static int SSE4x4(const uint8_t* a, const uint8_t* b) {
+static int SSE4x4_NEON(const uint8_t* a, const uint8_t* b) {
   const uint8x16_t a0 = Load4x4(a);
   const uint8x16_t b0 = Load4x4(b);
   const uint8x16_t abs_diff = vabdq_u8(a0, b0);
-  uint16x8_t prod = vmull_u8(vget_low_u8(abs_diff), vget_low_u8(abs_diff));
-  prod = vmlal_u8(prod, vget_high_u8(abs_diff), vget_high_u8(abs_diff));
-  return SumToInt(vpaddlq_u16(prod));
+  const uint16x8_t prod1 = vmull_u8(vget_low_u8(abs_diff),
+                                    vget_low_u8(abs_diff));
+  const uint16x8_t prod2 = vmull_u8(vget_high_u8(abs_diff),
+                                    vget_high_u8(abs_diff));
+  /* pair-wise adds and widen */
+  const uint32x4_t sum1 = vpaddlq_u16(prod1);
+  const uint32x4_t sum2 = vpaddlq_u16(prod2);
+  return SumToInt(vaddq_u32(sum1, sum2));
 }
 
 //------------------------------------------------------------------------------
@@ -903,10 +913,12 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitNEON(void) {
   VP8TDisto4x4 = Disto4x4;
   VP8TDisto16x16 = Disto16x16;
   VP8CollectHistogram = CollectHistogram;
-  VP8SSE16x16 = SSE16x16;
-  VP8SSE16x8 = SSE16x8;
-  VP8SSE8x8 = SSE8x8;
-  VP8SSE4x4 = SSE4x4;
+
+  VP8SSE16x16 = SSE16x16_NEON;
+  VP8SSE16x8 = SSE16x8_NEON;
+  VP8SSE8x8 = SSE8x8_NEON;
+  VP8SSE4x4 = SSE4x4_NEON;
+
 #if !defined(WORK_AROUND_GCC)
   VP8EncQuantizeBlock = QuantizeBlock;
   VP8EncQuantize2Blocks = Quantize2Blocks;
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse2.c b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
index 4a2e3ce..2026a74 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_sse2.c
@@ -14,12 +14,13 @@
 #include "./dsp.h"
 
 #if defined(WEBP_USE_SSE2)
+#include <assert.h>
 #include <stdlib.h>  // for abs()
 #include <emmintrin.h>
 
 #include "./common_sse2.h"
-#include "../enc/cost.h"
-#include "../enc/vp8enci.h"
+#include "../enc/cost_enc.h"
+#include "../enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Transforms (Paragraph 14.4)
@@ -139,7 +140,7 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
 
     // Transpose the two 4x4.
     VP8Transpose_2_4x4_16b(&shifted0, &shifted1, &shifted2, &shifted3, &T0, &T1,
-                        &T2, &T3);
+                           &T2, &T3);
   }
 
   // Add inverse transform to 'ref' and store.
@@ -250,25 +251,11 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
   const __m128i k51000 = _mm_set1_epi32(51000);
 
   // Same operations are done on the (0,3) and (1,2) pairs.
-  // a0 = v0 + v3
-  // a1 = v1 + v2
   // a3 = v0 - v3
   // a2 = v1 - v2
-  const __m128i a01 = _mm_add_epi16(*v01, *v32);
   const __m128i a32 = _mm_sub_epi16(*v01, *v32);
-  const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
   const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
-  const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
 
-  // d0 = (a0 + a1 + 7) >> 4;
-  // d2 = (a0 - a1 + 7) >> 4;
-  const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
-  const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
-  const __m128i d0 = _mm_srai_epi16(c0, 4);
-  const __m128i d2 = _mm_srai_epi16(c2, 4);
-
-  // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
-  // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
   const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
   const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
   const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
@@ -276,14 +263,28 @@ static void FTransformPass2(const __m128i* const v01, const __m128i* const v32,
   const __m128i d3 = _mm_add_epi32(c3, k51000);
   const __m128i e1 = _mm_srai_epi32(d1, 16);
   const __m128i e3 = _mm_srai_epi32(d3, 16);
+  // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
+  // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
   const __m128i f1 = _mm_packs_epi32(e1, e1);
   const __m128i f3 = _mm_packs_epi32(e3, e3);
-  // f1 = f1 + (a3 != 0);
+  // g1 = f1 + (a3 != 0);
   // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
   // desired (0, 1), we add one earlier through k12000_plus_one.
-  // -> f1 = f1 + 1 - (a3 == 0)
+  // -> g1 = f1 + 1 - (a3 == 0)
   const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));
 
+  // a0 = v0 + v3
+  // a1 = v1 + v2
+  const __m128i a01 = _mm_add_epi16(*v01, *v32);
+  const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);
+  const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
+  const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
+  const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
+  // d0 = (a0 + a1 + 7) >> 4;
+  // d2 = (a0 - a1 + 7) >> 4;
+  const __m128i d0 = _mm_srai_epi16(c0, 4);
+  const __m128i d2 = _mm_srai_epi16(c2, 4);
+
   const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
   const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
   _mm_storeu_si128((__m128i*)&out[0], d0_g1);
@@ -1046,6 +1047,37 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) {
 }
 
 //------------------------------------------------------------------------------
+
+static void Mean16x4(const uint8_t* ref, uint32_t dc[4]) {
+  const __m128i mask = _mm_set1_epi16(0x00ff);
+  const __m128i a0 = _mm_loadu_si128((const __m128i*)&ref[BPS * 0]);
+  const __m128i a1 = _mm_loadu_si128((const __m128i*)&ref[BPS * 1]);
+  const __m128i a2 = _mm_loadu_si128((const __m128i*)&ref[BPS * 2]);
+  const __m128i a3 = _mm_loadu_si128((const __m128i*)&ref[BPS * 3]);
+  const __m128i b0 = _mm_srli_epi16(a0, 8);     // hi byte
+  const __m128i b1 = _mm_srli_epi16(a1, 8);
+  const __m128i b2 = _mm_srli_epi16(a2, 8);
+  const __m128i b3 = _mm_srli_epi16(a3, 8);
+  const __m128i c0 = _mm_and_si128(a0, mask);   // lo byte
+  const __m128i c1 = _mm_and_si128(a1, mask);
+  const __m128i c2 = _mm_and_si128(a2, mask);
+  const __m128i c3 = _mm_and_si128(a3, mask);
+  const __m128i d0 = _mm_add_epi32(b0, c0);
+  const __m128i d1 = _mm_add_epi32(b1, c1);
+  const __m128i d2 = _mm_add_epi32(b2, c2);
+  const __m128i d3 = _mm_add_epi32(b3, c3);
+  const __m128i e0 = _mm_add_epi32(d0, d1);
+  const __m128i e1 = _mm_add_epi32(d2, d3);
+  const __m128i f0 = _mm_add_epi32(e0, e1);
+  uint16_t tmp[8];
+  _mm_storeu_si128((__m128i*)tmp, f0);
+  dc[0] = tmp[0] + tmp[1];
+  dc[1] = tmp[2] + tmp[3];
+  dc[2] = tmp[4] + tmp[5];
+  dc[3] = tmp[6] + tmp[7];
+}
+
+//------------------------------------------------------------------------------
 // Texture distortion
 //
 // We try to match the spectral content (weighted) between source and
@@ -1331,10 +1363,122 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitSSE2(void) {
   VP8SSE4x4 = SSE4x4;
   VP8TDisto4x4 = Disto4x4;
   VP8TDisto16x16 = Disto16x16;
+  VP8Mean16x4 = Mean16x4;
+}
+
+//------------------------------------------------------------------------------
+// SSIM / PSNR entry point (TODO(skal): move to its own file later)
+
+static uint32_t AccumulateSSE_SSE2(const uint8_t* src1,
+                                   const uint8_t* src2, int len) {
+  int i = 0;
+  uint32_t sse2 = 0;
+  if (len >= 16) {
+    const int limit = len - 32;
+    int32_t tmp[4];
+    __m128i sum1;
+    __m128i sum = _mm_setzero_si128();
+    __m128i a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+    __m128i b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+    i += 16;
+    while (i <= limit) {
+      const __m128i a1 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      const __m128i b1 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      __m128i sum2;
+      i += 16;
+      SubtractAndAccumulate(a0, b0, &sum1);
+      sum = _mm_add_epi32(sum, sum1);
+      a0 = _mm_loadu_si128((const __m128i*)&src1[i]);
+      b0 = _mm_loadu_si128((const __m128i*)&src2[i]);
+      i += 16;
+      SubtractAndAccumulate(a1, b1, &sum2);
+      sum = _mm_add_epi32(sum, sum2);
+    }
+    SubtractAndAccumulate(a0, b0, &sum1);
+    sum = _mm_add_epi32(sum, sum1);
+    _mm_storeu_si128((__m128i*)tmp, sum);
+    sse2 += (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
+  }
+
+  for (; i < len; ++i) {
+    const int32_t diff = src1[i] - src2[i];
+    sse2 += diff * diff;
+  }
+  return sse2;
+}
+
+static uint32_t HorizontalAdd16b(const __m128i* const m) {
+  uint16_t tmp[8];
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi16(*m, a);
+  _mm_storeu_si128((__m128i*)tmp, b);
+  return (uint32_t)tmp[3] + tmp[2] + tmp[1] + tmp[0];
+}
+
+static uint32_t HorizontalAdd32b(const __m128i* const m) {
+  const __m128i a = _mm_srli_si128(*m, 8);
+  const __m128i b = _mm_add_epi32(*m, a);
+  const __m128i c = _mm_add_epi32(b, _mm_srli_si128(b, 4));
+  return (uint32_t)_mm_cvtsi128_si32(c);
+}
+
+static const uint16_t kWeight[] = { 1, 2, 3, 4, 3, 2, 1, 0 };
+
+#define ACCUMULATE_ROW(WEIGHT) do {                         \
+  /* compute row weight (Wx * Wy) */                        \
+  const __m128i Wy = _mm_set1_epi16((WEIGHT));              \
+  const __m128i W = _mm_mullo_epi16(Wx, Wy);                \
+  /* process 8 bytes at a time (7 bytes, actually) */       \
+  const __m128i a0 = _mm_loadl_epi64((const __m128i*)src1); \
+  const __m128i b0 = _mm_loadl_epi64((const __m128i*)src2); \
+  /* convert to 16b and multiply by weight */               \
+  const __m128i a1 = _mm_unpacklo_epi8(a0, zero);           \
+  const __m128i b1 = _mm_unpacklo_epi8(b0, zero);           \
+  const __m128i wa1 = _mm_mullo_epi16(a1, W);               \
+  const __m128i wb1 = _mm_mullo_epi16(b1, W);               \
+  /* accumulate */                                          \
+  xm  = _mm_add_epi16(xm, wa1);                             \
+  ym  = _mm_add_epi16(ym, wb1);                             \
+  xxm = _mm_add_epi32(xxm, _mm_madd_epi16(a1, wa1));        \
+  xym = _mm_add_epi32(xym, _mm_madd_epi16(a1, wb1));        \
+  yym = _mm_add_epi32(yym, _mm_madd_epi16(b1, wb1));        \
+  src1 += stride1;                                          \
+  src2 += stride2;                                          \
+} while (0)
+
+static double SSIMGet_SSE2(const uint8_t* src1, int stride1,
+                           const uint8_t* src2, int stride2) {
+  VP8DistoStats stats;
+  const __m128i zero = _mm_setzero_si128();
+  __m128i xm = zero, ym = zero;                // 16b accums
+  __m128i xxm = zero, yym = zero, xym = zero;  // 32b accum
+  const __m128i Wx = _mm_loadu_si128((const __m128i*)kWeight);
+  assert(2 * VP8_SSIM_KERNEL + 1 == 7);
+  ACCUMULATE_ROW(1);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(4);
+  ACCUMULATE_ROW(3);
+  ACCUMULATE_ROW(2);
+  ACCUMULATE_ROW(1);
+  stats.xm  = HorizontalAdd16b(&xm);
+  stats.ym  = HorizontalAdd16b(&ym);
+  stats.xxm = HorizontalAdd32b(&xxm);
+  stats.xym = HorizontalAdd32b(&xym);
+  stats.yym = HorizontalAdd32b(&yym);
+  return VP8SSIMFromStats(&stats);
+}
+
+extern void VP8SSIMDspInitSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInitSSE2(void) {
+  VP8AccumulateSSE = AccumulateSSE_SSE2;
+  VP8SSIMGet = SSIMGet_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
 
 WEBP_DSP_INIT_STUB(VP8EncDspInitSSE2)
+WEBP_DSP_INIT_STUB(VP8SSIMDspInitSSE2)
 
 #endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/enc_sse41.c b/src/3rdparty/libwebp/src/dsp/enc_sse41.c
index a178390..e32086d 100644
--- a/src/3rdparty/libwebp/src/dsp/enc_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/enc_sse41.c
@@ -18,7 +18,7 @@
 #include <stdlib.h>  // for abs()
 
 #include "./common_sse2.h"
-#include "../enc/vp8enci.h"
+#include "../enc/vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Compute susceptibility based on DCT-coeff histograms.
diff --git a/src/3rdparty/libwebp/src/dsp/filters.c b/src/3rdparty/libwebp/src/dsp/filters.c
index 9f04faf..65f34aa 100644
--- a/src/3rdparty/libwebp/src/dsp/filters.c
+++ b/src/3rdparty/libwebp/src/dsp/filters.c
@@ -227,6 +227,8 @@ WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
 WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
 
 extern void VP8FiltersInitMIPSdspR2(void);
+extern void VP8FiltersInitMSA(void);
+extern void VP8FiltersInitNEON(void);
 extern void VP8FiltersInitSSE2(void);
 
 static volatile VP8CPUInfo filters_last_cpuinfo_used =
@@ -251,11 +253,21 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
       VP8FiltersInitSSE2();
     }
 #endif
+#if defined(WEBP_USE_NEON)
+    if (VP8GetCPUInfo(kNEON)) {
+      VP8FiltersInitNEON();
+    }
+#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
     if (VP8GetCPUInfo(kMIPSdspR2)) {
       VP8FiltersInitMIPSdspR2();
     }
 #endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      VP8FiltersInitMSA();
+    }
+#endif
   }
   filters_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/filters_msa.c b/src/3rdparty/libwebp/src/dsp/filters_msa.c
new file mode 100644
index 0000000..4b8922d
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/filters_msa.c
@@ -0,0 +1,202 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of alpha filters
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "./msa_macro.h"
+
+#include <assert.h>
+
+static WEBP_INLINE void PredictLineInverse0(const uint8_t* src,
+                                            const uint8_t* pred,
+                                            uint8_t* dst, int length) {
+  v16u8 src0, pred0, dst0;
+  assert(length >= 0);
+  while (length >= 32) {
+    v16u8 src1, pred1, dst1;
+    LD_UB2(src, 16, src0, src1);
+    LD_UB2(pred, 16, pred0, pred1);
+    SUB2(src0, pred0, src1, pred1, dst0, dst1);
+    ST_UB2(dst0, dst1, dst, 16);
+    src += 32;
+    pred += 32;
+    dst += 32;
+    length -= 32;
+  }
+  if (length > 0) {
+    int i;
+    if (length >= 16) {
+      src0 = LD_UB(src);
+      pred0 = LD_UB(pred);
+      dst0 = src0 - pred0;
+      ST_UB(dst0, dst);
+      src += 16;
+      pred += 16;
+      dst += 16;
+      length -= 16;
+    }
+    for (i = 0; i < length; i++) {
+      dst[i] = src[i] - pred[i];
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Helpful macro.
+
+#define SANITY_CHECK(in, out)  \
+  assert(in != NULL);          \
+  assert(out != NULL);         \
+  assert(width > 0);           \
+  assert(height > 0);          \
+  assert(stride >= width);
+
+//------------------------------------------------------------------------------
+// Horrizontal filter
+
+static void HorizontalFilter(const uint8_t* data, int width, int height,
+                             int stride, uint8_t* filtered_data) {
+  const uint8_t* preds = data;
+  const uint8_t* in = data;
+  uint8_t* out = filtered_data;
+  int row = 1;
+  SANITY_CHECK(in, out);
+
+  // Leftmost pixel is the same as input for topmost scanline.
+  out[0] = in[0];
+  PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+  preds += stride;
+  in += stride;
+  out += stride;
+  // Filter line-by-line.
+  while (row < height) {
+    // Leftmost pixel is predicted from above.
+    PredictLineInverse0(in, preds - stride, out, 1);
+    PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter
+
+static WEBP_INLINE void PredictLineGradient(const uint8_t* pinput,
+                                            const uint8_t* ppred,
+                                            uint8_t* poutput, int stride,
+                                            int size) {
+  int w;
+  const v16i8 zero = { 0 };
+  while (size >= 16) {
+    v16u8 pred0, dst0;
+    v8i16 a0, a1, b0, b1, c0, c1;
+    const v16u8 tmp0 = LD_UB(ppred - 1);
+    const v16u8 tmp1 = LD_UB(ppred - stride);
+    const v16u8 tmp2 = LD_UB(ppred - stride - 1);
+    const v16u8 src0 = LD_UB(pinput);
+    ILVRL_B2_SH(zero, tmp0, a0, a1);
+    ILVRL_B2_SH(zero, tmp1, b0, b1);
+    ILVRL_B2_SH(zero, tmp2, c0, c1);
+    ADD2(a0, b0, a1, b1, a0, a1);
+    SUB2(a0, c0, a1, c1, a0, a1);
+    CLIP_SH2_0_255(a0, a1);
+    pred0 = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);
+    dst0 = src0 - pred0;
+    ST_UB(dst0, poutput);
+    ppred += 16;
+    pinput += 16;
+    poutput += 16;
+    size -= 16;
+  }
+  for (w = 0; w < size; ++w) {
+    const int pred = ppred[w - 1] + ppred[w - stride] - ppred[w - stride - 1];
+    poutput[w] = pinput[w] - (pred < 0 ? 0 : pred > 255 ? 255 : pred);
+  }
+}
+
+
+static void GradientFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  const uint8_t* in = data;
+  const uint8_t* preds = data;
+  uint8_t* out = filtered_data;
+  int row = 1;
+  SANITY_CHECK(in, out);
+
+  // left prediction for top scan-line
+  out[0] = in[0];
+  PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+  preds += stride;
+  in += stride;
+  out += stride;
+  // Filter line-by-line.
+  while (row < height) {
+    out[0] = in[0] - preds[- stride];
+    PredictLineGradient(preds + 1, in + 1, out + 1, stride, width - 1);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter
+
+static void VerticalFilter(const uint8_t* data, int width, int height,
+                           int stride, uint8_t* filtered_data) {
+  const uint8_t* in = data;
+  const uint8_t* preds = data;
+  uint8_t* out = filtered_data;
+  int row = 1;
+  SANITY_CHECK(in, out);
+
+  // Very first top-left pixel is copied.
+  out[0] = in[0];
+  // Rest of top scan-line is left-predicted.
+  PredictLineInverse0(in + 1, preds, out + 1, width - 1);
+  in += stride;
+  out += stride;
+
+  // Filter line-by-line.
+  while (row < height) {
+    PredictLineInverse0(in, preds, out, width);
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
+}
+
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8FiltersInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitMSA(void) {
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8FiltersInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/filters_neon.c b/src/3rdparty/libwebp/src/dsp/filters_neon.c
new file mode 100644
index 0000000..4d6e50c
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/filters_neon.c
@@ -0,0 +1,327 @@
+// Copyright 2017 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// NEON variant of alpha filters
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_NEON)
+
+#include <assert.h>
+#include "./neon.h"
+
+//------------------------------------------------------------------------------
+// Helpful macros.
+
+# define SANITY_CHECK(in, out)                                                 \
+  assert(in != NULL);                                                          \
+  assert(out != NULL);                                                         \
+  assert(width > 0);                                                           \
+  assert(height > 0);                                                          \
+  assert(stride >= width);                                                     \
+  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
+  (void)height;  // Silence unused warning.
+
+// load eight u8 and widen to s16
+#define U8_TO_S16(A) vreinterpretq_s16_u16(vmovl_u8(A))
+#define LOAD_U8_TO_S16(A) U8_TO_S16(vld1_u8(A))
+
+// shift left or right by N byte, inserting zeros
+#define SHIFT_RIGHT_N_Q(A, N) vextq_u8((A), zero, (N))
+#define SHIFT_LEFT_N_Q(A, N) vextq_u8(zero, (A), (16 - (N)) % 16)
+
+// rotate left by N bytes
+#define ROTATE_LEFT_N(A, N)   vext_u8((A), (A), (N))
+// rotate right by N bytes
+#define ROTATE_RIGHT_N(A, N)   vext_u8((A), (A), (8 - (N)) % 8)
+
+static void PredictLine_NEON(const uint8_t* src, const uint8_t* pred,
+                             uint8_t* dst, int length) {
+  int i;
+  assert(length >= 0);
+  for (i = 0; i + 16 <= length; i += 16) {
+    const uint8x16_t A = vld1q_u8(&src[i]);
+    const uint8x16_t B = vld1q_u8(&pred[i]);
+    const uint8x16_t C = vsubq_u8(A, B);
+    vst1q_u8(&dst[i], C);
+  }
+  for (; i < length; ++i) dst[i] = src[i] - pred[i];
+}
+
+// Special case for left-based prediction (when preds==dst-1 or preds==src-1).
+static void PredictLineLeft_NEON(const uint8_t* src, uint8_t* dst, int length) {
+  PredictLine_NEON(src, src - 1, dst, length);
+}
+
+//------------------------------------------------------------------------------
+// Horizontal filter.
+
+static WEBP_INLINE void DoHorizontalFilter_NEON(const uint8_t* in,
+                                                int width, int height,
+                                                int stride,
+                                                int row, int num_rows,
+                                                uint8_t* out) {
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+
+  if (row == 0) {
+    // Leftmost pixel is the same as input for topmost scanline.
+    out[0] = in[0];
+    PredictLineLeft_NEON(in + 1, out + 1, width - 1);
+    row = 1;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    // Leftmost pixel is predicted from above.
+    out[0] = in[0] - in[-stride];
+    PredictLineLeft_NEON(in + 1, out + 1, width - 1);
+    ++row;
+    in += stride;
+    out += stride;
+  }
+}
+
+static void HorizontalFilter_NEON(const uint8_t* data, int width, int height,
+                                  int stride, uint8_t* filtered_data) {
+  DoHorizontalFilter_NEON(data, width, height, stride, 0, height,
+                          filtered_data);
+}
+
+//------------------------------------------------------------------------------
+// Vertical filter.
+
+static WEBP_INLINE void DoVerticalFilter_NEON(const uint8_t* in,
+                                              int width, int height, int stride,
+                                              int row, int num_rows,
+                                              uint8_t* out) {
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+
+  if (row == 0) {
+    // Very first top-left pixel is copied.
+    out[0] = in[0];
+    // Rest of top scan-line is left-predicted.
+    PredictLineLeft_NEON(in + 1, out + 1, width - 1);
+    row = 1;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    PredictLine_NEON(in, in - stride, out, width);
+    ++row;
+    in += stride;
+    out += stride;
+  }
+}
+
+static void VerticalFilter_NEON(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
+  DoVerticalFilter_NEON(data, width, height, stride, 0, height,
+                        filtered_data);
+}
+
+//------------------------------------------------------------------------------
+// Gradient filter.
+
+static WEBP_INLINE int GradientPredictor_C(uint8_t a, uint8_t b, uint8_t c) {
+  const int g = a + b - c;
+  return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
+}
+
+static void GradientPredictDirect_NEON(const uint8_t* const row,
+                                       const uint8_t* const top,
+                                       uint8_t* const out, int length) {
+  int i;
+  for (i = 0; i + 8 <= length; i += 8) {
+    const uint8x8_t A = vld1_u8(&row[i - 1]);
+    const uint8x8_t B = vld1_u8(&top[i + 0]);
+    const int16x8_t C = vreinterpretq_s16_u16(vaddl_u8(A, B));
+    const int16x8_t D = LOAD_U8_TO_S16(&top[i - 1]);
+    const uint8x8_t E = vqmovun_s16(vsubq_s16(C, D));
+    const uint8x8_t F = vld1_u8(&row[i + 0]);
+    vst1_u8(&out[i], vsub_u8(F, E));
+  }
+  for (; i < length; ++i) {
+    out[i] = row[i] - GradientPredictor_C(row[i - 1], top[i], top[i - 1]);
+  }
+}
+
+static WEBP_INLINE void DoGradientFilter_NEON(const uint8_t* in,
+                                              int width, int height,
+                                              int stride,
+                                              int row, int num_rows,
+                                              uint8_t* out) {
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
+  SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+
+  // left prediction for top scan-line
+  if (row == 0) {
+    out[0] = in[0];
+    PredictLineLeft_NEON(in + 1, out + 1, width - 1);
+    row = 1;
+    in += stride;
+    out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    out[0] = in[0] - in[-stride];
+    GradientPredictDirect_NEON(in + 1, in + 1 - stride, out + 1, width - 1);
+    ++row;
+    in += stride;
+    out += stride;
+  }
+}
+
+static void GradientFilter_NEON(const uint8_t* data, int width, int height,
+                               int stride, uint8_t* filtered_data) {
+  DoGradientFilter_NEON(data, width, height, stride, 0, height,
+                        filtered_data);
+}
+
+#undef SANITY_CHECK
+
+//------------------------------------------------------------------------------
+// Inverse transforms
+
+static void HorizontalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
+                                    uint8_t* out, int width) {
+  int i;
+  const uint8x16_t zero = vdupq_n_u8(0);
+  uint8x16_t last;
+  out[0] = in[0] + (prev == NULL ? 0 : prev[0]);
+  if (width <= 1) return;
+  last = vsetq_lane_u8(out[0], zero, 0);
+  for (i = 1; i + 16 <= width; i += 16) {
+    const uint8x16_t A0 = vld1q_u8(&in[i]);
+    const uint8x16_t A1 = vaddq_u8(A0, last);
+    const uint8x16_t A2 = SHIFT_LEFT_N_Q(A1, 1);
+    const uint8x16_t A3 = vaddq_u8(A1, A2);
+    const uint8x16_t A4 = SHIFT_LEFT_N_Q(A3, 2);
+    const uint8x16_t A5 = vaddq_u8(A3, A4);
+    const uint8x16_t A6 = SHIFT_LEFT_N_Q(A5, 4);
+    const uint8x16_t A7 = vaddq_u8(A5, A6);
+    const uint8x16_t A8 = SHIFT_LEFT_N_Q(A7, 8);
+    const uint8x16_t A9 = vaddq_u8(A7, A8);
+    vst1q_u8(&out[i], A9);
+    last = SHIFT_RIGHT_N_Q(A9, 15);
+  }
+  for (; i < width; ++i) out[i] = in[i] + out[i - 1];
+}
+
+static void VerticalUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
+                                  uint8_t* out, int width) {
+  if (prev == NULL) {
+    HorizontalUnfilter_NEON(NULL, in, out, width);
+  } else {
+    int i;
+    assert(width >= 0);
+    for (i = 0; i + 16 <= width; i += 16) {
+      const uint8x16_t A = vld1q_u8(&in[i]);
+      const uint8x16_t B = vld1q_u8(&prev[i]);
+      const uint8x16_t C = vaddq_u8(A, B);
+      vst1q_u8(&out[i], C);
+    }
+    for (; i < width; ++i) out[i] = in[i] + prev[i];
+  }
+}
+
+// GradientUnfilter_NEON is correct but slower than the C-version,
+// at least on ARM64. For armv7, it's a wash.
+// So best is to disable it for now, but keep the idea around...
+// #define USE_GRADIENT_UNFILTER
+
+#if defined(USE_GRADIENT_UNFILTER)
+#define GRAD_PROCESS_LANE(L)  do {                                             \
+  const uint8x8_t tmp1 = ROTATE_RIGHT_N(pred, 1);  /* rotate predictor in */   \
+  const int16x8_t tmp2 = vaddq_s16(BC, U8_TO_S16(tmp1));                       \
+  const uint8x8_t delta = vqmovun_s16(tmp2);                                   \
+  pred = vadd_u8(D, delta);                                                    \
+  out = vext_u8(out, ROTATE_LEFT_N(pred, (L)), 1);                             \
+} while (0)
+
+static void GradientPredictInverse_NEON(const uint8_t* const in,
+                                        const uint8_t* const top,
+                                        uint8_t* const row, int length) {
+  if (length > 0) {
+    int i;
+    uint8x8_t pred = vdup_n_u8(row[-1]);   // left sample
+    uint8x8_t out = vdup_n_u8(0);
+    for (i = 0; i + 8 <= length; i += 8) {
+      const int16x8_t B = LOAD_U8_TO_S16(&top[i + 0]);
+      const int16x8_t C = LOAD_U8_TO_S16(&top[i - 1]);
+      const int16x8_t BC = vsubq_s16(B, C);  // unclipped gradient basis B - C
+      const uint8x8_t D = vld1_u8(&in[i]);   // base input
+      GRAD_PROCESS_LANE(0);
+      GRAD_PROCESS_LANE(1);
+      GRAD_PROCESS_LANE(2);
+      GRAD_PROCESS_LANE(3);
+      GRAD_PROCESS_LANE(4);
+      GRAD_PROCESS_LANE(5);
+      GRAD_PROCESS_LANE(6);
+      GRAD_PROCESS_LANE(7);
+      vst1_u8(&row[i], out);
+    }
+    for (; i < length; ++i) {
+      row[i] = in[i] + GradientPredictor_C(row[i - 1], top[i], top[i - 1]);
+    }
+  }
+}
+#undef GRAD_PROCESS_LANE
+
+static void GradientUnfilter_NEON(const uint8_t* prev, const uint8_t* in,
+                                 uint8_t* out, int width) {
+  if (prev == NULL) {
+    HorizontalUnfilter_NEON(NULL, in, out, width);
+  } else {
+    out[0] = in[0] + prev[0];  // predict from above
+    GradientPredictInverse_NEON(in + 1, prev + 1, out + 1, width - 1);
+  }
+}
+
+#endif   // USE_GRADIENT_UNFILTER
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8FiltersInitNEON(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInitNEON(void) {
+  WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_NEON;
+  WebPUnfilters[WEBP_FILTER_VERTICAL] = VerticalUnfilter_NEON;
+#if defined(USE_GRADIENT_UNFILTER)
+  WebPUnfilters[WEBP_FILTER_GRADIENT] = GradientUnfilter_NEON;
+#endif
+
+  WebPFilters[WEBP_FILTER_HORIZONTAL] = HorizontalFilter_NEON;
+  WebPFilters[WEBP_FILTER_VERTICAL] = VerticalFilter_NEON;
+  WebPFilters[WEBP_FILTER_GRADIENT] = GradientFilter_NEON;
+}
+
+#else  // !WEBP_USE_NEON
+
+WEBP_DSP_INIT_STUB(VP8FiltersInitNEON)
+
+#endif  // WEBP_USE_NEON
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.c b/src/3rdparty/libwebp/src/dsp/lossless.c
index af913ef..20d18f6 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless.c
@@ -17,20 +17,16 @@
 
 #include <math.h>
 #include <stdlib.h>
-#include "../dec/vp8li.h"
-#include "../utils/endian_inl.h"
+#include "../dec/vp8li_dec.h"
+#include "../utils/endian_inl_utils.h"
 #include "./lossless.h"
+#include "./lossless_common.h"
 
 #define MAX_DIFF_COST (1e30f)
 
 //------------------------------------------------------------------------------
 // Image transforms.
 
-// In-place sum of each component with mod 256.
-static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
-  *a = VP8LAddPixels(*a, b);
-}
-
 static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
   return (((a0 ^ a1) & 0xfefefefeu) >> 1) + (a0 & a1);
 }
@@ -171,21 +167,41 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
   return pred;
 }
 
+GENERATE_PREDICTOR_ADD(Predictor0, PredictorAdd0)
+static void PredictorAdd1(const uint32_t* in, const uint32_t* upper,
+                          int num_pixels, uint32_t* out) {
+  int i;
+  uint32_t left = out[-1];
+  for (i = 0; i < num_pixels; ++i) {
+    out[i] = left = VP8LAddPixels(in[i], left);
+  }
+  (void)upper;
+}
+GENERATE_PREDICTOR_ADD(Predictor2, PredictorAdd2)
+GENERATE_PREDICTOR_ADD(Predictor3, PredictorAdd3)
+GENERATE_PREDICTOR_ADD(Predictor4, PredictorAdd4)
+GENERATE_PREDICTOR_ADD(Predictor5, PredictorAdd5)
+GENERATE_PREDICTOR_ADD(Predictor6, PredictorAdd6)
+GENERATE_PREDICTOR_ADD(Predictor7, PredictorAdd7)
+GENERATE_PREDICTOR_ADD(Predictor8, PredictorAdd8)
+GENERATE_PREDICTOR_ADD(Predictor9, PredictorAdd9)
+GENERATE_PREDICTOR_ADD(Predictor10, PredictorAdd10)
+GENERATE_PREDICTOR_ADD(Predictor11, PredictorAdd11)
+GENERATE_PREDICTOR_ADD(Predictor12, PredictorAdd12)
+GENERATE_PREDICTOR_ADD(Predictor13, PredictorAdd13)
+
 //------------------------------------------------------------------------------
 
 // Inverse prediction.
 static void PredictorInverseTransform(const VP8LTransform* const transform,
-                                      int y_start, int y_end, uint32_t* data) {
+                                      int y_start, int y_end,
+                                      const uint32_t* in, uint32_t* out) {
   const int width = transform->xsize_;
   if (y_start == 0) {  // First Row follows the L (mode=1) mode.
-    int x;
-    const uint32_t pred0 = Predictor0(data[-1], NULL);
-    AddPixelsEq(data, pred0);
-    for (x = 1; x < width; ++x) {
-      const uint32_t pred1 = Predictor1(data[x - 1], NULL);
-      AddPixelsEq(data + x, pred1);
-    }
-    data += width;
+    PredictorAdd0(in, NULL, 1, out);
+    PredictorAdd1(in + 1, NULL, width - 1, out + 1);
+    in += width;
+    out += width;
     ++y_start;
   }
 
@@ -193,36 +209,26 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
     int y = y_start;
     const int tile_width = 1 << transform->bits_;
     const int mask = tile_width - 1;
-    const int safe_width = width & ~mask;
     const int tiles_per_row = VP8LSubSampleSize(width, transform->bits_);
     const uint32_t* pred_mode_base =
         transform->data_ + (y >> transform->bits_) * tiles_per_row;
 
     while (y < y_end) {
-      const uint32_t pred2 = Predictor2(data[-1], data - width);
       const uint32_t* pred_mode_src = pred_mode_base;
-      VP8LPredictorFunc pred_func;
       int x = 1;
-      int t = 1;
       // First pixel follows the T (mode=2) mode.
-      AddPixelsEq(data, pred2);
+      PredictorAdd2(in, out - width, 1, out);
       // .. the rest:
-      while (x < safe_width) {
-        pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
-        for (; t < tile_width; ++t, ++x) {
-          const uint32_t pred = pred_func(data[x - 1], data + x - width);
-          AddPixelsEq(data + x, pred);
-        }
-        t = 0;
-      }
-      if (x < width) {
-        pred_func = VP8LPredictors[((*pred_mode_src++) >> 8) & 0xf];
-        for (; x < width; ++x) {
-          const uint32_t pred = pred_func(data[x - 1], data + x - width);
-          AddPixelsEq(data + x, pred);
-        }
+      while (x < width) {
+        const VP8LPredictorAddSubFunc pred_func =
+            VP8LPredictorsAdd[((*pred_mode_src++) >> 8) & 0xf];
+        int x_end = (x & ~mask) + tile_width;
+        if (x_end > width) x_end = width;
+        pred_func(in + x, out + x - width, x_end - x, out + x);
+        x = x_end;
       }
-      data += width;
+      in += width;
+      out += width;
       ++y;
       if ((y & mask) == 0) {   // Use the same mask, since tiles are squares.
         pred_mode_base += tiles_per_row;
@@ -233,21 +239,22 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
 
 // Add green to blue and red channels (i.e. perform the inverse transform of
 // 'subtract green').
-void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels) {
+void VP8LAddGreenToBlueAndRed_C(const uint32_t* src, int num_pixels,
+                                uint32_t* dst) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const uint32_t argb = data[i];
+    const uint32_t argb = src[i];
     const uint32_t green = ((argb >> 8) & 0xff);
     uint32_t red_blue = (argb & 0x00ff00ffu);
     red_blue += (green << 16) | green;
     red_blue &= 0x00ff00ffu;
-    data[i] = (argb & 0xff00ff00u) | red_blue;
+    dst[i] = (argb & 0xff00ff00u) | red_blue;
   }
 }
 
-static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
-                                                int8_t color) {
-  return (uint32_t)((int)(color_pred) * color) >> 5;
+static WEBP_INLINE int ColorTransformDelta(int8_t color_pred,
+                                           int8_t color) {
+  return ((int)color_pred * color) >> 5;
 }
 
 static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
@@ -257,27 +264,29 @@ static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
   m->red_to_blue_   = (color_code >> 16) & 0xff;
 }
 
-void VP8LTransformColorInverse_C(const VP8LMultipliers* const m, uint32_t* data,
-                                 int num_pixels) {
+void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
+                                 const uint32_t* src, int num_pixels,
+                                 uint32_t* dst) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const uint32_t argb = data[i];
+    const uint32_t argb = src[i];
     const uint32_t green = argb >> 8;
     const uint32_t red = argb >> 16;
-    uint32_t new_red = red;
-    uint32_t new_blue = argb;
+    int new_red = red;
+    int new_blue = argb;
     new_red += ColorTransformDelta(m->green_to_red_, green);
     new_red &= 0xff;
     new_blue += ColorTransformDelta(m->green_to_blue_, green);
     new_blue += ColorTransformDelta(m->red_to_blue_, new_red);
     new_blue &= 0xff;
-    data[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
+    dst[i] = (argb & 0xff00ff00u) | (new_red << 16) | (new_blue);
   }
 }
 
 // Color space inverse transform.
 static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
-                                       int y_start, int y_end, uint32_t* data) {
+                                       int y_start, int y_end,
+                                       const uint32_t* src, uint32_t* dst) {
   const int width = transform->xsize_;
   const int tile_width = 1 << transform->bits_;
   const int mask = tile_width - 1;
@@ -291,17 +300,19 @@ static void ColorSpaceInverseTransform(const VP8LTransform* const transform,
   while (y < y_end) {
     const uint32_t* pred = pred_row;
     VP8LMultipliers m = { 0, 0, 0 };
-    const uint32_t* const data_safe_end = data + safe_width;
-    const uint32_t* const data_end = data + width;
-    while (data < data_safe_end) {
+    const uint32_t* const src_safe_end = src + safe_width;
+    const uint32_t* const src_end = src + width;
+    while (src < src_safe_end) {
       ColorCodeToMultipliers(*pred++, &m);
-      VP8LTransformColorInverse(&m, data, tile_width);
-      data += tile_width;
+      VP8LTransformColorInverse(&m, src, tile_width, dst);
+      src += tile_width;
+      dst += tile_width;
     }
-    if (data < data_end) {  // Left-overs using C-version.
+    if (src < src_end) {  // Left-overs using C-version.
       ColorCodeToMultipliers(*pred++, &m);
-      VP8LTransformColorInverse(&m, data, remaining_width);
-      data += remaining_width;
+      VP8LTransformColorInverse(&m, src, remaining_width, dst);
+      src += remaining_width;
+      dst += remaining_width;
     }
     ++y;
     if ((y & mask) == 0) pred_row += tiles_per_row;
@@ -366,10 +377,10 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
   assert(row_end <= transform->ysize_);
   switch (transform->type_) {
     case SUBTRACT_GREEN:
-      VP8LAddGreenToBlueAndRed(out, (row_end - row_start) * width);
+      VP8LAddGreenToBlueAndRed(in, (row_end - row_start) * width, out);
       break;
     case PREDICTOR_TRANSFORM:
-      PredictorInverseTransform(transform, row_start, row_end, out);
+      PredictorInverseTransform(transform, row_start, row_end, in, out);
       if (row_end != transform->ysize_) {
         // The last predicted row in this iteration will be the top-pred row
         // for the first row in next iteration.
@@ -378,7 +389,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
       }
       break;
     case CROSS_COLOR_TRANSFORM:
-      ColorSpaceInverseTransform(transform, row_start, row_end, out);
+      ColorSpaceInverseTransform(transform, row_start, row_end, in, out);
       break;
     case COLOR_INDEXING_TRANSFORM:
       if (in == out && transform->bits_ > 0) {
@@ -555,10 +566,15 @@ void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
 
 //------------------------------------------------------------------------------
 
-VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+VP8LProcessDecBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+VP8LPredictorAddSubFunc VP8LPredictorsAdd[16];
 VP8LPredictorFunc VP8LPredictors[16];
 
-VP8LTransformColorFunc VP8LTransformColorInverse;
+// exposed plain-C implementations
+VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
+VP8LPredictorFunc VP8LPredictors_C[16];
+
+VP8LTransformColorInverseFunc VP8LTransformColorInverse;
 
 VP8LConvertFunc VP8LConvertBGRAToRGB;
 VP8LConvertFunc VP8LConvertBGRAToRGBA;
@@ -572,29 +588,37 @@ VP8LMapAlphaFunc VP8LMapColor8b;
 extern void VP8LDspInitSSE2(void);
 extern void VP8LDspInitNEON(void);
 extern void VP8LDspInitMIPSdspR2(void);
+extern void VP8LDspInitMSA(void);
 
 static volatile VP8CPUInfo lossless_last_cpuinfo_used =
     (VP8CPUInfo)&lossless_last_cpuinfo_used;
 
+#define COPY_PREDICTOR_ARRAY(IN, OUT) do {              \
+  (OUT)[0] = IN##0;                                     \
+  (OUT)[1] = IN##1;                                     \
+  (OUT)[2] = IN##2;                                     \
+  (OUT)[3] = IN##3;                                     \
+  (OUT)[4] = IN##4;                                     \
+  (OUT)[5] = IN##5;                                     \
+  (OUT)[6] = IN##6;                                     \
+  (OUT)[7] = IN##7;                                     \
+  (OUT)[8] = IN##8;                                     \
+  (OUT)[9] = IN##9;                                     \
+  (OUT)[10] = IN##10;                                   \
+  (OUT)[11] = IN##11;                                   \
+  (OUT)[12] = IN##12;                                   \
+  (OUT)[13] = IN##13;                                   \
+  (OUT)[14] = IN##0; /* <- padding security sentinels*/ \
+  (OUT)[15] = IN##0;                                    \
+} while (0);
+
 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
   if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return;
 
-  VP8LPredictors[0] = Predictor0;
-  VP8LPredictors[1] = Predictor1;
-  VP8LPredictors[2] = Predictor2;
-  VP8LPredictors[3] = Predictor3;
-  VP8LPredictors[4] = Predictor4;
-  VP8LPredictors[5] = Predictor5;
-  VP8LPredictors[6] = Predictor6;
-  VP8LPredictors[7] = Predictor7;
-  VP8LPredictors[8] = Predictor8;
-  VP8LPredictors[9] = Predictor9;
-  VP8LPredictors[10] = Predictor10;
-  VP8LPredictors[11] = Predictor11;
-  VP8LPredictors[12] = Predictor12;
-  VP8LPredictors[13] = Predictor13;
-  VP8LPredictors[14] = Predictor0;     // <- padding security sentinels
-  VP8LPredictors[15] = Predictor0;
+  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors)
+  COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C)
+  COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
+  COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd_C)
 
   VP8LAddGreenToBlueAndRed = VP8LAddGreenToBlueAndRed_C;
 
@@ -626,8 +650,14 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
       VP8LDspInitMIPSdspR2();
     }
 #endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      VP8LDspInitMSA();
+    }
+#endif
   }
   lossless_last_cpuinfo_used = VP8GetCPUInfo;
 }
+#undef COPY_PREDICTOR_ARRAY
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.h b/src/3rdparty/libwebp/src/dsp/lossless.h
index 9f0d7a2..352a54e 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.h
+++ b/src/3rdparty/libwebp/src/dsp/lossless.h
@@ -18,7 +18,7 @@
 #include "../webp/types.h"
 #include "../webp/decode.h"
 
-#include "../enc/histogram.h"
+#include "../enc/histogram_enc.h"
 #include "../utils/utils.h"
 
 #ifdef __cplusplus
@@ -26,7 +26,7 @@ extern "C" {
 #endif
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "../enc/delta_palettization.h"
+#include "../enc/delta_palettization_enc.h"
 #endif  // WEBP_EXPERIMENTAL_FEATURES
 
 //------------------------------------------------------------------------------
@@ -34,9 +34,17 @@ extern "C" {
 
 typedef uint32_t (*VP8LPredictorFunc)(uint32_t left, const uint32_t* const top);
 extern VP8LPredictorFunc VP8LPredictors[16];
-
-typedef void (*VP8LProcessBlueAndRedFunc)(uint32_t* argb_data, int num_pixels);
-extern VP8LProcessBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+extern VP8LPredictorFunc VP8LPredictors_C[16];
+// These Add/Sub function expects upper[-1] and out[-1] to be readable.
+typedef void (*VP8LPredictorAddSubFunc)(const uint32_t* in,
+                                        const uint32_t* upper, int num_pixels,
+                                        uint32_t* out);
+extern VP8LPredictorAddSubFunc VP8LPredictorsAdd[16];
+extern VP8LPredictorAddSubFunc VP8LPredictorsAdd_C[16];
+
+typedef void (*VP8LProcessDecBlueAndRedFunc)(const uint32_t* src,
+                                             int num_pixels, uint32_t* dst);
+extern VP8LProcessDecBlueAndRedFunc VP8LAddGreenToBlueAndRed;
 
 typedef struct {
   // Note: the members are uint8_t, so that any negative values are
@@ -45,9 +53,10 @@ typedef struct {
   uint8_t green_to_blue_;
   uint8_t red_to_blue_;
 } VP8LMultipliers;
-typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
-                                       uint32_t* argb_data, int num_pixels);
-extern VP8LTransformColorFunc VP8LTransformColorInverse;
+typedef void (*VP8LTransformColorInverseFunc)(const VP8LMultipliers* const m,
+                                              const uint32_t* src,
+                                              int num_pixels, uint32_t* dst);
+extern VP8LTransformColorInverseFunc VP8LTransformColorInverse;
 
 struct VP8LTransform;  // Defined in dec/vp8li.h.
 
@@ -72,23 +81,6 @@ extern VP8LConvertFunc VP8LConvertBGRAToBGR;
 void VP8LConvertFromBGRA(const uint32_t* const in_data, int num_pixels,
                          WEBP_CSP_MODE out_colorspace, uint8_t* const rgba);
 
-// color mapping related functions.
-static WEBP_INLINE uint32_t VP8GetARGBIndex(uint32_t idx) {
-  return (idx >> 8) & 0xff;
-}
-
-static WEBP_INLINE uint8_t VP8GetAlphaIndex(uint8_t idx) {
-  return idx;
-}
-
-static WEBP_INLINE uint32_t VP8GetARGBValue(uint32_t val) {
-  return val;
-}
-
-static WEBP_INLINE uint8_t VP8GetAlphaValue(uint32_t val) {
-  return (val >> 8) & 0xff;
-}
-
 typedef void (*VP8LMapARGBFunc)(const uint32_t* src,
                                 const uint32_t* const color_map,
                                 uint32_t* dst, int y_start,
@@ -110,7 +102,8 @@ void VP8LColorIndexInverseTransformAlpha(
 
 // Expose some C-only fallback functions
 void VP8LTransformColorInverse_C(const VP8LMultipliers* const m,
-                                 uint32_t* data, int num_pixels);
+                                 const uint32_t* src, int num_pixels,
+                                 uint32_t* dst);
 
 void VP8LConvertBGRAToRGB_C(const uint32_t* src, int num_pixels, uint8_t* dst);
 void VP8LConvertBGRAToRGBA_C(const uint32_t* src, int num_pixels, uint8_t* dst);
@@ -119,7 +112,8 @@ void VP8LConvertBGRAToRGBA4444_C(const uint32_t* src,
 void VP8LConvertBGRAToRGB565_C(const uint32_t* src,
                                int num_pixels, uint8_t* dst);
 void VP8LConvertBGRAToBGR_C(const uint32_t* src, int num_pixels, uint8_t* dst);
-void VP8LAddGreenToBlueAndRed_C(uint32_t* data, int num_pixels);
+void VP8LAddGreenToBlueAndRed_C(const uint32_t* src, int num_pixels,
+                                uint32_t* dst);
 
 // Must be called before calling any of the above methods.
 void VP8LDspInit(void);
@@ -127,7 +121,10 @@ void VP8LDspInit(void);
 //------------------------------------------------------------------------------
 // Encoding
 
-extern VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+typedef void (*VP8LProcessEncBlueAndRedFunc)(uint32_t* dst, int num_pixels);
+extern VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+typedef void (*VP8LTransformColorFunc)(const VP8LMultipliers* const m,
+                                       uint32_t* const dst, int num_pixels);
 extern VP8LTransformColorFunc VP8LTransformColor;
 typedef void (*VP8LCollectColorBlueTransformsFunc)(
     const uint32_t* argb, int stride,
@@ -153,62 +150,8 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
                                       int green_to_blue, int red_to_blue,
                                       int histo[]);
 
-//------------------------------------------------------------------------------
-// Image transforms.
-
-void VP8LResidualImage(int width, int height, int bits, int low_effort,
-                       uint32_t* const argb, uint32_t* const argb_scratch,
-                       uint32_t* const image, int near_lossless, int exact,
-                       int used_subtract_green);
-
-void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
-                             uint32_t* const argb, uint32_t* image);
-
-//------------------------------------------------------------------------------
-// Misc methods.
-
-// Computes sampled size of 'size' when sampling using 'sampling bits'.
-static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
-                                              uint32_t sampling_bits) {
-  return (size + (1 << sampling_bits) - 1) >> sampling_bits;
-}
-
-// Converts near lossless quality into max number of bits shaved off.
-static WEBP_INLINE int VP8LNearLosslessBits(int near_lossless_quality) {
-  //    100 -> 0
-  // 80..99 -> 1
-  // 60..79 -> 2
-  // 40..59 -> 3
-  // 20..39 -> 4
-  //  0..19 -> 5
-  return 5 - near_lossless_quality / 20;
-}
-
-// -----------------------------------------------------------------------------
-// Faster logarithm for integers. Small values use a look-up table.
-
-// The threshold till approximate version of log_2 can be used.
-// Practically, we can get rid of the call to log() as the two values match to
-// very high degree (the ratio of these two is 0.99999x).
-// Keeping a high threshold for now.
-#define APPROX_LOG_WITH_CORRECTION_MAX  65536
-#define APPROX_LOG_MAX                   4096
-#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
-#define LOG_LOOKUP_IDX_MAX 256
-extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
-extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
-typedef float (*VP8LFastLog2SlowFunc)(uint32_t v);
-
-extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
-extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
-
-static WEBP_INLINE float VP8LFastLog2(uint32_t v) {
-  return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
-}
-// Fast calculation of v * log2(v) for integer input.
-static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
-  return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
-}
+extern VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
+extern VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
 
 // -----------------------------------------------------------------------------
 // Huffman-cost related functions.
@@ -228,11 +171,6 @@ typedef struct {        // small struct to hold counters
   int streaks[2][2];    // [zero/non-zero][streak<3 / streak>=3]
 } VP8LStreaks;
 
-typedef VP8LStreaks (*VP8LCostCombinedCountFunc)(const uint32_t* X,
-                                                 const uint32_t* Y, int length);
-
-extern VP8LCostCombinedCountFunc VP8LHuffmanCostCombinedCount;
-
 typedef struct {            // small struct to hold bit entropy results
   double entropy;           // entropy
   uint32_t sum;             // sum of the population
@@ -246,26 +184,20 @@ void VP8LBitEntropyInit(VP8LBitEntropy* const entropy);
 // Get the combined symbol bit entropy and Huffman cost stats for the
 // distributions 'X' and 'Y'. Those results can then be refined according to
 // codec specific heuristics.
-void VP8LGetCombinedEntropyUnrefined(const uint32_t* const X,
-                                     const uint32_t* const Y, int length,
-                                     VP8LBitEntropy* const bit_entropy,
-                                     VP8LStreaks* const stats);
+typedef void (*VP8LGetCombinedEntropyUnrefinedFunc)(
+    const uint32_t X[], const uint32_t Y[], int length,
+    VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats);
+extern VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
+
 // Get the entropy for the distribution 'X'.
-void VP8LGetEntropyUnrefined(const uint32_t* const X, int length,
-                             VP8LBitEntropy* const bit_entropy,
-                             VP8LStreaks* const stats);
+typedef void (*VP8LGetEntropyUnrefinedFunc)(const uint32_t X[], int length,
+                                            VP8LBitEntropy* const bit_entropy,
+                                            VP8LStreaks* const stats);
+extern VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
 
 void VP8LBitsEntropyUnrefined(const uint32_t* const array, int n,
                               VP8LBitEntropy* const entropy);
 
-typedef void (*GetEntropyUnrefinedHelperFunc)(uint32_t val, int i,
-                                              uint32_t* const val_prev,
-                                              int* const i_prev,
-                                              VP8LBitEntropy* const bit_entropy,
-                                              VP8LStreaks* const stats);
-// Internal function used by VP8LGet*EntropyUnrefined.
-extern GetEntropyUnrefinedHelperFunc VP8LGetEntropyUnrefinedHelper;
-
 typedef void (*VP8LHistogramAddFunc)(const VP8LHistogram* const a,
                                      const VP8LHistogram* const b,
                                      VP8LHistogram* const out);
@@ -279,86 +211,11 @@ typedef int (*VP8LVectorMismatchFunc)(const uint32_t* const array1,
 // Returns the first index where array1 and array2 are different.
 extern VP8LVectorMismatchFunc VP8LVectorMismatch;
 
-static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
-  const int log_floor = BitsLog2Floor(n);
-  if (n == (n & ~(n - 1)))  // zero or a power of two.
-    return log_floor;
-  else
-    return log_floor + 1;
-}
-
-// Splitting of distance and length codes into prefixes and
-// extra bits. The prefixes are encoded with an entropy code
-// while the extra bits are stored just as normal bits.
-static WEBP_INLINE void VP8LPrefixEncodeBitsNoLUT(int distance, int* const code,
-                                                  int* const extra_bits) {
-  const int highest_bit = BitsLog2Floor(--distance);
-  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
-  *extra_bits = highest_bit - 1;
-  *code = 2 * highest_bit + second_highest_bit;
-}
-
-static WEBP_INLINE void VP8LPrefixEncodeNoLUT(int distance, int* const code,
-                                              int* const extra_bits,
-                                              int* const extra_bits_value) {
-  const int highest_bit = BitsLog2Floor(--distance);
-  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
-  *extra_bits = highest_bit - 1;
-  *extra_bits_value = distance & ((1 << *extra_bits) - 1);
-  *code = 2 * highest_bit + second_highest_bit;
-}
-
-#define PREFIX_LOOKUP_IDX_MAX   512
-typedef struct {
-  int8_t code_;
-  int8_t extra_bits_;
-} VP8LPrefixCode;
-
-// These tables are derived using VP8LPrefixEncodeNoLUT.
-extern const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX];
-extern const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX];
-static WEBP_INLINE void VP8LPrefixEncodeBits(int distance, int* const code,
-                                             int* const extra_bits) {
-  if (distance < PREFIX_LOOKUP_IDX_MAX) {
-    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
-    *code = prefix_code.code_;
-    *extra_bits = prefix_code.extra_bits_;
-  } else {
-    VP8LPrefixEncodeBitsNoLUT(distance, code, extra_bits);
-  }
-}
-
-static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
-                                         int* const extra_bits,
-                                         int* const extra_bits_value) {
-  if (distance < PREFIX_LOOKUP_IDX_MAX) {
-    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
-    *code = prefix_code.code_;
-    *extra_bits = prefix_code.extra_bits_;
-    *extra_bits_value = kPrefixEncodeExtraBitsValue[distance];
-  } else {
-    VP8LPrefixEncodeNoLUT(distance, code, extra_bits, extra_bits_value);
-  }
-}
-
-// Sum of each component, mod 256.
-static WEBP_INLINE uint32_t VP8LAddPixels(uint32_t a, uint32_t b) {
-  const uint32_t alpha_and_green = (a & 0xff00ff00u) + (b & 0xff00ff00u);
-  const uint32_t red_and_blue = (a & 0x00ff00ffu) + (b & 0x00ff00ffu);
-  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
-}
-
-// Difference of each component, mod 256.
-static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
-  const uint32_t alpha_and_green =
-      0x00ff00ffu + (a & 0xff00ff00u) - (b & 0xff00ff00u);
-  const uint32_t red_and_blue =
-      0xff00ff00u + (a & 0x00ff00ffu) - (b & 0x00ff00ffu);
-  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
-}
-
-void VP8LBundleColorMap(const uint8_t* const row, int width,
-                        int xbits, uint32_t* const dst);
+typedef void (*VP8LBundleColorMapFunc)(const uint8_t* const row, int width,
+                                       int xbits, uint32_t* dst);
+extern VP8LBundleColorMapFunc VP8LBundleColorMap;
+void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
+                          uint32_t* dst);
 
 // Must be called before calling any of the above methods.
 void VP8LEncDspInit(void);
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_common.h b/src/3rdparty/libwebp/src/dsp/lossless_common.h
new file mode 100644
index 0000000..c40f711
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_common.h
@@ -0,0 +1,210 @@
+// Copyright 2012 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transforms and color space conversion methods for lossless decoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+//          Vincent Rabaud (vrabaud@google.com)
+
+#ifndef WEBP_DSP_LOSSLESS_COMMON_H_
+#define WEBP_DSP_LOSSLESS_COMMON_H_
+
+#include "../webp/types.h"
+
+#include "../utils/utils.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//------------------------------------------------------------------------------
+// Decoding
+
+// color mapping related functions.
+static WEBP_INLINE uint32_t VP8GetARGBIndex(uint32_t idx) {
+  return (idx >> 8) & 0xff;
+}
+
+static WEBP_INLINE uint8_t VP8GetAlphaIndex(uint8_t idx) {
+  return idx;
+}
+
+static WEBP_INLINE uint32_t VP8GetARGBValue(uint32_t val) {
+  return val;
+}
+
+static WEBP_INLINE uint8_t VP8GetAlphaValue(uint32_t val) {
+  return (val >> 8) & 0xff;
+}
+
+//------------------------------------------------------------------------------
+// Misc methods.
+
+// Computes sampled size of 'size' when sampling using 'sampling bits'.
+static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
+                                              uint32_t sampling_bits) {
+  return (size + (1 << sampling_bits) - 1) >> sampling_bits;
+}
+
+// Converts near lossless quality into max number of bits shaved off.
+static WEBP_INLINE int VP8LNearLosslessBits(int near_lossless_quality) {
+  //    100 -> 0
+  // 80..99 -> 1
+  // 60..79 -> 2
+  // 40..59 -> 3
+  // 20..39 -> 4
+  //  0..19 -> 5
+  return 5 - near_lossless_quality / 20;
+}
+
+// -----------------------------------------------------------------------------
+// Faster logarithm for integers. Small values use a look-up table.
+
+// The threshold till approximate version of log_2 can be used.
+// Practically, we can get rid of the call to log() as the two values match to
+// very high degree (the ratio of these two is 0.99999x).
+// Keeping a high threshold for now.
+#define APPROX_LOG_WITH_CORRECTION_MAX  65536
+#define APPROX_LOG_MAX                   4096
+#define LOG_2_RECIPROCAL 1.44269504088896338700465094007086
+#define LOG_LOOKUP_IDX_MAX 256
+extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
+extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
+typedef float (*VP8LFastLog2SlowFunc)(uint32_t v);
+
+extern VP8LFastLog2SlowFunc VP8LFastLog2Slow;
+extern VP8LFastLog2SlowFunc VP8LFastSLog2Slow;
+
+static WEBP_INLINE float VP8LFastLog2(uint32_t v) {
+  return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
+}
+// Fast calculation of v * log2(v) for integer input.
+static WEBP_INLINE float VP8LFastSLog2(uint32_t v) {
+  return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
+}
+
+// -----------------------------------------------------------------------------
+// PrefixEncode()
+
+static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
+  const int log_floor = BitsLog2Floor(n);
+  if (n == (n & ~(n - 1))) {  // zero or a power of two.
+    return log_floor;
+  }
+  return log_floor + 1;
+}
+
+// Splitting of distance and length codes into prefixes and
+// extra bits. The prefixes are encoded with an entropy code
+// while the extra bits are stored just as normal bits.
+static WEBP_INLINE void VP8LPrefixEncodeBitsNoLUT(int distance, int* const code,
+                                                  int* const extra_bits) {
+  const int highest_bit = BitsLog2Floor(--distance);
+  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+  *extra_bits = highest_bit - 1;
+  *code = 2 * highest_bit + second_highest_bit;
+}
+
+static WEBP_INLINE void VP8LPrefixEncodeNoLUT(int distance, int* const code,
+                                              int* const extra_bits,
+                                              int* const extra_bits_value) {
+  const int highest_bit = BitsLog2Floor(--distance);
+  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+  *extra_bits = highest_bit - 1;
+  *extra_bits_value = distance & ((1 << *extra_bits) - 1);
+  *code = 2 * highest_bit + second_highest_bit;
+}
+
+#define PREFIX_LOOKUP_IDX_MAX   512
+typedef struct {
+  int8_t code_;
+  int8_t extra_bits_;
+} VP8LPrefixCode;
+
+// These tables are derived using VP8LPrefixEncodeNoLUT.
+extern const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX];
+extern const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX];
+static WEBP_INLINE void VP8LPrefixEncodeBits(int distance, int* const code,
+                                             int* const extra_bits) {
+  if (distance < PREFIX_LOOKUP_IDX_MAX) {
+    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
+    *code = prefix_code.code_;
+    *extra_bits = prefix_code.extra_bits_;
+  } else {
+    VP8LPrefixEncodeBitsNoLUT(distance, code, extra_bits);
+  }
+}
+
+static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
+                                         int* const extra_bits,
+                                         int* const extra_bits_value) {
+  if (distance < PREFIX_LOOKUP_IDX_MAX) {
+    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
+    *code = prefix_code.code_;
+    *extra_bits = prefix_code.extra_bits_;
+    *extra_bits_value = kPrefixEncodeExtraBitsValue[distance];
+  } else {
+    VP8LPrefixEncodeNoLUT(distance, code, extra_bits, extra_bits_value);
+  }
+}
+
+// Sum of each component, mod 256.
+static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
+uint32_t VP8LAddPixels(uint32_t a, uint32_t b) {
+  const uint32_t alpha_and_green = (a & 0xff00ff00u) + (b & 0xff00ff00u);
+  const uint32_t red_and_blue = (a & 0x00ff00ffu) + (b & 0x00ff00ffu);
+  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+}
+
+// Difference of each component, mod 256.
+static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
+uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
+  const uint32_t alpha_and_green =
+      0x00ff00ffu + (a & 0xff00ff00u) - (b & 0xff00ff00u);
+  const uint32_t red_and_blue =
+      0xff00ff00u + (a & 0x00ff00ffu) - (b & 0x00ff00ffu);
+  return (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+}
+
+//------------------------------------------------------------------------------
+// Transform-related functions use din both encoding and decoding.
+
+// Macros used to create a batch predictor that iteratively uses a
+// one-pixel predictor.
+
+// The predictor is added to the output pixel (which
+// is therefore considered as a residual) to get the final prediction.
+#define GENERATE_PREDICTOR_ADD(PREDICTOR, PREDICTOR_ADD)             \
+static void PREDICTOR_ADD(const uint32_t* in, const uint32_t* upper, \
+                          int num_pixels, uint32_t* out) {           \
+  int x;                                                             \
+  for (x = 0; x < num_pixels; ++x) {                                 \
+    const uint32_t pred = (PREDICTOR)(out[x - 1], upper + x);        \
+    out[x] = VP8LAddPixels(in[x], pred);                             \
+  }                                                                  \
+}
+
+// It subtracts the prediction from the input pixel and stores the residual
+// in the output pixel.
+#define GENERATE_PREDICTOR_SUB(PREDICTOR, PREDICTOR_SUB)             \
+static void PREDICTOR_SUB(const uint32_t* in, const uint32_t* upper, \
+                          int num_pixels, uint32_t* out) {           \
+  int x;                                                             \
+  for (x = 0; x < num_pixels; ++x) {                                 \
+    const uint32_t pred = (PREDICTOR)(in[x - 1], upper + x);         \
+    out[x] = VP8LSubPixels(in[x], pred);                             \
+  }                                                                  \
+}
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  // WEBP_DSP_LOSSLESS_COMMON_H_
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc.c b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
index 256f6f5..4e46fba 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
@@ -17,16 +17,12 @@
 
 #include <math.h>
 #include <stdlib.h>
-#include "../dec/vp8li.h"
-#include "../utils/endian_inl.h"
+#include "../dec/vp8li_dec.h"
+#include "../utils/endian_inl_utils.h"
 #include "./lossless.h"
+#include "./lossless_common.h"
 #include "./yuv.h"
 
-#define MAX_DIFF_COST (1e30f)
-
-static const int kPredLowEffort = 11;
-static const uint32_t kMaskAlpha = 0xff000000;
-
 // lookup table for small values of log2(int)
 const float kLog2Table[LOG_LOOKUP_IDX_MAX] = {
   0.0000000000000000f, 0.0000000000000000f,
@@ -380,26 +376,9 @@ static float FastLog2Slow(uint32_t v) {
   }
 }
 
-// Mostly used to reduce code size + readability
-static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
-static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
-
 //------------------------------------------------------------------------------
 // Methods to calculate Entropy (Shannon).
 
-static float PredictionCostSpatial(const int counts[256], int weight_0,
-                                   double exp_val) {
-  const int significant_symbols = 256 >> 4;
-  const double exp_decay_factor = 0.6;
-  double bits = weight_0 * counts[0];
-  int i;
-  for (i = 1; i < significant_symbols; ++i) {
-    bits += exp_val * (counts[i] + counts[256 - i]);
-    exp_val *= exp_decay_factor;
-  }
-  return (float)(-0.1 * bits);
-}
-
 // Compute the combined Shanon's entropy for distribution {X} and {X+Y}
 static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
   int i;
@@ -422,18 +401,6 @@ static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
   return (float)retval;
 }
 
-static float PredictionCostSpatialHistogram(const int accumulated[4][256],
-                                            const int tile[4][256]) {
-  int i;
-  double retval = 0;
-  for (i = 0; i < 4; ++i) {
-    const double kExpValue = 0.94;
-    retval += PredictionCostSpatial(tile[i], 1, kExpValue);
-    retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
-  }
-  return (float)retval;
-}
-
 void VP8LBitEntropyInit(VP8LBitEntropy* const entropy) {
   entropy->entropy = 0.;
   entropy->sum = 0;
@@ -486,9 +453,9 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
   *i_prev = i;
 }
 
-void VP8LGetEntropyUnrefined(const uint32_t* const X, int length,
-                             VP8LBitEntropy* const bit_entropy,
-                             VP8LStreaks* const stats) {
+static void GetEntropyUnrefined(const uint32_t X[], int length,
+                                VP8LBitEntropy* const bit_entropy,
+                                VP8LStreaks* const stats) {
   int i;
   int i_prev = 0;
   uint32_t x_prev = X[0];
@@ -499,18 +466,18 @@ void VP8LGetEntropyUnrefined(const uint32_t* const X, int length,
   for (i = 1; i < length; ++i) {
     const uint32_t x = X[i];
     if (x != x_prev) {
-      VP8LGetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
+      GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
     }
   }
-  VP8LGetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
+  GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
 
   bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
 }
 
-void VP8LGetCombinedEntropyUnrefined(const uint32_t* const X,
-                                     const uint32_t* const Y, int length,
-                                     VP8LBitEntropy* const bit_entropy,
-                                     VP8LStreaks* const stats) {
+static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
+                                        int length,
+                                        VP8LBitEntropy* const bit_entropy,
+                                        VP8LStreaks* const stats) {
   int i = 1;
   int i_prev = 0;
   uint32_t xy_prev = X[0] + Y[0];
@@ -521,439 +488,29 @@ void VP8LGetCombinedEntropyUnrefined(const uint32_t* const X,
   for (i = 1; i < length; ++i) {
     const uint32_t xy = X[i] + Y[i];
     if (xy != xy_prev) {
-      VP8LGetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy,
-                                    stats);
+      GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
     }
   }
-  VP8LGetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
+  GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
 
   bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
 }
 
-static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
-  ++histo_argb[0][argb >> 24];
-  ++histo_argb[1][(argb >> 16) & 0xff];
-  ++histo_argb[2][(argb >> 8) & 0xff];
-  ++histo_argb[3][argb & 0xff];
-}
-
 //------------------------------------------------------------------------------
 
-static WEBP_INLINE uint32_t Predict(VP8LPredictorFunc pred_func,
-                                    int x, int y,
-                                    const uint32_t* current_row,
-                                    const uint32_t* upper_row) {
-  if (y == 0) {
-    return (x == 0) ? ARGB_BLACK : current_row[x - 1];  // Left.
-  } else if (x == 0) {
-    return upper_row[x];  // Top.
-  } else {
-    return pred_func(current_row[x - 1], upper_row + x);
-  }
-}
-
-static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
-  const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
-  const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
-  const int diff_g = abs((int)((p1 >> 8) & 0xff) - (int)((p2 >> 8) & 0xff));
-  const int diff_b = abs((int)(p1 & 0xff) - (int)(p2 & 0xff));
-  return GetMax(GetMax(diff_a, diff_r), GetMax(diff_g, diff_b));
-}
-
-static int MaxDiffAroundPixel(uint32_t current, uint32_t up, uint32_t down,
-                              uint32_t left, uint32_t right) {
-  const int diff_up = MaxDiffBetweenPixels(current, up);
-  const int diff_down = MaxDiffBetweenPixels(current, down);
-  const int diff_left = MaxDiffBetweenPixels(current, left);
-  const int diff_right = MaxDiffBetweenPixels(current, right);
-  return GetMax(GetMax(diff_up, diff_down), GetMax(diff_left, diff_right));
-}
-
-static uint32_t AddGreenToBlueAndRed(uint32_t argb) {
-  const uint32_t green = (argb >> 8) & 0xff;
-  uint32_t red_blue = argb & 0x00ff00ffu;
-  red_blue += (green << 16) | green;
-  red_blue &= 0x00ff00ffu;
-  return (argb & 0xff00ff00u) | red_blue;
-}
-
-static void MaxDiffsForRow(int width, int stride, const uint32_t* const argb,
-                           uint8_t* const max_diffs, int used_subtract_green) {
-  uint32_t current, up, down, left, right;
-  int x;
-  if (width <= 2) return;
-  current = argb[0];
-  right = argb[1];
-  if (used_subtract_green) {
-    current = AddGreenToBlueAndRed(current);
-    right = AddGreenToBlueAndRed(right);
-  }
-  // max_diffs[0] and max_diffs[width - 1] are never used.
-  for (x = 1; x < width - 1; ++x) {
-    up = argb[-stride + x];
-    down = argb[stride + x];
-    left = current;
-    current = right;
-    right = argb[x + 1];
-    if (used_subtract_green) {
-      up = AddGreenToBlueAndRed(up);
-      down = AddGreenToBlueAndRed(down);
-      right = AddGreenToBlueAndRed(right);
-    }
-    max_diffs[x] = MaxDiffAroundPixel(current, up, down, left, right);
-  }
-}
-
-// Quantize the difference between the actual component value and its prediction
-// to a multiple of quantization, working modulo 256, taking care not to cross
-// a boundary (inclusive upper limit).
-static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
-                                     uint8_t boundary, int quantization) {
-  const int residual = (value - predict) & 0xff;
-  const int boundary_residual = (boundary - predict) & 0xff;
-  const int lower = residual & ~(quantization - 1);
-  const int upper = lower + quantization;
-  // Resolve ties towards a value closer to the prediction (i.e. towards lower
-  // if value comes after prediction and towards upper otherwise).
-  const int bias = ((boundary - value) & 0xff) < boundary_residual;
-  if (residual - lower < upper - residual + bias) {
-    // lower is closer to residual than upper.
-    if (residual > boundary_residual && lower <= boundary_residual) {
-      // Halve quantization step to avoid crossing boundary. This midpoint is
-      // on the same side of boundary as residual because midpoint >= residual
-      // (since lower is closer than upper) and residual is above the boundary.
-      return lower + (quantization >> 1);
-    }
-    return lower;
-  } else {
-    // upper is closer to residual than lower.
-    if (residual <= boundary_residual && upper > boundary_residual) {
-      // Halve quantization step to avoid crossing boundary. This midpoint is
-      // on the same side of boundary as residual because midpoint <= residual
-      // (since upper is closer than lower) and residual is below the boundary.
-      return lower + (quantization >> 1);
-    }
-    return upper & 0xff;
-  }
-}
-
-// Quantize every component of the difference between the actual pixel value and
-// its prediction to a multiple of a quantization (a power of 2, not larger than
-// max_quantization which is a power of 2, smaller than max_diff). Take care if
-// value and predict have undergone subtract green, which means that red and
-// blue are represented as offsets from green.
-static uint32_t NearLossless(uint32_t value, uint32_t predict,
-                             int max_quantization, int max_diff,
-                             int used_subtract_green) {
-  int quantization;
-  uint8_t new_green = 0;
-  uint8_t green_diff = 0;
-  uint8_t a, r, g, b;
-  if (max_diff <= 2) {
-    return VP8LSubPixels(value, predict);
-  }
-  quantization = max_quantization;
-  while (quantization >= max_diff) {
-    quantization >>= 1;
-  }
-  if ((value >> 24) == 0 || (value >> 24) == 0xff) {
-    // Preserve transparency of fully transparent or fully opaque pixels.
-    a = ((value >> 24) - (predict >> 24)) & 0xff;
-  } else {
-    a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
-  }
-  g = NearLosslessComponent((value >> 8) & 0xff, (predict >> 8) & 0xff, 0xff,
-                            quantization);
-  if (used_subtract_green) {
-    // The green offset will be added to red and blue components during decoding
-    // to obtain the actual red and blue values.
-    new_green = ((predict >> 8) + g) & 0xff;
-    // The amount by which green has been adjusted during quantization. It is
-    // subtracted from red and blue for compensation, to avoid accumulating two
-    // quantization errors in them.
-    green_diff = (new_green - (value >> 8)) & 0xff;
-  }
-  r = NearLosslessComponent(((value >> 16) - green_diff) & 0xff,
-                            (predict >> 16) & 0xff, 0xff - new_green,
-                            quantization);
-  b = NearLosslessComponent((value - green_diff) & 0xff, predict & 0xff,
-                            0xff - new_green, quantization);
-  return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
-}
-
-// Returns the difference between the pixel and its prediction. In case of a
-// lossy encoding, updates the source image to avoid propagating the deviation
-// further to pixels which depend on the current pixel for their predictions.
-static WEBP_INLINE uint32_t GetResidual(int width, int height,
-                                        uint32_t* const upper_row,
-                                        uint32_t* const current_row,
-                                        const uint8_t* const max_diffs,
-                                        int mode, VP8LPredictorFunc pred_func,
-                                        int x, int y, int max_quantization,
-                                        int exact, int used_subtract_green) {
-  const uint32_t predict = Predict(pred_func, x, y, current_row, upper_row);
-  uint32_t residual;
-  if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
-      x == 0 || x == width - 1) {
-    residual = VP8LSubPixels(current_row[x], predict);
-  } else {
-    residual = NearLossless(current_row[x], predict, max_quantization,
-                            max_diffs[x], used_subtract_green);
-    // Update the source image.
-    current_row[x] = VP8LAddPixels(predict, residual);
-    // x is never 0 here so we do not need to update upper_row like below.
-  }
-  if (!exact && (current_row[x] & kMaskAlpha) == 0) {
-    // If alpha is 0, cleanup RGB. We can choose the RGB values of the residual
-    // for best compression. The prediction of alpha itself can be non-zero and
-    // must be kept though. We choose RGB of the residual to be 0.
-    residual &= kMaskAlpha;
-    // Update the source image.
-    current_row[x] = predict & ~kMaskAlpha;
-    // The prediction for the rightmost pixel in a row uses the leftmost pixel
-    // in that row as its top-right context pixel. Hence if we change the
-    // leftmost pixel of current_row, the corresponding change must be applied
-    // to upper_row as well where top-right context is being read from.
-    if (x == 0 && y != 0) upper_row[width] = current_row[0];
-  }
-  return residual;
-}
-
-// Returns best predictor and updates the accumulated histogram.
-// If max_quantization > 1, assumes that near lossless processing will be
-// applied, quantizing residuals to multiples of quantization levels up to
-// max_quantization (the actual quantization level depends on smoothness near
-// the given pixel).
-static int GetBestPredictorForTile(int width, int height,
-                                   int tile_x, int tile_y, int bits,
-                                   int accumulated[4][256],
-                                   uint32_t* const argb_scratch,
-                                   const uint32_t* const argb,
-                                   int max_quantization,
-                                   int exact, int used_subtract_green) {
-  const int kNumPredModes = 14;
-  const int start_x = tile_x << bits;
-  const int start_y = tile_y << bits;
-  const int tile_size = 1 << bits;
-  const int max_y = GetMin(tile_size, height - start_y);
-  const int max_x = GetMin(tile_size, width - start_x);
-  // Whether there exist columns just outside the tile.
-  const int have_left = (start_x > 0);
-  const int have_right = (max_x < width - start_x);
-  // Position and size of the strip covering the tile and adjacent columns if
-  // they exist.
-  const int context_start_x = start_x - have_left;
-  const int context_width = max_x + have_left + have_right;
-  // The width of upper_row and current_row is one pixel larger than image width
-  // to allow the top right pixel to point to the leftmost pixel of the next row
-  // when at the right edge.
-  uint32_t* upper_row = argb_scratch;
-  uint32_t* current_row = upper_row + width + 1;
-  uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1);
-  float best_diff = MAX_DIFF_COST;
-  int best_mode = 0;
-  int mode;
-  int histo_stack_1[4][256];
-  int histo_stack_2[4][256];
-  // Need pointers to be able to swap arrays.
-  int (*histo_argb)[256] = histo_stack_1;
-  int (*best_histo)[256] = histo_stack_2;
-  int i, j;
-
-  for (mode = 0; mode < kNumPredModes; ++mode) {
-    const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
-    float cur_diff;
-    int relative_y;
-    memset(histo_argb, 0, sizeof(histo_stack_1));
-    if (start_y > 0) {
-      // Read the row above the tile which will become the first upper_row.
-      // Include a pixel to the left if it exists; include a pixel to the right
-      // in all cases (wrapping to the leftmost pixel of the next row if it does
-      // not exist).
-      memcpy(current_row + context_start_x,
-             argb + (start_y - 1) * width + context_start_x,
-             sizeof(*argb) * (max_x + have_left + 1));
-    }
-    for (relative_y = 0; relative_y < max_y; ++relative_y) {
-      const int y = start_y + relative_y;
-      int relative_x;
-      uint32_t* tmp = upper_row;
-      upper_row = current_row;
-      current_row = tmp;
-      // Read current_row. Include a pixel to the left if it exists; include a
-      // pixel to the right in all cases except at the bottom right corner of
-      // the image (wrapping to the leftmost pixel of the next row if it does
-      // not exist in the current row).
-      memcpy(current_row + context_start_x,
-             argb + y * width + context_start_x,
-             sizeof(*argb) * (max_x + have_left + (y + 1 < height)));
-      if (max_quantization > 1 && y >= 1 && y + 1 < height) {
-        MaxDiffsForRow(context_width, width, argb + y * width + context_start_x,
-                       max_diffs + context_start_x, used_subtract_green);
-      }
-
-      for (relative_x = 0; relative_x < max_x; ++relative_x) {
-        const int x = start_x + relative_x;
-        UpdateHisto(histo_argb,
-                    GetResidual(width, height, upper_row, current_row,
-                                max_diffs, mode, pred_func, x, y,
-                                max_quantization, exact, used_subtract_green));
-      }
-    }
-    cur_diff = PredictionCostSpatialHistogram(
-        (const int (*)[256])accumulated, (const int (*)[256])histo_argb);
-    if (cur_diff < best_diff) {
-      int (*tmp)[256] = histo_argb;
-      histo_argb = best_histo;
-      best_histo = tmp;
-      best_diff = cur_diff;
-      best_mode = mode;
-    }
-  }
-
-  for (i = 0; i < 4; i++) {
-    for (j = 0; j < 256; j++) {
-      accumulated[i][j] += best_histo[i][j];
-    }
-  }
-
-  return best_mode;
-}
-
-// Converts pixels of the image to residuals with respect to predictions.
-// If max_quantization > 1, applies near lossless processing, quantizing
-// residuals to multiples of quantization levels up to max_quantization
-// (the actual quantization level depends on smoothness near the given pixel).
-static void CopyImageWithPrediction(int width, int height,
-                                    int bits, uint32_t* const modes,
-                                    uint32_t* const argb_scratch,
-                                    uint32_t* const argb,
-                                    int low_effort, int max_quantization,
-                                    int exact, int used_subtract_green) {
-  const int tiles_per_row = VP8LSubSampleSize(width, bits);
-  const int mask = (1 << bits) - 1;
-  // The width of upper_row and current_row is one pixel larger than image width
-  // to allow the top right pixel to point to the leftmost pixel of the next row
-  // when at the right edge.
-  uint32_t* upper_row = argb_scratch;
-  uint32_t* current_row = upper_row + width + 1;
-  uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1);
-  uint8_t* lower_max_diffs = current_max_diffs + width;
-  int y;
-  int mode = 0;
-  VP8LPredictorFunc pred_func = NULL;
-
-  for (y = 0; y < height; ++y) {
-    int x;
-    uint32_t* const tmp32 = upper_row;
-    upper_row = current_row;
-    current_row = tmp32;
-    memcpy(current_row, argb + y * width,
-           sizeof(*argb) * (width + (y + 1 < height)));
-
-    if (low_effort) {
-      for (x = 0; x < width; ++x) {
-        const uint32_t predict = Predict(VP8LPredictors[kPredLowEffort], x, y,
-                                         current_row, upper_row);
-        argb[y * width + x] = VP8LSubPixels(current_row[x], predict);
-      }
-    } else {
-      if (max_quantization > 1) {
-        // Compute max_diffs for the lower row now, because that needs the
-        // contents of argb for the current row, which we will overwrite with
-        // residuals before proceeding with the next row.
-        uint8_t* const tmp8 = current_max_diffs;
-        current_max_diffs = lower_max_diffs;
-        lower_max_diffs = tmp8;
-        if (y + 2 < height) {
-          MaxDiffsForRow(width, width, argb + (y + 1) * width, lower_max_diffs,
-                         used_subtract_green);
-        }
-      }
-      for (x = 0; x < width; ++x) {
-        if ((x & mask) == 0) {
-          mode = (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff;
-          pred_func = VP8LPredictors[mode];
-        }
-        argb[y * width + x] = GetResidual(
-            width, height, upper_row, current_row, current_max_diffs, mode,
-            pred_func, x, y, max_quantization, exact, used_subtract_green);
-      }
-    }
-  }
-}
-
-// Finds the best predictor for each tile, and converts the image to residuals
-// with respect to predictions. If near_lossless_quality < 100, applies
-// near lossless processing, shaving off more bits of residuals for lower
-// qualities.
-void VP8LResidualImage(int width, int height, int bits, int low_effort,
-                       uint32_t* const argb, uint32_t* const argb_scratch,
-                       uint32_t* const image, int near_lossless_quality,
-                       int exact, int used_subtract_green) {
-  const int tiles_per_row = VP8LSubSampleSize(width, bits);
-  const int tiles_per_col = VP8LSubSampleSize(height, bits);
-  int tile_y;
-  int histo[4][256];
-  const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
-  if (low_effort) {
-    int i;
-    for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
-      image[i] = ARGB_BLACK | (kPredLowEffort << 8);
-    }
-  } else {
-    memset(histo, 0, sizeof(histo));
-    for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
-      int tile_x;
-      for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
-        const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y,
-            bits, histo, argb_scratch, argb, max_quantization, exact,
-            used_subtract_green);
-        image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
-      }
-    }
-  }
-
-  CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
-                          low_effort, max_quantization, exact,
-                          used_subtract_green);
-}
-
 void VP8LSubtractGreenFromBlueAndRed_C(uint32_t* argb_data, int num_pixels) {
   int i;
   for (i = 0; i < num_pixels; ++i) {
-    const uint32_t argb = argb_data[i];
-    const uint32_t green = (argb >> 8) & 0xff;
+    const int argb = argb_data[i];
+    const int green = (argb >> 8) & 0xff;
     const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
-    const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
-    argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
+    const uint32_t new_b = (((argb >>  0) & 0xff) - green) & 0xff;
+    argb_data[i] = (argb & 0xff00ff00u) | (new_r << 16) | new_b;
   }
 }
 
-static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
-  m->green_to_red_ = 0;
-  m->green_to_blue_ = 0;
-  m->red_to_blue_ = 0;
-}
-
-static WEBP_INLINE uint32_t ColorTransformDelta(int8_t color_pred,
-                                                int8_t color) {
-  return (uint32_t)((int)(color_pred) * color) >> 5;
-}
-
-static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
-                                               VP8LMultipliers* const m) {
-  m->green_to_red_  = (color_code >>  0) & 0xff;
-  m->green_to_blue_ = (color_code >>  8) & 0xff;
-  m->red_to_blue_   = (color_code >> 16) & 0xff;
-}
-
-static WEBP_INLINE uint32_t MultipliersToColorCode(
-    const VP8LMultipliers* const m) {
-  return 0xff000000u |
-         ((uint32_t)(m->red_to_blue_) << 16) |
-         ((uint32_t)(m->green_to_blue_) << 8) |
-         m->green_to_red_;
+static WEBP_INLINE int ColorTransformDelta(int8_t color_pred, int8_t color) {
+  return ((int)color_pred * color) >> 5;
 }
 
 void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
@@ -963,8 +520,8 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
     const uint32_t argb = data[i];
     const uint32_t green = argb >> 8;
     const uint32_t red = argb >> 16;
-    uint32_t new_red = red;
-    uint32_t new_blue = argb;
+    int new_red = red;
+    int new_blue = argb;
     new_red -= ColorTransformDelta(m->green_to_red_, green);
     new_red &= 0xff;
     new_blue -= ColorTransformDelta(m->green_to_blue_, green);
@@ -977,7 +534,7 @@ void VP8LTransformColor_C(const VP8LMultipliers* const m, uint32_t* data,
 static WEBP_INLINE uint8_t TransformColorRed(uint8_t green_to_red,
                                              uint32_t argb) {
   const uint32_t green = argb >> 8;
-  uint32_t new_red = argb >> 16;
+  int new_red = argb >> 16;
   new_red -= ColorTransformDelta(green_to_red, green);
   return (new_red & 0xff);
 }
@@ -993,15 +550,6 @@ static WEBP_INLINE uint8_t TransformColorBlue(uint8_t green_to_blue,
   return (new_blue & 0xff);
 }
 
-static float PredictionCostCrossColor(const int accumulated[256],
-                                      const int counts[256]) {
-  // Favor low entropy, locally and globally.
-  // Favor small absolute values for PredictionCostSpatial
-  static const double kExpValue = 2.4;
-  return VP8LCombinedShannonEntropy(counts, accumulated) +
-         PredictionCostSpatial(counts, 3, kExpValue);
-}
-
 void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
                                      int tile_width, int tile_height,
                                      int green_to_red, int histo[]) {
@@ -1014,59 +562,6 @@ void VP8LCollectColorRedTransforms_C(const uint32_t* argb, int stride,
   }
 }
 
-static float GetPredictionCostCrossColorRed(
-    const uint32_t* argb, int stride, int tile_width, int tile_height,
-    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
-    const int accumulated_red_histo[256]) {
-  int histo[256] = { 0 };
-  float cur_diff;
-
-  VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height,
-                                green_to_red, histo);
-
-  cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
-  if ((uint8_t)green_to_red == prev_x.green_to_red_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if ((uint8_t)green_to_red == prev_y.green_to_red_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if (green_to_red == 0) {
-    cur_diff -= 3;
-  }
-  return cur_diff;
-}
-
-static void GetBestGreenToRed(
-    const uint32_t* argb, int stride, int tile_width, int tile_height,
-    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
-    const int accumulated_red_histo[256], VP8LMultipliers* const best_tx) {
-  const int kMaxIters = 4 + ((7 * quality) >> 8);  // in range [4..6]
-  int green_to_red_best = 0;
-  int iter, offset;
-  float best_diff = GetPredictionCostCrossColorRed(
-      argb, stride, tile_width, tile_height, prev_x, prev_y,
-      green_to_red_best, accumulated_red_histo);
-  for (iter = 0; iter < kMaxIters; ++iter) {
-    // ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
-    // one in color computation. Having initial delta here as 1 is sufficient
-    // to explore the range of (-2, 2).
-    const int delta = 32 >> iter;
-    // Try a negative and a positive delta from the best known value.
-    for (offset = -delta; offset <= delta; offset += 2 * delta) {
-      const int green_to_red_cur = offset + green_to_red_best;
-      const float cur_diff = GetPredictionCostCrossColorRed(
-          argb, stride, tile_width, tile_height, prev_x, prev_y,
-          green_to_red_cur, accumulated_red_histo);
-      if (cur_diff < best_diff) {
-        best_diff = cur_diff;
-        green_to_red_best = green_to_red_cur;
-      }
-    }
-  }
-  best_tx->green_to_red_ = green_to_red_best;
-}
-
 void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
                                       int tile_width, int tile_height,
                                       int green_to_blue, int red_to_blue,
@@ -1080,187 +575,6 @@ void VP8LCollectColorBlueTransforms_C(const uint32_t* argb, int stride,
   }
 }
 
-static float GetPredictionCostCrossColorBlue(
-    const uint32_t* argb, int stride, int tile_width, int tile_height,
-    VP8LMultipliers prev_x, VP8LMultipliers prev_y,
-    int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256]) {
-  int histo[256] = { 0 };
-  float cur_diff;
-
-  VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height,
-                                 green_to_blue, red_to_blue, histo);
-
-  cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
-  if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
-    cur_diff -= 3;  // favor keeping the areas locally similar
-  }
-  if (green_to_blue == 0) {
-    cur_diff -= 3;
-  }
-  if (red_to_blue == 0) {
-    cur_diff -= 3;
-  }
-  return cur_diff;
-}
-
-#define kGreenRedToBlueNumAxis 8
-#define kGreenRedToBlueMaxIters 7
-static void GetBestGreenRedToBlue(
-    const uint32_t* argb, int stride, int tile_width, int tile_height,
-    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
-    const int accumulated_blue_histo[256],
-    VP8LMultipliers* const best_tx) {
-  const int8_t offset[kGreenRedToBlueNumAxis][2] =
-      {{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}};
-  const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 };
-  const int iters =
-      (quality < 25) ? 1 : (quality > 50) ? kGreenRedToBlueMaxIters : 4;
-  int green_to_blue_best = 0;
-  int red_to_blue_best = 0;
-  int iter;
-  // Initial value at origin:
-  float best_diff = GetPredictionCostCrossColorBlue(
-      argb, stride, tile_width, tile_height, prev_x, prev_y,
-      green_to_blue_best, red_to_blue_best, accumulated_blue_histo);
-  for (iter = 0; iter < iters; ++iter) {
-    const int delta = delta_lut[iter];
-    int axis;
-    for (axis = 0; axis < kGreenRedToBlueNumAxis; ++axis) {
-      const int green_to_blue_cur =
-          offset[axis][0] * delta + green_to_blue_best;
-      const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best;
-      const float cur_diff = GetPredictionCostCrossColorBlue(
-          argb, stride, tile_width, tile_height, prev_x, prev_y,
-          green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo);
-      if (cur_diff < best_diff) {
-        best_diff = cur_diff;
-        green_to_blue_best = green_to_blue_cur;
-        red_to_blue_best = red_to_blue_cur;
-      }
-      if (quality < 25 && iter == 4) {
-        // Only axis aligned diffs for lower quality.
-        break;  // next iter.
-      }
-    }
-    if (delta == 2 && green_to_blue_best == 0 && red_to_blue_best == 0) {
-      // Further iterations would not help.
-      break;  // out of iter-loop.
-    }
-  }
-  best_tx->green_to_blue_ = green_to_blue_best;
-  best_tx->red_to_blue_ = red_to_blue_best;
-}
-#undef kGreenRedToBlueMaxIters
-#undef kGreenRedToBlueNumAxis
-
-static VP8LMultipliers GetBestColorTransformForTile(
-    int tile_x, int tile_y, int bits,
-    VP8LMultipliers prev_x,
-    VP8LMultipliers prev_y,
-    int quality, int xsize, int ysize,
-    const int accumulated_red_histo[256],
-    const int accumulated_blue_histo[256],
-    const uint32_t* const argb) {
-  const int max_tile_size = 1 << bits;
-  const int tile_y_offset = tile_y * max_tile_size;
-  const int tile_x_offset = tile_x * max_tile_size;
-  const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
-  const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
-  const int tile_width = all_x_max - tile_x_offset;
-  const int tile_height = all_y_max - tile_y_offset;
-  const uint32_t* const tile_argb = argb + tile_y_offset * xsize
-                                  + tile_x_offset;
-  VP8LMultipliers best_tx;
-  MultipliersClear(&best_tx);
-
-  GetBestGreenToRed(tile_argb, xsize, tile_width, tile_height,
-                    prev_x, prev_y, quality, accumulated_red_histo, &best_tx);
-  GetBestGreenRedToBlue(tile_argb, xsize, tile_width, tile_height,
-                        prev_x, prev_y, quality, accumulated_blue_histo,
-                        &best_tx);
-  return best_tx;
-}
-
-static void CopyTileWithColorTransform(int xsize, int ysize,
-                                       int tile_x, int tile_y,
-                                       int max_tile_size,
-                                       VP8LMultipliers color_transform,
-                                       uint32_t* argb) {
-  const int xscan = GetMin(max_tile_size, xsize - tile_x);
-  int yscan = GetMin(max_tile_size, ysize - tile_y);
-  argb += tile_y * xsize + tile_x;
-  while (yscan-- > 0) {
-    VP8LTransformColor(&color_transform, argb, xscan);
-    argb += xsize;
-  }
-}
-
-void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
-                             uint32_t* const argb, uint32_t* image) {
-  const int max_tile_size = 1 << bits;
-  const int tile_xsize = VP8LSubSampleSize(width, bits);
-  const int tile_ysize = VP8LSubSampleSize(height, bits);
-  int accumulated_red_histo[256] = { 0 };
-  int accumulated_blue_histo[256] = { 0 };
-  int tile_x, tile_y;
-  VP8LMultipliers prev_x, prev_y;
-  MultipliersClear(&prev_y);
-  MultipliersClear(&prev_x);
-  for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
-    for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
-      int y;
-      const int tile_x_offset = tile_x * max_tile_size;
-      const int tile_y_offset = tile_y * max_tile_size;
-      const int all_x_max = GetMin(tile_x_offset + max_tile_size, width);
-      const int all_y_max = GetMin(tile_y_offset + max_tile_size, height);
-      const int offset = tile_y * tile_xsize + tile_x;
-      if (tile_y != 0) {
-        ColorCodeToMultipliers(image[offset - tile_xsize], &prev_y);
-      }
-      prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits,
-                                            prev_x, prev_y,
-                                            quality, width, height,
-                                            accumulated_red_histo,
-                                            accumulated_blue_histo,
-                                            argb);
-      image[offset] = MultipliersToColorCode(&prev_x);
-      CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset,
-                                 max_tile_size, prev_x, argb);
-
-      // Gather accumulated histogram data.
-      for (y = tile_y_offset; y < all_y_max; ++y) {
-        int ix = y * width + tile_x_offset;
-        const int ix_end = ix + all_x_max - tile_x_offset;
-        for (; ix < ix_end; ++ix) {
-          const uint32_t pix = argb[ix];
-          if (ix >= 2 &&
-              pix == argb[ix - 2] &&
-              pix == argb[ix - 1]) {
-            continue;  // repeated pixels are handled by backward references
-          }
-          if (ix >= width + 2 &&
-              argb[ix - 2] == argb[ix - width - 2] &&
-              argb[ix - 1] == argb[ix - width - 1] &&
-              pix == argb[ix - width]) {
-            continue;  // repeated pixels are handled by backward references
-          }
-          ++accumulated_red_histo[(pix >> 16) & 0xff];
-          ++accumulated_blue_histo[(pix >> 0) & 0xff];
-        }
-      }
-    }
-  }
-}
-
 //------------------------------------------------------------------------------
 
 static int VectorMismatch(const uint32_t* const array1,
@@ -1274,8 +588,8 @@ static int VectorMismatch(const uint32_t* const array1,
 }
 
 // Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
-void VP8LBundleColorMap(const uint8_t* const row, int width,
-                        int xbits, uint32_t* const dst) {
+void VP8LBundleColorMap_C(const uint8_t* const row, int width, int xbits,
+                          uint32_t* dst) {
   int x;
   if (xbits > 0) {
     const int bit_depth = 1 << (3 - xbits);
@@ -1350,8 +664,172 @@ static void HistogramAdd(const VP8LHistogram* const a,
 }
 
 //------------------------------------------------------------------------------
+// Image transforms.
 
-VP8LProcessBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
+  return (((a0 ^ a1) & 0xfefefefeu) >> 1) + (a0 & a1);
+}
+
+static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
+  return Average2(Average2(a0, a2), a1);
+}
+
+static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
+                                     uint32_t a2, uint32_t a3) {
+  return Average2(Average2(a0, a1), Average2(a2, a3));
+}
+
+static WEBP_INLINE uint32_t Clip255(uint32_t a) {
+  if (a < 256) {
+    return a;
+  }
+  // return 0, when a is a negative integer.
+  // return 255, when a is positive.
+  return ~a >> 24;
+}
+
+static WEBP_INLINE int AddSubtractComponentFull(int a, int b, int c) {
+  return Clip255(a + b - c);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  const int a = AddSubtractComponentFull(c0 >> 24, c1 >> 24, c2 >> 24);
+  const int r = AddSubtractComponentFull((c0 >> 16) & 0xff,
+                                         (c1 >> 16) & 0xff,
+                                         (c2 >> 16) & 0xff);
+  const int g = AddSubtractComponentFull((c0 >> 8) & 0xff,
+                                         (c1 >> 8) & 0xff,
+                                         (c2 >> 8) & 0xff);
+  const int b = AddSubtractComponentFull(c0 & 0xff, c1 & 0xff, c2 & 0xff);
+  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
+}
+
+static WEBP_INLINE int AddSubtractComponentHalf(int a, int b) {
+  return Clip255(a + (a - b) / 2);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
+                                                   uint32_t c2) {
+  const uint32_t ave = Average2(c0, c1);
+  const int a = AddSubtractComponentHalf(ave >> 24, c2 >> 24);
+  const int r = AddSubtractComponentHalf((ave >> 16) & 0xff, (c2 >> 16) & 0xff);
+  const int g = AddSubtractComponentHalf((ave >> 8) & 0xff, (c2 >> 8) & 0xff);
+  const int b = AddSubtractComponentHalf((ave >> 0) & 0xff, (c2 >> 0) & 0xff);
+  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
+}
+
+// gcc-4.9 on ARM generates incorrect code in Select() when Sub3() is inlined.
+#if defined(__arm__) && \
+    (LOCAL_GCC_VERSION == 0x409 || LOCAL_GCC_VERSION == 0x408)
+# define LOCAL_INLINE __attribute__ ((noinline))
+#else
+# define LOCAL_INLINE WEBP_INLINE
+#endif
+
+static LOCAL_INLINE int Sub3(int a, int b, int c) {
+  const int pb = b - c;
+  const int pa = a - c;
+  return abs(pb) - abs(pa);
+}
+
+#undef LOCAL_INLINE
+
+static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
+  const int pa_minus_pb =
+      Sub3((a >> 24)       , (b >> 24)       , (c >> 24)       ) +
+      Sub3((a >> 16) & 0xff, (b >> 16) & 0xff, (c >> 16) & 0xff) +
+      Sub3((a >>  8) & 0xff, (b >>  8) & 0xff, (c >>  8) & 0xff) +
+      Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
+  return (pa_minus_pb <= 0) ? a : b;
+}
+
+//------------------------------------------------------------------------------
+// Predictors
+
+static uint32_t Predictor2(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[0];
+}
+static uint32_t Predictor3(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[1];
+}
+static uint32_t Predictor4(uint32_t left, const uint32_t* const top) {
+  (void)left;
+  return top[-1];
+}
+static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average3(left, top[0], top[1]);
+  return pred;
+}
+static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(left, top[-1]);
+  return pred;
+}
+static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(left, top[0]);
+  return pred;
+}
+static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(top[-1], top[0]);
+  (void)left;
+  return pred;
+}
+static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average2(top[0], top[1]);
+  (void)left;
+  return pred;
+}
+static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
+  return pred;
+}
+static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = Select(top[0], left, top[-1]);
+  return pred;
+}
+static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+  return pred;
+}
+static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+  return pred;
+}
+
+//------------------------------------------------------------------------------
+
+static void PredictorSub0_C(const uint32_t* in, const uint32_t* upper,
+                            int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], ARGB_BLACK);
+  (void)upper;
+}
+
+static void PredictorSub1_C(const uint32_t* in, const uint32_t* upper,
+                            int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i < num_pixels; ++i) out[i] = VP8LSubPixels(in[i], in[i - 1]);
+  (void)upper;
+}
+
+GENERATE_PREDICTOR_SUB(Predictor2, PredictorSub2_C)
+GENERATE_PREDICTOR_SUB(Predictor3, PredictorSub3_C)
+GENERATE_PREDICTOR_SUB(Predictor4, PredictorSub4_C)
+GENERATE_PREDICTOR_SUB(Predictor5, PredictorSub5_C)
+GENERATE_PREDICTOR_SUB(Predictor6, PredictorSub6_C)
+GENERATE_PREDICTOR_SUB(Predictor7, PredictorSub7_C)
+GENERATE_PREDICTOR_SUB(Predictor8, PredictorSub8_C)
+GENERATE_PREDICTOR_SUB(Predictor9, PredictorSub9_C)
+GENERATE_PREDICTOR_SUB(Predictor10, PredictorSub10_C)
+GENERATE_PREDICTOR_SUB(Predictor11, PredictorSub11_C)
+GENERATE_PREDICTOR_SUB(Predictor12, PredictorSub12_C)
+GENERATE_PREDICTOR_SUB(Predictor13, PredictorSub13_C)
+
+//------------------------------------------------------------------------------
+
+VP8LProcessEncBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
 
 VP8LTransformColorFunc VP8LTransformColor;
 
@@ -1365,17 +843,23 @@ VP8LCostFunc VP8LExtraCost;
 VP8LCostCombinedFunc VP8LExtraCostCombined;
 VP8LCombinedShannonEntropyFunc VP8LCombinedShannonEntropy;
 
-GetEntropyUnrefinedHelperFunc VP8LGetEntropyUnrefinedHelper;
+VP8LGetEntropyUnrefinedFunc VP8LGetEntropyUnrefined;
+VP8LGetCombinedEntropyUnrefinedFunc VP8LGetCombinedEntropyUnrefined;
 
 VP8LHistogramAddFunc VP8LHistogramAdd;
 
 VP8LVectorMismatchFunc VP8LVectorMismatch;
+VP8LBundleColorMapFunc VP8LBundleColorMap;
+
+VP8LPredictorAddSubFunc VP8LPredictorsSub[16];
+VP8LPredictorAddSubFunc VP8LPredictorsSub_C[16];
 
 extern void VP8LEncDspInitSSE2(void);
 extern void VP8LEncDspInitSSE41(void);
 extern void VP8LEncDspInitNEON(void);
 extern void VP8LEncDspInitMIPS32(void);
 extern void VP8LEncDspInitMIPSdspR2(void);
+extern void VP8LEncDspInitMSA(void);
 
 static volatile VP8CPUInfo lossless_enc_last_cpuinfo_used =
     (VP8CPUInfo)&lossless_enc_last_cpuinfo_used;
@@ -1399,11 +883,47 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
   VP8LExtraCostCombined = ExtraCostCombined;
   VP8LCombinedShannonEntropy = CombinedShannonEntropy;
 
-  VP8LGetEntropyUnrefinedHelper = GetEntropyUnrefinedHelper;
+  VP8LGetEntropyUnrefined = GetEntropyUnrefined;
+  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
 
   VP8LHistogramAdd = HistogramAdd;
 
   VP8LVectorMismatch = VectorMismatch;
+  VP8LBundleColorMap = VP8LBundleColorMap_C;
+
+  VP8LPredictorsSub[0] = PredictorSub0_C;
+  VP8LPredictorsSub[1] = PredictorSub1_C;
+  VP8LPredictorsSub[2] = PredictorSub2_C;
+  VP8LPredictorsSub[3] = PredictorSub3_C;
+  VP8LPredictorsSub[4] = PredictorSub4_C;
+  VP8LPredictorsSub[5] = PredictorSub5_C;
+  VP8LPredictorsSub[6] = PredictorSub6_C;
+  VP8LPredictorsSub[7] = PredictorSub7_C;
+  VP8LPredictorsSub[8] = PredictorSub8_C;
+  VP8LPredictorsSub[9] = PredictorSub9_C;
+  VP8LPredictorsSub[10] = PredictorSub10_C;
+  VP8LPredictorsSub[11] = PredictorSub11_C;
+  VP8LPredictorsSub[12] = PredictorSub12_C;
+  VP8LPredictorsSub[13] = PredictorSub13_C;
+  VP8LPredictorsSub[14] = PredictorSub0_C;  // <- padding security sentinels
+  VP8LPredictorsSub[15] = PredictorSub0_C;
+
+  VP8LPredictorsSub_C[0] = PredictorSub0_C;
+  VP8LPredictorsSub_C[1] = PredictorSub1_C;
+  VP8LPredictorsSub_C[2] = PredictorSub2_C;
+  VP8LPredictorsSub_C[3] = PredictorSub3_C;
+  VP8LPredictorsSub_C[4] = PredictorSub4_C;
+  VP8LPredictorsSub_C[5] = PredictorSub5_C;
+  VP8LPredictorsSub_C[6] = PredictorSub6_C;
+  VP8LPredictorsSub_C[7] = PredictorSub7_C;
+  VP8LPredictorsSub_C[8] = PredictorSub8_C;
+  VP8LPredictorsSub_C[9] = PredictorSub9_C;
+  VP8LPredictorsSub_C[10] = PredictorSub10_C;
+  VP8LPredictorsSub_C[11] = PredictorSub11_C;
+  VP8LPredictorsSub_C[12] = PredictorSub12_C;
+  VP8LPredictorsSub_C[13] = PredictorSub13_C;
+  VP8LPredictorsSub_C[14] = PredictorSub0_C;  // <- padding security sentinels
+  VP8LPredictorsSub_C[15] = PredictorSub0_C;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
   if (VP8GetCPUInfo != NULL) {
@@ -1432,6 +952,11 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
       VP8LEncDspInitMIPSdspR2();
     }
 #endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      VP8LEncDspInitMSA();
+    }
+#endif
   }
   lossless_enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
index 49c666d..4186b9f 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_mips32.c
@@ -14,6 +14,7 @@
 
 #include "./dsp.h"
 #include "./lossless.h"
+#include "./lossless_common.h"
 
 #if defined(WEBP_USE_MIPS32)
 
@@ -240,6 +241,49 @@ static WEBP_INLINE void GetEntropyUnrefinedHelper(
   *i_prev = i;
 }
 
+static void GetEntropyUnrefined(const uint32_t X[], int length,
+                                VP8LBitEntropy* const bit_entropy,
+                                VP8LStreaks* const stats) {
+  int i;
+  int i_prev = 0;
+  uint32_t x_prev = X[0];
+
+  memset(stats, 0, sizeof(*stats));
+  VP8LBitEntropyInit(bit_entropy);
+
+  for (i = 1; i < length; ++i) {
+    const uint32_t x = X[i];
+    if (x != x_prev) {
+      GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats);
+    }
+  }
+  GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats);
+
+  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
+static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[],
+                                        int length,
+                                        VP8LBitEntropy* const bit_entropy,
+                                        VP8LStreaks* const stats) {
+  int i = 1;
+  int i_prev = 0;
+  uint32_t xy_prev = X[0] + Y[0];
+
+  memset(stats, 0, sizeof(*stats));
+  VP8LBitEntropyInit(bit_entropy);
+
+  for (i = 1; i < length; ++i) {
+    const uint32_t xy = X[i] + Y[i];
+    if (xy != xy_prev) {
+      GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats);
+    }
+  }
+  GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats);
+
+  bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum);
+}
+
 #define ASM_START                                       \
   __asm__ volatile(                                     \
     ".set   push                            \n\t"       \
@@ -375,7 +419,8 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) {
   VP8LFastLog2Slow = FastLog2Slow;
   VP8LExtraCost = ExtraCost;
   VP8LExtraCostCombined = ExtraCostCombined;
-  VP8LGetEntropyUnrefinedHelper = GetEntropyUnrefinedHelper;
+  VP8LGetEntropyUnrefined = GetEntropyUnrefined;
+  VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined;
   VP8LHistogramAdd = HistogramAdd;
 }
 
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c
new file mode 100644
index 0000000..2f69ba3
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_msa.c
@@ -0,0 +1,147 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of Image transform methods for lossless encoder.
+//
+// Authors: Prashant Patil (Prashant.Patil@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "./lossless.h"
+#include "./msa_macro.h"
+
+#define TRANSFORM_COLOR_8(src0, src1, dst0, dst1, c0, c1, mask0, mask1) do {  \
+  v8i16 g0, g1, t0, t1, t2, t3;                                               \
+  v4i32 t4, t5;                                                               \
+  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1);                   \
+  DOTP_SB2_SH(g0, g1, c0, c0, t0, t1);                                        \
+  SRAI_H2_SH(t0, t1, 5);                                                      \
+  t0 = __msa_subv_h((v8i16)src0, t0);                                         \
+  t1 = __msa_subv_h((v8i16)src1, t1);                                         \
+  t4 = __msa_srli_w((v4i32)src0, 16);                                         \
+  t5 = __msa_srli_w((v4i32)src1, 16);                                         \
+  DOTP_SB2_SH(t4, t5, c1, c1, t2, t3);                                        \
+  SRAI_H2_SH(t2, t3, 5);                                                      \
+  SUB2(t0, t2, t1, t3, t0, t1);                                               \
+  VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1);                   \
+} while (0)
+
+#define TRANSFORM_COLOR_4(src, dst, c0, c1, mask0, mask1) do {  \
+  const v16i8 g0 = VSHF_SB(src, src, mask0);                    \
+  v8i16 t0 = __msa_dotp_s_h(c0, g0);                            \
+  v8i16 t1;                                                     \
+  v4i32 t2;                                                     \
+  t0 = SRAI_H(t0, 5);                                           \
+  t0 = __msa_subv_h((v8i16)src, t0);                            \
+  t2 = __msa_srli_w((v4i32)src, 16);                            \
+  t1 = __msa_dotp_s_h(c1, (v16i8)t2);                           \
+  t1 = SRAI_H(t1, 5);                                           \
+  t0 = t0 - t1;                                                 \
+  dst = VSHF_UB(src, t0, mask1);                                \
+} while (0)
+
+static void TransformColor(const VP8LMultipliers* const m, uint32_t* data,
+                           int num_pixels) {
+  v16u8 src0, dst0;
+  const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
+                                         (m->green_to_red_ << 16));
+  const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_);
+  const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                        13, 255, 13, 255 };
+  const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11,
+                        28, 13, 30, 15 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1;
+    LD_UB2(data, 4, src0, src1);
+    TRANSFORM_COLOR_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
+    ST_UB2(dst0, dst1, data, 4);
+    data += 8;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(data);
+      TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
+      ST_UB(dst0, data);
+      data += 4;
+      num_pixels -= 4;
+    }
+    if (num_pixels > 0) {
+      src0 = LD_UB(data);
+      TRANSFORM_COLOR_4(src0, dst0, g2br, r2b, mask0, mask1);
+      if (num_pixels == 3) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
+        SD(pix_d, data + 0);
+        SW(pix_w, data + 2);
+      } else if (num_pixels == 2) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        SD(pix_d, data);
+      } else {
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
+        SW(pix_w, data);
+      }
+    }
+  }
+}
+
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
+  int i;
+  uint8_t* ptemp_data = (uint8_t*)argb_data;
+  v16u8 src0, dst0, tmp0;
+  const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                       13, 255, 13, 255 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1, tmp1;
+    LD_UB2(ptemp_data, 16, src0, src1);
+    VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
+    SUB2(src0, tmp0, src1, tmp1, dst0, dst1);
+    ST_UB2(dst0, dst1, ptemp_data, 16);
+    ptemp_data += 8 * 4;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(ptemp_data);
+      tmp0 = VSHF_UB(src0, src0, mask);
+      dst0 = src0 - tmp0;
+      ST_UB(dst0, ptemp_data);
+      ptemp_data += 4 * 4;
+      num_pixels -= 4;
+    }
+    for (i = 0; i < num_pixels; i++) {
+      const uint8_t b = ptemp_data[0];
+      const uint8_t g = ptemp_data[1];
+      const uint8_t r = ptemp_data[2];
+      ptemp_data[0] = (b - g) & 0xff;
+      ptemp_data[2] = (r - g) & 0xff;
+      ptemp_data += 4;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LEncDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMSA(void) {
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LTransformColor = TransformColor;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8LEncDspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
index 7c894e7..8ad85d9 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -17,6 +17,8 @@
 #include <assert.h>
 #include <emmintrin.h>
 #include "./lossless.h"
+#include "./common_sse2.h"
+#include "./lossless_common.h"
 
 // For sign-extended multiplying constants, pre-shifted by 5:
 #define CST_5b(X)  (((int16_t)((uint16_t)X << 8)) >> 5)
@@ -35,7 +37,9 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
     _mm_storeu_si128((__m128i*)&argb_data[i], out);
   }
   // fallthrough and finish off with plain-C
-  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+  if (i != num_pixels) {
+    VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -69,7 +73,9 @@ static void TransformColor(const VP8LMultipliers* const m,
     _mm_storeu_si128((__m128i*)&argb_data[i], out);
   }
   // fallthrough and finish off with plain-C
-  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+  if (i != num_pixels) {
+    VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -364,8 +370,9 @@ static int VectorMismatch(const uint32_t* const array1,
       if (length >= 8 &&
           _mm_movemask_epi8(_mm_cmpeq_epi32(
               _mm_loadu_si128((const __m128i*)&array1[4]),
-              _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff)
+              _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) {
         match_len = 8;
+      }
     }
   }
 
@@ -375,6 +382,295 @@ static int VectorMismatch(const uint32_t* const array1,
   return match_len;
 }
 
+// Bundles multiple (1, 2, 4 or 8) pixels into a single pixel.
+static void BundleColorMap_SSE2(const uint8_t* const row, int width, int xbits,
+                                uint32_t* dst) {
+  int x;
+  assert(xbits >= 0);
+  assert(xbits <= 3);
+  switch (xbits) {
+    case 0: {
+      const __m128i ff = _mm_set1_epi16(0xff00);
+      const __m128i zero = _mm_setzero_si128();
+      // Store 0xff000000 | (row[x] << 8).
+      for (x = 0; x + 16 <= width; x += 16, dst += 16) {
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i in_lo = _mm_unpacklo_epi8(zero, in);
+        const __m128i dst0 = _mm_unpacklo_epi16(in_lo, ff);
+        const __m128i dst1 = _mm_unpackhi_epi16(in_lo, ff);
+        const __m128i in_hi = _mm_unpackhi_epi8(zero, in);
+        const __m128i dst2 = _mm_unpacklo_epi16(in_hi, ff);
+        const __m128i dst3 = _mm_unpackhi_epi16(in_hi, ff);
+        _mm_storeu_si128((__m128i*)&dst[0], dst0);
+        _mm_storeu_si128((__m128i*)&dst[4], dst1);
+        _mm_storeu_si128((__m128i*)&dst[8], dst2);
+        _mm_storeu_si128((__m128i*)&dst[12], dst3);
+      }
+      break;
+    }
+    case 1: {
+      const __m128i ff = _mm_set1_epi16(0xff00);
+      const __m128i mul = _mm_set1_epi16(0x110);
+      for (x = 0; x + 16 <= width; x += 16, dst += 8) {
+        // 0a0b | (where a/b are 4 bits).
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i tmp = _mm_mullo_epi16(in, mul);  // aba0
+        const __m128i pack = _mm_and_si128(tmp, ff);   // ab00
+        const __m128i dst0 = _mm_unpacklo_epi16(pack, ff);
+        const __m128i dst1 = _mm_unpackhi_epi16(pack, ff);
+        _mm_storeu_si128((__m128i*)&dst[0], dst0);
+        _mm_storeu_si128((__m128i*)&dst[4], dst1);
+      }
+      break;
+    }
+    case 2: {
+      const __m128i mask_or = _mm_set1_epi32(0xff000000);
+      const __m128i mul_cst = _mm_set1_epi16(0x0104);
+      const __m128i mask_mul = _mm_set1_epi16(0x0f00);
+      for (x = 0; x + 16 <= width; x += 16, dst += 4) {
+        // 000a000b000c000d | (where a/b/c/d are 2 bits).
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i mul = _mm_mullo_epi16(in, mul_cst);  // 00ab00b000cd00d0
+        const __m128i tmp = _mm_and_si128(mul, mask_mul);  // 00ab000000cd0000
+        const __m128i shift = _mm_srli_epi32(tmp, 12);     // 00000000ab000000
+        const __m128i pack = _mm_or_si128(shift, tmp);     // 00000000abcd0000
+        // Convert to 0xff00**00.
+        const __m128i res = _mm_or_si128(pack, mask_or);
+        _mm_storeu_si128((__m128i*)dst, res);
+      }
+      break;
+    }
+    default: {
+      assert(xbits == 3);
+      for (x = 0; x + 16 <= width; x += 16, dst += 2) {
+        // 0000000a00000000b... | (where a/b are 1 bit).
+        const __m128i in = _mm_loadu_si128((const __m128i*)&row[x]);
+        const __m128i shift = _mm_slli_epi64(in, 7);
+        const uint32_t move = _mm_movemask_epi8(shift);
+        dst[0] = 0xff000000 | ((move & 0xff) << 8);
+        dst[1] = 0xff000000 | (move & 0xff00);
+      }
+      break;
+    }
+  }
+  if (x != width) {
+    VP8LBundleColorMap_C(row + x, width - x, xbits, dst);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Batch version of Predictor Transform subtraction
+
+static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
+                                       const __m128i* const a1,
+                                       __m128i* const avg) {
+  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
+  const __m128i ones = _mm_set1_epi8(1);
+  const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
+  const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
+  *avg = _mm_sub_epi8(avg1, one);
+}
+
+// Predictor0: ARGB_BLACK.
+static void PredictorSub0_SSE2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i black = _mm_set1_epi32(ARGB_BLACK);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i res = _mm_sub_epi8(src, black);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[0](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+#define GENERATE_PREDICTOR_1(X, IN)                                           \
+static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
+                                   int num_pixels, uint32_t* out) {           \
+  int i;                                                                      \
+  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
+    const __m128i pred = _mm_loadu_si128((const __m128i*)&(IN));              \
+    const __m128i res = _mm_sub_epi8(src, pred);                              \
+    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
+  }                                                                           \
+  if (i != num_pixels) {                                                      \
+    VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
+  }                                                                           \
+}
+
+GENERATE_PREDICTOR_1(1, in[i - 1])       // Predictor1: L
+GENERATE_PREDICTOR_1(2, upper[i])        // Predictor2: T
+GENERATE_PREDICTOR_1(3, upper[i + 1])    // Predictor3: TR
+GENERATE_PREDICTOR_1(4, upper[i - 1])    // Predictor4: TL
+#undef GENERATE_PREDICTOR_1
+
+// Predictor5: avg2(avg2(L, TR), T)
+static void PredictorSub5_SSE2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    __m128i avg, pred, res;
+    Average2_m128i(&L, &TR, &avg);
+    Average2_m128i(&avg, &T, &pred);
+    res = _mm_sub_epi8(src, pred);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[5](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+#define GENERATE_PREDICTOR_2(X, A, B)                                         \
+static void PredictorSub##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
+                                   int num_pixels, uint32_t* out) {           \
+  int i;                                                                      \
+  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
+    const __m128i tA = _mm_loadu_si128((const __m128i*)&(A));                 \
+    const __m128i tB = _mm_loadu_si128((const __m128i*)&(B));                 \
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
+    __m128i pred, res;                                                        \
+    Average2_m128i(&tA, &tB, &pred);                                          \
+    res = _mm_sub_epi8(src, pred);                                            \
+    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
+  }                                                                           \
+  if (i != num_pixels) {                                                      \
+    VP8LPredictorsSub_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
+  }                                                                           \
+}
+
+GENERATE_PREDICTOR_2(6, in[i - 1], upper[i - 1])   // Predictor6: avg(L, TL)
+GENERATE_PREDICTOR_2(7, in[i - 1], upper[i])       // Predictor7: avg(L, T)
+GENERATE_PREDICTOR_2(8, upper[i - 1], upper[i])    // Predictor8: avg(TL, T)
+GENERATE_PREDICTOR_2(9, upper[i], upper[i + 1])    // Predictor9: average(T, TR)
+#undef GENERATE_PREDICTOR_2
+
+// Predictor10: avg(avg(L,TL), avg(T, TR)).
+static void PredictorSub10_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
+    __m128i avgTTR, avgLTL, avg, res;
+    Average2_m128i(&T, &TR, &avgTTR);
+    Average2_m128i(&L, &TL, &avgLTL);
+    Average2_m128i(&avgTTR, &avgLTL, &avg);
+    res = _mm_sub_epi8(src, avg);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[10](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor11: select.
+static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
+                            __m128i* const out) {
+  // We can unpack with any value on the upper 32 bits, provided it's the same
+  // on both operands (to that their sum of abs diff is zero). Here we use *A.
+  const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
+  const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
+  const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
+  const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
+  const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
+  const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
+  *out = _mm_packs_epi32(s_lo, s_hi);
+}
+
+static void PredictorSub11_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    __m128i pa, pb;
+    GetSumAbsDiff32(&T, &TL, &pa);   // pa = sum |T-TL|
+    GetSumAbsDiff32(&L, &TL, &pb);   // pb = sum |L-TL|
+    {
+      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
+      const __m128i A = _mm_and_si128(mask, L);
+      const __m128i B = _mm_andnot_si128(mask, T);
+      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
+      const __m128i res = _mm_sub_epi8(src, pred);
+      _mm_storeu_si128((__m128i*)&out[i], res);
+    }
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[11](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor12: ClampedSubSubtractFull.
+static void PredictorSub12_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i L = _mm_loadu_si128((const __m128i*)&in[i - 1]);
+    const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
+    const __m128i L_hi = _mm_unpackhi_epi8(L, zero);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
+    const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
+    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
+    const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
+    const __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
+    const __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
+    const __m128i pred_lo = _mm_add_epi16(L_lo, diff_lo);
+    const __m128i pred_hi = _mm_add_epi16(L_hi, diff_hi);
+    const __m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
+    const __m128i res = _mm_sub_epi8(src, pred);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[12](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictors13: ClampedAddSubtractHalf
+static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 2 <= num_pixels; i += 2) {
+    // we can only process two pixels at a time
+    const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]);
+    const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]);
+    const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]);
+    const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]);
+    const __m128i L_lo = _mm_unpacklo_epi8(L, zero);
+    const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
+    const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
+    const __m128i sum = _mm_add_epi16(T_lo, L_lo);
+    const __m128i avg = _mm_srli_epi16(sum, 1);
+    const __m128i A1 = _mm_sub_epi16(avg, TL_lo);
+    const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg);
+    const __m128i A2 = _mm_sub_epi16(A1, bit_fix);
+    const __m128i A3 = _mm_srai_epi16(A2, 1);
+    const __m128i A4 = _mm_add_epi16(avg, A3);
+    const __m128i pred = _mm_packus_epi16(A4, A4);
+    const __m128i res = _mm_sub_epi8(src, pred);
+    _mm_storel_epi64((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
 //------------------------------------------------------------------------------
 // Entry point
 
@@ -388,6 +684,24 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE2(void) {
   VP8LHistogramAdd = HistogramAdd;
   VP8LCombinedShannonEntropy = CombinedShannonEntropy;
   VP8LVectorMismatch = VectorMismatch;
+  VP8LBundleColorMap = BundleColorMap_SSE2;
+
+  VP8LPredictorsSub[0] = PredictorSub0_SSE2;
+  VP8LPredictorsSub[1] = PredictorSub1_SSE2;
+  VP8LPredictorsSub[2] = PredictorSub2_SSE2;
+  VP8LPredictorsSub[3] = PredictorSub3_SSE2;
+  VP8LPredictorsSub[4] = PredictorSub4_SSE2;
+  VP8LPredictorsSub[5] = PredictorSub5_SSE2;
+  VP8LPredictorsSub[6] = PredictorSub6_SSE2;
+  VP8LPredictorsSub[7] = PredictorSub7_SSE2;
+  VP8LPredictorsSub[8] = PredictorSub8_SSE2;
+  VP8LPredictorsSub[9] = PredictorSub9_SSE2;
+  VP8LPredictorsSub[10] = PredictorSub10_SSE2;
+  VP8LPredictorsSub[11] = PredictorSub11_SSE2;
+  VP8LPredictorsSub[12] = PredictorSub12_SSE2;
+  VP8LPredictorsSub[13] = PredictorSub13_SSE2;
+  VP8LPredictorsSub[14] = PredictorSub0_SSE2;  // <- padding security sentinels
+  VP8LPredictorsSub[15] = PredictorSub0_SSE2;
 }
 
 #else  // !WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
index 3e49319..821057c 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
@@ -32,7 +32,9 @@ static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
     _mm_storeu_si128((__m128i*)&argb_data[i], out);
   }
   // fallthrough and finish off with plain-C
-  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+  if (i != num_pixels) {
+    VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
+  }
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
index 90aed7f..2984ce8 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_mips_dsp_r2.c
@@ -17,6 +17,7 @@
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
 #include "./lossless.h"
+#include "./lossless_common.h"
 
 #define MAP_COLOR_FUNCS(FUNC_NAME, TYPE, GET_INDEX, GET_VALUE)                 \
 static void FUNC_NAME(const TYPE* src,                                         \
@@ -227,25 +228,27 @@ static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
 
 // Add green to blue and red channels (i.e. perform the inverse transform of
 // 'subtract green').
-static void AddGreenToBlueAndRed(uint32_t* data, int num_pixels) {
+static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
+                                 uint32_t* dst) {
   uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
-  uint32_t* const p_loop1_end = data + (num_pixels & ~3);
-  uint32_t* const p_loop2_end = data + num_pixels;
+  const uint32_t* const p_loop1_end = src + (num_pixels & ~3);
+  const uint32_t* const p_loop2_end = src + num_pixels;
   __asm__ volatile (
     ".set       push                                          \n\t"
     ".set       noreorder                                     \n\t"
-    "beq        %[data],         %[p_loop1_end],     3f       \n\t"
+    "beq        %[src],          %[p_loop1_end],     3f       \n\t"
     " nop                                                     \n\t"
   "0:                                                         \n\t"
-    "lw         %[temp0],        0(%[data])                   \n\t"
-    "lw         %[temp1],        4(%[data])                   \n\t"
-    "lw         %[temp2],        8(%[data])                   \n\t"
-    "lw         %[temp3],        12(%[data])                  \n\t"
+    "lw         %[temp0],        0(%[src])                    \n\t"
+    "lw         %[temp1],        4(%[src])                    \n\t"
+    "lw         %[temp2],        8(%[src])                    \n\t"
+    "lw         %[temp3],        12(%[src])                   \n\t"
     "ext        %[temp4],        %[temp0],           8,    8  \n\t"
     "ext        %[temp5],        %[temp1],           8,    8  \n\t"
     "ext        %[temp6],        %[temp2],           8,    8  \n\t"
     "ext        %[temp7],        %[temp3],           8,    8  \n\t"
-    "addiu      %[data],         %[data],            16       \n\t"
+    "addiu      %[src],          %[src],             16       \n\t"
+    "addiu      %[dst],          %[dst],             16       \n\t"
     "replv.ph   %[temp4],        %[temp4]                     \n\t"
     "replv.ph   %[temp5],        %[temp5]                     \n\t"
     "replv.ph   %[temp6],        %[temp6]                     \n\t"
@@ -254,44 +257,47 @@ static void AddGreenToBlueAndRed(uint32_t* data, int num_pixels) {
     "addu.qb    %[temp1],        %[temp1],           %[temp5] \n\t"
     "addu.qb    %[temp2],        %[temp2],           %[temp6] \n\t"
     "addu.qb    %[temp3],        %[temp3],           %[temp7] \n\t"
-    "sw         %[temp0],        -16(%[data])                 \n\t"
-    "sw         %[temp1],        -12(%[data])                 \n\t"
-    "sw         %[temp2],        -8(%[data])                  \n\t"
-    "bne        %[data],         %[p_loop1_end],     0b       \n\t"
-    " sw        %[temp3],        -4(%[data])                  \n\t"
+    "sw         %[temp0],        -16(%[dst])                  \n\t"
+    "sw         %[temp1],        -12(%[dst])                  \n\t"
+    "sw         %[temp2],        -8(%[dst])                   \n\t"
+    "bne        %[src],          %[p_loop1_end],     0b       \n\t"
+    " sw        %[temp3],        -4(%[dst])                   \n\t"
   "3:                                                         \n\t"
-    "beq        %[data],         %[p_loop2_end],     2f       \n\t"
+    "beq        %[src],          %[p_loop2_end],     2f       \n\t"
     " nop                                                     \n\t"
   "1:                                                         \n\t"
-    "lw         %[temp0],        0(%[data])                   \n\t"
-    "addiu      %[data],         %[data],            4        \n\t"
+    "lw         %[temp0],        0(%[src])                    \n\t"
+    "addiu      %[src],          %[src],             4        \n\t"
+    "addiu      %[dst],          %[dst],             4        \n\t"
     "ext        %[temp4],        %[temp0],           8,    8  \n\t"
     "replv.ph   %[temp4],        %[temp4]                     \n\t"
     "addu.qb    %[temp0],        %[temp0],           %[temp4] \n\t"
-    "bne        %[data],         %[p_loop2_end],     1b       \n\t"
-    " sw        %[temp0],        -4(%[data])                  \n\t"
+    "bne        %[src],          %[p_loop2_end],     1b       \n\t"
+    " sw        %[temp0],        -4(%[dst])                   \n\t"
   "2:                                                         \n\t"
     ".set       pop                                           \n\t"
-    : [data]"+&r"(data), [temp0]"=&r"(temp0), [temp1]"=&r"(temp1),
-      [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), [temp4]"=&r"(temp4),
-      [temp5]"=&r"(temp5), [temp6]"=&r"(temp6), [temp7]"=&r"(temp7)
+    : [dst]"+&r"(dst), [src]"+&r"(src), [temp0]"=&r"(temp0),
+      [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), [temp3]"=&r"(temp3),
+      [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), [temp6]"=&r"(temp6),
+      [temp7]"=&r"(temp7)
     : [p_loop1_end]"r"(p_loop1_end), [p_loop2_end]"r"(p_loop2_end)
     : "memory"
   );
 }
 
 static void TransformColorInverse(const VP8LMultipliers* const m,
-                                  uint32_t* data, int num_pixels) {
+                                  const uint32_t* src, int num_pixels,
+                                  uint32_t* dst) {
   int temp0, temp1, temp2, temp3, temp4, temp5;
   uint32_t argb, argb1, new_red;
   const uint32_t G_to_R = m->green_to_red_;
   const uint32_t G_to_B = m->green_to_blue_;
   const uint32_t R_to_B = m->red_to_blue_;
-  uint32_t* const p_loop_end = data + (num_pixels & ~1);
+  const uint32_t* const p_loop_end = src + (num_pixels & ~1);
   __asm__ volatile (
     ".set            push                                    \n\t"
     ".set            noreorder                               \n\t"
-    "beq             %[data],      %[p_loop_end],  1f        \n\t"
+    "beq             %[src],       %[p_loop_end],  1f        \n\t"
     " nop                                                    \n\t"
     "replv.ph        %[temp0],     %[G_to_R]                 \n\t"
     "replv.ph        %[temp1],     %[G_to_B]                 \n\t"
@@ -303,9 +309,12 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
     "shra.ph         %[temp1],     %[temp1],       8         \n\t"
     "shra.ph         %[temp2],     %[temp2],       8         \n\t"
   "0:                                                        \n\t"
-    "lw              %[argb],      0(%[data])                \n\t"
-    "lw              %[argb1],     4(%[data])                \n\t"
-    "addiu           %[data],      %[data],        8         \n\t"
+    "lw              %[argb],      0(%[src])                 \n\t"
+    "lw              %[argb1],     4(%[src])                 \n\t"
+    "sw              %[argb],      0(%[dst])                 \n\t"
+    "sw              %[argb1],     4(%[dst])                 \n\t"
+    "addiu           %[src],       %[src],         8         \n\t"
+    "addiu           %[dst],       %[dst],         8         \n\t"
     "precrq.qb.ph    %[temp3],     %[argb],        %[argb1]  \n\t"
     "preceu.ph.qbra  %[temp3],     %[temp3]                  \n\t"
     "shll.ph         %[temp3],     %[temp3],       8         \n\t"
@@ -322,29 +331,29 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
     "shll.ph         %[temp4],     %[temp5],       8         \n\t"
     "shra.ph         %[temp4],     %[temp4],       8         \n\t"
     "mul.ph          %[temp4],     %[temp4],       %[temp2]  \n\t"
-    "sb              %[temp5],     -2(%[data])               \n\t"
+    "sb              %[temp5],     -2(%[dst])                \n\t"
     "sra             %[temp5],     %[temp5],       16        \n\t"
     "shra.ph         %[temp4],     %[temp4],       5         \n\t"
     "addu.ph         %[argb1],     %[argb1],       %[temp4]  \n\t"
     "preceu.ph.qbra  %[temp3],     %[argb1]                  \n\t"
-    "sb              %[temp5],     -6(%[data])               \n\t"
-    "sb              %[temp3],     -4(%[data])               \n\t"
+    "sb              %[temp5],     -6(%[dst])                \n\t"
+    "sb              %[temp3],     -4(%[dst])                \n\t"
     "sra             %[temp3],     %[temp3],       16        \n\t"
-    "bne             %[data],      %[p_loop_end],  0b        \n\t"
-    " sb             %[temp3],     -8(%[data])               \n\t"
+    "bne             %[src],       %[p_loop_end],  0b        \n\t"
+    " sb             %[temp3],     -8(%[dst])                \n\t"
   "1:                                                        \n\t"
     ".set            pop                                     \n\t"
     : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
       [temp3]"=&r"(temp3), [temp4]"=&r"(temp4), [temp5]"=&r"(temp5),
       [new_red]"=&r"(new_red), [argb]"=&r"(argb),
-      [argb1]"=&r"(argb1), [data]"+&r"(data)
+      [argb1]"=&r"(argb1), [dst]"+&r"(dst), [src]"+&r"(src)
     : [G_to_R]"r"(G_to_R), [R_to_B]"r"(R_to_B),
       [G_to_B]"r"(G_to_B), [p_loop_end]"r"(p_loop_end)
     : "memory", "hi", "lo"
   );
 
   // Fall-back to C-version for left-overs.
-  if (num_pixels & 1) VP8LTransformColorInverse_C(m, data, 1);
+  if (num_pixels & 1) VP8LTransformColorInverse_C(m, src, 1, dst);
 }
 
 static void ConvertBGRAToRGB(const uint32_t* src,
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_msa.c b/src/3rdparty/libwebp/src/dsp/lossless_msa.c
new file mode 100644
index 0000000..f6dd564
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/lossless_msa.c
@@ -0,0 +1,355 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA variant of methods for lossless decoder
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "./lossless.h"
+#include "./msa_macro.h"
+
+//------------------------------------------------------------------------------
+// Colorspace conversion functions
+
+#define CONVERT16_BGRA_XXX(psrc, pdst, m0, m1, m2) do {    \
+  v16u8 src0, src1, src2, src3, dst0, dst1, dst2;          \
+  LD_UB4(psrc, 16, src0, src1, src2, src3);                \
+  VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
+  dst2 = VSHF_UB(src2, src3, m2);                          \
+  ST_UB2(dst0, dst1, pdst, 16);                            \
+  ST_UB(dst2, pdst + 32);                                  \
+} while (0)
+
+#define CONVERT12_BGRA_XXX(psrc, pdst, m0, m1, m2) do {    \
+  uint32_t pix_w;                                          \
+  v16u8 src0, src1, src2, dst0, dst1, dst2;                \
+  LD_UB3(psrc, 16, src0, src1, src2);                      \
+  VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
+  dst2 = VSHF_UB(src2, src2, m2);                          \
+  ST_UB2(dst0, dst1, pdst, 16);                            \
+  pix_w = __msa_copy_s_w((v4i32)dst2, 0);                  \
+  SW(pix_w, pdst + 32);                                    \
+} while (0)
+
+#define CONVERT8_BGRA_XXX(psrc, pdst, m0, m1) do {         \
+  uint64_t pix_d;                                          \
+  v16u8 src0, src1, src2, dst0, dst1;                      \
+  LD_UB2(psrc, 16, src0, src1);                            \
+  VSHF_B2_UB(src0, src1, src1, src2, m0, m1, dst0, dst1);  \
+  ST_UB(dst0, pdst);                                       \
+  pix_d = __msa_copy_s_d((v2i64)dst1, 0);                  \
+  SD(pix_d, pdst + 16);                                    \
+} while (0)
+
+#define CONVERT4_BGRA_XXX(psrc, pdst, m) do {       \
+  const v16u8 src0 = LD_UB(psrc);                   \
+  const v16u8 dst0 = VSHF_UB(src0, src0, m);        \
+  uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);  \
+  uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);  \
+  SD(pix_d, pdst + 0);                              \
+  SW(pix_w, pdst + 8);                              \
+} while (0)
+
+#define CONVERT1_BGRA_BGR(psrc, pdst) do {  \
+  const int32_t b = (psrc)[0];              \
+  const int32_t g = (psrc)[1];              \
+  const int32_t r = (psrc)[2];              \
+  (pdst)[0] = b;                            \
+  (pdst)[1] = g;                            \
+  (pdst)[2] = r;                            \
+} while (0)
+
+#define CONVERT1_BGRA_RGB(psrc, pdst) do {  \
+  const int32_t b = (psrc)[0];              \
+  const int32_t g = (psrc)[1];              \
+  const int32_t r = (psrc)[2];              \
+  (pdst)[0] = r;                            \
+  (pdst)[1] = g;                            \
+  (pdst)[2] = b;                            \
+} while (0)
+
+#define TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1,     \
+                                  c0, c1, mask0, mask1) do {  \
+  v8i16 g0, g1, t0, t1, t2, t3;                               \
+  v4i32 t4, t5;                                               \
+  VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, g0, g1);   \
+  DOTP_SB2_SH(g0, g1, c0, c0, t0, t1);                        \
+  SRAI_H2_SH(t0, t1, 5);                                      \
+  t0 = __msa_addv_h(t0, (v8i16)src0);                         \
+  t1 = __msa_addv_h(t1, (v8i16)src1);                         \
+  t4 = __msa_srli_w((v4i32)t0, 16);                           \
+  t5 = __msa_srli_w((v4i32)t1, 16);                           \
+  DOTP_SB2_SH(t4, t5, c1, c1, t2, t3);                        \
+  SRAI_H2_SH(t2, t3, 5);                                      \
+  ADD2(t0, t2, t1, t3, t0, t1);                               \
+  VSHF_B2_UB(src0, t0, src1, t1, mask1, mask1, dst0, dst1);   \
+} while (0)
+
+#define TRANSFORM_COLOR_INVERSE_4(src, dst, c0, c1, mask0, mask1) do {  \
+  const v16i8 g0 = VSHF_SB(src, src, mask0);                            \
+  v8i16 t0 = __msa_dotp_s_h(c0, g0);                                    \
+  v8i16 t1;                                                             \
+  v4i32 t2;                                                             \
+  t0 = SRAI_H(t0, 5);                                                   \
+  t0 = __msa_addv_h(t0, (v8i16)src);                                    \
+  t2 = __msa_srli_w((v4i32)t0, 16);                                     \
+  t1 = __msa_dotp_s_h(c1, (v16i8)t2);                                   \
+  t1 = SRAI_H(t1, 5);                                                   \
+  t0 = t0 + t1;                                                         \
+  dst = VSHF_UB(src, t0, mask1);                                        \
+} while (0)
+
+static void ConvertBGRAToRGBA(const uint32_t* src,
+                              int num_pixels, uint8_t* dst) {
+  int i;
+  const uint8_t* ptemp_src = (const uint8_t*)src;
+  uint8_t* ptemp_dst = (uint8_t*)dst;
+  v16u8 src0, dst0;
+  const v16u8 mask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1;
+    LD_UB2(ptemp_src, 16, src0, src1);
+    VSHF_B2_UB(src0, src0, src1, src1, mask, mask, dst0, dst1);
+    ST_UB2(dst0, dst1, ptemp_dst, 16);
+    ptemp_src += 32;
+    ptemp_dst += 32;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(ptemp_src);
+      dst0 = VSHF_UB(src0, src0, mask);
+      ST_UB(dst0, ptemp_dst);
+      ptemp_src += 16;
+      ptemp_dst += 16;
+      num_pixels -= 4;
+    }
+    for (i = 0; i < num_pixels; i++) {
+      const uint8_t b = ptemp_src[2];
+      const uint8_t g = ptemp_src[1];
+      const uint8_t r = ptemp_src[0];
+      const uint8_t a = ptemp_src[3];
+      ptemp_dst[0] = b;
+      ptemp_dst[1] = g;
+      ptemp_dst[2] = r;
+      ptemp_dst[3] = a;
+      ptemp_src += 4;
+      ptemp_dst += 4;
+    }
+  }
+}
+
+static void ConvertBGRAToBGR(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint8_t* ptemp_src = (const uint8_t*)src;
+  uint8_t* ptemp_dst = (uint8_t*)dst;
+  const v16u8 mask0 = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 12, 13, 14,
+                        16, 17, 18, 20 };
+  const v16u8 mask1 = { 5, 6, 8, 9, 10, 12, 13, 14, 16, 17, 18, 20,
+                        21, 22, 24, 25 };
+  const v16u8 mask2 = { 10, 12, 13, 14, 16, 17, 18, 20, 21, 22, 24, 25,
+                        26, 28, 29, 30 };
+
+  while (num_pixels >= 16) {
+    CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+    ptemp_src += 64;
+    ptemp_dst += 48;
+    num_pixels -= 16;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 12) {
+      CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+      ptemp_src += 48;
+      ptemp_dst += 36;
+      num_pixels -= 12;
+    } else if (num_pixels >= 8) {
+      CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1);
+      ptemp_src += 32;
+      ptemp_dst += 24;
+      num_pixels -= 8;
+    } else if (num_pixels >= 4) {
+      CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0);
+      ptemp_src += 16;
+      ptemp_dst += 12;
+      num_pixels -= 4;
+    }
+    if (num_pixels == 3) {
+      CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3);
+      CONVERT1_BGRA_BGR(ptemp_src + 8, ptemp_dst + 6);
+    } else if (num_pixels == 2) {
+      CONVERT1_BGRA_BGR(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_BGR(ptemp_src + 4, ptemp_dst + 3);
+    } else if (num_pixels == 1) {
+      CONVERT1_BGRA_BGR(ptemp_src, ptemp_dst);
+    }
+  }
+}
+
+static void ConvertBGRAToRGB(const uint32_t* src,
+                             int num_pixels, uint8_t* dst) {
+  const uint8_t* ptemp_src = (const uint8_t*)src;
+  uint8_t* ptemp_dst = (uint8_t*)dst;
+  const v16u8 mask0 = { 2, 1, 0, 6, 5, 4, 10, 9, 8, 14, 13, 12,
+                        18, 17, 16, 22 };
+  const v16u8 mask1 = { 5, 4, 10, 9, 8, 14, 13, 12, 18, 17, 16, 22,
+                        21, 20, 26, 25 };
+  const v16u8 mask2 = { 8, 14, 13, 12, 18, 17, 16, 22, 21, 20, 26, 25,
+                        24, 30, 29, 28 };
+
+  while (num_pixels >= 16) {
+    CONVERT16_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+    ptemp_src += 64;
+    ptemp_dst += 48;
+    num_pixels -= 16;
+  }
+  if (num_pixels) {
+    if (num_pixels >= 12) {
+      CONVERT12_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1, mask2);
+      ptemp_src += 48;
+      ptemp_dst += 36;
+      num_pixels -= 12;
+    } else if (num_pixels >= 8) {
+      CONVERT8_BGRA_XXX(ptemp_src, ptemp_dst, mask0, mask1);
+      ptemp_src += 32;
+      ptemp_dst += 24;
+      num_pixels -= 8;
+    } else if (num_pixels >= 4) {
+      CONVERT4_BGRA_XXX(ptemp_src, ptemp_dst, mask0);
+      ptemp_src += 16;
+      ptemp_dst += 12;
+      num_pixels -= 4;
+    }
+    if (num_pixels == 3) {
+      CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3);
+      CONVERT1_BGRA_RGB(ptemp_src + 8, ptemp_dst + 6);
+    } else if (num_pixels == 2) {
+      CONVERT1_BGRA_RGB(ptemp_src + 0, ptemp_dst + 0);
+      CONVERT1_BGRA_RGB(ptemp_src + 4, ptemp_dst + 3);
+    } else if (num_pixels == 1) {
+      CONVERT1_BGRA_RGB(ptemp_src, ptemp_dst);
+    }
+  }
+}
+
+static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
+                                 uint32_t* dst) {
+  int i;
+  const uint8_t* in = (const uint8_t*)src;
+  uint8_t* out = (uint8_t*)dst;
+  v16u8 src0, dst0, tmp0;
+  const v16u8 mask = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                       13, 255, 13, 255 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1, tmp1;
+    LD_UB2(in, 16, src0, src1);
+    VSHF_B2_UB(src0, src1, src1, src0, mask, mask, tmp0, tmp1);
+    ADD2(src0, tmp0, src1, tmp1, dst0, dst1);
+    ST_UB2(dst0, dst1, out, 16);
+    in += 32;
+    out += 32;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(in);
+      tmp0 = VSHF_UB(src0, src0, mask);
+      dst0 = src0 + tmp0;
+      ST_UB(dst0, out);
+      in += 16;
+      out += 16;
+      num_pixels -= 4;
+    }
+    for (i = 0; i < num_pixels; i++) {
+      const uint8_t b = in[0];
+      const uint8_t g = in[1];
+      const uint8_t r = in[2];
+      out[0] = (b + g) & 0xff;
+      out[1] = g;
+      out[2] = (r + g) & 0xff;
+      out[4] = in[4];
+      out += 4;
+    }
+  }
+}
+
+static void TransformColorInverse(const VP8LMultipliers* const m,
+                                  const uint32_t* src, int num_pixels,
+                                  uint32_t* dst) {
+  v16u8 src0, dst0;
+  const v16i8 g2br = (v16i8)__msa_fill_w(m->green_to_blue_ |
+                                         (m->green_to_red_ << 16));
+  const v16i8 r2b = (v16i8)__msa_fill_w(m->red_to_blue_);
+  const v16u8 mask0 = { 1, 255, 1, 255, 5, 255, 5, 255, 9, 255, 9, 255,
+                        13, 255, 13, 255 };
+  const v16u8 mask1 = { 16, 1, 18, 3, 20, 5, 22, 7, 24, 9, 26, 11,
+                        28, 13, 30, 15 };
+
+  while (num_pixels >= 8) {
+    v16u8 src1, dst1;
+    LD_UB2(src, 4, src0, src1);
+    TRANSFORM_COLOR_INVERSE_8(src0, src1, dst0, dst1, g2br, r2b, mask0, mask1);
+    ST_UB2(dst0, dst1, dst, 4);
+    src += 8;
+    dst += 8;
+    num_pixels -= 8;
+  }
+  if (num_pixels > 0) {
+    if (num_pixels >= 4) {
+      src0 = LD_UB(src);
+      TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
+      ST_UB(dst0, dst);
+      src += 4;
+      dst += 4;
+      num_pixels -= 4;
+    }
+    if (num_pixels > 0) {
+      src0 = LD_UB(src);
+      TRANSFORM_COLOR_INVERSE_4(src0, dst0, g2br, r2b, mask0, mask1);
+      if (num_pixels == 3) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 2);
+        SD(pix_d, dst + 0);
+        SW(pix_w, dst + 2);
+      } else if (num_pixels == 2) {
+        const uint64_t pix_d = __msa_copy_s_d((v2i64)dst0, 0);
+        SD(pix_d, dst);
+      } else {
+        const uint32_t pix_w = __msa_copy_s_w((v4i32)dst0, 0);
+        SW(pix_w, dst);
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void VP8LDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitMSA(void) {
+  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
+  VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
+  VP8LTransformColorInverse = TransformColorInverse;
+}
+
+#else  // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(VP8LDspInitMSA)
+
+#endif  // WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_neon.c b/src/3rdparty/libwebp/src/dsp/lossless_neon.c
index 6faccb8..1145d5f 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_neon.c
@@ -139,6 +139,357 @@ static void ConvertBGRAToRGB(const uint32_t* src,
 
 #endif   // !WORK_AROUND_GCC
 
+
+//------------------------------------------------------------------------------
+// Predictor Transform
+
+#define LOAD_U32_AS_U8(IN) vreinterpret_u8_u32(vdup_n_u32((IN)))
+#define LOAD_U32P_AS_U8(IN) vreinterpret_u8_u32(vld1_u32((IN)))
+#define LOADQ_U32_AS_U8(IN) vreinterpretq_u8_u32(vdupq_n_u32((IN)))
+#define LOADQ_U32P_AS_U8(IN) vreinterpretq_u8_u32(vld1q_u32((IN)))
+#define GET_U8_AS_U32(IN) vget_lane_u32(vreinterpret_u32_u8((IN)), 0);
+#define GETQ_U8_AS_U32(IN) vgetq_lane_u32(vreinterpretq_u32_u8((IN)), 0);
+#define STOREQ_U8_AS_U32P(OUT, IN) vst1q_u32((OUT), vreinterpretq_u32_u8((IN)));
+#define ROTATE32_LEFT(L) vextq_u8((L), (L), 12)    // D|C|B|A -> C|B|A|D
+
+static WEBP_INLINE uint8x8_t Average2_u8_NEON(uint32_t a0, uint32_t a1) {
+  const uint8x8_t A0 = LOAD_U32_AS_U8(a0);
+  const uint8x8_t A1 = LOAD_U32_AS_U8(a1);
+  return vhadd_u8(A0, A1);
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalf_NEON(uint32_t c0,
+                                                        uint32_t c1,
+                                                        uint32_t c2) {
+  const uint8x8_t avg = Average2_u8_NEON(c0, c1);
+  // Remove one to c2 when bigger than avg.
+  const uint8x8_t C2 = LOAD_U32_AS_U8(c2);
+  const uint8x8_t cmp = vcgt_u8(C2, avg);
+  const uint8x8_t C2_1 = vadd_u8(C2, cmp);
+  // Compute half of the difference between avg and c2.
+  const int8x8_t diff_avg = vreinterpret_s8_u8(vhsub_u8(avg, C2_1));
+  // Compute the sum with avg and saturate.
+  const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(avg));
+  const uint8x8_t res = vqmovun_s16(vaddw_s8(avg_16, diff_avg));
+  const uint32_t output = GET_U8_AS_U32(res);
+  return output;
+}
+
+static WEBP_INLINE uint32_t Average2_NEON(uint32_t a0, uint32_t a1) {
+  const uint8x8_t avg_u8x8 = Average2_u8_NEON(a0, a1);
+  const uint32_t avg = GET_U8_AS_U32(avg_u8x8);
+  return avg;
+}
+
+static WEBP_INLINE uint32_t Average3_NEON(uint32_t a0, uint32_t a1,
+                                          uint32_t a2) {
+  const uint8x8_t avg0 = Average2_u8_NEON(a0, a2);
+  const uint8x8_t A1 = LOAD_U32_AS_U8(a1);
+  const uint32_t avg = GET_U8_AS_U32(vhadd_u8(avg0, A1));
+  return avg;
+}
+
+static uint32_t Predictor5_NEON(uint32_t left, const uint32_t* const top) {
+  return Average3_NEON(left, top[0], top[1]);
+}
+static uint32_t Predictor6_NEON(uint32_t left, const uint32_t* const top) {
+  return Average2_NEON(left, top[-1]);
+}
+static uint32_t Predictor7_NEON(uint32_t left, const uint32_t* const top) {
+  return Average2_NEON(left, top[0]);
+}
+static uint32_t Predictor13_NEON(uint32_t left, const uint32_t* const top) {
+  return ClampedAddSubtractHalf_NEON(left, top[0], top[-1]);
+}
+
+// Batch versions of those functions.
+
+// Predictor0: ARGB_BLACK.
+static void PredictorAdd0_NEON(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  const uint8x16_t black = vreinterpretq_u8_u32(vdupq_n_u32(ARGB_BLACK));
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    const uint8x16_t res = vaddq_u8(src, black);
+    STOREQ_U8_AS_U32P(&out[i], res);
+  }
+  VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);
+}
+
+// Predictor1: left.
+static void PredictorAdd1_NEON(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  const uint8x16_t zero = LOADQ_U32_AS_U8(0);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    // a | b | c | d
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    // 0 | a | b | c
+    const uint8x16_t shift0 = vextq_u8(zero, src, 12);
+    // a | a + b | b + c | c + d
+    const uint8x16_t sum0 = vaddq_u8(src, shift0);
+    // 0 | 0 | a | a + b
+    const uint8x16_t shift1 = vextq_u8(zero, sum0, 8);
+    // a | a + b | a + b + c | a + b + c + d
+    const uint8x16_t sum1 = vaddq_u8(sum0, shift1);
+    const uint8x16_t prev = LOADQ_U32_AS_U8(out[i - 1]);
+    const uint8x16_t res = vaddq_u8(sum1, prev);
+    STOREQ_U8_AS_U32P(&out[i], res);
+  }
+  VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
+}
+
+// Macro that adds 32-bit integers from IN using mod 256 arithmetic
+// per 8 bit channel.
+#define GENERATE_PREDICTOR_1(X, IN)                                       \
+static void PredictorAdd##X##_NEON(const uint32_t* in,                    \
+                                   const uint32_t* upper, int num_pixels, \
+                                   uint32_t* out) {                       \
+  int i;                                                                  \
+  for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);                      \
+    const uint8x16_t other = LOADQ_U32P_AS_U8(&(IN));                     \
+    const uint8x16_t res = vaddq_u8(src, other);                          \
+    STOREQ_U8_AS_U32P(&out[i], res);                                      \
+  }                                                                       \
+  VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);   \
+}
+// Predictor2: Top.
+GENERATE_PREDICTOR_1(2, upper[i])
+// Predictor3: Top-right.
+GENERATE_PREDICTOR_1(3, upper[i + 1])
+// Predictor4: Top-left.
+GENERATE_PREDICTOR_1(4, upper[i - 1])
+#undef GENERATE_PREDICTOR_1
+
+// Predictor5: average(average(left, TR), T)
+#define DO_PRED5(LANE) do {                                              \
+  const uint8x16_t avgLTR = vhaddq_u8(L, TR);                            \
+  const uint8x16_t avg = vhaddq_u8(avgLTR, T);                           \
+  const uint8x16_t res = vaddq_u8(avg, src);                             \
+  vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE));   \
+  L = ROTATE32_LEFT(res);                                                \
+} while (0)
+
+static void PredictorAdd5_NEON(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i + 0]);
+    const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]);
+    DO_PRED5(0);
+    DO_PRED5(1);
+    DO_PRED5(2);
+    DO_PRED5(3);
+  }
+  VP8LPredictorsAdd_C[5](in + i, upper + i, num_pixels - i, out + i);
+}
+#undef DO_PRED5
+
+#define DO_PRED67(LANE) do {                                             \
+  const uint8x16_t avg = vhaddq_u8(L, top);                              \
+  const uint8x16_t res = vaddq_u8(avg, src);                             \
+  vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE));   \
+  L = ROTATE32_LEFT(res);                                                \
+} while (0)
+
+// Predictor6: average(left, TL)
+static void PredictorAdd6_NEON(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i - 1]);
+    DO_PRED67(0);
+    DO_PRED67(1);
+    DO_PRED67(2);
+    DO_PRED67(3);
+  }
+  VP8LPredictorsAdd_C[6](in + i, upper + i, num_pixels - i, out + i);
+}
+
+// Predictor7: average(left, T)
+static void PredictorAdd7_NEON(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    const uint8x16_t top = LOADQ_U32P_AS_U8(&upper[i]);
+    DO_PRED67(0);
+    DO_PRED67(1);
+    DO_PRED67(2);
+    DO_PRED67(3);
+  }
+  VP8LPredictorsAdd_C[7](in + i, upper + i, num_pixels - i, out + i);
+}
+#undef DO_PRED67
+
+#define GENERATE_PREDICTOR_2(X, IN)                                       \
+static void PredictorAdd##X##_NEON(const uint32_t* in,                    \
+                                   const uint32_t* upper, int num_pixels, \
+                                   uint32_t* out) {                       \
+  int i;                                                                  \
+  for (i = 0; i + 4 <= num_pixels; i += 4) {                              \
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);                      \
+    const uint8x16_t Tother = LOADQ_U32P_AS_U8(&(IN));                    \
+    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);                     \
+    const uint8x16_t avg = vhaddq_u8(T, Tother);                          \
+    const uint8x16_t res = vaddq_u8(avg, src);                            \
+    STOREQ_U8_AS_U32P(&out[i], res);                                      \
+  }                                                                       \
+  VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);   \
+}
+// Predictor8: average TL T.
+GENERATE_PREDICTOR_2(8, upper[i - 1])
+// Predictor9: average T TR.
+GENERATE_PREDICTOR_2(9, upper[i + 1])
+#undef GENERATE_PREDICTOR_2
+
+// Predictor10: average of (average of (L,TL), average of (T, TR)).
+#define DO_PRED10(LANE) do {                                             \
+  const uint8x16_t avgLTL = vhaddq_u8(L, TL);                            \
+  const uint8x16_t avg = vhaddq_u8(avgTTR, avgLTL);                      \
+  const uint8x16_t res = vaddq_u8(avg, src);                             \
+  vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE));   \
+  L = ROTATE32_LEFT(res);                                                \
+} while (0)
+
+static void PredictorAdd10_NEON(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
+    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
+    const uint8x16_t TR = LOADQ_U32P_AS_U8(&upper[i + 1]);
+    const uint8x16_t avgTTR = vhaddq_u8(T, TR);
+    DO_PRED10(0);
+    DO_PRED10(1);
+    DO_PRED10(2);
+    DO_PRED10(3);
+  }
+  VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
+}
+#undef DO_PRED10
+
+// Predictor11: select.
+#define DO_PRED11(LANE) do {                                                   \
+  const uint8x16_t sumLin = vaddq_u8(L, src);  /* in + L */                    \
+  const uint8x16_t pLTL = vabdq_u8(L, TL);  /* |L - TL| */                     \
+  const uint16x8_t sum_LTL = vpaddlq_u8(pLTL);                                 \
+  const uint32x4_t pa = vpaddlq_u16(sum_LTL);                                  \
+  const uint32x4_t mask = vcleq_u32(pa, pb);                                   \
+  const uint8x16_t res = vbslq_u8(vreinterpretq_u8_u32(mask), sumTin, sumLin); \
+  vst1q_lane_u32(&out[i + (LANE)], vreinterpretq_u32_u8(res), (LANE));         \
+  L = ROTATE32_LEFT(res);                                                      \
+} while (0)
+
+static void PredictorAdd11_NEON(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
+    const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
+    const uint8x16_t pTTL = vabdq_u8(T, TL);   // |T - TL|
+    const uint16x8_t sum_TTL = vpaddlq_u8(pTTL);
+    const uint32x4_t pb = vpaddlq_u16(sum_TTL);
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    const uint8x16_t sumTin = vaddq_u8(T, src);   // in + T
+    DO_PRED11(0);
+    DO_PRED11(1);
+    DO_PRED11(2);
+    DO_PRED11(3);
+  }
+  VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
+}
+#undef DO_PRED11
+
+// Predictor12: ClampedAddSubtractFull.
+#define DO_PRED12(DIFF, LANE) do {                                       \
+  const uint8x8_t pred =                                                 \
+      vqmovun_s16(vaddq_s16(vreinterpretq_s16_u16(L), (DIFF)));          \
+  const uint8x8_t res =                                                  \
+      vadd_u8(pred, (LANE <= 1) ? vget_low_u8(src) : vget_high_u8(src)); \
+  const uint16x8_t res16 = vmovl_u8(res);                                \
+  vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1); \
+  /* rotate in the left predictor for next iteration */                  \
+  L = vextq_u16(res16, res16, 4);                                        \
+} while (0)
+
+static void PredictorAdd12_NEON(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  uint16x8_t L = vmovl_u8(LOAD_U32_AS_U8(out[-1]));
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    // load four pixels of source
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    // precompute the difference T - TL once for all, stored as s16
+    const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
+    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
+    const int16x8_t diff_lo =
+        vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(T), vget_low_u8(TL)));
+    const int16x8_t diff_hi =
+        vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(T), vget_high_u8(TL)));
+    // loop over the four reconstructed pixels
+    DO_PRED12(diff_lo, 0);
+    DO_PRED12(diff_lo, 1);
+    DO_PRED12(diff_hi, 2);
+    DO_PRED12(diff_hi, 3);
+  }
+  VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
+}
+#undef DO_PRED12
+
+// Predictor13: ClampedAddSubtractHalf
+#define DO_PRED13(LANE, LOW_OR_HI) do {                                        \
+  const uint8x16_t avg = vhaddq_u8(L, T);                                      \
+  const uint8x16_t cmp = vcgtq_u8(TL, avg);                                    \
+  const uint8x16_t TL_1 = vaddq_u8(TL, cmp);                                   \
+  /* Compute half of the difference between avg and TL'. */                    \
+  const int8x8_t diff_avg =                                                    \
+      vreinterpret_s8_u8(LOW_OR_HI(vhsubq_u8(avg, TL_1)));                     \
+  /* Compute the sum with avg and saturate. */                                 \
+  const int16x8_t avg_16 = vreinterpretq_s16_u16(vmovl_u8(LOW_OR_HI(avg)));    \
+  const uint8x8_t delta = vqmovun_s16(vaddw_s8(avg_16, diff_avg));             \
+  const uint8x8_t res = vadd_u8(LOW_OR_HI(src), delta);                        \
+  const uint8x16_t res2 = vcombine_u8(res, res);                               \
+  vst1_lane_u32(&out[i + (LANE)], vreinterpret_u32_u8(res), (LANE) & 1);       \
+  L = ROTATE32_LEFT(res2);                                                     \
+} while (0)
+
+static void PredictorAdd13_NEON(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  uint8x16_t L = LOADQ_U32_AS_U8(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const uint8x16_t src = LOADQ_U32P_AS_U8(&in[i]);
+    const uint8x16_t T = LOADQ_U32P_AS_U8(&upper[i]);
+    const uint8x16_t TL = LOADQ_U32P_AS_U8(&upper[i - 1]);
+    DO_PRED13(0, vget_low_u8);
+    DO_PRED13(1, vget_low_u8);
+    DO_PRED13(2, vget_high_u8);
+    DO_PRED13(3, vget_high_u8);
+  }
+  VP8LPredictorsAdd_C[13](in + i, upper + i, num_pixels - i, out + i);
+}
+#undef DO_PRED13
+
+#undef LOAD_U32_AS_U8
+#undef LOAD_U32P_AS_U8
+#undef LOADQ_U32_AS_U8
+#undef LOADQ_U32P_AS_U8
+#undef GET_U8_AS_U32
+#undef GETQ_U8_AS_U32
+#undef STOREQ_U8_AS_U32P
+#undef ROTATE32_LEFT
+
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
 
@@ -171,28 +522,30 @@ static WEBP_INLINE uint8x16_t DoGreenShuffle(const uint8x16_t argb,
 }
 #endif  // USE_VTBLQ
 
-static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
-  const uint32_t* const end = argb_data + (num_pixels & ~3);
+static void AddGreenToBlueAndRed(const uint32_t* src, int num_pixels,
+                                 uint32_t* dst) {
+  const uint32_t* const end = src + (num_pixels & ~3);
 #ifdef USE_VTBLQ
   const uint8x16_t shuffle = vld1q_u8(kGreenShuffle);
 #else
   const uint8x8_t shuffle = vld1_u8(kGreenShuffle);
 #endif
-  for (; argb_data < end; argb_data += 4) {
-    const uint8x16_t argb = vld1q_u8((uint8_t*)argb_data);
+  for (; src < end; src += 4, dst += 4) {
+    const uint8x16_t argb = vld1q_u8((const uint8_t*)src);
     const uint8x16_t greens = DoGreenShuffle(argb, shuffle);
-    vst1q_u8((uint8_t*)argb_data, vaddq_u8(argb, greens));
+    vst1q_u8((uint8_t*)dst, vaddq_u8(argb, greens));
   }
   // fallthrough and finish off with plain-C
-  VP8LAddGreenToBlueAndRed_C(argb_data, num_pixels & 3);
+  VP8LAddGreenToBlueAndRed_C(src, num_pixels & 3, dst);
 }
 
 //------------------------------------------------------------------------------
 // Color Transform
 
 static void TransformColorInverse(const VP8LMultipliers* const m,
-                                  uint32_t* argb_data, int num_pixels) {
-  // sign-extended multiplying constants, pre-shifted by 6.
+                                  const uint32_t* const src, int num_pixels,
+                                  uint32_t* dst) {
+// sign-extended multiplying constants, pre-shifted by 6.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 6)
   const int16_t rb[8] = {
     CST(green_to_blue_), CST(green_to_red_),
@@ -219,7 +572,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
   const uint32x4_t mask_ag = vdupq_n_u32(0xff00ff00u);
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
-    const uint8x16_t in = vld1q_u8((uint8_t*)(argb_data + i));
+    const uint8x16_t in = vld1q_u8((const uint8_t*)(src + i));
     const uint32x4_t a0g0 = vandq_u32(vreinterpretq_u32_u8(in), mask_ag);
     // 0 g 0 g
     const uint8x16_t greens = DoGreenShuffle(in, shuffle);
@@ -240,10 +593,10 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
     // 0  r'  0  b''
     const uint16x8_t G = vshrq_n_u16(vreinterpretq_u16_s8(F), 8);
     const uint32x4_t out = vorrq_u32(vreinterpretq_u32_u16(G), a0g0);
-    vst1q_u32(argb_data + i, out);
+    vst1q_u32(dst + i, out);
   }
   // Fall-back to C-version for left-overs.
-  VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
+  VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
 }
 
 #undef USE_VTBLQ
@@ -254,6 +607,26 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
 extern void VP8LDspInitNEON(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitNEON(void) {
+  VP8LPredictors[5] = Predictor5_NEON;
+  VP8LPredictors[6] = Predictor6_NEON;
+  VP8LPredictors[7] = Predictor7_NEON;
+  VP8LPredictors[13] = Predictor13_NEON;
+
+  VP8LPredictorsAdd[0] = PredictorAdd0_NEON;
+  VP8LPredictorsAdd[1] = PredictorAdd1_NEON;
+  VP8LPredictorsAdd[2] = PredictorAdd2_NEON;
+  VP8LPredictorsAdd[3] = PredictorAdd3_NEON;
+  VP8LPredictorsAdd[4] = PredictorAdd4_NEON;
+  VP8LPredictorsAdd[5] = PredictorAdd5_NEON;
+  VP8LPredictorsAdd[6] = PredictorAdd6_NEON;
+  VP8LPredictorsAdd[7] = PredictorAdd7_NEON;
+  VP8LPredictorsAdd[8] = PredictorAdd8_NEON;
+  VP8LPredictorsAdd[9] = PredictorAdd9_NEON;
+  VP8LPredictorsAdd[10] = PredictorAdd10_NEON;
+  VP8LPredictorsAdd[11] = PredictorAdd11_NEON;
+  VP8LPredictorsAdd[12] = PredictorAdd12_NEON;
+  VP8LPredictorsAdd[13] = PredictorAdd13_NEON;
+
   VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
   VP8LConvertBGRAToBGR = ConvertBGRAToBGR;
   VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
index 2d016c2..15aae93 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
@@ -14,9 +14,12 @@
 #include "./dsp.h"
 
 #if defined(WEBP_USE_SSE2)
+
+#include "./common_sse2.h"
+#include "./lossless.h"
+#include "./lossless_common.h"
 #include <assert.h>
 #include <emmintrin.h>
-#include "./lossless.h"
 
 //------------------------------------------------------------------------------
 // Predictor Transform
@@ -75,25 +78,44 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
   return (pa_minus_pb <= 0) ? a : b;
 }
 
-static WEBP_INLINE __m128i Average2_128i(uint32_t a0, uint32_t a1) {
+static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
+                                       const __m128i* const a1,
+                                       __m128i* const avg) {
+  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
+  const __m128i ones = _mm_set1_epi8(1);
+  const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
+  const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
+  *avg = _mm_sub_epi8(avg1, one);
+}
+
+static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
+                                        __m128i* const avg) {
+  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
+  const __m128i ones = _mm_set1_epi8(1);
+  const __m128i A0 = _mm_cvtsi32_si128(a0);
+  const __m128i A1 = _mm_cvtsi32_si128(a1);
+  const __m128i avg1 = _mm_avg_epu8(A0, A1);
+  const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
+  *avg = _mm_sub_epi8(avg1, one);
+}
+
+static WEBP_INLINE __m128i Average2_uint32_16(uint32_t a0, uint32_t a1) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero);
   const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
   const __m128i sum = _mm_add_epi16(A1, A0);
-  const __m128i avg = _mm_srli_epi16(sum, 1);
-  return avg;
+  return _mm_srli_epi16(sum, 1);
 }
 
 static WEBP_INLINE uint32_t Average2(uint32_t a0, uint32_t a1) {
-  const __m128i avg = Average2_128i(a0, a1);
-  const __m128i A2 = _mm_packus_epi16(avg, avg);
-  const uint32_t output = _mm_cvtsi128_si32(A2);
-  return output;
+  __m128i output;
+  Average2_uint32(a0, a1, &output);
+  return _mm_cvtsi128_si32(output);
 }
 
 static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i avg1 = Average2_128i(a0, a2);
+  const __m128i avg1 = Average2_uint32_16(a0, a2);
   const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
   const __m128i sum = _mm_add_epi16(avg1, A1);
   const __m128i avg2 = _mm_srli_epi16(sum, 1);
@@ -104,8 +126,8 @@ static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
 
 static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
                                      uint32_t a2, uint32_t a3) {
-  const __m128i avg1 = Average2_128i(a0, a1);
-  const __m128i avg2 = Average2_128i(a2, a3);
+  const __m128i avg1 = Average2_uint32_16(a0, a1);
+  const __m128i avg2 = Average2_uint32_16(a2, a3);
   const __m128i sum = _mm_add_epi16(avg2, avg1);
   const __m128i avg3 = _mm_srli_epi16(sum, 1);
   const __m128i A0 = _mm_packus_epi16(avg3, avg3);
@@ -113,68 +135,289 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
   return output;
 }
 
-static uint32_t Predictor5(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor5_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average3(left, top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor6(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor6_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(left, top[-1]);
   return pred;
 }
-static uint32_t Predictor7(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor7_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(left, top[0]);
   return pred;
 }
-static uint32_t Predictor8(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor8_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(top[-1], top[0]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor9(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor9_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average2(top[0], top[1]);
   (void)left;
   return pred;
 }
-static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor10_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Average4(left, top[-1], top[0], top[1]);
   return pred;
 }
-static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor11_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = Select(top[0], left, top[-1]);
   return pred;
 }
-static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor12_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
   return pred;
 }
-static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
+static uint32_t Predictor13_SSE2(uint32_t left, const uint32_t* const top) {
   const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
   return pred;
 }
 
+// Batch versions of those functions.
+
+// Predictor0: ARGB_BLACK.
+static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i black = _mm_set1_epi32(ARGB_BLACK);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i res = _mm_add_epi8(src, black);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor1: left.
+static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper,
+                               int num_pixels, uint32_t* out) {
+  int i;
+  __m128i prev = _mm_set1_epi32(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    // a | b | c | d
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    // 0 | a | b | c
+    const __m128i shift0 = _mm_slli_si128(src, 4);
+    // a | a + b | b + c | c + d
+    const __m128i sum0 = _mm_add_epi8(src, shift0);
+    // 0 | 0 | a | a + b
+    const __m128i shift1 = _mm_slli_si128(sum0, 8);
+    // a | a + b | a + b + c | a + b + c + d
+    const __m128i sum1 = _mm_add_epi8(sum0, shift1);
+    const __m128i res = _mm_add_epi8(sum1, prev);
+    _mm_storeu_si128((__m128i*)&out[i], res);
+    // replicate prev output on the four lanes
+    prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6));
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Macro that adds 32-bit integers from IN using mod 256 arithmetic
+// per 8 bit channel.
+#define GENERATE_PREDICTOR_1(X, IN)                                           \
+static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
+                                  int num_pixels, uint32_t* out) {            \
+  int i;                                                                      \
+  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
+    const __m128i other = _mm_loadu_si128((const __m128i*)&(IN));             \
+    const __m128i res = _mm_add_epi8(src, other);                             \
+    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
+  }                                                                           \
+  if (i != num_pixels) {                                                      \
+    VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
+  }                                                                           \
+}
+
+// Predictor2: Top.
+GENERATE_PREDICTOR_1(2, upper[i])
+// Predictor3: Top-right.
+GENERATE_PREDICTOR_1(3, upper[i + 1])
+// Predictor4: Top-left.
+GENERATE_PREDICTOR_1(4, upper[i - 1])
+#undef GENERATE_PREDICTOR_1
+
+// Due to averages with integers, values cannot be accumulated in parallel for
+// predictors 5 to 7.
+GENERATE_PREDICTOR_ADD(Predictor5_SSE2, PredictorAdd5_SSE2)
+GENERATE_PREDICTOR_ADD(Predictor6_SSE2, PredictorAdd6_SSE2)
+GENERATE_PREDICTOR_ADD(Predictor7_SSE2, PredictorAdd7_SSE2)
+
+#define GENERATE_PREDICTOR_2(X, IN)                                           \
+static void PredictorAdd##X##_SSE2(const uint32_t* in, const uint32_t* upper, \
+                                   int num_pixels, uint32_t* out) {           \
+  int i;                                                                      \
+  for (i = 0; i + 4 <= num_pixels; i += 4) {                                  \
+    const __m128i Tother = _mm_loadu_si128((const __m128i*)&(IN));            \
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);             \
+    const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);              \
+    __m128i avg, res;                                                         \
+    Average2_m128i(&T, &Tother, &avg);                                        \
+    res = _mm_add_epi8(avg, src);                                             \
+    _mm_storeu_si128((__m128i*)&out[i], res);                                 \
+  }                                                                           \
+  if (i != num_pixels) {                                                      \
+    VP8LPredictorsAdd_C[(X)](in + i, upper + i, num_pixels - i, out + i);     \
+  }                                                                           \
+}
+// Predictor8: average TL T.
+GENERATE_PREDICTOR_2(8, upper[i - 1])
+// Predictor9: average T TR.
+GENERATE_PREDICTOR_2(9, upper[i + 1])
+#undef GENERATE_PREDICTOR_2
+
+// Predictor10: average of (average of (L,TL), average of (T, TR)).
+static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i, j;
+  __m128i L = _mm_cvtsi32_si128(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]);
+    __m128i avgTTR;
+    Average2_m128i(&T, &TR, &avgTTR);
+    for (j = 0; j < 4; ++j) {
+      __m128i avgLTL, avg;
+      Average2_m128i(&L, &TL, &avgLTL);
+      Average2_m128i(&avgTTR, &avgLTL, &avg);
+      L = _mm_add_epi8(avg, src);
+      out[i + j] = _mm_cvtsi128_si32(L);
+      // Rotate the pre-computed values for the next iteration.
+      avgTTR = _mm_srli_si128(avgTTR, 4);
+      TL = _mm_srli_si128(TL, 4);
+      src = _mm_srli_si128(src, 4);
+    }
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor11: select.
+static void GetSumAbsDiff32(const __m128i* const A, const __m128i* const B,
+                            __m128i* const out) {
+  // We can unpack with any value on the upper 32 bits, provided it's the same
+  // on both operands (to that their sum of abs diff is zero). Here we use *A.
+  const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
+  const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
+  const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
+  const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
+  const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
+  const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
+  *out = _mm_packs_epi32(s_lo, s_hi);
+}
+
+static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i, j;
+  __m128i L = _mm_cvtsi32_si128(out[-1]);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    __m128i pa;
+    GetSumAbsDiff32(&T, &TL, &pa);   // pa = sum |T-TL|
+    for (j = 0; j < 4; ++j) {
+      const __m128i L_lo = _mm_unpacklo_epi32(L, L);
+      const __m128i TL_lo = _mm_unpacklo_epi32(TL, L);
+      const __m128i pb = _mm_sad_epu8(L_lo, TL_lo);  // pb = sum |L-TL|
+      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
+      const __m128i A = _mm_and_si128(mask, L);
+      const __m128i B = _mm_andnot_si128(mask, T);
+      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
+      L = _mm_add_epi8(src, pred);
+      out[i + j] = _mm_cvtsi128_si32(L);
+      // Shift the pre-computed value for the next iteration.
+      T = _mm_srli_si128(T, 4);
+      TL = _mm_srli_si128(TL, 4);
+      src = _mm_srli_si128(src, 4);
+      pa = _mm_srli_si128(pa, 4);
+    }
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+
+// Predictor12: ClampedAddSubtractFull.
+#define DO_PRED12(DIFF, LANE, OUT)                          \
+do {                                                        \
+  const __m128i all = _mm_add_epi16(L, (DIFF));             \
+  const __m128i alls = _mm_packus_epi16(all, all);          \
+  const __m128i res = _mm_add_epi8(src, alls);              \
+  out[i + (OUT)] = _mm_cvtsi128_si32(res);                  \
+  L = _mm_unpacklo_epi8(res, zero);                         \
+  /* Shift the pre-computed value for the next iteration.*/ \
+  if (LANE == 0) (DIFF) = _mm_srli_si128((DIFF), 8);        \
+  src = _mm_srli_si128(src, 4);                             \
+} while (0)
+
+static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper,
+                                int num_pixels, uint32_t* out) {
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i L8 = _mm_cvtsi32_si128(out[-1]);
+  __m128i L = _mm_unpacklo_epi8(L8, zero);
+  for (i = 0; i + 4 <= num_pixels; i += 4) {
+    // Load 4 pixels at a time.
+    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
+    const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
+    const __m128i T_lo = _mm_unpacklo_epi8(T, zero);
+    const __m128i T_hi = _mm_unpackhi_epi8(T, zero);
+    const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
+    const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero);
+    const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero);
+    __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo);
+    __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi);
+    DO_PRED12(diff_lo, 0, 0);
+    DO_PRED12(diff_lo, 1, 1);
+    DO_PRED12(diff_hi, 0, 2);
+    DO_PRED12(diff_hi, 1, 3);
+  }
+  if (i != num_pixels) {
+    VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i);
+  }
+}
+#undef DO_PRED12
+
+// Due to averages with integers, values cannot be accumulated in parallel for
+// predictors 13.
+GENERATE_PREDICTOR_ADD(Predictor13_SSE2, PredictorAdd13_SSE2)
+
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
 
-static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) {
+static void AddGreenToBlueAndRed(const uint32_t* const src, int num_pixels,
+                                 uint32_t* dst) {
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+    const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
     const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
     const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
     const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
     const __m128i out = _mm_add_epi8(in, C);
-    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+    _mm_storeu_si128((__m128i*)&dst[i], out);
   }
   // fallthrough and finish off with plain-C
-  VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i);
+  if (i != num_pixels) {
+    VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
+  }
 }
 
 //------------------------------------------------------------------------------
 // Color Transform
 
 static void TransformColorInverse(const VP8LMultipliers* const m,
-                                  uint32_t* argb_data, int num_pixels) {
-  // sign-extended multiplying constants, pre-shifted by 5.
+                                  const uint32_t* const src, int num_pixels,
+                                  uint32_t* dst) {
+// sign-extended multiplying constants, pre-shifted by 5.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
   const __m128i mults_rb = _mm_set_epi16(
       CST(green_to_red_), CST(green_to_blue_),
@@ -188,7 +431,7 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
   const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
   int i;
   for (i = 0; i + 4 <= num_pixels; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
+    const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
     const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
     const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
     const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
@@ -200,15 +443,53 @@ static void TransformColorInverse(const VP8LMultipliers* const m,
     const __m128i I = _mm_add_epi8(H, F);              // r' x  b'' 0
     const __m128i J = _mm_srli_epi16(I, 8);            // 0  r'  0  b''
     const __m128i out = _mm_or_si128(J, A);
-    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+    _mm_storeu_si128((__m128i*)&dst[i], out);
   }
   // Fall-back to C-version for left-overs.
-  VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
+  if (i != num_pixels) {
+    VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
+  }
 }
 
 //------------------------------------------------------------------------------
 // Color-space conversion functions
 
+static void ConvertBGRAToRGB(const uint32_t* src, int num_pixels,
+                             uint8_t* dst) {
+  const __m128i* in = (const __m128i*)src;
+  __m128i* out = (__m128i*)dst;
+
+  while (num_pixels >= 32) {
+    // Load the BGRA buffers.
+    __m128i in0 = _mm_loadu_si128(in + 0);
+    __m128i in1 = _mm_loadu_si128(in + 1);
+    __m128i in2 = _mm_loadu_si128(in + 2);
+    __m128i in3 = _mm_loadu_si128(in + 3);
+    __m128i in4 = _mm_loadu_si128(in + 4);
+    __m128i in5 = _mm_loadu_si128(in + 5);
+    __m128i in6 = _mm_loadu_si128(in + 6);
+    __m128i in7 = _mm_loadu_si128(in + 7);
+    VP8L32bToPlanar(&in0, &in1, &in2, &in3);
+    VP8L32bToPlanar(&in4, &in5, &in6, &in7);
+    // At this points, in1/in5 contains red only, in2/in6 green only ...
+    // Pack the colors in 24b RGB.
+    VP8PlanarTo24b(&in1, &in5, &in2, &in6, &in3, &in7);
+    _mm_storeu_si128(out + 0, in1);
+    _mm_storeu_si128(out + 1, in5);
+    _mm_storeu_si128(out + 2, in2);
+    _mm_storeu_si128(out + 3, in6);
+    _mm_storeu_si128(out + 4, in3);
+    _mm_storeu_si128(out + 5, in7);
+    in += 8;
+    out += 6;
+    num_pixels -= 32;
+  }
+  // left-overs
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
+}
+
 static void ConvertBGRAToRGBA(const uint32_t* src,
                               int num_pixels, uint8_t* dst) {
   const __m128i* in = (const __m128i*)src;
@@ -233,7 +514,9 @@ static void ConvertBGRAToRGBA(const uint32_t* src,
     num_pixels -= 8;
   }
   // left-overs
-  VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
 }
 
 static void ConvertBGRAToRGBA4444(const uint32_t* src,
@@ -267,7 +550,9 @@ static void ConvertBGRAToRGBA4444(const uint32_t* src,
     num_pixels -= 8;
   }
   // left-overs
-  VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
 }
 
 static void ConvertBGRAToRGB565(const uint32_t* src,
@@ -306,7 +591,9 @@ static void ConvertBGRAToRGB565(const uint32_t* src,
     num_pixels -= 8;
   }
   // left-overs
-  VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
+  }
 }
 
 static void ConvertBGRAToBGR(const uint32_t* src,
@@ -337,7 +624,9 @@ static void ConvertBGRAToBGR(const uint32_t* src,
     num_pixels -= 8;
   }
   // left-overs
-  VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
+  if (num_pixels > 0) {
+    VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst);
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -346,19 +635,35 @@ static void ConvertBGRAToBGR(const uint32_t* src,
 extern void VP8LDspInitSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE2(void) {
-  VP8LPredictors[5] = Predictor5;
-  VP8LPredictors[6] = Predictor6;
-  VP8LPredictors[7] = Predictor7;
-  VP8LPredictors[8] = Predictor8;
-  VP8LPredictors[9] = Predictor9;
-  VP8LPredictors[10] = Predictor10;
-  VP8LPredictors[11] = Predictor11;
-  VP8LPredictors[12] = Predictor12;
-  VP8LPredictors[13] = Predictor13;
+  VP8LPredictors[5] = Predictor5_SSE2;
+  VP8LPredictors[6] = Predictor6_SSE2;
+  VP8LPredictors[7] = Predictor7_SSE2;
+  VP8LPredictors[8] = Predictor8_SSE2;
+  VP8LPredictors[9] = Predictor9_SSE2;
+  VP8LPredictors[10] = Predictor10_SSE2;
+  VP8LPredictors[11] = Predictor11_SSE2;
+  VP8LPredictors[12] = Predictor12_SSE2;
+  VP8LPredictors[13] = Predictor13_SSE2;
+
+  VP8LPredictorsAdd[0] = PredictorAdd0_SSE2;
+  VP8LPredictorsAdd[1] = PredictorAdd1_SSE2;
+  VP8LPredictorsAdd[2] = PredictorAdd2_SSE2;
+  VP8LPredictorsAdd[3] = PredictorAdd3_SSE2;
+  VP8LPredictorsAdd[4] = PredictorAdd4_SSE2;
+  VP8LPredictorsAdd[5] = PredictorAdd5_SSE2;
+  VP8LPredictorsAdd[6] = PredictorAdd6_SSE2;
+  VP8LPredictorsAdd[7] = PredictorAdd7_SSE2;
+  VP8LPredictorsAdd[8] = PredictorAdd8_SSE2;
+  VP8LPredictorsAdd[9] = PredictorAdd9_SSE2;
+  VP8LPredictorsAdd[10] = PredictorAdd10_SSE2;
+  VP8LPredictorsAdd[11] = PredictorAdd11_SSE2;
+  VP8LPredictorsAdd[12] = PredictorAdd12_SSE2;
+  VP8LPredictorsAdd[13] = PredictorAdd13_SSE2;
 
   VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
   VP8LTransformColorInverse = TransformColorInverse;
 
+  VP8LConvertBGRAToRGB = ConvertBGRAToRGB;
   VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA;
   VP8LConvertBGRAToRGBA4444 = ConvertBGRAToRGBA4444;
   VP8LConvertBGRAToRGB565 = ConvertBGRAToRGB565;
diff --git a/src/3rdparty/libwebp/src/dsp/msa_macro.h b/src/3rdparty/libwebp/src/dsp/msa_macro.h
index 5c707f4..d0e5f45 100644
--- a/src/3rdparty/libwebp/src/dsp/msa_macro.h
+++ b/src/3rdparty/libwebp/src/dsp/msa_macro.h
@@ -23,12 +23,24 @@
 
 #ifdef CLANG_BUILD
   #define ADDVI_H(a, b)  __msa_addvi_h((v8i16)a, b)
+  #define ADDVI_W(a, b)  __msa_addvi_w((v4i32)a, b)
+  #define SRAI_B(a, b)  __msa_srai_b((v16i8)a, b)
   #define SRAI_H(a, b)  __msa_srai_h((v8i16)a, b)
   #define SRAI_W(a, b)  __msa_srai_w((v4i32)a, b)
+  #define SRLI_H(a, b)  __msa_srli_h((v8i16)a, b)
+  #define SLLI_B(a, b)  __msa_slli_b((v4i32)a, b)
+  #define ANDI_B(a, b)  __msa_andi_b((v16u8)a, b)
+  #define ORI_B(a, b)   __msa_ori_b((v16u8)a, b)
 #else
   #define ADDVI_H(a, b)  (a + b)
+  #define ADDVI_W(a, b)  (a + b)
+  #define SRAI_B(a, b)  (a >> b)
   #define SRAI_H(a, b)  (a >> b)
   #define SRAI_W(a, b)  (a >> b)
+  #define SRLI_H(a, b)  (a << b)
+  #define SLLI_B(a, b)  (a << b)
+  #define ANDI_B(a, b)  (a & b)
+  #define ORI_B(a, b)   (a | b)
 #endif
 
 #define LD_B(RTYPE, psrc) *((RTYPE*)(psrc))
@@ -116,13 +128,13 @@
   #define SH(val, pdst)  MSA_STORE(val, pdst, msa_ush)
   MSA_STORE_FUNC(uint32_t, usw, msa_usw);
   #define SW(val, pdst)  MSA_STORE(val, pdst, msa_usw)
-  #define SD(val, pdst) {                                                  \
+  #define SD(val, pdst) do {                                               \
     uint8_t* const pdst_sd_m = (uint8_t*)(pdst);                           \
     const uint32_t val0_m = (uint32_t)(val & 0x00000000FFFFFFFF);          \
     const uint32_t val1_m = (uint32_t)((val >> 32) & 0x00000000FFFFFFFF);  \
     SW(val0_m, pdst_sd_m);                                                 \
     SW(val1_m, pdst_sd_m + 4);                                             \
-  }
+  } while (0)
 #endif  // (__mips_isa_rev >= 6)
 
 /* Description : Load 4 words with stride
@@ -133,34 +145,68 @@
  *               Load word in 'out2' from (psrc + 2 * stride)
  *               Load word in 'out3' from (psrc + 3 * stride)
  */
-#define LW4(psrc, stride, out0, out1, out2, out3) {  \
-  const uint8_t* ptmp = (const uint8_t*)psrc;        \
-  out0 = LW(ptmp);                                   \
-  ptmp += stride;                                    \
-  out1 = LW(ptmp);                                   \
-  ptmp += stride;                                    \
-  out2 = LW(ptmp);                                   \
-  ptmp += stride;                                    \
-  out3 = LW(ptmp);                                   \
-}
+#define LW4(psrc, stride, out0, out1, out2, out3) do {  \
+  const uint8_t* ptmp = (const uint8_t*)psrc;           \
+  out0 = LW(ptmp);                                      \
+  ptmp += stride;                                       \
+  out1 = LW(ptmp);                                      \
+  ptmp += stride;                                       \
+  out2 = LW(ptmp);                                      \
+  ptmp += stride;                                       \
+  out3 = LW(ptmp);                                      \
+} while (0)
 
-/* Description : Store 4 words with stride
+/* Description : Store words with stride
  * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
  * Details     : Store word from 'in0' to (pdst)
  *               Store word from 'in1' to (pdst + stride)
  *               Store word from 'in2' to (pdst + 2 * stride)
  *               Store word from 'in3' to (pdst + 3 * stride)
  */
-#define SW4(in0, in1, in2, in3, pdst, stride) {  \
-  uint8_t* ptmp = (uint8_t*)pdst;                \
-  SW(in0, ptmp);                                 \
-  ptmp += stride;                                \
-  SW(in1, ptmp);                                 \
-  ptmp += stride;                                \
-  SW(in2, ptmp);                                 \
-  ptmp += stride;                                \
-  SW(in3, ptmp);                                 \
-}
+#define SW4(in0, in1, in2, in3, pdst, stride) do {  \
+  uint8_t* ptmp = (uint8_t*)pdst;                   \
+  SW(in0, ptmp);                                    \
+  ptmp += stride;                                   \
+  SW(in1, ptmp);                                    \
+  ptmp += stride;                                   \
+  SW(in2, ptmp);                                    \
+  ptmp += stride;                                   \
+  SW(in3, ptmp);                                    \
+} while (0)
+
+#define SW3(in0, in1, in2, pdst, stride) do {  \
+  uint8_t* ptmp = (uint8_t*)pdst;              \
+  SW(in0, ptmp);                               \
+  ptmp += stride;                              \
+  SW(in1, ptmp);                               \
+  ptmp += stride;                              \
+  SW(in2, ptmp);                               \
+} while (0)
+
+#define SW2(in0, in1, pdst, stride) do {  \
+  uint8_t* ptmp = (uint8_t*)pdst;         \
+  SW(in0, ptmp);                          \
+  ptmp += stride;                         \
+  SW(in1, ptmp);                          \
+} while (0)
+
+/* Description : Store 4 double words with stride
+ * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+ * Details     : Store double word from 'in0' to (pdst)
+ *               Store double word from 'in1' to (pdst + stride)
+ *               Store double word from 'in2' to (pdst + 2 * stride)
+ *               Store double word from 'in3' to (pdst + 3 * stride)
+ */
+#define SD4(in0, in1, in2, in3, pdst, stride) do {  \
+  uint8_t* ptmp = (uint8_t*)pdst;                   \
+  SD(in0, ptmp);                                    \
+  ptmp += stride;                                   \
+  SD(in1, ptmp);                                    \
+  ptmp += stride;                                   \
+  SD(in2, ptmp);                                    \
+  ptmp += stride;                                   \
+  SD(in3, ptmp);                                    \
+} while (0)
 
 /* Description : Load vectors with 16 byte elements with stride
  * Arguments   : Inputs  - psrc, stride
@@ -169,33 +215,169 @@
  * Details     : Load 16 byte elements in 'out0' from (psrc)
  *               Load 16 byte elements in 'out1' from (psrc + stride)
  */
-#define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
-  out0 = LD_B(RTYPE, psrc);                       \
-  out1 = LD_B(RTYPE, psrc + stride);              \
-}
+#define LD_B2(RTYPE, psrc, stride, out0, out1) do {  \
+  out0 = LD_B(RTYPE, psrc);                          \
+  out1 = LD_B(RTYPE, psrc + stride);                 \
+} while (0)
 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
 
-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
-  LD_B2(RTYPE, psrc, stride, out0, out1);                     \
-  LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3);       \
-}
+#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) do {  \
+  LD_B2(RTYPE, psrc, stride, out0, out1);                  \
+  out2 = LD_B(RTYPE, psrc + 2 * stride);                   \
+} while (0)
+#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
+#define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) do {  \
+  LD_B2(RTYPE, psrc, stride, out0, out1);                        \
+  LD_B2(RTYPE, psrc + 2 * stride , stride, out2, out3);          \
+} while (0)
 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
 
+#define LD_B8(RTYPE, psrc, stride,                                  \
+              out0, out1, out2, out3, out4, out5, out6, out7) do {  \
+  LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3);               \
+  LD_B4(RTYPE, psrc + 4 * stride, stride, out4, out5, out6, out7);  \
+} while (0)
+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
+
 /* Description : Load vectors with 8 halfword elements with stride
  * Arguments   : Inputs  - psrc, stride
  *               Outputs - out0, out1
  * Details     : Load 8 halfword elements in 'out0' from (psrc)
  *               Load 8 halfword elements in 'out1' from (psrc + stride)
  */
-#define LD_H2(RTYPE, psrc, stride, out0, out1) {  \
-  out0 = LD_H(RTYPE, psrc);                       \
-  out1 = LD_H(RTYPE, psrc + stride);              \
-}
+#define LD_H2(RTYPE, psrc, stride, out0, out1) do {  \
+  out0 = LD_H(RTYPE, psrc);                          \
+  out1 = LD_H(RTYPE, psrc + stride);                 \
+} while (0)
 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
 
+/* Description : Load vectors with 4 word elements with stride
+ * Arguments   : Inputs  - psrc, stride
+ *               Outputs - out0, out1, out2, out3
+ * Details     : Load 4 word elements in 'out0' from (psrc + 0 * stride)
+ *               Load 4 word elements in 'out1' from (psrc + 1 * stride)
+ *               Load 4 word elements in 'out2' from (psrc + 2 * stride)
+ *               Load 4 word elements in 'out3' from (psrc + 3 * stride)
+ */
+#define LD_W2(RTYPE, psrc, stride, out0, out1) do {  \
+  out0 = LD_W(RTYPE, psrc);                          \
+  out1 = LD_W(RTYPE, psrc + stride);                 \
+} while (0)
+#define LD_UW2(...) LD_W2(v4u32, __VA_ARGS__)
+#define LD_SW2(...) LD_W2(v4i32, __VA_ARGS__)
+
+#define LD_W3(RTYPE, psrc, stride, out0, out1, out2) do {  \
+  LD_W2(RTYPE, psrc, stride, out0, out1);                  \
+  out2 = LD_W(RTYPE, psrc + 2 * stride);                   \
+} while (0)
+#define LD_UW3(...) LD_W3(v4u32, __VA_ARGS__)
+#define LD_SW3(...) LD_W3(v4i32, __VA_ARGS__)
+
+#define LD_W4(RTYPE, psrc, stride, out0, out1, out2, out3) do {  \
+  LD_W2(RTYPE, psrc, stride, out0, out1);                        \
+  LD_W2(RTYPE, psrc + 2 * stride, stride, out2, out3);           \
+} while (0)
+#define LD_UW4(...) LD_W4(v4u32, __VA_ARGS__)
+#define LD_SW4(...) LD_W4(v4i32, __VA_ARGS__)
+
+/* Description : Store vectors of 16 byte elements with stride
+ * Arguments   : Inputs - in0, in1, pdst, stride
+ * Details     : Store 16 byte elements from 'in0' to (pdst)
+ *               Store 16 byte elements from 'in1' to (pdst + stride)
+ */
+#define ST_B2(RTYPE, in0, in1, pdst, stride) do {  \
+  ST_B(RTYPE, in0, pdst);                          \
+  ST_B(RTYPE, in1, pdst + stride);                 \
+} while (0)
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+#define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) do {  \
+  ST_B2(RTYPE, in0, in1, pdst, stride);                      \
+  ST_B2(RTYPE, in2, in3, pdst + 2 * stride, stride);         \
+} while (0)
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
+
+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
+              pdst, stride) do {                                \
+  ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);               \
+  ST_B4(RTYPE, in4, in5, in6, in7, pdst + 4 * stride, stride);  \
+} while (0)
+#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 4 word elements with stride
+ * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+ * Details     : Store 4 word elements from 'in0' to (pdst + 0 * stride)
+ *               Store 4 word elements from 'in1' to (pdst + 1 * stride)
+ *               Store 4 word elements from 'in2' to (pdst + 2 * stride)
+ *               Store 4 word elements from 'in3' to (pdst + 3 * stride)
+ */
+#define ST_W2(RTYPE, in0, in1, pdst, stride) do {  \
+  ST_W(RTYPE, in0, pdst);                          \
+  ST_W(RTYPE, in1, pdst + stride);                 \
+} while (0)
+#define ST_UW2(...) ST_W2(v4u32, __VA_ARGS__)
+#define ST_SW2(...) ST_W2(v4i32, __VA_ARGS__)
+
+#define ST_W3(RTYPE, in0, in1, in2, pdst, stride) do {  \
+  ST_W2(RTYPE, in0, in1, pdst, stride);                 \
+  ST_W(RTYPE, in2, pdst + 2 * stride);                  \
+} while (0)
+#define ST_UW3(...) ST_W3(v4u32, __VA_ARGS__)
+#define ST_SW3(...) ST_W3(v4i32, __VA_ARGS__)
+
+#define ST_W4(RTYPE, in0, in1, in2, in3, pdst, stride) do {  \
+  ST_W2(RTYPE, in0, in1, pdst, stride);                      \
+  ST_W2(RTYPE, in2, in3, pdst + 2 * stride, stride);         \
+} while (0)
+#define ST_UW4(...) ST_W4(v4u32, __VA_ARGS__)
+#define ST_SW4(...) ST_W4(v4i32, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+ * Arguments   : Inputs - in0, in1, pdst, stride
+ * Details     : Store 8 halfword elements from 'in0' to (pdst)
+ *               Store 8 halfword elements from 'in1' to (pdst + stride)
+ */
+#define ST_H2(RTYPE, in0, in1, pdst, stride) do {  \
+  ST_H(RTYPE, in0, pdst);                          \
+  ST_H(RTYPE, in1, pdst + stride);                 \
+} while (0)
+#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+/* Description : Store 2x4 byte block to destination memory from input vector
+ * Arguments   : Inputs - in, stidx, pdst, stride
+ * Details     : Index 'stidx' halfword element from 'in' vector is copied to
+ *               the GP register and stored to (pdst)
+ *               Index 'stidx+1' halfword element from 'in' vector is copied to
+ *               the GP register and stored to (pdst + stride)
+ *               Index 'stidx+2' halfword element from 'in' vector is copied to
+ *               the GP register and stored to (pdst + 2 * stride)
+ *               Index 'stidx+3' halfword element from 'in' vector is copied to
+ *               the GP register and stored to (pdst + 3 * stride)
+ */
+#define ST2x4_UB(in, stidx, pdst, stride) do {                   \
+  uint8_t* pblk_2x4_m = (uint8_t*)pdst;                          \
+  const uint16_t out0_m = __msa_copy_s_h((v8i16)in, stidx);      \
+  const uint16_t out1_m = __msa_copy_s_h((v8i16)in, stidx + 1);  \
+  const uint16_t out2_m = __msa_copy_s_h((v8i16)in, stidx + 2);  \
+  const uint16_t out3_m = __msa_copy_s_h((v8i16)in, stidx + 3);  \
+  SH(out0_m, pblk_2x4_m);                                        \
+  pblk_2x4_m += stride;                                          \
+  SH(out1_m, pblk_2x4_m);                                        \
+  pblk_2x4_m += stride;                                          \
+  SH(out2_m, pblk_2x4_m);                                        \
+  pblk_2x4_m += stride;                                          \
+  SH(out3_m, pblk_2x4_m);                                        \
+} while (0)
+
 /* Description : Store 4x4 byte block to destination memory from input vector
  * Arguments   : Inputs - in0, in1, pdst, stride
  * Details     : 'Idx0' word element from input vector 'in0' is copied to the
@@ -207,14 +389,20 @@
  *               'Idx3' word element from input vector 'in0' is copied to the
  *               GP register and stored to (pdst + 3 * stride)
  */
-#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
-  uint8_t* const pblk_4x4_m = (uint8_t*)pdst;                       \
-  const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0);         \
-  const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1);         \
-  const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2);         \
-  const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3);         \
-  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \
-}
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) do {  \
+  uint8_t* const pblk_4x4_m = (uint8_t*)pdst;                          \
+  const uint32_t out0_m = __msa_copy_s_w((v4i32)in0, idx0);            \
+  const uint32_t out1_m = __msa_copy_s_w((v4i32)in0, idx1);            \
+  const uint32_t out2_m = __msa_copy_s_w((v4i32)in1, idx2);            \
+  const uint32_t out3_m = __msa_copy_s_w((v4i32)in1, idx3);            \
+  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);             \
+} while (0)
+
+#define ST4x8_UB(in0, in1, pdst, stride) do {                     \
+  uint8_t* const pblk_4x8 = (uint8_t*)pdst;                       \
+  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
+  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
+} while (0)
 
 /* Description : Immediate number of elements to slide
  * Arguments   : Inputs  - in0, in1, slide_val
@@ -230,6 +418,30 @@
 #define SLDI_SB(...) SLDI_B(v16i8, __VA_ARGS__)
 #define SLDI_SH(...) SLDI_B(v8i16, __VA_ARGS__)
 
+/* Description : Shuffle byte vector elements as per mask vector
+ * Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+ *               'out0' as per control vector 'mask0'
+ */
+#define VSHF_B(RTYPE, in0, in1, mask)                              \
+        (RTYPE)__msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0)
+
+#define VSHF_UB(...) VSHF_B(v16u8, __VA_ARGS__)
+#define VSHF_SB(...) VSHF_B(v16i8, __VA_ARGS__)
+#define VSHF_UH(...) VSHF_B(v8u16, __VA_ARGS__)
+#define VSHF_SH(...) VSHF_B(v8i16, __VA_ARGS__)
+
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do {  \
+  out0 = VSHF_B(RTYPE, in0, in1, mask0);                                   \
+  out1 = VSHF_B(RTYPE, in2, in3, mask1);                                   \
+} while (0)
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
+#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+#define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
+
 /* Description : Shuffle halfword vector elements as per mask vector
  * Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
  *               Outputs - out0, out1
@@ -237,44 +449,219 @@
  * Details     : halfword elements from 'in0' & 'in1' are copied selectively to
  *               'out0' as per control vector 'mask0'
  */
-#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
-  out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0);     \
-  out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2);     \
-}
+#define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) do {  \
+  out0 = (RTYPE)__msa_vshf_h((v8i16)mask0, (v8i16)in1, (v8i16)in0);        \
+  out1 = (RTYPE)__msa_vshf_h((v8i16)mask1, (v8i16)in3, (v8i16)in2);        \
+} while (0)
 #define VSHF_H2_UH(...) VSHF_H2(v8u16, __VA_ARGS__)
 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
 
+/* Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Signed byte elements from 'mult0' are multiplied with
+ *               signed byte elements from 'cnst0' producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               The multiplication result of adjacent odd-even elements
+ *               are added together and written to the 'out0' vector
+*/
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {  \
+  out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);           \
+  out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);           \
+} while (0)
+#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of halfword vector elements
+ * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Signed halfword elements from 'mult0' are multiplied with
+ *               signed halfword elements from 'cnst0' producing a result
+ *               twice the size of input i.e. signed word.
+ *               The multiplication result of adjacent odd-even elements
+ *               are added together and written to the 'out0' vector
+ */
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {  \
+  out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);           \
+  out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);           \
+} while (0)
+#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
+
+/* Description : Dot product of unsigned word vector elements
+ * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Unsigned word elements from 'mult0' are multiplied with
+ *               unsigned word elements from 'cnst0' producing a result
+ *               twice the size of input i.e. unsigned double word.
+ *               The multiplication result of adjacent odd-even elements
+ *               are added together and written to the 'out0' vector
+ */
+#define DOTP_UW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {  \
+  out0 = (RTYPE)__msa_dotp_u_d((v4u32)mult0, (v4u32)cnst0);           \
+  out1 = (RTYPE)__msa_dotp_u_d((v4u32)mult1, (v4u32)cnst1);           \
+} while (0)
+#define DOTP_UW2_UD(...) DOTP_UW2(v2u64, __VA_ARGS__)
+
+/* Description : Dot product & addition of halfword vector elements
+ * Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Signed halfword elements from 'mult0' are multiplied with
+ *               signed halfword elements from 'cnst0' producing a result
+ *               twice the size of input i.e. signed word.
+ *               The multiplication result of adjacent odd-even elements
+ *               are added to the 'out0' vector
+ */
+#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) do {      \
+  out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
+  out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
+} while (0)
+#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
+
 /* Description : Clips all signed halfword elements of input vector
  *               between 0 & 255
  * Arguments   : Input/output  - val
  *               Return Type - signed halfword
  */
-#define CLIP_SH_0_255(val) {                      \
+#define CLIP_SH_0_255(val) do {                   \
   const v8i16 max_m = __msa_ldi_h(255);           \
   val = __msa_maxi_s_h((v8i16)val, 0);            \
   val = __msa_min_s_h(max_m, (v8i16)val);         \
-}
-#define CLIP_SH2_0_255(in0, in1) {  \
-  CLIP_SH_0_255(in0);               \
-  CLIP_SH_0_255(in1);               \
-}
+} while (0)
+
+#define CLIP_SH2_0_255(in0, in1) do {  \
+  CLIP_SH_0_255(in0);                  \
+  CLIP_SH_0_255(in1);                  \
+} while (0)
+
+#define CLIP_SH4_0_255(in0, in1, in2, in3) do {  \
+  CLIP_SH2_0_255(in0, in1);                      \
+  CLIP_SH2_0_255(in2, in3);                      \
+} while (0)
+
+/* Description : Clips all unsigned halfword elements of input vector
+ *               between 0 & 255
+ * Arguments   : Input  - in
+ *               Output - out_m
+ *               Return Type - unsigned halfword
+ */
+#define CLIP_UH_0_255(in) do {                    \
+  const v8u16 max_m = (v8u16)__msa_ldi_h(255);    \
+  in = __msa_maxi_u_h((v8u16) in, 0);             \
+  in = __msa_min_u_h((v8u16) max_m, (v8u16) in);  \
+} while (0)
+
+#define CLIP_UH2_0_255(in0, in1) do {  \
+  CLIP_UH_0_255(in0);                  \
+  CLIP_UH_0_255(in1);                  \
+} while (0)
 
 /* Description : Clips all signed word elements of input vector
  *               between 0 & 255
  * Arguments   : Input/output  - val
  *               Return Type - signed word
  */
-#define CLIP_SW_0_255(val) {                      \
+#define CLIP_SW_0_255(val) do {                   \
   const v4i32 max_m = __msa_ldi_w(255);           \
   val = __msa_maxi_s_w((v4i32)val, 0);            \
   val = __msa_min_s_w(max_m, (v4i32)val);         \
+} while (0)
+
+#define CLIP_SW4_0_255(in0, in1, in2, in3) do {   \
+  CLIP_SW_0_255(in0);                             \
+  CLIP_SW_0_255(in1);                             \
+  CLIP_SW_0_255(in2);                             \
+  CLIP_SW_0_255(in3);                             \
+} while (0)
+
+/* Description : Horizontal addition of 4 signed word elements of input vector
+ * Arguments   : Input  - in       (signed word vector)
+ *               Output - sum_m    (i32 sum)
+ *               Return Type - signed word (GP)
+ * Details     : 4 signed word elements of 'in' vector are added together and
+ *               the resulting integer sum is returned
+ */
+static WEBP_INLINE int32_t func_hadd_sw_s32(v4i32 in) {
+  const v2i64 res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);
+  const v2i64 res1_m = __msa_splati_d(res0_m, 1);
+  const v2i64 out = res0_m + res1_m;
+  int32_t sum_m = __msa_copy_s_w((v4i32)out, 0);
+  return sum_m;
 }
-#define CLIP_SW4_0_255(in0, in1, in2, in3) {  \
-  CLIP_SW_0_255(in0);                         \
-  CLIP_SW_0_255(in1);                         \
-  CLIP_SW_0_255(in2);                         \
-  CLIP_SW_0_255(in3);                         \
+#define HADD_SW_S32(in) func_hadd_sw_s32(in)
+
+/* Description : Horizontal addition of 8 signed halfword elements
+ * Arguments   : Input  - in       (signed halfword vector)
+ *               Output - sum_m    (s32 sum)
+ *               Return Type - signed word
+ * Details     : 8 signed halfword elements of input vector are added
+ *               together and the resulting integer sum is returned
+ */
+static WEBP_INLINE int32_t func_hadd_sh_s32(v8i16 in) {
+  const v4i32 res = __msa_hadd_s_w(in, in);
+  const v2i64 res0 = __msa_hadd_s_d(res, res);
+  const v2i64 res1 = __msa_splati_d(res0, 1);
+  const v2i64 res2 = res0 + res1;
+  const int32_t sum_m = __msa_copy_s_w((v4i32)res2, 0);
+  return sum_m;
+}
+#define HADD_SH_S32(in) func_hadd_sh_s32(in)
+
+/* Description : Horizontal addition of 8 unsigned halfword elements
+ * Arguments   : Input  - in       (unsigned halfword vector)
+ *               Output - sum_m    (u32 sum)
+ *               Return Type - unsigned word
+ * Details     : 8 unsigned halfword elements of input vector are added
+ *               together and the resulting integer sum is returned
+ */
+static WEBP_INLINE uint32_t func_hadd_uh_u32(v8u16 in) {
+  uint32_t sum_m;
+  const v4u32 res_m = __msa_hadd_u_w(in, in);
+  v2u64 res0_m = __msa_hadd_u_d(res_m, res_m);
+  v2u64 res1_m = (v2u64)__msa_splati_d((v2i64)res0_m, 1);
+  res0_m = res0_m + res1_m;
+  sum_m = __msa_copy_s_w((v4i32)res0_m, 0);
+  return sum_m;
 }
+#define HADD_UH_U32(in) func_hadd_uh_u32(in)
+
+/* Description : Horizontal addition of signed half word vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed odd half word element from 'in0' is added to
+                 even signed half word element from 'in0' (pairwise) and the
+                 halfword result is written in 'out0'
+*/
+#define HADD_SH2(RTYPE, in0, in1, out0, out1) do {       \
+  out0 = (RTYPE)__msa_hadd_s_w((v8i16)in0, (v8i16)in0);  \
+  out1 = (RTYPE)__msa_hadd_s_w((v8i16)in1, (v8i16)in1);  \
+} while (0)
+#define HADD_SH2_SW(...) HADD_SH2(v4i32, __VA_ARGS__)
+
+#define HADD_SH4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  HADD_SH2(RTYPE, in0, in1, out0, out1);                                  \
+  HADD_SH2(RTYPE, in2, in3, out2, out3);                                  \
+} while (0)
+#define HADD_SH4_SW(...) HADD_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Horizontal subtraction of unsigned byte vector elements
+ * Arguments   : Inputs  - in0, in1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Each unsigned odd byte element from 'in0' is subtracted from
+ *               even unsigned byte element from 'in0' (pairwise) and the
+ *               halfword result is written to 'out0'
+ */
+#define HSUB_UB2(RTYPE, in0, in1, out0, out1) do {       \
+  out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
+  out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
+} while (0)
+#define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
+#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
+#define HSUB_UB2_SW(...) HSUB_UB2(v4i32, __VA_ARGS__)
 
 /* Description : Set element n input vector to GPR value
  * Arguments   : Inputs - in0, in1, in2, in3
@@ -282,23 +669,188 @@
  *               Return Type - as per RTYPE
  * Details     : Set element 0 in vector 'out' to value specified in 'in0'
  */
-#define INSERT_W2(RTYPE, in0, in1, out) {           \
+#define INSERT_W2(RTYPE, in0, in1, out) do {        \
   out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
   out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
-}
+} while (0)
 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
 
-#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);   \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);   \
-}
+#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) do {  \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);      \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);      \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);      \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);      \
+} while (0)
 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
 
+/* Description : Set element n of double word input vector to GPR value
+ * Arguments   : Inputs - in0, in1
+ *               Output - out
+ *               Return Type - as per RTYPE
+ * Details     : Set element 0 in vector 'out' to GPR value specified in 'in0'
+ *               Set element 1 in vector 'out' to GPR value specified in 'in1'
+ */
+#define INSERT_D2(RTYPE, in0, in1, out) do {        \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
+} while (0)
+#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even byte elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even byte elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ */
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);        \
+  out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);        \
+} while (0)
+#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
+#define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
+#define ILVEV_B2_UH(...) ILVEV_B2(v8u16, __VA_ARGS__)
+#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
+#define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave odd byte elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Odd byte elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ */
+#define ILVOD_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvod_b((v16i8)in1, (v16i8)in0);        \
+  out1 = (RTYPE)__msa_ilvod_b((v16i8)in3, (v16i8)in2);        \
+} while (0)
+#define ILVOD_B2_UB(...) ILVOD_B2(v16u8, __VA_ARGS__)
+#define ILVOD_B2_SB(...) ILVOD_B2(v16i8, __VA_ARGS__)
+#define ILVOD_B2_UH(...) ILVOD_B2(v8u16, __VA_ARGS__)
+#define ILVOD_B2_SH(...) ILVOD_B2(v8i16, __VA_ARGS__)
+#define ILVOD_B2_SD(...) ILVOD_B2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave even halfword elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even halfword elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ */
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);        \
+  out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);        \
+} while (0)
+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
+#define ILVEV_H2_UH(...) ILVEV_H2(v8u16, __VA_ARGS__)
+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
+#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave odd halfword elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Odd halfword elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ */
+#define ILVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvod_h((v8i16)in1, (v8i16)in0);        \
+  out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2);        \
+} while (0)
+#define ILVOD_H2_UB(...) ILVOD_H2(v16u8, __VA_ARGS__)
+#define ILVOD_H2_UH(...) ILVOD_H2(v8u16, __VA_ARGS__)
+#define ILVOD_H2_SH(...) ILVOD_H2(v8i16, __VA_ARGS__)
+#define ILVOD_H2_SW(...) ILVOD_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave even word elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even word elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ */
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);        \
+  out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);        \
+} while (0)
+#define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
+#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
+#define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
+#define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave even-odd word elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even word elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ *               Odd word elements of 'in2' and 'in3' are interleaved
+ *               and written to 'out1'
+ */
+#define ILVEVOD_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);          \
+  out1 = (RTYPE)__msa_ilvod_w((v4i32)in3, (v4i32)in2);          \
+} while (0)
+#define ILVEVOD_W2_UB(...) ILVEVOD_W2(v16u8, __VA_ARGS__)
+#define ILVEVOD_W2_UH(...) ILVEVOD_W2(v8u16, __VA_ARGS__)
+#define ILVEVOD_W2_SH(...) ILVEVOD_W2(v8i16, __VA_ARGS__)
+#define ILVEVOD_W2_SW(...) ILVEVOD_W2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave even-odd half-word elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even half-word elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ *               Odd half-word elements of 'in2' and 'in3' are interleaved
+ *               and written to 'out1'
+ */
+#define ILVEVOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);          \
+  out1 = (RTYPE)__msa_ilvod_h((v8i16)in3, (v8i16)in2);          \
+} while (0)
+#define ILVEVOD_H2_UB(...) ILVEVOD_H2(v16u8, __VA_ARGS__)
+#define ILVEVOD_H2_UH(...) ILVEVOD_H2(v8u16, __VA_ARGS__)
+#define ILVEVOD_H2_SH(...) ILVEVOD_H2(v8i16, __VA_ARGS__)
+#define ILVEVOD_H2_SW(...) ILVEVOD_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave even double word elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even double word elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'
+ */
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);        \
+  out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);        \
+} while (0)
+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
+#define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
+#define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
+#define ILVEV_D2_SD(...) ILVEV_D2(v2i64, __VA_ARGS__)
+
+/* Description : Interleave left half of byte elements from vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
+ *               and written to 'out0'.
+ */
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);        \
+  out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);        \
+} while (0)
+#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
+#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
+#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
+#define ILVL_B2_SW(...) ILVL_B2(v4i32, __VA_ARGS__)
+
 /* Description : Interleave right half of byte elements from vectors
  * Arguments   : Inputs  - in0, in1, in2, in3
  *               Outputs - out0, out1
@@ -306,10 +858,10 @@
  * Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
  *               and written to out0.
  */
-#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
-  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \
-}
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);        \
+  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);        \
+} while (0)
 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
@@ -317,10 +869,10 @@
 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
 
 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
+                out0, out1, out2, out3) do {                    \
   ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
   ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
+} while (0)
 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
@@ -334,19 +886,19 @@
  * Details     : Right half of halfword elements of 'in0' and 'in1' are
  *               interleaved and written to 'out0'.
  */
-#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
-  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
-}
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);        \
+  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);        \
+} while (0)
 #define ILVR_H2_UB(...) ILVR_H2(v16u8, __VA_ARGS__)
 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
 
 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
+                out0, out1, out2, out3) do {                    \
   ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
   ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
+} while (0)
 #define ILVR_H4_UB(...) ILVR_H4(v16u8, __VA_ARGS__)
 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
@@ -358,31 +910,57 @@
  * Details     : Right half of double word elements of 'in0' and 'in1' are
  *               interleaved and written to 'out0'.
  */
-#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1);     \
-  out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3);     \
-}
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_ilvr_d((v2i64)in0, (v2i64)in1);        \
+  out1 = (RTYPE)__msa_ilvr_d((v2i64)in2, (v2i64)in3);        \
+} while (0)
 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
 
-#define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) do {                    \
+  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+} while (0)
+#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+ * Arguments   : Inputs  - in0, in1
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Right half of byte elements from 'in0' and 'in1' are
+ *               interleaved and written to 'out0'
+ */
+#define ILVRL_B2(RTYPE, in0, in1, out0, out1) do {     \
+  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
+  out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
+} while (0)
+#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
+#define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
+
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1) do {     \
   out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
   out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
-}
+} while (0)
 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
 #define ILVRL_H2_UW(...) ILVRL_H2(v4u32, __VA_ARGS__)
 
-#define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1) do {     \
   out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
   out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
-}
+} while (0)
 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+#define ILVRL_W2_UW(...) ILVRL_W2(v4u32, __VA_ARGS__)
 
 /* Description : Pack even byte elements of vector pairs
  *  Arguments   : Inputs  - in0, in1, in2, in3
@@ -392,15 +970,76 @@
  *                'out0' & even byte elements of 'in1' are copied to the right
  *                half of 'out0'.
  */
-#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
-  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \
-}
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);        \
+  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);        \
+} while (0)
 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
 
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) do {                    \
+  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+} while (0)
+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
+#define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
+
+/* Description : Pack even halfword elements of vector pairs
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even halfword elements of 'in0' are copied to the left half of
+ *               'out0' & even halfword elements of 'in1' are copied to the
+ *               right half of 'out0'.
+ */
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);        \
+  out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);        \
+} while (0)
+#define PCKEV_H2_UH(...) PCKEV_H2(v8u16, __VA_ARGS__)
+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
+#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
+#define PCKEV_H2_UW(...) PCKEV_H2(v4u32, __VA_ARGS__)
+
+/* Description : Pack even word elements of vector pairs
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Even word elements of 'in0' are copied to the left half of
+ *               'out0' & even word elements of 'in1' are copied to the
+ *               right half of 'out0'.
+ */
+#define PCKEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_pckev_w((v4i32)in0, (v4i32)in1);        \
+  out1 = (RTYPE)__msa_pckev_w((v4i32)in2, (v4i32)in3);        \
+} while (0)
+#define PCKEV_W2_UH(...) PCKEV_W2(v8u16, __VA_ARGS__)
+#define PCKEV_W2_SH(...) PCKEV_W2(v8i16, __VA_ARGS__)
+#define PCKEV_W2_SW(...) PCKEV_W2(v4i32, __VA_ARGS__)
+#define PCKEV_W2_UW(...) PCKEV_W2(v4u32, __VA_ARGS__)
+
+/* Description : Pack odd halfword elements of vector pairs
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Odd halfword elements of 'in0' are copied to the left half of
+ *               'out0' & odd halfword elements of 'in1' are copied to the
+ *               right half of 'out0'.
+ */
+#define PCKOD_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_pckod_h((v8i16)in0, (v8i16)in1);        \
+  out1 = (RTYPE)__msa_pckod_h((v8i16)in2, (v8i16)in3);        \
+} while (0)
+#define PCKOD_H2_UH(...) PCKOD_H2(v8u16, __VA_ARGS__)
+#define PCKOD_H2_SH(...) PCKOD_H2(v8i16, __VA_ARGS__)
+#define PCKOD_H2_SW(...) PCKOD_H2(v4i32, __VA_ARGS__)
+#define PCKOD_H2_UW(...) PCKOD_H2(v4u32, __VA_ARGS__)
+
 /* Description : Arithmetic immediate shift right all elements of word vector
  * Arguments   : Inputs  - in0, in1, shift
  *               Outputs - in place operation
@@ -408,17 +1047,17 @@
  * Details     : Each element of vector 'in0' is right shifted by 'shift' and
  *               the result is written in-place. 'shift' is a GP variable.
  */
-#define SRAI_W2(RTYPE, in0, in1, shift_val) {  \
-  in0 = (RTYPE)SRAI_W(in0, shift_val);         \
-  in1 = (RTYPE)SRAI_W(in1, shift_val);         \
-}
+#define SRAI_W2(RTYPE, in0, in1, shift_val) do {  \
+  in0 = (RTYPE)SRAI_W(in0, shift_val);            \
+  in1 = (RTYPE)SRAI_W(in1, shift_val);            \
+} while (0)
 #define SRAI_W2_SW(...) SRAI_W2(v4i32, __VA_ARGS__)
 #define SRAI_W2_UW(...) SRAI_W2(v4u32, __VA_ARGS__)
 
-#define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) {  \
-  SRAI_W2(RTYPE, in0, in1, shift_val);                   \
-  SRAI_W2(RTYPE, in2, in3, shift_val);                   \
-}
+#define SRAI_W4(RTYPE, in0, in1, in2, in3, shift_val) do {  \
+  SRAI_W2(RTYPE, in0, in1, shift_val);                      \
+  SRAI_W2(RTYPE, in2, in3, shift_val);                      \
+} while (0)
 #define SRAI_W4_SW(...) SRAI_W4(v4i32, __VA_ARGS__)
 #define SRAI_W4_UW(...) SRAI_W4(v4u32, __VA_ARGS__)
 
@@ -429,10 +1068,10 @@
  * Details     : Each element of vector 'in0' is right shifted by 'shift' and
  *               the result is written in-place. 'shift' is a GP variable.
  */
-#define SRAI_H2(RTYPE, in0, in1, shift_val) {  \
-  in0 = (RTYPE)SRAI_H(in0, shift_val);         \
-  in1 = (RTYPE)SRAI_H(in1, shift_val);         \
-}
+#define SRAI_H2(RTYPE, in0, in1, shift_val) do {  \
+  in0 = (RTYPE)SRAI_H(in0, shift_val);            \
+  in1 = (RTYPE)SRAI_H(in1, shift_val);            \
+} while (0)
 #define SRAI_H2_SH(...) SRAI_H2(v8i16, __VA_ARGS__)
 #define SRAI_H2_UH(...) SRAI_H2(v8u16, __VA_ARGS__)
 
@@ -443,48 +1082,166 @@
  * Details     : Each element of vector 'in0' is right shifted by 'shift' and
  *               the result is written in-place. 'shift' is a GP variable.
  */
-#define SRARI_W2(RTYPE, in0, in1, shift) {        \
+#define SRARI_W2(RTYPE, in0, in1, shift) do {     \
   in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
   in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
-}
+} while (0)
 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
 
-#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \
-  SRARI_W2(RTYPE, in0, in1, shift);                   \
-  SRARI_W2(RTYPE, in2, in3, shift);                   \
-}
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) do {  \
+  SRARI_W2(RTYPE, in0, in1, shift);                      \
+  SRARI_W2(RTYPE, in2, in3, shift);                      \
+} while (0)
 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
 #define SRARI_W4_UW(...) SRARI_W4(v4u32, __VA_ARGS__)
 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
 
+/* Description : Shift right arithmetic rounded double words
+ * Arguments   : Inputs  - in0, in1, shift
+ *               Outputs - in place operation
+ *               Return Type - as per RTYPE
+ * Details     : Each element of vector 'in0' is shifted right arithmetically by
+ *               the number of bits in the corresponding element in the vector
+ *               'shift'. The last discarded bit is added to shifted value for
+ *               rounding and the result is written in-place.
+ *               'shift' is a vector.
+ */
+#define SRAR_D2(RTYPE, in0, in1, shift) do {            \
+  in0 = (RTYPE)__msa_srar_d((v2i64)in0, (v2i64)shift);  \
+  in1 = (RTYPE)__msa_srar_d((v2i64)in1, (v2i64)shift);  \
+} while (0)
+#define SRAR_D2_SW(...) SRAR_D2(v4i32, __VA_ARGS__)
+#define SRAR_D2_SD(...) SRAR_D2(v2i64, __VA_ARGS__)
+#define SRAR_D2_UD(...) SRAR_D2(v2u64, __VA_ARGS__)
+
+#define SRAR_D4(RTYPE, in0, in1, in2, in3, shift) do {  \
+  SRAR_D2(RTYPE, in0, in1, shift);                      \
+  SRAR_D2(RTYPE, in2, in3, shift);                      \
+} while (0)
+#define SRAR_D4_SD(...) SRAR_D4(v2i64, __VA_ARGS__)
+#define SRAR_D4_UD(...) SRAR_D4(v2u64, __VA_ARGS__)
+
 /* Description : Addition of 2 pairs of half-word vectors
  * Arguments   : Inputs  - in0, in1, in2, in3
  *               Outputs - out0, out1
  * Details     : Each element in 'in0' is added to 'in1' and result is written
  *               to 'out0'.
  */
-#define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)ADDVI_H(in0, in1);                         \
-  out1 = (RTYPE)ADDVI_H(in2, in3);                         \
-}
+#define ADDVI_H2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)ADDVI_H(in0, in1);                            \
+  out1 = (RTYPE)ADDVI_H(in2, in3);                            \
+} while (0)
 #define ADDVI_H2_SH(...) ADDVI_H2(v8i16, __VA_ARGS__)
 #define ADDVI_H2_UH(...) ADDVI_H2(v8u16, __VA_ARGS__)
 
+/* Description : Addition of 2 pairs of word vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ * Details     : Each element in 'in0' is added to 'in1' and result is written
+ *               to 'out0'.
+ */
+#define ADDVI_W2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)ADDVI_W(in0, in1);                            \
+  out1 = (RTYPE)ADDVI_W(in2, in3);                            \
+} while (0)
+#define ADDVI_W2_SW(...) ADDVI_W2(v4i32, __VA_ARGS__)
+
+/* Description : Fill 2 pairs of word vectors with GP registers
+ * Arguments   : Inputs  - in0, in1
+ *               Outputs - out0, out1
+ * Details     : GP register in0 is replicated in each word element of out0
+ *               GP register in1 is replicated in each word element of out1
+ */
+#define FILL_W2(RTYPE, in0, in1, out0, out1) do {  \
+  out0 = (RTYPE)__msa_fill_w(in0);                 \
+  out1 = (RTYPE)__msa_fill_w(in1);                 \
+} while (0)
+#define FILL_W2_SW(...) FILL_W2(v4i32, __VA_ARGS__)
+
 /* Description : Addition of 2 pairs of vectors
  * Arguments   : Inputs  - in0, in1, in2, in3
  *               Outputs - out0, out1
  * Details     : Each element in 'in0' is added to 'in1' and result is written
  *               to 'out0'.
  */
-#define ADD2(in0, in1, in2, in3, out0, out1) {  \
-  out0 = in0 + in1;                             \
-  out1 = in2 + in3;                             \
-}
+#define ADD2(in0, in1, in2, in3, out0, out1) do {  \
+  out0 = in0 + in1;                                \
+  out1 = in2 + in3;                                \
+} while (0)
+
 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
-             out0, out1, out2, out3) {                \
+             out0, out1, out2, out3) do {             \
   ADD2(in0, in1, in2, in3, out0, out1);               \
   ADD2(in4, in5, in6, in7, out2, out3);               \
-}
+} while (0)
+
+/* Description : Subtraction of 2 pairs of vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ * Details     : Each element in 'in1' is subtracted from 'in0' and result is
+ *               written to 'out0'.
+ */
+#define SUB2(in0, in1, in2, in3, out0, out1) do {  \
+  out0 = in0 - in1;                                \
+  out1 = in2 - in3;                                \
+} while (0)
+
+#define SUB3(in0, in1, in2, in3, in4, in5, out0, out1, out2) do {  \
+  out0 = in0 - in1;                                                \
+  out1 = in2 - in3;                                                \
+  out2 = in4 - in5;                                                \
+} while (0)
+
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3) do {             \
+  out0 = in0 - in1;                                   \
+  out1 = in2 - in3;                                   \
+  out2 = in4 - in5;                                   \
+  out3 = in6 - in7;                                   \
+} while (0)
+
+/* Description : Addition - Subtraction of input vectors
+ * Arguments   : Inputs  - in0, in1
+ *               Outputs - out0, out1
+ * Details     : Each element in 'in1' is added to 'in0' and result is
+ *               written to 'out0'.
+ *               Each element in 'in1' is subtracted from 'in0' and result is
+ *               written to 'out1'.
+ */
+#define ADDSUB2(in0, in1, out0, out1) do {  \
+  out0 = in0 + in1;                         \
+  out1 = in0 - in1;                         \
+} while (0)
+
+/* Description : Multiplication of pairs of vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1
+ * Details     : Each element from 'in0' is multiplied with elements from 'in1'
+ *               and the result is written to 'out0'
+ */
+#define MUL2(in0, in1, in2, in3, out0, out1) do {  \
+  out0 = in0 * in1;                                \
+  out1 = in2 * in3;                                \
+} while (0)
+
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3) do {             \
+  MUL2(in0, in1, in2, in3, out0, out1);               \
+  MUL2(in4, in5, in6, in7, out2, out3);               \
+} while (0)
+
+/* Description : Sign extend halfword elements from right half of the vector
+ * Arguments   : Input  - in    (halfword vector)
+ *               Output - out   (sign extended word vector)
+ *               Return Type - signed word
+ * Details     : Sign bit of halfword elements from input vector 'in' is
+ *               extracted and interleaved with same vector 'in0' to generate
+ *               4 word elements keeping sign intact
+ */
+#define UNPCK_R_SH_SW(in, out) do {                   \
+  const v8i16 sign_m = __msa_clti_s_h((v8i16)in, 0);  \
+  out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);       \
+} while (0)
 
 /* Description : Sign extend halfword elements from input vector and return
  *               the result in pair of vectors
@@ -497,29 +1254,82 @@
  *               Then interleaved left with same vector 'in0' to
  *               generate 4 signed word elements in 'out1'
  */
-#define UNPCK_SH_SW(in, out0, out1) {                 \
+#define UNPCK_SH_SW(in, out0, out1) do {              \
   const v8i16 tmp_m = __msa_clti_s_h((v8i16)in, 0);   \
   ILVRL_H2_SW(tmp_m, in, out0, out1);                 \
-}
+} while (0)
 
 /* Description : Butterfly of 4 input vectors
  * Arguments   : Inputs  - in0, in1, in2, in3
  *               Outputs - out0, out1, out2, out3
  * Details     : Butterfly operation
  */
-#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
-  out0 = in0 + in3;                                                \
-  out1 = in1 + in2;                                                \
-  out2 = in1 - in2;                                                \
-  out3 = in0 - in3;                                                \
-}
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) do {  \
+  out0 = in0 + in3;                                                   \
+  out1 = in1 + in2;                                                   \
+  out2 = in1 - in2;                                                   \
+  out3 = in0 - in3;                                                   \
+} while (0)
+
+/* Description : Transpose 16x4 block into 4x16 with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+ *                         in8, in9, in10, in11, in12, in13, in14, in15
+ *               Outputs - out0, out1, out2, out3
+ *               Return Type - unsigned byte
+ */
+#define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
+                            in8, in9, in10, in11, in12, in13, in14, in15,  \
+                            out0, out1, out2, out3) do {                   \
+  v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m, tmp4_m, tmp5_m;                    \
+  ILVEV_W2_SD(in0, in4, in8, in12, tmp2_m, tmp3_m);                        \
+  ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                        \
+  ILVEV_D2_UB(tmp2_m, tmp3_m, tmp0_m, tmp1_m, out1, out3);                 \
+  ILVEV_W2_SD(in2, in6, in10, in14, tmp4_m, tmp5_m);                       \
+  ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                       \
+  ILVEV_D2_SD(tmp4_m, tmp5_m, tmp0_m, tmp1_m, tmp2_m, tmp3_m);             \
+  ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);                 \
+  ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out0, out2);               \
+  ILVOD_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);                 \
+  ILVEVOD_H2_UB(tmp0_m, tmp1_m, tmp0_m, tmp1_m, out1, out3);               \
+} while (0)
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+ *                         in8, in9, in10, in11, in12, in13, in14, in15
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ *               Return Type - unsigned byte
+ */
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \
+                            in8, in9, in10, in11, in12, in13, in14, in15,  \
+                            out0, out1, out2, out3, out4, out5,            \
+                            out6, out7) do {                               \
+  v8i16 tmp0_m, tmp1_m, tmp4_m, tmp5_m, tmp6_m, tmp7_m;                    \
+  v4i32 tmp2_m, tmp3_m;                                                    \
+  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \
+  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \
+  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \
+  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \
+  ILVEV_B2_SH(out7, out6, out5, out4, tmp0_m, tmp1_m);                     \
+  ILVOD_B2_SH(out7, out6, out5, out4, tmp4_m, tmp5_m);                     \
+  ILVEV_B2_UB(out3, out2, out1, out0, out5, out7);                         \
+  ILVOD_B2_SH(out3, out2, out1, out0, tmp6_m, tmp7_m);                     \
+  ILVEV_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
+  ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out0, out4);               \
+  ILVOD_H2_SW(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \
+  ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out2, out6);               \
+  ILVEV_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
+  ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out1, out5);               \
+  ILVOD_H2_SW(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \
+  ILVEVOD_W2_UB(tmp2_m, tmp3_m, tmp2_m, tmp3_m, out3, out7);               \
+} while (0)
 
 /* Description : Transpose 4x4 block with word elements in vectors
  * Arguments   : Inputs  - in0, in1, in2, in3
  *                Outputs - out0, out1, out2, out3
  *                Return Type - as per RTYPE
  */
-#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
+#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3,                            \
+                       out0, out1, out2, out3) do {                          \
   v4i32 s0_m, s1_m, s2_m, s3_m;                                              \
   ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                         \
   ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                         \
@@ -527,7 +1337,7 @@
   out1 = (RTYPE)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                      \
   out2 = (RTYPE)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                      \
   out3 = (RTYPE)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                      \
-}
+} while (0)
 #define TRANSPOSE4x4_SW_SW(...) TRANSPOSE4x4_W(v4i32, __VA_ARGS__)
 
 /* Description : Add block 4x4
@@ -535,7 +1345,7 @@
  * Details     : Least significant 4 bytes from each input vector are added to
  *               the destination bytes, clipped between 0-255 and stored.
  */
-#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do {  \
   uint32_t src0_m, src1_m, src2_m, src3_m;                      \
   v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
   v16i8 dst0_m = { 0 };                                         \
@@ -550,6 +1360,31 @@
   CLIP_SH2_0_255(res0_m, res1_m);                               \
   PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
   ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
-}
+} while (0)
+
+/* Description : Pack even byte elements, extract 0 & 2 index words from pair
+ *               of results and store 4 words in destination memory as per
+ *               stride
+ * Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+ */
+#define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) do {  \
+  v16i8 tmp0_m, tmp1_m;                                        \
+  PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);             \
+  ST4x4_UB(tmp0_m, tmp1_m, 0, 2, 0, 2, pdst, stride);          \
+} while (0)
+
+/* Description : average with rounding (in0 + in1 + 1) / 2.
+ * Arguments   : Inputs  - in0, in1, in2, in3,
+ *               Outputs - out0, out1
+ *               Return Type - as per RTYPE
+ * Details     : Each unsigned byte element from 'in0' vector is added with
+ *               each unsigned byte element from 'in1' vector. Then the average
+ *               with rounding is calculated and written to 'out0'
+ */
+#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) do {  \
+  out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);       \
+  out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3);       \
+} while (0)
+#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
 
 #endif  /* WEBP_DSP_MSA_MACRO_H_ */
diff --git a/src/3rdparty/libwebp/src/dsp/neon.h b/src/3rdparty/libwebp/src/dsp/neon.h
index 0a06266..3b548a6 100644
--- a/src/3rdparty/libwebp/src/dsp/neon.h
+++ b/src/3rdparty/libwebp/src/dsp/neon.h
@@ -79,4 +79,22 @@ static WEBP_INLINE int32x4x4_t Transpose4x4(const int32x4x4_t rows) {
   }
 }
 
+#if 0     // Useful debug macro.
+#include <stdio.h>
+#define PRINT_REG(REG, SIZE) do {                       \
+  int i;                                                \
+  printf("%s \t[%d]: 0x", #REG, SIZE);                  \
+  if (SIZE == 8) {                                      \
+    uint8_t _tmp[8];                                    \
+    vst1_u8(_tmp, (REG));                               \
+    for (i = 0; i < 8; ++i) printf("%.2x ", _tmp[i]);   \
+  } else if (SIZE == 16) {                              \
+    uint16_t _tmp[4];                                   \
+    vst1_u16(_tmp, (REG));                              \
+    for (i = 0; i < 4; ++i) printf("%.4x ", _tmp[i]);   \
+  }                                                     \
+  printf("\n");                                         \
+} while (0)
+#endif
+
 #endif  // WEBP_DSP_NEON_H_
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler.c b/src/3rdparty/libwebp/src/dsp/rescaler.c
index bc743d5..0f54502 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler.c
@@ -14,7 +14,7 @@
 #include <assert.h>
 
 #include "./dsp.h"
-#include "../utils/rescaler.h"
+#include "../utils/rescaler_utils.h"
 
 //------------------------------------------------------------------------------
 // Implementations of critical functions ImportRow / ExportRow
@@ -173,10 +173,10 @@ void WebPRescalerExportRow(WebPRescaler* const wrk) {
       WebPRescalerExportRowExpand(wrk);
     } else if (wrk->fxy_scale) {
       WebPRescalerExportRowShrink(wrk);
-    } else {  // very special case for src = dst = 1x1
+    } else {  // special case
       int i;
+      assert(wrk->src_height == wrk->dst_height && wrk->x_add == 1);
       assert(wrk->src_width == 1 && wrk->dst_width <= 2);
-      assert(wrk->src_height == 1 && wrk->dst_height == 1);
       for (i = 0; i < wrk->num_channels * wrk->dst_width; ++i) {
         wrk->dst[i] = wrk->irow[i];
         wrk->irow[i] = 0;
@@ -199,6 +199,7 @@ WebPRescalerExportRowFunc WebPRescalerExportRowShrink;
 extern void WebPRescalerDspInitSSE2(void);
 extern void WebPRescalerDspInitMIPS32(void);
 extern void WebPRescalerDspInitMIPSdspR2(void);
+extern void WebPRescalerDspInitMSA(void);
 extern void WebPRescalerDspInitNEON(void);
 
 static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
@@ -233,6 +234,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
       WebPRescalerDspInitMIPSdspR2();
     }
 #endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      WebPRescalerDspInitMSA();
+    }
+#endif
   }
   rescaler_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c b/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c
index ddaa391..e09ad5d 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_mips32.c
@@ -16,7 +16,7 @@
 #if defined(WEBP_USE_MIPS32)
 
 #include <assert.h>
-#include "../utils/rescaler.h"
+#include "../utils/rescaler_utils.h"
 
 //------------------------------------------------------------------------------
 // Row import
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
index b457d0a..2308d64 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_mips_dsp_r2.c
@@ -16,7 +16,7 @@
 #if defined(WEBP_USE_MIPS_DSP_R2)
 
 #include <assert.h>
-#include "../utils/rescaler.h"
+#include "../utils/rescaler_utils.h"
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_msa.c b/src/3rdparty/libwebp/src/dsp/rescaler_msa.c
new file mode 100644
index 0000000..2c10e55
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_msa.c
@@ -0,0 +1,444 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of rescaling functions
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include <assert.h>
+
+#include "../utils/rescaler_utils.h"
+#include "./msa_macro.h"
+
+#define ROUNDER (WEBP_RESCALER_ONE >> 1)
+#define MULT_FIX(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
+
+#define CALC_MULT_FIX_16(in0, in1, in2, in3, scale, shift, dst) do {  \
+  v4u32 tmp0, tmp1, tmp2, tmp3;                                       \
+  v16u8 t0, t1, t2, t3, t4, t5;                                       \
+  v2u64 out0, out1, out2, out3;                                       \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                                 \
+  ILVRL_W2_UW(zero, in1, tmp2, tmp3);                                 \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
+  DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
+  PCKEV_B2_UB(out1, out0, out3, out2, t0, t1);                        \
+  ILVRL_W2_UW(zero, in2, tmp0, tmp1);                                 \
+  ILVRL_W2_UW(zero, in3, tmp2, tmp3);                                 \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);                  \
+  DOTP_UW2_UD(tmp2, tmp3, scale, scale, out2, out3);                  \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                          \
+  PCKEV_B2_UB(out1, out0, out3, out2, t2, t3);                        \
+  PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);                                \
+  dst = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);                   \
+} while (0)
+
+#define CALC_MULT_FIX_4(in0, scale, shift, dst) do {  \
+  v4u32 tmp0, tmp1;                                   \
+  v16i8 t0, t1;                                       \
+  v2u64 out0, out1;                                   \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                 \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);  \
+  SRAR_D2_UD(out0, out1, shift);                      \
+  t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);       \
+  t1 = __msa_pckev_b(t0, t0);                         \
+  t0 = __msa_pckev_b(t1, t1);                         \
+  dst = __msa_copy_s_w((v4i32)t0, 0);                 \
+} while (0)
+
+#define CALC_MULT_FIX1_16(in0, in1, in2, in3, fyscale, shift,  \
+                          dst0, dst1, dst2, dst3) do {         \
+  v4u32 tmp0, tmp1, tmp2, tmp3;                                \
+  v2u64 out0, out1, out2, out3;                                \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                          \
+  ILVRL_W2_UW(zero, in1, tmp2, tmp3);                          \
+  DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
+  DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
+  PCKEV_W2_UW(out1, out0, out3, out2, dst0, dst1);             \
+  ILVRL_W2_UW(zero, in2, tmp0, tmp1);                          \
+  ILVRL_W2_UW(zero, in3, tmp2, tmp3);                          \
+  DOTP_UW2_UD(tmp0, tmp1, fyscale, fyscale, out0, out1);       \
+  DOTP_UW2_UD(tmp2, tmp3, fyscale, fyscale, out2, out3);       \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                   \
+  PCKEV_W2_UW(out1, out0, out3, out2, dst2, dst3);             \
+} while (0)
+
+#define CALC_MULT_FIX1_4(in0, scale, shift, dst) do {    \
+  v4u32 tmp0, tmp1;                                      \
+  v2u64 out0, out1;                                      \
+  ILVRL_W2_UW(zero, in0, tmp0, tmp1);                    \
+  DOTP_UW2_UD(tmp0, tmp1, scale, scale, out0, out1);     \
+  SRAR_D2_UD(out0, out1, shift);                         \
+  dst = (v4u32)__msa_pckev_w((v4i32)out1, (v4i32)out0);  \
+} while (0)
+
+#define CALC_MULT_FIX2_16(in0, in1, in2, in3, mult, scale, shift,  \
+                          dst0, dst1) do {                         \
+  v4u32 tmp0, tmp1, tmp2, tmp3;                                    \
+  v2u64 out0, out1, out2, out3;                                    \
+  ILVRL_W2_UW(in0, in2, tmp0, tmp1);                               \
+  ILVRL_W2_UW(in1, in3, tmp2, tmp3);                               \
+  DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                 \
+  DOTP_UW2_UD(tmp2, tmp3, mult, mult, out2, out3);                 \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
+  DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);               \
+  DOTP_UW2_UD(out2, out3, scale, scale, out2, out3);               \
+  SRAR_D4_UD(out0, out1, out2, out3, shift);                       \
+  PCKEV_B2_UB(out1, out0, out3, out2, dst0, dst1);                 \
+} while (0)
+
+#define CALC_MULT_FIX2_4(in0, in1, mult, scale, shift, dst) do {  \
+  v4u32 tmp0, tmp1;                                               \
+  v2u64 out0, out1;                                               \
+  v16i8 t0, t1;                                                   \
+  ILVRL_W2_UW(in0, in1, tmp0, tmp1);                              \
+  DOTP_UW2_UD(tmp0, tmp1, mult, mult, out0, out1);                \
+  SRAR_D2_UD(out0, out1, shift);                                  \
+  DOTP_UW2_UD(out0, out1, scale, scale, out0, out1);              \
+  SRAR_D2_UD(out0, out1, shift);                                  \
+  t0 = __msa_pckev_b((v16i8)out1, (v16i8)out0);                   \
+  t1 = __msa_pckev_b(t0, t0);                                     \
+  t0 = __msa_pckev_b(t1, t1);                                     \
+  dst = __msa_copy_s_w((v4i32)t0, 0);                             \
+} while (0)
+
+static WEBP_INLINE void ExportRowExpand_0(const uint32_t* frow, uint8_t* dst,
+                                          int length,
+                                          WebPRescaler* const wrk) {
+  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
+  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+  const v4i32 zero = { 0 };
+
+  while (length >= 16) {
+    v4u32 src0, src1, src2, src3;
+    v16u8 out;
+    LD_UW4(frow, 4, src0, src1, src2, src3);
+    CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, out);
+    ST_UB(out, dst);
+    length -= 16;
+    frow   += 16;
+    dst    += 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 src0, src1, src2;
+      LD_UW3(frow, 4, src0, src1, src2);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      CALC_MULT_FIX_4(src2, scale, shift, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      length -= 12;
+      frow   += 12;
+      dst    += 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 src0, src1;
+      LD_UW2(frow, 4, src0, src1);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      length -= 8;
+      frow   += 8;
+      dst    += 8;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      const v4u32 src0 = LD_UW(frow);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      SW(val0_m, dst);
+      length -= 4;
+      frow   += 4;
+      dst    += 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const uint32_t J = frow[x_out];
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  }
+}
+
+static WEBP_INLINE void ExportRowExpand_1(const uint32_t* frow, uint32_t* irow,
+                                          uint8_t* dst, int length,
+                                          WebPRescaler* const wrk) {
+  const uint32_t B = WEBP_RESCALER_FRAC(-wrk->y_accum, wrk->y_sub);
+  const uint32_t A = (uint32_t)(WEBP_RESCALER_ONE - B);
+  const v4i32 B1 = __msa_fill_w(B);
+  const v4i32 A1 = __msa_fill_w(A);
+  const v4i32 AB = __msa_ilvr_w(A1, B1);
+  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fy_scale);
+  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+
+  while (length >= 16) {
+    v4u32 frow0, frow1, frow2, frow3, irow0, irow1, irow2, irow3;
+    v16u8 t0, t1, t2, t3, t4, t5;
+    LD_UW4(frow, 4, frow0, frow1, frow2, frow3);
+    LD_UW4(irow, 4, irow0, irow1, irow2, irow3);
+    CALC_MULT_FIX2_16(frow0, frow1, irow0, irow1, AB, scale, shift, t0, t1);
+    CALC_MULT_FIX2_16(frow2, frow3, irow2, irow3, AB, scale, shift, t2, t3);
+    PCKEV_B2_UB(t1, t0, t3, t2, t4, t5);
+    t0 = (v16u8)__msa_pckev_b((v16i8)t5, (v16i8)t4);
+    ST_UB(t0, dst);
+    frow   += 16;
+    irow   += 16;
+    dst    += 16;
+    length -= 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 frow0, frow1, frow2, irow0, irow1, irow2;
+      LD_UW3(frow, 4, frow0, frow1, frow2);
+      LD_UW3(irow, 4, irow0, irow1, irow2);
+      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+      CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
+      CALC_MULT_FIX2_4(frow2, irow2, AB, scale, shift, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      frow   += 12;
+      irow   += 12;
+      dst    += 12;
+      length -= 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 frow0, frow1, irow0, irow1;
+      LD_UW2(frow, 4, frow0, frow1);
+      LD_UW2(irow, 4, irow0, irow1);
+      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+      CALC_MULT_FIX2_4(frow1, irow1, AB, scale, shift, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      frow   += 4;
+      irow   += 4;
+      dst    += 4;
+      length -= 4;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      const v4u32 frow0 = LD_UW(frow + 0);
+      const v4u32 irow0 = LD_UW(irow + 0);
+      CALC_MULT_FIX2_4(frow0, irow0, AB, scale, shift, val0_m);
+      SW(val0_m, dst);
+      frow   += 4;
+      irow   += 4;
+      dst    += 4;
+      length -= 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const uint64_t I = (uint64_t)A * frow[x_out]
+                       + (uint64_t)B * irow[x_out];
+      const uint32_t J = (uint32_t)((I + ROUNDER) >> WEBP_RESCALER_RFIX);
+      const int v = (int)MULT_FIX(J, wrk->fy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+    }
+  }
+}
+
+static void RescalerExportRowExpand(WebPRescaler* const wrk) {
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(wrk->y_expand);
+  assert(wrk->y_sub != 0);
+  if (wrk->y_accum == 0) {
+    ExportRowExpand_0(frow, dst, x_out_max, wrk);
+  } else {
+    ExportRowExpand_1(frow, irow, dst, x_out_max, wrk);
+  }
+}
+
+static WEBP_INLINE void ExportRowShrink_0(const uint32_t* frow, uint32_t* irow,
+                                          uint8_t* dst, int length,
+                                          const uint32_t yscale,
+                                          WebPRescaler* const wrk) {
+  const v4u32 y_scale = (v4u32)__msa_fill_w(yscale);
+  const v4u32 fxyscale = (v4u32)__msa_fill_w(wrk->fxy_scale);
+  const v4u32 shiftval = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+  const v4i32 zero = { 0 };
+
+  while (length >= 16) {
+    v4u32 src0, src1, src2, src3, frac0, frac1, frac2, frac3;
+    v16u8 out;
+    LD_UW4(frow, 4, src0, src1, src2, src3);
+    CALC_MULT_FIX1_16(src0, src1, src2, src3, y_scale, shiftval,
+                      frac0, frac1, frac2, frac3);
+    LD_UW4(irow, 4, src0, src1, src2, src3);
+    SUB4(src0, frac0, src1, frac1, src2, frac2, src3, frac3,
+         src0, src1, src2, src3);
+    CALC_MULT_FIX_16(src0, src1, src2, src3, fxyscale, shiftval, out);
+    ST_UB(out, dst);
+    ST_UW4(frac0, frac1, frac2, frac3, irow, 4);
+    frow   += 16;
+    irow   += 16;
+    dst    += 16;
+    length -= 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 src0, src1, src2, frac0, frac1, frac2;
+      LD_UW3(frow, 4, src0, src1, src2);
+      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+      CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
+      CALC_MULT_FIX1_4(src2, y_scale, shiftval, frac2);
+      LD_UW3(irow, 4, src0, src1, src2);
+      SUB3(src0, frac0, src1, frac1, src2, frac2, src0, src1, src2);
+      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+      CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
+      CALC_MULT_FIX_4(src2, fxyscale, shiftval, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      ST_UW3(frac0, frac1, frac2, irow, 4);
+      frow   += 12;
+      irow   += 12;
+      dst    += 12;
+      length -= 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 src0, src1, frac0, frac1;
+      LD_UW2(frow, 4, src0, src1);
+      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+      CALC_MULT_FIX1_4(src1, y_scale, shiftval, frac1);
+      LD_UW2(irow, 4, src0, src1);
+      SUB2(src0, frac0, src1, frac1, src0, src1);
+      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+      CALC_MULT_FIX_4(src1, fxyscale, shiftval, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      ST_UW2(frac0, frac1, irow, 4);
+      frow   += 8;
+      irow   += 8;
+      dst    += 8;
+      length -= 8;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      v4u32 frac0;
+      v4u32 src0 = LD_UW(frow);
+      CALC_MULT_FIX1_4(src0, y_scale, shiftval, frac0);
+      src0 = LD_UW(irow);
+      src0 = src0 - frac0;
+      CALC_MULT_FIX_4(src0, fxyscale, shiftval, val0_m);
+      SW(val0_m, dst);
+      ST_UW(frac0, irow);
+      frow   += 4;
+      irow   += 4;
+      dst    += 4;
+      length -= 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const uint32_t frac = (uint32_t)MULT_FIX(frow[x_out], yscale);
+      const int v = (int)MULT_FIX(irow[x_out] - frac, wrk->fxy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+      irow[x_out] = frac;
+    }
+  }
+}
+
+static WEBP_INLINE void ExportRowShrink_1(uint32_t* irow, uint8_t* dst,
+                                          int length,
+                                          WebPRescaler* const wrk) {
+  const v4u32 scale = (v4u32)__msa_fill_w(wrk->fxy_scale);
+  const v4u32 shift = (v4u32)__msa_fill_w(WEBP_RESCALER_RFIX);
+  const v4i32 zero = { 0 };
+
+  while (length >= 16) {
+    v4u32 src0, src1, src2, src3;
+    v16u8 dst0;
+    LD_UW4(irow, 4, src0, src1, src2, src3);
+    CALC_MULT_FIX_16(src0, src1, src2, src3, scale, shift, dst0);
+    ST_UB(dst0, dst);
+    ST_SW4(zero, zero, zero, zero, irow, 4);
+    length -= 16;
+    irow   += 16;
+    dst    += 16;
+  }
+  if (length > 0) {
+    int x_out;
+    if (length >= 12) {
+      uint32_t val0_m, val1_m, val2_m;
+      v4u32 src0, src1, src2;
+      LD_UW3(irow, 4, src0, src1, src2);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      CALC_MULT_FIX_4(src2, scale, shift, val2_m);
+      SW3(val0_m, val1_m, val2_m, dst, 4);
+      ST_SW3(zero, zero, zero, irow, 4);
+      length -= 12;
+      irow   += 12;
+      dst    += 12;
+    } else if (length >= 8) {
+      uint32_t val0_m, val1_m;
+      v4u32 src0, src1;
+      LD_UW2(irow, 4, src0, src1);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      CALC_MULT_FIX_4(src1, scale, shift, val1_m);
+      SW2(val0_m, val1_m, dst, 4);
+      ST_SW2(zero, zero, irow, 4);
+      length -= 8;
+      irow   += 8;
+      dst    += 8;
+    } else if (length >= 4) {
+      uint32_t val0_m;
+      const v4u32 src0 = LD_UW(irow + 0);
+      CALC_MULT_FIX_4(src0, scale, shift, val0_m);
+      SW(val0_m, dst);
+      ST_SW(zero, irow);
+      length -= 4;
+      irow   += 4;
+      dst    += 4;
+    }
+    for (x_out = 0; x_out < length; ++x_out) {
+      const int v = (int)MULT_FIX(irow[x_out], wrk->fxy_scale);
+      assert(v >= 0 && v <= 255);
+      dst[x_out] = v;
+      irow[x_out] = 0;
+    }
+  }
+}
+
+static void RescalerExportRowShrink(WebPRescaler* const wrk) {
+  uint8_t* dst = wrk->dst;
+  rescaler_t* irow = wrk->irow;
+  const int x_out_max = wrk->dst_width * wrk->num_channels;
+  const rescaler_t* frow = wrk->frow;
+  const uint32_t yscale = wrk->fy_scale * (-wrk->y_accum);
+  assert(!WebPRescalerOutputDone(wrk));
+  assert(wrk->y_accum <= 0);
+  assert(!wrk->y_expand);
+  if (yscale) {
+    ExportRowShrink_0(frow, irow, dst, x_out_max, yscale, wrk);
+  } else {
+    ExportRowShrink_1(irow, dst, x_out_max, wrk);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPRescalerDspInitMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInitMSA(void) {
+  WebPRescalerExportRowExpand = RescalerExportRowExpand;
+  WebPRescalerExportRowShrink = RescalerExportRowShrink;
+}
+
+#else     // !WEBP_USE_MSA
+
+WEBP_DSP_INIT_STUB(WebPRescalerDspInitMSA)
+
+#endif    // WEBP_USE_MSA
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_neon.c b/src/3rdparty/libwebp/src/dsp/rescaler_neon.c
index 16fd450..b2dd8f3 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_neon.c
@@ -18,7 +18,7 @@
 #include <arm_neon.h>
 #include <assert.h>
 #include "./neon.h"
-#include "../utils/rescaler.h"
+#include "../utils/rescaler_utils.h"
 
 #define ROUNDER (WEBP_RESCALER_ONE >> 1)
 #define MULT_FIX_C(x, y) (((uint64_t)(x) * (y) + ROUNDER) >> WEBP_RESCALER_RFIX)
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
index 5b97028..8271c22 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
@@ -17,7 +17,7 @@
 #include <emmintrin.h>
 
 #include <assert.h>
-#include "../utils/rescaler.h"
+#include "../utils/rescaler_utils.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling.c b/src/3rdparty/libwebp/src/dsp/upsampling.c
index 651274f..265e722 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling.c
@@ -215,6 +215,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
 extern void WebPInitUpsamplersSSE2(void);
 extern void WebPInitUpsamplersNEON(void);
 extern void WebPInitUpsamplersMIPSdspR2(void);
+extern void WebPInitUpsamplersMSA(void);
 
 static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 =
     (VP8CPUInfo)&upsampling_last_cpuinfo_used2;
@@ -252,6 +253,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
       WebPInitUpsamplersMIPSdspR2();
     }
 #endif
+#if defined(WEBP_USE_MSA)
+    if (VP8GetCPUInfo(kMSA)) {
+      WebPInitUpsamplersMSA();
+    }
+#endif
   }
 #endif  // FANCY_UPSAMPLING
   upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_msa.c b/src/3rdparty/libwebp/src/dsp/upsampling_msa.c
new file mode 100644
index 0000000..f24926f
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_msa.c
@@ -0,0 +1,678 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// MSA version of YUV to RGB upsampling functions.
+//
+// Author: Prashant Patil (prashant.patil@imgtec.com)
+
+#include <string.h>
+#include "./dsp.h"
+
+#if defined(WEBP_USE_MSA)
+
+#include "./msa_macro.h"
+#include "./yuv.h"
+
+#ifdef FANCY_UPSAMPLING
+
+#define ILVR_UW2(in, out0, out1) do {                            \
+  const v8i16 t0 = (v8i16)__msa_ilvr_b((v16i8)zero, (v16i8)in);  \
+  out0 = (v4u32)__msa_ilvr_h((v8i16)zero, t0);                   \
+  out1 = (v4u32)__msa_ilvl_h((v8i16)zero, t0);                   \
+} while (0)
+
+#define ILVRL_UW4(in, out0, out1, out2, out3) do {  \
+  v16u8 t0, t1;                                     \
+  ILVRL_B2_UB(zero, in, t0, t1);                    \
+  ILVRL_H2_UW(zero, t0, out0, out1);                \
+  ILVRL_H2_UW(zero, t1, out2, out3);                \
+} while (0)
+
+#define MULTHI_16(in0, in1, in2, in3, cnst, out0, out1) do {   \
+  const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256);        \
+  v4u32 temp0, temp1, temp2, temp3;                            \
+  MUL4(in0, const0, in1, const0, in2, const0, in3, const0,     \
+       temp0, temp1, temp2, temp3);                            \
+  PCKOD_H2_UH(temp1, temp0, temp3, temp2, out0, out1);         \
+} while (0)
+
+#define MULTHI_8(in0, in1, cnst, out0) do {                 \
+  const v4i32 const0 = (v4i32)__msa_fill_w(cnst * 256);     \
+  v4u32 temp0, temp1;                                       \
+  MUL2(in0, const0, in1, const0, temp0, temp1);             \
+  out0 = (v8u16)__msa_pckod_h((v8i16)temp1, (v8i16)temp0);  \
+} while (0)
+
+#define CALC_R16(y0, y1, v0, v1, dst) do {                \
+  const v8i16 const_a = (v8i16)__msa_fill_h(14234);       \
+  const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0);  \
+  const v8i16 a1 = __msa_adds_s_h((v8i16)y1, (v8i16)v1);  \
+  v8i16 b0 = __msa_subs_s_h(a0, const_a);                 \
+  v8i16 b1 = __msa_subs_s_h(a1, const_a);                 \
+  SRAI_H2_SH(b0, b1, 6);                                  \
+  CLIP_SH2_0_255(b0, b1);                                 \
+  dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0);       \
+} while (0)
+
+#define CALC_R8(y0, v0, dst) do {                         \
+  const v8i16 const_a = (v8i16)__msa_fill_h(14234);       \
+  const v8i16 a0 = __msa_adds_s_h((v8i16)y0, (v8i16)v0);  \
+  v8i16 b0 = __msa_subs_s_h(a0, const_a);                 \
+  b0 = SRAI_H(b0, 6);                                     \
+  CLIP_SH_0_255(b0);                                      \
+  dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0);       \
+} while (0)
+
+#define CALC_G16(y0, y1, u0, u1, v0, v1, dst) do {   \
+  const v8i16 const_a = (v8i16)__msa_fill_h(8708);   \
+  v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0);   \
+  v8i16 a1 = __msa_subs_s_h((v8i16)y1, (v8i16)u1);   \
+  const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0);    \
+  const v8i16 b1 = __msa_subs_s_h(a1, (v8i16)v1);    \
+  a0 = __msa_adds_s_h(b0, const_a);                  \
+  a1 = __msa_adds_s_h(b1, const_a);                  \
+  SRAI_H2_SH(a0, a1, 6);                             \
+  CLIP_SH2_0_255(a0, a1);                            \
+  dst = (v16u8)__msa_pckev_b((v16i8)a1, (v16i8)a0);  \
+} while (0)
+
+#define CALC_G8(y0, u0, v0, dst) do {                \
+  const v8i16 const_a = (v8i16)__msa_fill_h(8708);   \
+  v8i16 a0 = __msa_subs_s_h((v8i16)y0, (v8i16)u0);   \
+  const v8i16 b0 = __msa_subs_s_h(a0, (v8i16)v0);    \
+  a0 = __msa_adds_s_h(b0, const_a);                  \
+  a0 = SRAI_H(a0, 6);                                \
+  CLIP_SH_0_255(a0);                                 \
+  dst = (v16u8)__msa_pckev_b((v16i8)a0, (v16i8)a0);  \
+} while (0)
+
+#define CALC_B16(y0, y1, u0, u1, dst) do {           \
+  const v8u16 const_a = (v8u16)__msa_fill_h(17685);  \
+  const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0);    \
+  const v8u16 a1 = __msa_adds_u_h((v8u16)y1, u1);    \
+  v8u16 b0 = __msa_subs_u_h(a0, const_a);            \
+  v8u16 b1 = __msa_subs_u_h(a1, const_a);            \
+  SRAI_H2_UH(b0, b1, 6);                             \
+  CLIP_UH2_0_255(b0, b1);                            \
+  dst = (v16u8)__msa_pckev_b((v16i8)b1, (v16i8)b0);  \
+} while (0)
+
+#define CALC_B8(y0, u0, dst) do {                    \
+  const v8u16 const_a = (v8u16)__msa_fill_h(17685);  \
+  const v8u16 a0 = __msa_adds_u_h((v8u16)y0, u0);    \
+  v8u16 b0 = __msa_subs_u_h(a0, const_a);            \
+  b0 = SRAI_H(b0, 6);                                \
+  CLIP_UH_0_255(b0);                                 \
+  dst = (v16u8)__msa_pckev_b((v16i8)b0, (v16i8)b0);  \
+} while (0)
+
+#define CALC_RGB16(y, u, v, R, G, B) do {    \
+  const v16u8 zero = { 0 };                  \
+  v8u16 y0, y1, u0, u1, v0, v1;              \
+  v4u32 p0, p1, p2, p3;                      \
+  const v16u8 in_y = LD_UB(y);               \
+  const v16u8 in_u = LD_UB(u);               \
+  const v16u8 in_v = LD_UB(v);               \
+  ILVRL_UW4(in_y, p0, p1, p2, p3);           \
+  MULTHI_16(p0, p1, p2, p3, 19077, y0, y1);  \
+  ILVRL_UW4(in_v, p0, p1, p2, p3);           \
+  MULTHI_16(p0, p1, p2, p3, 26149, v0, v1);  \
+  CALC_R16(y0, y1, v0, v1, R);               \
+  MULTHI_16(p0, p1, p2, p3, 13320, v0, v1);  \
+  ILVRL_UW4(in_u, p0, p1, p2, p3);           \
+  MULTHI_16(p0, p1, p2, p3, 6419, u0, u1);   \
+  CALC_G16(y0, y1, u0, u1, v0, v1, G);       \
+  MULTHI_16(p0, p1, p2, p3, 33050, u0, u1);  \
+  CALC_B16(y0, y1, u0, u1, B);               \
+} while (0)
+
+#define CALC_RGB8(y, u, v, R, G, B) do {  \
+  const v16u8 zero = { 0 };               \
+  v8u16 y0, u0, v0;                       \
+  v4u32 p0, p1;                           \
+  const v16u8 in_y = LD_UB(y);            \
+  const v16u8 in_u = LD_UB(u);            \
+  const v16u8 in_v = LD_UB(v);            \
+  ILVR_UW2(in_y, p0, p1);                 \
+  MULTHI_8(p0, p1, 19077, y0);            \
+  ILVR_UW2(in_v, p0, p1);                 \
+  MULTHI_8(p0, p1, 26149, v0);            \
+  CALC_R8(y0, v0, R);                     \
+  MULTHI_8(p0, p1, 13320, v0);            \
+  ILVR_UW2(in_u, p0, p1);                 \
+  MULTHI_8(p0, p1, 6419, u0);             \
+  CALC_G8(y0, u0, v0, G);                 \
+  MULTHI_8(p0, p1, 33050, u0);            \
+  CALC_B8(y0, u0, B);                     \
+} while (0)
+
+#define STORE16_3(a0, a1, a2, dst) do {                          \
+  const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19,  \
+                        8, 9, 20, 10 };                          \
+  const v16u8 mask1 = { 0, 21, 1, 2, 22, 3, 4, 23, 5, 6, 24, 7,  \
+                        8, 25, 9, 10 };                          \
+  const v16u8 mask2 = { 26, 0, 1, 27, 2, 3, 28, 4, 5, 29, 6, 7,  \
+                        30, 8, 9, 31 };                          \
+  v16u8 out0, out1, out2, tmp0, tmp1, tmp2;                      \
+  ILVRL_B2_UB(a1, a0, tmp0, tmp1);                               \
+  out0 = VSHF_UB(tmp0, a2, mask0);                               \
+  tmp2 = SLDI_UB(tmp1, tmp0, 11);                                \
+  out1 = VSHF_UB(tmp2, a2, mask1);                               \
+  tmp2 = SLDI_UB(tmp1, tmp1, 6);                                 \
+  out2 = VSHF_UB(tmp2, a2, mask2);                               \
+  ST_UB(out0, dst +  0);                                         \
+  ST_UB(out1, dst + 16);                                         \
+  ST_UB(out2, dst + 32);                                         \
+} while (0)
+
+#define STORE8_3(a0, a1, a2, dst) do {                             \
+  int64_t out_m;                                                   \
+  const v16u8 mask0 = { 0, 1, 16, 2, 3, 17, 4, 5, 18, 6, 7, 19,    \
+                        8, 9, 20, 10 };                            \
+  const v16u8 mask1 = { 11, 21, 12, 13, 22, 14, 15, 23,            \
+                        255, 255, 255, 255, 255, 255, 255, 255 };  \
+  const v16u8 tmp0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0);    \
+  v16u8 out0, out1;                                                \
+  VSHF_B2_UB(tmp0, a2, tmp0, a2, mask0, mask1, out0, out1);        \
+  ST_UB(out0, dst);                                                \
+  out_m = __msa_copy_s_d((v2i64)out1, 0);                          \
+  SD(out_m, dst + 16);                                             \
+} while (0)
+
+#define STORE16_4(a0, a1, a2, a3, dst) do {  \
+  v16u8 tmp0, tmp1, tmp2, tmp3;              \
+  v16u8 out0, out1, out2, out3;              \
+  ILVRL_B2_UB(a1, a0, tmp0, tmp1);           \
+  ILVRL_B2_UB(a3, a2, tmp2, tmp3);           \
+  ILVRL_H2_UB(tmp2, tmp0, out0, out1);       \
+  ILVRL_H2_UB(tmp3, tmp1, out2, out3);       \
+  ST_UB(out0, dst +  0);                     \
+  ST_UB(out1, dst + 16);                     \
+  ST_UB(out2, dst + 32);                     \
+  ST_UB(out3, dst + 48);                     \
+} while (0)
+
+#define STORE8_4(a0, a1, a2, a3, dst) do {  \
+  v16u8 tmp0, tmp1, tmp2, tmp3;             \
+  ILVR_B2_UB(a1, a0, a3, a2, tmp0, tmp1);   \
+  ILVRL_H2_UB(tmp1, tmp0, tmp2, tmp3);      \
+  ST_UB(tmp2, dst +  0);                    \
+  ST_UB(tmp3, dst + 16);                    \
+} while (0)
+
+#define STORE2_16(a0, a1, dst) do {  \
+  v16u8 out0, out1;                  \
+  ILVRL_B2_UB(a1, a0, out0, out1);   \
+  ST_UB(out0, dst +  0);             \
+  ST_UB(out1, dst + 16);             \
+} while (0)
+
+#define STORE2_8(a0, a1, dst) do {                               \
+  const v16u8 out0 = (v16u8)__msa_ilvr_b((v16i8)a1, (v16i8)a0);  \
+  ST_UB(out0, dst);                                              \
+} while (0)
+
+#define CALC_RGBA4444(y, u, v, out0, out1, N, dst) do {  \
+  CALC_RGB##N(y, u, v, R, G, B);                         \
+  tmp0 = ANDI_B(R, 0xf0);                                \
+  tmp1 = SRAI_B(G, 4);                                   \
+  RG = tmp0 | tmp1;                                      \
+  tmp0 = ANDI_B(B, 0xf0);                                \
+  BA = ORI_B(tmp0, 0x0f);                                \
+  STORE2_##N(out0, out1, dst);                           \
+} while (0)
+
+#define CALC_RGB565(y, u, v, out0, out1, N, dst) do {  \
+  CALC_RGB##N(y, u, v, R, G, B);                       \
+  tmp0 = ANDI_B(R, 0xf8);                              \
+  tmp1 = SRAI_B(G, 5);                                 \
+  RG = tmp0 | tmp1;                                    \
+  tmp0 = SLLI_B(G, 3);                                 \
+  tmp1 = ANDI_B(tmp0, 0xe0);                           \
+  tmp0 = SRAI_B(B, 3);                                 \
+  GB = tmp0 | tmp1;                                    \
+  STORE2_##N(out0, out1, dst);                         \
+} while (0)
+
+static WEBP_INLINE int Clip8(int v) {
+  return v < 0 ? 0 : v > 255 ? 255 : v;
+}
+
+static void YuvToRgb(int y, int u, int v, uint8_t* const rgb) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  rgb[0] = Clip8(r1 >> 6);
+  rgb[1] = Clip8(g1 >> 6);
+  rgb[2] = Clip8(b1 >> 6);
+}
+
+static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  bgr[0] = Clip8(b1 >> 6);
+  bgr[1] = Clip8(g1 >> 6);
+  bgr[2] = Clip8(r1 >> 6);
+}
+
+static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  const int r = Clip8(r1 >> 6);
+  const int g = Clip8(g1 >> 6);
+  const int b = Clip8(b1 >> 6);
+  const int rg = (r & 0xf8) | (g >> 5);
+  const int gb = ((g << 3) & 0xe0) | (b >> 3);
+#ifdef WEBP_SWAP_16BIT_CSP
+  rgb[0] = gb;
+  rgb[1] = rg;
+#else
+  rgb[0] = rg;
+  rgb[1] = gb;
+#endif
+}
+
+static void YuvToRgba4444(int y, int u, int v, uint8_t* const argb) {
+  const int y1 = MultHi(y, 19077);
+  const int r1 = y1 + MultHi(v, 26149) - 14234;
+  const int g1 = y1 - MultHi(u, 6419) - MultHi(v, 13320) + 8708;
+  const int b1 = y1 + MultHi(u, 33050) - 17685;
+  const int r = Clip8(r1 >> 6);
+  const int g = Clip8(g1 >> 6);
+  const int b = Clip8(b1 >> 6);
+  const int rg = (r & 0xf0) | (g >> 4);
+  const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
+#ifdef WEBP_SWAP_16BIT_CSP
+  argb[0] = ba;
+  argb[1] = rg;
+#else
+  argb[0] = rg;
+  argb[1] = ba;
+#endif
+}
+
+static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) {
+  argb[0] = 0xff;
+  YuvToRgb(y, u, v, argb + 1);
+}
+
+static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) {
+  YuvToBgr(y, u, v, bgra);
+  bgra[3] = 0xff;
+}
+
+static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
+  YuvToRgb(y, u, v, rgba);
+  rgba[3] = 0xff;
+}
+
+static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
+                         const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_3(R, G, B, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 3;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[3 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_3(R, G, B, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[3 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_3(R, G, B, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  }
+}
+
+static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
+                         const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_3(B, G, R, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 3;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[3 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_3(B, G, R, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[3 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_3(B, G, R, temp);
+    memcpy(dst, temp, length * 3 * sizeof(*dst));
+  }
+}
+
+static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
+                          const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_4(R, G, B, A, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 4;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[4 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(&temp[0], u, v, R, G, B);
+    STORE16_4(R, G, B, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[4 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_4(R, G, B, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  }
+}
+
+static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
+                          const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_4(B, G, R, A, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 4;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[4 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_4(B, G, R, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[4 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_4(B, G, R, A, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  }
+}
+
+static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
+                          const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B;
+  const v16u8 A = (v16u8)__msa_ldi_b(0xff);
+  while (length >= 16) {
+    CALC_RGB16(y, u, v, R, G, B);
+    STORE16_4(A, R, G, B, dst);
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 4;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[4 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB16(temp, u, v, R, G, B);
+    STORE16_4(A, R, G, B, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[4 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+    CALC_RGB8(temp, u, v, R, G, B);
+    STORE8_4(A, R, G, B, temp);
+    memcpy(dst, temp, length * 4 * sizeof(*dst));
+  }
+}
+
+static void YuvToRgba4444Line(const uint8_t* y, const uint8_t* u,
+                              const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B, RG, BA, tmp0, tmp1;
+  while (length >= 16) {
+  #ifdef WEBP_SWAP_16BIT_CSP
+    CALC_RGBA4444(y, u, v, BA, RG, 16, dst);
+  #else
+    CALC_RGBA4444(y, u, v, RG, BA, 16, dst);
+  #endif
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 2;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[2 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#ifdef WEBP_SWAP_16BIT_CSP
+    CALC_RGBA4444(temp, u, v, BA, RG, 16, temp);
+#else
+    CALC_RGBA4444(temp, u, v, RG, BA, 16, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[2 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#ifdef WEBP_SWAP_16BIT_CSP
+    CALC_RGBA4444(temp, u, v, BA, RG, 8, temp);
+#else
+    CALC_RGBA4444(temp, u, v, RG, BA, 8, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  }
+}
+
+static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
+                            const uint8_t* v, uint8_t* dst, int length) {
+  v16u8 R, G, B, RG, GB, tmp0, tmp1;
+  while (length >= 16) {
+  #ifdef WEBP_SWAP_16BIT_CSP
+    CALC_RGB565(y, u, v, GB, RG, 16, dst);
+  #else
+    CALC_RGB565(y, u, v, RG, GB, 16, dst);
+  #endif
+    y      += 16;
+    u      += 16;
+    v      += 16;
+    dst    += 16 * 2;
+    length -= 16;
+  }
+  if (length > 8) {
+    uint8_t temp[2 * 16] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#ifdef WEBP_SWAP_16BIT_CSP
+    CALC_RGB565(temp, u, v, GB, RG, 16, temp);
+#else
+    CALC_RGB565(temp, u, v, RG, GB, 16, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  } else if (length > 0) {
+    uint8_t temp[2 * 8] = { 0 };
+    memcpy(temp, y, length * sizeof(*temp));
+#ifdef WEBP_SWAP_16BIT_CSP
+    CALC_RGB565(temp, u, v, GB, RG, 8, temp);
+#else
+    CALC_RGB565(temp, u, v, RG, GB, 8, temp);
+#endif
+    memcpy(dst, temp, length * 2 * sizeof(*dst));
+  }
+}
+
+#define UPSAMPLE_32PIXELS(a, b, c, d) do {    \
+  v16u8 s = __msa_aver_u_b(a, d);             \
+  v16u8 t = __msa_aver_u_b(b, c);             \
+  const v16u8 st = s ^ t;                     \
+  v16u8 ad = a ^ d;                           \
+  v16u8 bc = b ^ c;                           \
+  v16u8 t0 = ad | bc;                         \
+  v16u8 t1 = t0 | st;                         \
+  v16u8 t2 = ANDI_B(t1, 1);                   \
+  v16u8 t3 = __msa_aver_u_b(s, t);            \
+  const v16u8 k = t3 - t2;                    \
+  v16u8 diag1, diag2;                         \
+  AVER_UB2_UB(t, k, s, k, t0, t1);            \
+  bc = bc & st;                               \
+  ad = ad & st;                               \
+  t = t ^ k;                                  \
+  s = s ^ k;                                  \
+  t2 = bc | t;                                \
+  t3 = ad | s;                                \
+  t2 = ANDI_B(t2, 1);                         \
+  t3 = ANDI_B(t3, 1);                         \
+  SUB2(t0, t2, t1, t3, diag1, diag2);         \
+  AVER_UB2_UB(a, diag1, b, diag2, t0, t1);    \
+  ILVRL_B2_UB(t1, t0, a, b);                  \
+  if (pbot_y != NULL) {                       \
+    AVER_UB2_UB(c, diag2, d, diag1, t0, t1);  \
+    ILVRL_B2_UB(t1, t0, c, d);                \
+  }                                           \
+} while (0)
+
+#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                            \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,        \
+                      const uint8_t* top_u, const uint8_t* top_v,        \
+                      const uint8_t* cur_u, const uint8_t* cur_v,        \
+                      uint8_t* top_dst, uint8_t* bot_dst, int len)       \
+{                                                                        \
+  int size = (len - 1) >> 1;                                             \
+  uint8_t temp_u[64];                                                    \
+  uint8_t temp_v[64];                                                    \
+  const uint32_t tl_uv = ((top_u[0]) | ((top_v[0]) << 16));              \
+  const uint32_t l_uv = ((cur_u[0]) | ((cur_v[0]) << 16));               \
+  const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;            \
+  const uint8_t* ptop_y = &top_y[1];                                     \
+  uint8_t *ptop_dst = top_dst + XSTEP;                                   \
+  const uint8_t* pbot_y = &bot_y[1];                                     \
+  uint8_t *pbot_dst = bot_dst + XSTEP;                                   \
+                                                                         \
+  FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                      \
+  if (bot_y != NULL) {                                                   \
+    const uint32_t uv1 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;          \
+    FUNC(bot_y[0], uv1 & 0xff, (uv1 >> 16), bot_dst);                    \
+  }                                                                      \
+  while (size >= 16) {                                                   \
+    v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1;                        \
+    LD_UB2(top_u, 1, tu0, tu1);                                          \
+    LD_UB2(cur_u, 1, cu0, cu1);                                          \
+    LD_UB2(top_v, 1, tv0, tv1);                                          \
+    LD_UB2(cur_v, 1, cv0, cv1);                                          \
+    UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1);                               \
+    UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1);                               \
+    ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16);                          \
+    ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16);                          \
+    FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, 32);           \
+    if (bot_y != NULL) {                                                 \
+      FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, 32);        \
+    }                                                                    \
+    ptop_y   += 32;                                                      \
+    pbot_y   += 32;                                                      \
+    ptop_dst += XSTEP * 32;                                              \
+    pbot_dst += XSTEP * 32;                                              \
+    top_u    += 16;                                                      \
+    top_v    += 16;                                                      \
+    cur_u    += 16;                                                      \
+    cur_v    += 16;                                                      \
+    size     -= 16;                                                      \
+  }                                                                      \
+  if (size > 0) {                                                        \
+    v16u8 tu0, tu1, tv0, tv1, cu0, cu1, cv0, cv1;                        \
+    memcpy(&temp_u[ 0], top_u, 17 * sizeof(uint8_t));                    \
+    memcpy(&temp_u[32], cur_u, 17 * sizeof(uint8_t));                    \
+    memcpy(&temp_v[ 0], top_v, 17 * sizeof(uint8_t));                    \
+    memcpy(&temp_v[32], cur_v, 17 * sizeof(uint8_t));                    \
+    LD_UB2(&temp_u[ 0], 1, tu0, tu1);                                    \
+    LD_UB2(&temp_u[32], 1, cu0, cu1);                                    \
+    LD_UB2(&temp_v[ 0], 1, tv0, tv1);                                    \
+    LD_UB2(&temp_v[32], 1, cv0, cv1);                                    \
+    UPSAMPLE_32PIXELS(tu0, tu1, cu0, cu1);                               \
+    UPSAMPLE_32PIXELS(tv0, tv1, cv0, cv1);                               \
+    ST_UB4(tu0, tu1, cu0, cu1, &temp_u[0], 16);                          \
+    ST_UB4(tv0, tv1, cv0, cv1, &temp_v[0], 16);                          \
+    FUNC##Line(ptop_y, &temp_u[ 0], &temp_v[0], ptop_dst, size * 2);     \
+    if (bot_y != NULL) {                                                 \
+      FUNC##Line(pbot_y, &temp_u[32], &temp_v[32], pbot_dst, size * 2);  \
+    }                                                                    \
+    top_u += size;                                                       \
+    top_v += size;                                                       \
+    cur_u += size;                                                       \
+    cur_v += size;                                                       \
+  }                                                                      \
+  if (!(len & 1)) {                                                      \
+    const uint32_t t0 = ((top_u[0]) | ((top_v[0]) << 16));               \
+    const uint32_t c0  = ((cur_u[0]) | ((cur_v[0]) << 16));              \
+    const uint32_t tmp0 = (3 * t0 + c0 + 0x00020002u) >> 2;              \
+    FUNC(top_y[len - 1], tmp0 & 0xff, (tmp0 >> 16),                      \
+                top_dst + (len - 1) * XSTEP);                            \
+    if (bot_y != NULL) {                                                 \
+      const uint32_t tmp1 = (3 * c0 + t0 + 0x00020002u) >> 2;            \
+      FUNC(bot_y[len - 1], tmp1 & 0xff, (tmp1 >> 16),                    \
+           bot_dst + (len - 1) * XSTEP);                                 \
+    }                                                                    \
+  }                                                                      \
+}
+
+UPSAMPLE_FUNC(UpsampleRgbLinePair,      YuvToRgb,      3)
+UPSAMPLE_FUNC(UpsampleBgrLinePair,      YuvToBgr,      3)
+UPSAMPLE_FUNC(UpsampleRgbaLinePair,     YuvToRgba,     4)
+UPSAMPLE_FUNC(UpsampleBgraLinePair,     YuvToBgra,     4)
+UPSAMPLE_FUNC(UpsampleArgbLinePair,     YuvToArgb,     4)
+UPSAMPLE_FUNC(UpsampleRgba4444LinePair, YuvToRgba4444, 2)
+UPSAMPLE_FUNC(UpsampleRgb565LinePair,   YuvToRgb565,   2)
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+extern void WebPInitUpsamplersMSA(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersMSA(void) {
+  WebPUpsamplers[MODE_RGB]       = UpsampleRgbLinePair;
+  WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_BGR]       = UpsampleBgrLinePair;
+  WebPUpsamplers[MODE_BGRA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_ARGB]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_rgbA]      = UpsampleRgbaLinePair;
+  WebPUpsamplers[MODE_bgrA]      = UpsampleBgraLinePair;
+  WebPUpsamplers[MODE_Argb]      = UpsampleArgbLinePair;
+  WebPUpsamplers[MODE_RGB_565]   = UpsampleRgb565LinePair;
+  WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair;
+  WebPUpsamplers[MODE_rgbA_4444] = UpsampleRgba4444LinePair;
+}
+
+#endif  // FANCY_UPSAMPLING
+
+#endif  // WEBP_USE_MSA
+
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_MSA))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersMSA)
+#endif
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
index 2b0c99b..d371a83 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_neon.c
@@ -28,47 +28,34 @@
 // U/V upsampling
 
 // Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
-#define UPSAMPLE_16PIXELS(r1, r2, out) {                                \
-  uint8x8_t a = vld1_u8(r1);                                            \
-  uint8x8_t b = vld1_u8(r1 + 1);                                        \
-  uint8x8_t c = vld1_u8(r2);                                            \
-  uint8x8_t d = vld1_u8(r2 + 1);                                        \
-                                                                        \
-  uint16x8_t al = vshll_n_u8(a, 1);                                     \
-  uint16x8_t bl = vshll_n_u8(b, 1);                                     \
-  uint16x8_t cl = vshll_n_u8(c, 1);                                     \
-  uint16x8_t dl = vshll_n_u8(d, 1);                                     \
-                                                                        \
-  uint8x8_t diag1, diag2;                                               \
-  uint16x8_t sl;                                                        \
-                                                                        \
+#define UPSAMPLE_16PIXELS(r1, r2, out) do {                             \
+  const uint8x8_t a = vld1_u8(r1 + 0);                                  \
+  const uint8x8_t b = vld1_u8(r1 + 1);                                  \
+  const uint8x8_t c = vld1_u8(r2 + 0);                                  \
+  const uint8x8_t d = vld1_u8(r2 + 1);                                  \
   /* a + b + c + d */                                                   \
-  sl = vaddl_u8(a,  b);                                                 \
-  sl = vaddw_u8(sl, c);                                                 \
-  sl = vaddw_u8(sl, d);                                                 \
-                                                                        \
-  al = vaddq_u16(sl, al); /* 3a +  b +  c +  d */                       \
-  bl = vaddq_u16(sl, bl); /*  a + 3b +  c +  d */                       \
-                                                                        \
-  al = vaddq_u16(al, dl); /* 3a +  b +  c + 3d */                       \
-  bl = vaddq_u16(bl, cl); /*  a + 3b + 3c +  d */                       \
+  const uint16x8_t ad = vaddl_u8(a,  d);                                \
+  const uint16x8_t bc = vaddl_u8(b,  c);                                \
+  const uint16x8_t abcd = vaddq_u16(ad, bc);                            \
+  /* 3a +  b +  c + 3d */                                               \
+  const uint16x8_t al = vaddq_u16(abcd, vshlq_n_u16(ad, 1));            \
+  /*  a + 3b + 3c +  d */                                               \
+  const uint16x8_t bl = vaddq_u16(abcd, vshlq_n_u16(bc, 1));            \
                                                                         \
-  diag2 = vshrn_n_u16(al, 3);                                           \
-  diag1 = vshrn_n_u16(bl, 3);                                           \
+  const uint8x8_t diag2 = vshrn_n_u16(al, 3);                           \
+  const uint8x8_t diag1 = vshrn_n_u16(bl, 3);                           \
                                                                         \
-  a = vrhadd_u8(a, diag1);                                              \
-  b = vrhadd_u8(b, diag2);                                              \
-  c = vrhadd_u8(c, diag2);                                              \
-  d = vrhadd_u8(d, diag1);                                              \
+  const uint8x8_t A = vrhadd_u8(a, diag1);                              \
+  const uint8x8_t B = vrhadd_u8(b, diag2);                              \
+  const uint8x8_t C = vrhadd_u8(c, diag2);                              \
+  const uint8x8_t D = vrhadd_u8(d, diag1);                              \
                                                                         \
-  {                                                                     \
-    uint8x8x2_t a_b, c_d;                                               \
-    INIT_VECTOR2(a_b, a, b);                                            \
-    INIT_VECTOR2(c_d, c, d);                                            \
-    vst2_u8(out,      a_b);                                             \
-    vst2_u8(out + 32, c_d);                                             \
-  }                                                                     \
-}
+  uint8x8x2_t A_B, C_D;                                                 \
+  INIT_VECTOR2(A_B, A, B);                                              \
+  INIT_VECTOR2(C_D, C, D);                                              \
+  vst2_u8(out +  0, A_B);                                               \
+  vst2_u8(out + 32, C_D);                                               \
+} while (0)
 
 // Turn the macro into a function for reducing code-size when non-critical
 static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
@@ -93,7 +80,6 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
 static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
 
 #define v255 vdup_n_u8(255)
-#define v_0x0f vdup_n_u8(15)
 
 #define STORE_Rgb(out, r, g, b) do {                                    \
   uint8x8x3_t r_g_b;                                                    \
@@ -132,21 +118,16 @@ static const int16_t kCoeffs1[4] = { 19077, 26149, 6419, 13320 };
 #endif
 
 #define STORE_Rgba4444(out, r, g, b) do {                               \
-  const uint8x8_t r1 = vshl_n_u8(vshr_n_u8(r, 4), 4);  /* 4bits */      \
-  const uint8x8_t g1 = vshr_n_u8(g, 4);                                 \
-  const uint8x8_t ba = vorr_u8(b, v_0x0f);                              \
-  const uint8x8_t rg = vorr_u8(r1, g1);                                 \
+  const uint8x8_t rg = vsri_n_u8(r, g, 4);      /* shift g, insert r */ \
+  const uint8x8_t ba = vsri_n_u8(b, v255, 4);   /* shift a, insert b */ \
   const uint8x8x2_t rgba4444 = ZIP_U8(rg, ba);                          \
   vst1q_u8(out, vcombine_u8(rgba4444.val[0], rgba4444.val[1]));         \
 } while (0)
 
 #define STORE_Rgb565(out, r, g, b) do {                                 \
-  const uint8x8_t r1 = vshl_n_u8(vshr_n_u8(r, 3), 3);  /* 5bits */      \
-  const uint8x8_t g1 = vshr_n_u8(g, 5);                /* upper 3bits */\
-  const uint8x8_t g2 = vshl_n_u8(vshr_n_u8(g, 2), 5);  /* lower 3bits */\
-  const uint8x8_t b1 = vshr_n_u8(b, 3);                /* 5bits */      \
-  const uint8x8_t rg = vorr_u8(r1, g1);                                 \
-  const uint8x8_t gb = vorr_u8(g2, b1);                                 \
+  const uint8x8_t rg = vsri_n_u8(r, g, 5);   /* shift g and insert r */ \
+  const uint8x8_t g1 = vshl_n_u8(g, 3);      /* pre-shift g: 3bits */   \
+  const uint8x8_t gb = vsri_n_u8(g1, b, 3);  /* shift b and insert g */ \
   const uint8x8x2_t rgb565 = ZIP_U8(rg, gb);                            \
   vst1q_u8(out, vcombine_u8(rgb565.val[0], rgb565.val[1]));             \
 } while (0)
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.c b/src/3rdparty/libwebp/src/dsp/yuv.c
index f50a253..dd7d9de 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv.c
@@ -13,6 +13,8 @@
 
 #include "./yuv.h"
 
+#include <stdlib.h>
+
 #if defined(WEBP_YUV_USE_TABLE)
 
 static int done = 0;
@@ -244,6 +246,48 @@ void WebPConvertRGBA32ToUV_C(const uint16_t* rgb,
 
 //-----------------------------------------------------------------------------
 
+#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
+static uint16_t clip_y(int v) {
+  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
+}
+
+static uint64_t SharpYUVUpdateY_C(const uint16_t* ref, const uint16_t* src,
+                                  uint16_t* dst, int len) {
+  uint64_t diff = 0;
+  int i;
+  for (i = 0; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)dst[i] + diff_y;
+    dst[i] = clip_y(new_y);
+    diff += (uint64_t)abs(diff_y);
+  }
+  return diff;
+}
+
+static void SharpYUVUpdateRGB_C(const int16_t* ref, const int16_t* src,
+                                int16_t* dst, int len) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYUVFilterRow_C(const int16_t* A, const int16_t* B, int len,
+                                const uint16_t* best_y, uint16_t* out) {
+  int i;
+  for (i = 0; i < len; ++i, ++A, ++B) {
+    const int v0 = (A[0] * 9 + A[1] * 3 + B[0] * 3 + B[1] + 8) >> 4;
+    const int v1 = (A[1] * 9 + A[0] * 3 + B[1] * 3 + B[0] + 8) >> 4;
+    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
+    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
+  }
+}
+
+#undef MAX_Y
+
+//-----------------------------------------------------------------------------
+
 void (*WebPConvertRGB24ToY)(const uint8_t* rgb, uint8_t* y, int width);
 void (*WebPConvertBGR24ToY)(const uint8_t* bgr, uint8_t* y, int width);
 void (*WebPConvertRGBA32ToUV)(const uint16_t* rgb,
@@ -253,10 +297,18 @@ void (*WebPConvertARGBToY)(const uint32_t* argb, uint8_t* y, int width);
 void (*WebPConvertARGBToUV)(const uint32_t* argb, uint8_t* u, uint8_t* v,
                             int src_width, int do_store);
 
+uint64_t (*WebPSharpYUVUpdateY)(const uint16_t* ref, const uint16_t* src,
+                                uint16_t* dst, int len);
+void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src,
+                              int16_t* dst, int len);
+void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len,
+                              const uint16_t* best_y, uint16_t* out);
+
 static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
     (VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;
 
 extern void WebPInitConvertARGBToYUVSSE2(void);
+extern void WebPInitSharpYUVSSE2(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
   if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
@@ -269,10 +321,15 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
 
   WebPConvertRGBA32ToUV = WebPConvertRGBA32ToUV_C;
 
+  WebPSharpYUVUpdateY = SharpYUVUpdateY_C;
+  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_C;
+  WebPSharpYUVFilterRow = SharpYUVFilterRow_C;
+
   if (VP8GetCPUInfo != NULL) {
 #if defined(WEBP_USE_SSE2)
     if (VP8GetCPUInfo(kSSE2)) {
       WebPInitConvertARGBToYUVSSE2();
+      WebPInitSharpYUVSSE2();
     }
 #endif  // WEBP_USE_SSE2
   }
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.h b/src/3rdparty/libwebp/src/dsp/yuv.h
index 01c40fc..1d33b58 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.h
+++ b/src/3rdparty/libwebp/src/dsp/yuv.h
@@ -36,7 +36,7 @@
 #define WEBP_DSP_YUV_H_
 
 #include "./dsp.h"
-#include "../dec/decode_vp8.h"
+#include "../dec/vp8_dec.h"
 
 #if defined(WEBP_EXPERIMENTAL_FEATURES)
 // Do NOT activate this feature for real compression. This is only experimental!
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
index e19bddf..e33c2bb 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
@@ -15,6 +15,8 @@
 
 #if defined(WEBP_USE_SSE2)
 
+#include "./common_sse2.h"
+#include <stdlib.h>
 #include <emmintrin.h>
 
 //-----------------------------------------------------------------------------
@@ -155,30 +157,13 @@ static WEBP_INLINE void PackAndStore565(const __m128i* const R,
   _mm_storeu_si128((__m128i*)dst, rgb565);
 }
 
-// Function used several times in PlanarTo24b.
-// It samples the in buffer as follows: one every two unsigned char is stored
-// at the beginning of the buffer, while the other half is stored at the end.
-static WEBP_INLINE void PlanarTo24bHelper(const __m128i* const in /*in[6]*/,
-                                          __m128i* const out /*out[6]*/) {
-  const __m128i v_mask = _mm_set1_epi16(0x00ff);
-
-  // Take one every two upper 8b values.
-  out[0] = _mm_packus_epi16(_mm_and_si128(in[0], v_mask),
-                            _mm_and_si128(in[1], v_mask));
-  out[1] = _mm_packus_epi16(_mm_and_si128(in[2], v_mask),
-                            _mm_and_si128(in[3], v_mask));
-  out[2] = _mm_packus_epi16(_mm_and_si128(in[4], v_mask),
-                            _mm_and_si128(in[5], v_mask));
-  // Take one every two lower 8b values.
-  out[3] = _mm_packus_epi16(_mm_srli_epi16(in[0], 8), _mm_srli_epi16(in[1], 8));
-  out[4] = _mm_packus_epi16(_mm_srli_epi16(in[2], 8), _mm_srli_epi16(in[3], 8));
-  out[5] = _mm_packus_epi16(_mm_srli_epi16(in[4], 8), _mm_srli_epi16(in[5], 8));
-}
-
 // Pack the planar buffers
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
-static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) {
+static WEBP_INLINE void PlanarTo24b(__m128i* const in0, __m128i* const in1,
+                                    __m128i* const in2, __m128i* const in3,
+                                    __m128i* const in4, __m128i* const in5,
+                                    uint8_t* const rgb) {
   // The input is 6 registers of sixteen 8b but for the sake of explanation,
   // let's take 6 registers of four 8b values.
   // To pack, we will keep taking one every two 8b integer and move it
@@ -191,22 +176,15 @@ static WEBP_INLINE void PlanarTo24b(__m128i* const in /*in[6]*/, uint8_t* rgb) {
   // Repeat the same permutations twice more:
   //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
   //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
-  __m128i tmp[6];
-  PlanarTo24bHelper(in, tmp);
-  PlanarTo24bHelper(tmp, in);
-  PlanarTo24bHelper(in, tmp);
-  // We need to do it two more times than the example as we have sixteen bytes.
-  PlanarTo24bHelper(tmp, in);
-  PlanarTo24bHelper(in, tmp);
-
-  _mm_storeu_si128((__m128i*)(rgb +  0), tmp[0]);
-  _mm_storeu_si128((__m128i*)(rgb + 16), tmp[1]);
-  _mm_storeu_si128((__m128i*)(rgb + 32), tmp[2]);
-  _mm_storeu_si128((__m128i*)(rgb + 48), tmp[3]);
-  _mm_storeu_si128((__m128i*)(rgb + 64), tmp[4]);
-  _mm_storeu_si128((__m128i*)(rgb + 80), tmp[5]);
-}
-#undef MK_UINT32
+  VP8PlanarTo24b(in0, in1, in2, in3, in4, in5);
+
+  _mm_storeu_si128((__m128i*)(rgb +  0), *in0);
+  _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
+  _mm_storeu_si128((__m128i*)(rgb + 32), *in2);
+  _mm_storeu_si128((__m128i*)(rgb + 48), *in3);
+  _mm_storeu_si128((__m128i*)(rgb + 64), *in4);
+  _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
+}
 
 void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                     uint8_t* dst) {
@@ -265,29 +243,29 @@ void VP8YuvToRgb56532(const uint8_t* y, const uint8_t* u, const uint8_t* v,
 void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                    uint8_t* dst) {
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
-  __m128i rgb[6];
+  __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
 
-  YUV444ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
-  YUV444ToRGB(y +  8, u +  8, v +  8, &R1, &G1, &B1);
+  YUV444ToRGB(y + 0, u + 0, v + 0, &R0, &G0, &B0);
+  YUV444ToRGB(y + 8, u + 8, v + 8, &R1, &G1, &B1);
   YUV444ToRGB(y + 16, u + 16, v + 16, &R2, &G2, &B2);
   YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
 
   // Cast to 8b and store as RRRRGGGGBBBB.
-  rgb[0] = _mm_packus_epi16(R0, R1);
-  rgb[1] = _mm_packus_epi16(R2, R3);
-  rgb[2] = _mm_packus_epi16(G0, G1);
-  rgb[3] = _mm_packus_epi16(G2, G3);
-  rgb[4] = _mm_packus_epi16(B0, B1);
-  rgb[5] = _mm_packus_epi16(B2, B3);
+  rgb0 = _mm_packus_epi16(R0, R1);
+  rgb1 = _mm_packus_epi16(R2, R3);
+  rgb2 = _mm_packus_epi16(G0, G1);
+  rgb3 = _mm_packus_epi16(G2, G3);
+  rgb4 = _mm_packus_epi16(B0, B1);
+  rgb5 = _mm_packus_epi16(B2, B3);
 
   // Pack as RGBRGBRGBRGB.
-  PlanarTo24b(rgb, dst);
+  PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
 }
 
 void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
                    uint8_t* dst) {
   __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
-  __m128i bgr[6];
+  __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
 
   YUV444ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
   YUV444ToRGB(y +  8, u +  8, v +  8, &R1, &G1, &B1);
@@ -295,15 +273,15 @@ void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   YUV444ToRGB(y + 24, u + 24, v + 24, &R3, &G3, &B3);
 
   // Cast to 8b and store as BBBBGGGGRRRR.
-  bgr[0] = _mm_packus_epi16(B0, B1);
-  bgr[1] = _mm_packus_epi16(B2, B3);
-  bgr[2] = _mm_packus_epi16(G0, G1);
-  bgr[3] = _mm_packus_epi16(G2, G3);
-  bgr[4] = _mm_packus_epi16(R0, R1);
-  bgr[5] = _mm_packus_epi16(R2, R3);
+  bgr0 = _mm_packus_epi16(B0, B1);
+  bgr1 = _mm_packus_epi16(B2, B3);
+  bgr2 = _mm_packus_epi16(G0, G1);
+  bgr3 = _mm_packus_epi16(G2, G3);
+  bgr4 = _mm_packus_epi16(R0, R1);
+  bgr5= _mm_packus_epi16(R2, R3);
 
   // Pack as BGRBGRBGRBGR.
-  PlanarTo24b(bgr, dst);
+  PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
 }
 
 //-----------------------------------------------------------------------------
@@ -377,7 +355,7 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   int n;
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
-    __m128i rgb[6];
+    __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
 
     YUV420ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
     YUV420ToRGB(y +  8, u +  4, v +  4, &R1, &G1, &B1);
@@ -385,15 +363,15 @@ static void YuvToRgbRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
     YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
 
     // Cast to 8b and store as RRRRGGGGBBBB.
-    rgb[0] = _mm_packus_epi16(R0, R1);
-    rgb[1] = _mm_packus_epi16(R2, R3);
-    rgb[2] = _mm_packus_epi16(G0, G1);
-    rgb[3] = _mm_packus_epi16(G2, G3);
-    rgb[4] = _mm_packus_epi16(B0, B1);
-    rgb[5] = _mm_packus_epi16(B2, B3);
+    rgb0 = _mm_packus_epi16(R0, R1);
+    rgb1 = _mm_packus_epi16(R2, R3);
+    rgb2 = _mm_packus_epi16(G0, G1);
+    rgb3 = _mm_packus_epi16(G2, G3);
+    rgb4 = _mm_packus_epi16(B0, B1);
+    rgb5 = _mm_packus_epi16(B2, B3);
 
     // Pack as RGBRGBRGBRGB.
-    PlanarTo24b(rgb, dst);
+    PlanarTo24b(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
 
     y += 32;
     u += 16;
@@ -413,7 +391,7 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
   int n;
   for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
     __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
-    __m128i bgr[6];
+    __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
 
     YUV420ToRGB(y +  0, u +  0, v +  0, &R0, &G0, &B0);
     YUV420ToRGB(y +  8, u +  4, v +  4, &R1, &G1, &B1);
@@ -421,15 +399,15 @@ static void YuvToBgrRow(const uint8_t* y, const uint8_t* u, const uint8_t* v,
     YUV420ToRGB(y + 24, u + 12, v + 12, &R3, &G3, &B3);
 
     // Cast to 8b and store as BBBBGGGGRRRR.
-    bgr[0] = _mm_packus_epi16(B0, B1);
-    bgr[1] = _mm_packus_epi16(B2, B3);
-    bgr[2] = _mm_packus_epi16(G0, G1);
-    bgr[3] = _mm_packus_epi16(G2, G3);
-    bgr[4] = _mm_packus_epi16(R0, R1);
-    bgr[5] = _mm_packus_epi16(R2, R3);
+    bgr0 = _mm_packus_epi16(B0, B1);
+    bgr1 = _mm_packus_epi16(B2, B3);
+    bgr2 = _mm_packus_epi16(G0, G1);
+    bgr3 = _mm_packus_epi16(G2, G3);
+    bgr4 = _mm_packus_epi16(R0, R1);
+    bgr5 = _mm_packus_epi16(R2, R3);
 
     // Pack as BGRBGRBGRBGR.
-    PlanarTo24b(bgr, dst);
+    PlanarTo24b(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
 
     y += 32;
     u += 16;
@@ -499,25 +477,19 @@ static WEBP_INLINE void RGB24PackedToPlanar(const uint8_t* const rgb,
 
 // Convert 8 packed ARGB to r[], g[], b[]
 static WEBP_INLINE void RGB32PackedToPlanar(const uint32_t* const argb,
-                                            __m128i* const r,
-                                            __m128i* const g,
-                                            __m128i* const b) {
+                                            __m128i* const rgb /*in[6]*/) {
   const __m128i zero = _mm_setzero_si128();
-  const __m128i in0 = LOAD_16(argb + 0);    // argb3 | argb2 | argb1 | argb0
-  const __m128i in1 = LOAD_16(argb + 4);    // argb7 | argb6 | argb5 | argb4
-  // column-wise transpose
-  const __m128i A0 = _mm_unpacklo_epi8(in0, in1);
-  const __m128i A1 = _mm_unpackhi_epi8(in0, in1);
-  const __m128i B0 = _mm_unpacklo_epi8(A0, A1);
-  const __m128i B1 = _mm_unpackhi_epi8(A0, A1);
-  // C0 = g7 g6 ... g1 g0 | b7 b6 ... b1 b0
-  // C1 = a7 a6 ... a1 a0 | r7 r6 ... r1 r0
-  const __m128i C0 = _mm_unpacklo_epi8(B0, B1);
-  const __m128i C1 = _mm_unpackhi_epi8(B0, B1);
-  // store 16b
-  *r = _mm_unpacklo_epi8(C1, zero);
-  *g = _mm_unpackhi_epi8(C0, zero);
-  *b = _mm_unpacklo_epi8(C0, zero);
+  __m128i a0 = LOAD_16(argb + 0);
+  __m128i a1 = LOAD_16(argb + 4);
+  __m128i a2 = LOAD_16(argb + 8);
+  __m128i a3 = LOAD_16(argb + 12);
+  VP8L32bToPlanar(&a0, &a1, &a2, &a3);
+  rgb[0] = _mm_unpacklo_epi8(a1, zero);
+  rgb[1] = _mm_unpackhi_epi8(a1, zero);
+  rgb[2] = _mm_unpacklo_epi8(a2, zero);
+  rgb[3] = _mm_unpackhi_epi8(a2, zero);
+  rgb[4] = _mm_unpacklo_epi8(a3, zero);
+  rgb[5] = _mm_unpackhi_epi8(a3, zero);
 }
 
 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
@@ -649,11 +621,10 @@ static void ConvertARGBToY(const uint32_t* argb, uint8_t* y, int width) {
   const int max_width = width & ~15;
   int i;
   for (i = 0; i < max_width; i += 16) {
-    __m128i r, g, b, Y0, Y1;
-    RGB32PackedToPlanar(&argb[i + 0], &r, &g, &b);
-    ConvertRGBToY(&r, &g, &b, &Y0);
-    RGB32PackedToPlanar(&argb[i + 8], &r, &g, &b);
-    ConvertRGBToY(&r, &g, &b, &Y1);
+    __m128i Y0, Y1, rgb[6];
+    RGB32PackedToPlanar(&argb[i], rgb);
+    ConvertRGBToY(&rgb[0], &rgb[2], &rgb[4], &Y0);
+    ConvertRGBToY(&rgb[1], &rgb[3], &rgb[5], &Y1);
     STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
   }
   for (; i < width; ++i) {   // left-over
@@ -678,20 +649,18 @@ static void ConvertARGBToUV(const uint32_t* argb, uint8_t* u, uint8_t* v,
   const int max_width = src_width & ~31;
   int i;
   for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
-    __m128i r0, g0, b0, r1, g1, b1, U0, V0, U1, V1;
-    RGB32PackedToPlanar(&argb[i +  0], &r0, &g0, &b0);
-    RGB32PackedToPlanar(&argb[i +  8], &r1, &g1, &b1);
-    HorizontalAddPack(&r0, &r1, &r0);
-    HorizontalAddPack(&g0, &g1, &g0);
-    HorizontalAddPack(&b0, &b1, &b0);
-    ConvertRGBToUV(&r0, &g0, &b0, &U0, &V0);
-
-    RGB32PackedToPlanar(&argb[i + 16], &r0, &g0, &b0);
-    RGB32PackedToPlanar(&argb[i + 24], &r1, &g1, &b1);
-    HorizontalAddPack(&r0, &r1, &r0);
-    HorizontalAddPack(&g0, &g1, &g0);
-    HorizontalAddPack(&b0, &b1, &b0);
-    ConvertRGBToUV(&r0, &g0, &b0, &U1, &V1);
+    __m128i rgb[6], U0, V0, U1, V1;
+    RGB32PackedToPlanar(&argb[i], rgb);
+    HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
+    HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
+    HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
+    ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
+
+    RGB32PackedToPlanar(&argb[i + 16], rgb);
+    HorizontalAddPack(&rgb[0], &rgb[1], &rgb[0]);
+    HorizontalAddPack(&rgb[2], &rgb[3], &rgb[2]);
+    HorizontalAddPack(&rgb[4], &rgb[5], &rgb[4]);
+    ConvertRGBToUV(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
 
     U0 = _mm_packus_epi16(U0, U1);
     V0 = _mm_packus_epi16(V0, V1);
@@ -767,9 +736,128 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
   WebPConvertRGBA32ToUV = ConvertRGBA32ToUV;
 }
 
+//------------------------------------------------------------------------------
+
+#define MAX_Y ((1 << 10) - 1)    // 10b precision over 16b-arithmetic
+static uint16_t clip_y(int v) {
+  return (v < 0) ? 0 : (v > MAX_Y) ? MAX_Y : (uint16_t)v;
+}
+
+static uint64_t SharpYUVUpdateY_SSE2(const uint16_t* ref, const uint16_t* src,
+                                     uint16_t* dst, int len) {
+  uint64_t diff = 0;
+  uint32_t tmp[4];
+  int i;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i max = _mm_set1_epi16(MAX_Y);
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i sum = zero;
+
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
+    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
+    const __m128i D = _mm_sub_epi16(A, B);       // diff_y
+    const __m128i E = _mm_cmpgt_epi16(zero, D);  // sign (-1 or 0)
+    const __m128i F = _mm_add_epi16(C, D);       // new_y
+    const __m128i G = _mm_or_si128(E, one);      // -1 or 1
+    const __m128i H = _mm_max_epi16(_mm_min_epi16(F, max), zero);
+    const __m128i I = _mm_madd_epi16(D, G);      // sum(abs(...))
+    _mm_storeu_si128((__m128i*)(dst + i), H);
+    sum = _mm_add_epi32(sum, I);
+  }
+  _mm_storeu_si128((__m128i*)tmp, sum);
+  diff = tmp[3] + tmp[2] + tmp[1] + tmp[0];
+  for (; i < len; ++i) {
+    const int diff_y = ref[i] - src[i];
+    const int new_y = (int)dst[i] + diff_y;
+    dst[i] = clip_y(new_y);
+    diff += (uint64_t)abs(diff_y);
+  }
+  return diff;
+}
+
+static void SharpYUVUpdateRGB_SSE2(const int16_t* ref, const int16_t* src,
+                                   int16_t* dst, int len) {
+  int i = 0;
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i A = _mm_loadu_si128((const __m128i*)(ref + i));
+    const __m128i B = _mm_loadu_si128((const __m128i*)(src + i));
+    const __m128i C = _mm_loadu_si128((const __m128i*)(dst + i));
+    const __m128i D = _mm_sub_epi16(A, B);   // diff_uv
+    const __m128i E = _mm_add_epi16(C, D);   // new_uv
+    _mm_storeu_si128((__m128i*)(dst + i), E);
+  }
+  for (; i < len; ++i) {
+    const int diff_uv = ref[i] - src[i];
+    dst[i] += diff_uv;
+  }
+}
+
+static void SharpYUVFilterRow_SSE2(const int16_t* A, const int16_t* B, int len,
+                                   const uint16_t* best_y, uint16_t* out) {
+  int i;
+  const __m128i kCst8 = _mm_set1_epi16(8);
+  const __m128i max = _mm_set1_epi16(MAX_Y);
+  const __m128i zero = _mm_setzero_si128();
+  for (i = 0; i + 8 <= len; i += 8) {
+    const __m128i a0 = _mm_loadu_si128((const __m128i*)(A + i + 0));
+    const __m128i a1 = _mm_loadu_si128((const __m128i*)(A + i + 1));
+    const __m128i b0 = _mm_loadu_si128((const __m128i*)(B + i + 0));
+    const __m128i b1 = _mm_loadu_si128((const __m128i*)(B + i + 1));
+    const __m128i a0b1 = _mm_add_epi16(a0, b1);
+    const __m128i a1b0 = _mm_add_epi16(a1, b0);
+    const __m128i a0a1b0b1 = _mm_add_epi16(a0b1, a1b0);  // A0+A1+B0+B1
+    const __m128i a0a1b0b1_8 = _mm_add_epi16(a0a1b0b1, kCst8);
+    const __m128i a0b1_2 = _mm_add_epi16(a0b1, a0b1);    // 2*(A0+B1)
+    const __m128i a1b0_2 = _mm_add_epi16(a1b0, a1b0);    // 2*(A1+B0)
+    const __m128i c0 = _mm_srai_epi16(_mm_add_epi16(a0b1_2, a0a1b0b1_8), 3);
+    const __m128i c1 = _mm_srai_epi16(_mm_add_epi16(a1b0_2, a0a1b0b1_8), 3);
+    const __m128i d0 = _mm_add_epi16(c1, a0);
+    const __m128i d1 = _mm_add_epi16(c0, a1);
+    const __m128i e0 = _mm_srai_epi16(d0, 1);
+    const __m128i e1 = _mm_srai_epi16(d1, 1);
+    const __m128i f0 = _mm_unpacklo_epi16(e0, e1);
+    const __m128i f1 = _mm_unpackhi_epi16(e0, e1);
+    const __m128i g0 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 0));
+    const __m128i g1 = _mm_loadu_si128((const __m128i*)(best_y + 2 * i + 8));
+    const __m128i h0 = _mm_add_epi16(g0, f0);
+    const __m128i h1 = _mm_add_epi16(g1, f1);
+    const __m128i i0 = _mm_max_epi16(_mm_min_epi16(h0, max), zero);
+    const __m128i i1 = _mm_max_epi16(_mm_min_epi16(h1, max), zero);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 0), i0);
+    _mm_storeu_si128((__m128i*)(out + 2 * i + 8), i1);
+  }
+  for (; i < len; ++i) {
+    //   (9 * A0 + 3 * A1 + 3 * B0 + B1 + 8) >> 4 =
+    // = (8 * A0 + 2 * (A1 + B0) + (A0 + A1 + B0 + B1 + 8)) >> 4
+    // We reuse the common sub-expressions.
+    const int a0b1 = A[i + 0] + B[i + 1];
+    const int a1b0 = A[i + 1] + B[i + 0];
+    const int a0a1b0b1 = a0b1 + a1b0 + 8;
+    const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4;
+    const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4;
+    out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0);
+    out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1);
+  }
+}
+
+#undef MAX_Y
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitSharpYUVSSE2(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSharpYUVSSE2(void) {
+  WebPSharpYUVUpdateY = SharpYUVUpdateY_SSE2;
+  WebPSharpYUVUpdateRGB = SharpYUVUpdateRGB_SSE2;
+  WebPSharpYUVFilterRow = SharpYUVFilterRow_SSE2;
+}
+
 #else  // !WEBP_USE_SSE2
 
 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
+WEBP_DSP_INIT_STUB(WebPInitSharpYUVSSE2)
 
 #endif  // WEBP_USE_SSE2
diff --git a/src/3rdparty/libwebp/src/enc/alpha.c b/src/3rdparty/libwebp/src/enc/alpha_enc.c
index 03e3ad0..5a2c931 100644
--- a/src/3rdparty/libwebp/src/enc/alpha.c
+++ b/src/3rdparty/libwebp/src/enc/alpha_enc.c
@@ -14,10 +14,10 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 #include "../dsp/dsp.h"
-#include "../utils/filters.h"
-#include "../utils/quant_levels.h"
+#include "../utils/filters_utils.h"
+#include "../utils/quant_levels_utils.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
@@ -44,7 +44,7 @@
 //           invalid quality or method, or
 //           memory allocation for the compressed data fails.
 
-#include "../enc/vp8li.h"
+#include "../enc/vp8li_enc.h"
 
 static int EncodeLossless(const uint8_t* const data, int width, int height,
                           int effort_level,  // in [0..6] range
diff --git a/src/3rdparty/libwebp/src/enc/analysis.c b/src/3rdparty/libwebp/src/enc/analysis_enc.c
index b55128f..dce159b 100644
--- a/src/3rdparty/libwebp/src/enc/analysis.c
+++ b/src/3rdparty/libwebp/src/enc/analysis_enc.c
@@ -15,8 +15,8 @@
 #include <string.h>
 #include <assert.h>
 
-#include "./vp8enci.h"
-#include "./cost.h"
+#include "./vp8i_enc.h"
+#include "./cost_enc.h"
 #include "../utils/utils.h"
 
 #define MAX_ITERS_K_MEANS  6
@@ -262,6 +262,29 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
   return best_alpha;
 }
 
+static int FastMBAnalyze(VP8EncIterator* const it) {
+  // Empirical cut-off value, should be around 16 (~=block size). We use the
+  // [8-17] range and favor intra4 at high quality, intra16 for low quality.
+  const int q = (int)it->enc_->config_->quality;
+  const uint32_t kThreshold = 8 + (17 - 8) * q / 100;
+  int k;
+  uint32_t dc[16], m, m2;
+  for (k = 0; k < 16; k += 4) {
+    VP8Mean16x4(it->yuv_in_ + Y_OFF_ENC + k * BPS, &dc[k]);
+  }
+  for (m = 0, m2 = 0, k = 0; k < 16; ++k) {
+    m += dc[k];
+    m2 += dc[k] * dc[k];
+  }
+  if (kThreshold * m2 < m * m) {
+    VP8SetIntra16Mode(it, 0);   // DC16
+  } else {
+    const uint8_t modes[16] = { 0 };  // DC4
+    VP8SetIntra4Mode(it, modes);
+  }
+  return 0;
+}
+
 static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
                                    int best_alpha) {
   uint8_t modes[16];
@@ -307,6 +330,7 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
 
 static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
   int best_alpha = DEFAULT_ALPHA;
+  int smallest_alpha = 0;
   int best_mode = 0;
   const int max_mode = MAX_UV_MODE;
   int mode;
@@ -322,6 +346,10 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
     alpha = GetAlpha(&histo);
     if (IS_BETTER_ALPHA(alpha, best_alpha)) {
       best_alpha = alpha;
+    }
+    // The best prediction mode tends to be the one with the smallest alpha.
+    if (mode == 0 || alpha < smallest_alpha) {
+      smallest_alpha = alpha;
       best_mode = mode;
     }
   }
@@ -339,13 +367,17 @@ static void MBAnalyze(VP8EncIterator* const it,
   VP8SetSkip(it, 0);         // not skipped
   VP8SetSegment(it, 0);      // default segment, spec-wise.
 
-  best_alpha = MBAnalyzeBestIntra16Mode(it);
-  if (enc->method_ >= 5) {
-    // We go and make a fast decision for intra4/intra16.
-    // It's usually not a good and definitive pick, but helps seeding the stats
-    // about level bit-cost.
-    // TODO(skal): improve criterion.
-    best_alpha = MBAnalyzeBestIntra4Mode(it, best_alpha);
+  if (enc->method_ <= 1) {
+    best_alpha = FastMBAnalyze(it);
+  } else {
+    best_alpha = MBAnalyzeBestIntra16Mode(it);
+    if (enc->method_ >= 5) {
+      // We go and make a fast decision for intra4/intra16.
+      // It's usually not a good and definitive pick, but helps seeding the
+      // stats about level bit-cost.
+      // TODO(skal): improve criterion.
+      best_alpha = MBAnalyzeBestIntra4Mode(it, best_alpha);
+    }
   }
   best_uv_alpha = MBAnalyzeBestUVMode(it);
 
@@ -448,7 +480,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
   const int do_segments =
       enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation.
       (enc->segment_hdr_.num_segments_ > 1) ||
-      (enc->method_ == 0);  // for method 0, we need preds_[] to be filled.
+      (enc->method_ <= 1);  // for method 0 - 1, we need preds_[] to be filled.
   if (do_segments) {
     const int last_row = enc->mb_h_;
     // We give a little more than a half work to the main thread.
diff --git a/src/3rdparty/libwebp/src/enc/backward_references.c b/src/3rdparty/libwebp/src/enc/backward_references_enc.c
index 136a24a..7c0559f 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references.c
+++ b/src/3rdparty/libwebp/src/enc/backward_references_enc.c
@@ -13,11 +13,12 @@
 #include <assert.h>
 #include <math.h>
 
-#include "./backward_references.h"
-#include "./histogram.h"
+#include "./backward_references_enc.h"
+#include "./histogram_enc.h"
 #include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
 #include "../dsp/dsp.h"
-#include "../utils/color_cache.h"
+#include "../utils/color_cache_utils.h"
 #include "../utils/utils.h"
 
 #define VALUES_IN_BYTE 256
@@ -30,8 +31,9 @@
 #define WINDOW_SIZE_BITS 20
 #define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
 
-// Bounds for the match length.
-#define MIN_LENGTH 2
+// Minimum number of pixels for which it is cheaper to encode a
+// distance + length instead of each pixel as a literal.
+#define MIN_LENGTH 4
 // If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it
 // is used in VP8LHashChain.
 #define MAX_LENGTH_BITS 12
@@ -211,13 +213,13 @@ void VP8LHashChainClear(VP8LHashChain* const p) {
 
 // -----------------------------------------------------------------------------
 
-#define HASH_MULTIPLIER_HI (0xc6a4a793U)
-#define HASH_MULTIPLIER_LO (0x5bd1e996U)
+#define HASH_MULTIPLIER_HI (0xc6a4a793ULL)
+#define HASH_MULTIPLIER_LO (0x5bd1e996ULL)
 
 static WEBP_INLINE uint32_t GetPixPairHash64(const uint32_t* const argb) {
   uint32_t key;
-  key  = argb[1] * HASH_MULTIPLIER_HI;
-  key += argb[0] * HASH_MULTIPLIER_LO;
+  key  = (argb[1] * HASH_MULTIPLIER_HI) & 0xffffffffu;
+  key += (argb[0] * HASH_MULTIPLIER_LO) & 0xffffffffu;
   key = key >> (32 - HASH_BITS);
   return key;
 }
@@ -242,19 +244,26 @@ static WEBP_INLINE int MaxFindCopyLength(int len) {
 }
 
 int VP8LHashChainFill(VP8LHashChain* const p, int quality,
-                      const uint32_t* const argb, int xsize, int ysize) {
+                      const uint32_t* const argb, int xsize, int ysize,
+                      int low_effort) {
   const int size = xsize * ysize;
   const int iter_max = GetMaxItersForQuality(quality);
-  const int iter_min = iter_max - quality / 10;
   const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
   int pos;
+  int argb_comp;
   uint32_t base_position;
   int32_t* hash_to_first_index;
   // Temporarily use the p->offset_length_ as a hash chain.
   int32_t* chain = (int32_t*)p->offset_length_;
+  assert(size > 0);
   assert(p->size_ != 0);
   assert(p->offset_length_ != NULL);
 
+  if (size <= 2) {
+    p->offset_length_[0] = p->offset_length_[size - 1] = 0;
+    return 1;
+  }
+
   hash_to_first_index =
       (int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
   if (hash_to_first_index == NULL) return 0;
@@ -262,48 +271,111 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
   // Set the int32_t array to -1.
   memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
   // Fill the chain linking pixels with the same hash.
-  for (pos = 0; pos < size - 1; ++pos) {
-    const uint32_t hash_code = GetPixPairHash64(argb + pos);
-    chain[pos] = hash_to_first_index[hash_code];
-    hash_to_first_index[hash_code] = pos;
+  argb_comp = (argb[0] == argb[1]);
+  for (pos = 0; pos < size - 2;) {
+    uint32_t hash_code;
+    const int argb_comp_next = (argb[pos + 1] == argb[pos + 2]);
+    if (argb_comp && argb_comp_next) {
+      // Consecutive pixels with the same color will share the same hash.
+      // We therefore use a different hash: the color and its repetition
+      // length.
+      uint32_t tmp[2];
+      uint32_t len = 1;
+      tmp[0] = argb[pos];
+      // Figure out how far the pixels are the same.
+      // The last pixel has a different 64 bit hash, as its next pixel does
+      // not have the same color, so we just need to get to the last pixel equal
+      // to its follower.
+      while (pos + (int)len + 2 < size && argb[pos + len + 2] == argb[pos]) {
+        ++len;
+      }
+      if (len > MAX_LENGTH) {
+        // Skip the pixels that match for distance=1 and length>MAX_LENGTH
+        // because they are linked to their predecessor and we automatically
+        // check that in the main for loop below. Skipping means setting no
+        // predecessor in the chain, hence -1.
+        memset(chain + pos, 0xff, (len - MAX_LENGTH) * sizeof(*chain));
+        pos += len - MAX_LENGTH;
+        len = MAX_LENGTH;
+      }
+      // Process the rest of the hash chain.
+      while (len) {
+        tmp[1] = len--;
+        hash_code = GetPixPairHash64(tmp);
+        chain[pos] = hash_to_first_index[hash_code];
+        hash_to_first_index[hash_code] = pos++;
+      }
+      argb_comp = 0;
+    } else {
+      // Just move one pixel forward.
+      hash_code = GetPixPairHash64(argb + pos);
+      chain[pos] = hash_to_first_index[hash_code];
+      hash_to_first_index[hash_code] = pos++;
+      argb_comp = argb_comp_next;
+    }
   }
+  // Process the penultimate pixel.
+  chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];
+
   WebPSafeFree(hash_to_first_index);
 
   // Find the best match interval at each pixel, defined by an offset to the
   // pixel and a length. The right-most pixel cannot match anything to the right
   // (hence a best length of 0) and the left-most pixel nothing to the left
   // (hence an offset of 0).
+  assert(size > 2);
   p->offset_length_[0] = p->offset_length_[size - 1] = 0;
-  for (base_position = size - 2 < 0 ? 0 : size - 2; base_position > 0;) {
+  for (base_position = size - 2; base_position > 0;) {
     const int max_len = MaxFindCopyLength(size - 1 - base_position);
     const uint32_t* const argb_start = argb + base_position;
     int iter = iter_max;
     int best_length = 0;
     uint32_t best_distance = 0;
+    uint32_t best_argb;
     const int min_pos =
         (base_position > window_size) ? base_position - window_size : 0;
     const int length_max = (max_len < 256) ? max_len : 256;
     uint32_t max_base_position;
 
-    for (pos = chain[base_position]; pos >= min_pos; pos = chain[pos]) {
+    pos = chain[base_position];
+    if (!low_effort) {
       int curr_length;
-      if (--iter < 0) {
-        break;
+      // Heuristic: use the comparison with the above line as an initialization.
+      if (base_position >= (uint32_t)xsize) {
+        curr_length = FindMatchLength(argb_start - xsize, argb_start,
+                                      best_length, max_len);
+        if (curr_length > best_length) {
+          best_length = curr_length;
+          best_distance = xsize;
+        }
+        --iter;
+      }
+      // Heuristic: compare to the previous pixel.
+      curr_length =
+          FindMatchLength(argb_start - 1, argb_start, best_length, max_len);
+      if (curr_length > best_length) {
+        best_length = curr_length;
+        best_distance = 1;
       }
+      --iter;
+      // Skip the for loop if we already have the maximum.
+      if (best_length == MAX_LENGTH) pos = min_pos - 1;
+    }
+    best_argb = argb_start[best_length];
+
+    for (; pos >= min_pos && --iter; pos = chain[pos]) {
+      int curr_length;
       assert(base_position > (uint32_t)pos);
 
-      curr_length =
-          FindMatchLength(argb + pos, argb_start, best_length, max_len);
+      if (argb[pos + best_length] != best_argb) continue;
+
+      curr_length = VP8LVectorMismatch(argb + pos, argb_start, max_len);
       if (best_length < curr_length) {
         best_length = curr_length;
         best_distance = base_position - pos;
-        // Stop if we have reached the maximum length. Otherwise, make sure
-        // we have executed a minimum number of iterations depending on the
-        // quality.
-        if ((best_length == MAX_LENGTH) ||
-            (curr_length >= length_max && iter < iter_min)) {
-          break;
-        }
+        best_argb = argb_start[best_length];
+        // Stop if we have reached a good enough length.
+        if (best_length >= length_max) break;
       }
     }
     // We have the best match but in case the two intervals continue matching
@@ -392,17 +464,16 @@ static int BackwardReferencesRle(int xsize, int ysize,
   i = 1;
   while (i < pix_count) {
     const int max_len = MaxFindCopyLength(pix_count - i);
-    const int kMinLength = 4;
     const int rle_len = FindMatchLength(argb + i, argb + i - 1, 0, max_len);
     const int prev_row_len = (i < xsize) ? 0 :
         FindMatchLength(argb + i, argb + i - xsize, 0, max_len);
-    if (rle_len >= prev_row_len && rle_len >= kMinLength) {
+    if (rle_len >= prev_row_len && rle_len >= MIN_LENGTH) {
       BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len));
       // We don't need to update the color cache here since it is always the
       // same pixel being copied, and that does not change the color cache
       // state.
       i += rle_len;
-    } else if (prev_row_len >= kMinLength) {
+    } else if (prev_row_len >= MIN_LENGTH) {
       BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len));
       if (use_color_cache) {
         for (k = 0; k < prev_row_len; ++k) {
@@ -442,7 +513,7 @@ static int BackwardReferencesLz77(int xsize, int ysize,
     int len = 0;
     int j;
     HashChainFindCopy(hash_chain, i, &offset, &len);
-    if (len > MIN_LENGTH + 1) {
+    if (len >= MIN_LENGTH) {
       const int len_ini = len;
       int max_reach = 0;
       assert(i + len < pix_count);
@@ -457,7 +528,7 @@ static int BackwardReferencesLz77(int xsize, int ysize,
       for (j = i_last_check + 1; j <= i + len_ini; ++j) {
         const int len_j = HashChainFindLength(hash_chain, j);
         const int reach =
-            j + (len_j > MIN_LENGTH + 1 ? len_j : 1);  // 1 for single literal.
+            j + (len_j >= MIN_LENGTH ? len_j : 1);  // 1 for single literal.
         if (reach > max_reach) {
           len = j - i;
           max_reach = reach;
@@ -581,9 +652,10 @@ static void AddSingleLiteralWithCostModel(const uint32_t* const argb,
                                           uint16_t* const dist_array) {
   double cost_val = prev_cost;
   const uint32_t color = argb[0];
-  if (use_color_cache && VP8LColorCacheContains(hashers, color)) {
+  const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
+  if (ix >= 0) {
+    // use_color_cache is true and hashers contains color
     const double mul0 = 0.68;
-    const int ix = VP8LColorCacheGetIndex(hashers, color);
     cost_val += GetCacheCost(cost_model, ix) * mul0;
   } else {
     const double mul1 = 0.82;
@@ -1215,7 +1287,8 @@ static int BackwardReferencesHashChainDistanceOnly(
     int offset = 0, len = 0;
     double prev_cost = cost_manager->costs_[i - 1];
     HashChainFindCopy(hash_chain, i, &offset, &len);
-    if (len >= MIN_LENGTH) {
+    if (len >= 2) {
+      // If we are dealing with a non-literal.
       const int code = DistanceToPlaneCode(xsize, offset);
       const double offset_cost = GetDistanceCost(cost_model, code);
       const int first_i = i;
@@ -1304,20 +1377,17 @@ static int BackwardReferencesHashChainDistanceOnly(
         }
         goto next_symbol;
       }
-      if (len > MIN_LENGTH) {
-        int code_min_length;
-        double cost_total;
-        offset = HashChainFindOffset(hash_chain, i);
-        code_min_length = DistanceToPlaneCode(xsize, offset);
-        cost_total = prev_cost +
-            GetDistanceCost(cost_model, code_min_length) +
-            GetLengthCost(cost_model, 1);
+      if (len > 2) {
+        // Also try the smallest interval possible (size 2).
+        double cost_total =
+            prev_cost + offset_cost + GetLengthCost(cost_model, 1);
         if (cost_manager->costs_[i + 1] > cost_total) {
           cost_manager->costs_[i + 1] = (float)cost_total;
           dist_array[i + 1] = 2;
         }
       }
-    } else {    // len < MIN_LENGTH
+    } else {
+      // The pixel is added as a single literal so just update the costs.
       UpdateCostPerIndex(cost_manager, i + 1);
     }
 
@@ -1393,9 +1463,11 @@ static int BackwardReferencesHashChainFollowChosenPath(
       i += len;
     } else {
       PixOrCopy v;
-      if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
+      const int idx =
+          use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
+      if (idx >= 0) {
+        // use_color_cache is true and hashers contains argb[i]
         // push pixel as a color cache index
-        const int idx = VP8LColorCacheGetIndex(&hashers, argb[i]);
         v = PixOrCopyCreateCacheIdx(idx);
       } else {
         if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
@@ -1454,63 +1526,89 @@ static void BackwardReferences2DLocality(int xsize,
   }
 }
 
-// Returns entropy for the given cache bits.
-static double ComputeCacheEntropy(const uint32_t* argb,
-                                  const VP8LBackwardRefs* const refs,
-                                  int cache_bits) {
-  const int use_color_cache = (cache_bits > 0);
-  int cc_init = 0;
-  double entropy = MAX_ENTROPY;
-  const double kSmallPenaltyForLargeCache = 4.0;
-  VP8LColorCache hashers;
+// Computes the entropies for a color cache size (in bits) between 0 (unused)
+// and cache_bits_max (inclusive).
+// Returns 1 on success, 0 in case of allocation error.
+static int ComputeCacheEntropies(const uint32_t* argb,
+                                 const VP8LBackwardRefs* const refs,
+                                 int cache_bits_max, double entropies[]) {
+  int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
+  VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
   VP8LRefsCursor c = VP8LRefsCursorInit(refs);
-  VP8LHistogram* histo = VP8LAllocateHistogram(cache_bits);
-  if (histo == NULL) goto Error;
+  VP8LHistogram* histos[MAX_COLOR_CACHE_BITS + 1] = { NULL };
+  int ok = 0;
+  int i;
 
-  if (use_color_cache) {
-    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-    if (!cc_init) goto Error;
+  for (i = 0; i <= cache_bits_max; ++i) {
+    histos[i] = VP8LAllocateHistogram(i);
+    if (histos[i] == NULL) goto Error;
+    if (i == 0) continue;
+    cc_init[i] = VP8LColorCacheInit(&hashers[i], i);
+    if (!cc_init[i]) goto Error;
   }
-  if (!use_color_cache) {
-    while (VP8LRefsCursorOk(&c)) {
-      VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos);
-      VP8LRefsCursorNext(&c);
-    }
-  } else {
+
+  assert(cache_bits_max >= 0);
+  // Do not use the color cache for cache_bits=0.
+  while (VP8LRefsCursorOk(&c)) {
+    VP8LHistogramAddSinglePixOrCopy(histos[0], c.cur_pos);
+    VP8LRefsCursorNext(&c);
+  }
+  if (cache_bits_max > 0) {
+    c = VP8LRefsCursorInit(refs);
     while (VP8LRefsCursorOk(&c)) {
       const PixOrCopy* const v = c.cur_pos;
       if (PixOrCopyIsLiteral(v)) {
         const uint32_t pix = *argb++;
-        const uint32_t key = VP8LColorCacheGetIndex(&hashers, pix);
-        if (VP8LColorCacheLookup(&hashers, key) == pix) {
-          ++histo->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
-        } else {
-          VP8LColorCacheSet(&hashers, key, pix);
-          ++histo->blue_[pix & 0xff];
-          ++histo->literal_[(pix >> 8) & 0xff];
-          ++histo->red_[(pix >> 16) & 0xff];
-          ++histo->alpha_[pix >> 24];
+        // The keys of the caches can be derived from the longest one.
+        int key = HashPix(pix, 32 - cache_bits_max);
+        for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+          if (VP8LColorCacheLookup(&hashers[i], key) == pix) {
+            ++histos[i]->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
+          } else {
+            VP8LColorCacheSet(&hashers[i], key, pix);
+            ++histos[i]->blue_[pix & 0xff];
+            ++histos[i]->literal_[(pix >> 8) & 0xff];
+            ++histos[i]->red_[(pix >> 16) & 0xff];
+            ++histos[i]->alpha_[pix >> 24];
+          }
         }
       } else {
+        // Update the histograms for distance/length.
         int len = PixOrCopyLength(v);
-        int code, extra_bits;
-        VP8LPrefixEncodeBits(len, &code, &extra_bits);
-        ++histo->literal_[NUM_LITERAL_CODES + code];
-        VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
-        ++histo->distance_[code];
+        int code_dist, code_len, extra_bits;
+        uint32_t argb_prev = *argb ^ 0xffffffffu;
+        VP8LPrefixEncodeBits(len, &code_len, &extra_bits);
+        VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code_dist, &extra_bits);
+        for (i = 1; i <= cache_bits_max; ++i) {
+          ++histos[i]->literal_[NUM_LITERAL_CODES + code_len];
+          ++histos[i]->distance_[code_dist];
+        }
+        // Update the colors caches.
         do {
-          VP8LColorCacheInsert(&hashers, *argb++);
-        } while(--len != 0);
+          if (*argb != argb_prev) {
+            // Efficiency: insert only if the color changes.
+            int key = HashPix(*argb, 32 - cache_bits_max);
+            for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+              hashers[i].colors_[key] = *argb;
+            }
+            argb_prev = *argb;
+          }
+          argb++;
+        } while (--len != 0);
       }
       VP8LRefsCursorNext(&c);
     }
   }
-  entropy = VP8LHistogramEstimateBits(histo) +
-      kSmallPenaltyForLargeCache * cache_bits;
- Error:
-  if (cc_init) VP8LColorCacheClear(&hashers);
-  VP8LFreeHistogram(histo);
-  return entropy;
+  for (i = 0; i <= cache_bits_max; ++i) {
+    entropies[i] = VP8LHistogramEstimateBits(histos[i]);
+  }
+  ok = 1;
+Error:
+  for (i = 0; i <= cache_bits_max; ++i) {
+    if (cc_init[i]) VP8LColorCacheClear(&hashers[i]);
+    VP8LFreeHistogram(histos[i]);
+  }
+  return ok;
 }
 
 // Evaluate optimal cache bits for the local color cache.
@@ -1524,13 +1622,10 @@ static int CalculateBestCacheSize(const uint32_t* const argb,
                                   VP8LBackwardRefs* const refs,
                                   int* const lz77_computed,
                                   int* const best_cache_bits) {
-  int eval_low = 1;
-  int eval_high = 1;
-  double entropy_low = MAX_ENTROPY;
-  double entropy_high = MAX_ENTROPY;
-  const double cost_mul = 5e-4;
-  int cache_bits_low = 0;
+  int i;
   int cache_bits_high = (quality <= 25) ? 0 : *best_cache_bits;
+  double entropy_min = MAX_ENTROPY;
+  double entropies[MAX_COLOR_CACHE_BITS + 1];
 
   assert(cache_bits_high <= MAX_COLOR_CACHE_BITS);
 
@@ -1540,34 +1635,23 @@ static int CalculateBestCacheSize(const uint32_t* const argb,
     // Local color cache is disabled.
     return 1;
   }
-  if (!BackwardReferencesLz77(xsize, ysize, argb, cache_bits_low, hash_chain,
-                              refs)) {
+  // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color cache
+  // is not that different in practice.
+  if (!BackwardReferencesLz77(xsize, ysize, argb, 0, hash_chain, refs)) {
     return 0;
   }
-  // Do a binary search to find the optimal entropy for cache_bits.
-  while (eval_low || eval_high) {
-    if (eval_low) {
-      entropy_low = ComputeCacheEntropy(argb, refs, cache_bits_low);
-      entropy_low += entropy_low * cache_bits_low * cost_mul;
-      eval_low = 0;
-    }
-    if (eval_high) {
-      entropy_high = ComputeCacheEntropy(argb, refs, cache_bits_high);
-      entropy_high += entropy_high * cache_bits_high * cost_mul;
-      eval_high = 0;
-    }
-    if (entropy_high < entropy_low) {
-      const int prev_cache_bits_low = cache_bits_low;
-      *best_cache_bits = cache_bits_high;
-      cache_bits_low = (cache_bits_low + cache_bits_high) / 2;
-      if (cache_bits_low != prev_cache_bits_low) eval_low = 1;
-    } else {
-      *best_cache_bits = cache_bits_low;
-      cache_bits_high = (cache_bits_low + cache_bits_high) / 2;
-      if (cache_bits_high != cache_bits_low) eval_high = 1;
+  // Find the cache_bits giving the lowest entropy. The search is done in a
+  // brute-force way as the function (entropy w.r.t cache_bits) can be
+  // anything in practice.
+  if (!ComputeCacheEntropies(argb, refs, cache_bits_high, entropies)) {
+    return 0;
+  }
+  for (i = 0; i <= cache_bits_high; ++i) {
+    if (i == 0 || entropies[i] < entropy_min) {
+      entropy_min = entropies[i];
+      *best_cache_bits = i;
     }
   }
-  *lz77_computed = 1;
   return 1;
 }
 
@@ -1584,8 +1668,9 @@ static int BackwardRefsWithLocalCache(const uint32_t* const argb,
     PixOrCopy* const v = c.cur_pos;
     if (PixOrCopyIsLiteral(v)) {
       const uint32_t argb_literal = v->argb_or_distance;
-      if (VP8LColorCacheContains(&hashers, argb_literal)) {
-        const int ix = VP8LColorCacheGetIndex(&hashers, argb_literal);
+      const int ix = VP8LColorCacheContains(&hashers, argb_literal);
+      if (ix >= 0) {
+        // hashers contains argb_literal
         *v = PixOrCopyCreateCacheIdx(ix);
       } else {
         VP8LColorCacheInsert(&hashers, argb_literal);
diff --git a/src/3rdparty/libwebp/src/enc/backward_references.h b/src/3rdparty/libwebp/src/enc/backward_references_enc.h
index 0cadb11..3a19aa7 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references.h
+++ b/src/3rdparty/libwebp/src/enc/backward_references_enc.h
@@ -130,7 +130,8 @@ struct VP8LHashChain {
 int VP8LHashChainInit(VP8LHashChain* const p, int size);
 // Pre-compute the best matches for argb.
 int VP8LHashChainFill(VP8LHashChain* const p, int quality,
-                      const uint32_t* const argb, int xsize, int ysize);
+                      const uint32_t* const argb, int xsize, int ysize,
+                      int low_effort);
 void VP8LHashChainClear(VP8LHashChain* const p);  // release memory
 
 // -----------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/config.c b/src/3rdparty/libwebp/src/enc/config_enc.c
index f9f7961..4589dc0 100644
--- a/src/3rdparty/libwebp/src/enc/config.c
+++ b/src/3rdparty/libwebp/src/enc/config_enc.c
@@ -11,6 +11,10 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
 #include "../webp/encode.h"
 
 //------------------------------------------------------------------------------
@@ -49,9 +53,8 @@ int WebPConfigInitInternal(WebPConfig* config,
   config->thread_level = 0;
   config->low_memory = 0;
   config->near_lossless = 100;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  config->delta_palettization = 0;
-#endif // WEBP_EXPERIMENTAL_FEATURES
+  config->use_delta_palette = 0;
+  config->use_sharp_yuv = 0;
 
   // TODO(skal): tune.
   switch (preset) {
@@ -92,60 +95,36 @@ int WebPConfigInitInternal(WebPConfig* config,
 
 int WebPValidateConfig(const WebPConfig* config) {
   if (config == NULL) return 0;
-  if (config->quality < 0 || config->quality > 100)
-    return 0;
-  if (config->target_size < 0)
-    return 0;
-  if (config->target_PSNR < 0)
-    return 0;
-  if (config->method < 0 || config->method > 6)
-    return 0;
-  if (config->segments < 1 || config->segments > 4)
-    return 0;
-  if (config->sns_strength < 0 || config->sns_strength > 100)
-    return 0;
-  if (config->filter_strength < 0 || config->filter_strength > 100)
-    return 0;
-  if (config->filter_sharpness < 0 || config->filter_sharpness > 7)
-    return 0;
-  if (config->filter_type < 0 || config->filter_type > 1)
-    return 0;
-  if (config->autofilter < 0 || config->autofilter > 1)
-    return 0;
-  if (config->pass < 1 || config->pass > 10)
-    return 0;
-  if (config->show_compressed < 0 || config->show_compressed > 1)
-    return 0;
-  if (config->preprocessing < 0 || config->preprocessing > 7)
-    return 0;
-  if (config->partitions < 0 || config->partitions > 3)
+  if (config->quality < 0 || config->quality > 100) return 0;
+  if (config->target_size < 0) return 0;
+  if (config->target_PSNR < 0) return 0;
+  if (config->method < 0 || config->method > 6) return 0;
+  if (config->segments < 1 || config->segments > 4) return 0;
+  if (config->sns_strength < 0 || config->sns_strength > 100) return 0;
+  if (config->filter_strength < 0 || config->filter_strength > 100) return 0;
+  if (config->filter_sharpness < 0 || config->filter_sharpness > 7) return 0;
+  if (config->filter_type < 0 || config->filter_type > 1) return 0;
+  if (config->autofilter < 0 || config->autofilter > 1) return 0;
+  if (config->pass < 1 || config->pass > 10) return 0;
+  if (config->show_compressed < 0 || config->show_compressed > 1) return 0;
+  if (config->preprocessing < 0 || config->preprocessing > 7) return 0;
+  if (config->partitions < 0 || config->partitions > 3) return 0;
+  if (config->partition_limit < 0 || config->partition_limit > 100) return 0;
+  if (config->alpha_compression < 0) return 0;
+  if (config->alpha_filtering < 0) return 0;
+  if (config->alpha_quality < 0 || config->alpha_quality > 100) return 0;
+  if (config->lossless < 0 || config->lossless > 1) return 0;
+  if (config->near_lossless < 0 || config->near_lossless > 100) return 0;
+  if (config->image_hint >= WEBP_HINT_LAST) return 0;
+  if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1) return 0;
+  if (config->thread_level < 0 || config->thread_level > 1) return 0;
+  if (config->low_memory < 0 || config->low_memory > 1) return 0;
+  if (config->exact < 0 || config->exact > 1) return 0;
+  if (config->use_delta_palette < 0 || config->use_delta_palette > 1) {
     return 0;
-  if (config->partition_limit < 0 || config->partition_limit > 100)
-    return 0;
-  if (config->alpha_compression < 0)
-    return 0;
-  if (config->alpha_filtering < 0)
-    return 0;
-  if (config->alpha_quality < 0 || config->alpha_quality > 100)
-    return 0;
-  if (config->lossless < 0 || config->lossless > 1)
-    return 0;
-  if (config->near_lossless < 0 || config->near_lossless > 100)
-    return 0;
-  if (config->image_hint >= WEBP_HINT_LAST)
-    return 0;
-  if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1)
-    return 0;
-  if (config->thread_level < 0 || config->thread_level > 1)
-    return 0;
-  if (config->low_memory < 0 || config->low_memory > 1)
-    return 0;
-  if (config->exact < 0 || config->exact > 1)
-    return 0;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (config->delta_palettization < 0 || config->delta_palettization > 1)
-    return 0;
-#endif  // WEBP_EXPERIMENTAL_FEATURES
+  }
+  if (config->use_sharp_yuv < 0 || config->use_sharp_yuv > 1) return 0;
+
   return 1;
 }
 
diff --git a/src/3rdparty/libwebp/src/enc/cost.c b/src/3rdparty/libwebp/src/enc/cost_enc.c
index ae7fe01..c823f5a 100644
--- a/src/3rdparty/libwebp/src/enc/cost.c
+++ b/src/3rdparty/libwebp/src/enc/cost_enc.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./cost.h"
+#include "./cost_enc.h"
 
 //------------------------------------------------------------------------------
 // Level cost tables
@@ -281,18 +281,6 @@ int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
 //------------------------------------------------------------------------------
 // Recording of token probabilities.
 
-// Record proba context used
-static int Record(int bit, proba_t* const stats) {
-  proba_t p = *stats;
-  if (p >= 0xffff0000u) {               // an overflow is inbound.
-    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
-  }
-  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
-  p += 0x00010000u + bit;
-  *stats = p;
-  return bit;
-}
-
 // We keep the table-free variant around for reference, in case.
 #define USE_LEVEL_CODE_TABLE
 
@@ -303,31 +291,31 @@ int VP8RecordCoeffs(int ctx, const VP8Residual* const res) {
   // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
   proba_t* s = res->stats[n][ctx];
   if (res->last  < 0) {
-    Record(0, s + 0);
+    VP8RecordStats(0, s + 0);
     return 0;
   }
   while (n <= res->last) {
     int v;
-    Record(1, s + 0);  // order of record doesn't matter
+    VP8RecordStats(1, s + 0);  // order of record doesn't matter
     while ((v = res->coeffs[n++]) == 0) {
-      Record(0, s + 1);
+      VP8RecordStats(0, s + 1);
       s = res->stats[VP8EncBands[n]][0];
     }
-    Record(1, s + 1);
-    if (!Record(2u < (unsigned int)(v + 1), s + 2)) {  // v = -1 or 1
+    VP8RecordStats(1, s + 1);
+    if (!VP8RecordStats(2u < (unsigned int)(v + 1), s + 2)) {  // v = -1 or 1
       s = res->stats[VP8EncBands[n]][1];
     } else {
       v = abs(v);
 #if !defined(USE_LEVEL_CODE_TABLE)
-      if (!Record(v > 4, s + 3)) {
-        if (Record(v != 2, s + 4))
-          Record(v == 4, s + 5);
-      } else if (!Record(v > 10, s + 6)) {
-        Record(v > 6, s + 7);
-      } else if (!Record((v >= 3 + (8 << 2)), s + 8)) {
-        Record((v >= 3 + (8 << 1)), s + 9);
+      if (!VP8RecordStats(v > 4, s + 3)) {
+        if (VP8RecordStats(v != 2, s + 4))
+          VP8RecordStats(v == 4, s + 5);
+      } else if (!VP8RecordStats(v > 10, s + 6)) {
+        VP8RecordStats(v > 6, s + 7);
+      } else if (!VP8RecordStats((v >= 3 + (8 << 2)), s + 8)) {
+        VP8RecordStats((v >= 3 + (8 << 1)), s + 9);
       } else {
-        Record((v >= 3 + (8 << 3)), s + 10);
+        VP8RecordStats((v >= 3 + (8 << 3)), s + 10);
       }
 #else
       if (v > MAX_VARIABLE_LEVEL) {
@@ -340,14 +328,14 @@ int VP8RecordCoeffs(int ctx, const VP8Residual* const res) {
         int i;
         for (i = 0; (pattern >>= 1) != 0; ++i) {
           const int mask = 2 << i;
-          if (pattern & 1) Record(!!(bits & mask), s + 3 + i);
+          if (pattern & 1) VP8RecordStats(!!(bits & mask), s + 3 + i);
         }
       }
 #endif
       s = res->stats[VP8EncBands[n]][2];
     }
   }
-  if (n < 16) Record(0, s + 0);
+  if (n < 16) VP8RecordStats(0, s + 0);
   return 1;
 }
 
diff --git a/src/3rdparty/libwebp/src/enc/cost.h b/src/3rdparty/libwebp/src/enc/cost_enc.h
index 20960d6..99e4b37 100644
--- a/src/3rdparty/libwebp/src/enc/cost.h
+++ b/src/3rdparty/libwebp/src/enc/cost_enc.h
@@ -16,7 +16,7 @@
 
 #include <assert.h>
 #include <stdlib.h>
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -41,6 +41,20 @@ void VP8InitResidual(int first, int coeff_type,
 
 int VP8RecordCoeffs(int ctx, const VP8Residual* const res);
 
+// Record proba context used.
+static WEBP_INLINE int VP8RecordStats(int bit, proba_t* const stats) {
+  proba_t p = *stats;
+  // An overflow is inbound. Note we handle this at 0xfffe0000u instead of
+  // 0xffff0000u to make sure p + 1u does not overflow.
+  if (p >= 0xfffe0000u) {
+    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
+  }
+  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
+  p += 0x00010000u + bit;
+  *stats = p;
+  return bit;
+}
+
 // Cost of coding one event with probability 'proba'.
 static WEBP_INLINE int VP8BitCost(int bit, uint8_t proba) {
   return !bit ? VP8EntropyCost[proba] : VP8EntropyCost[255 - proba];
diff --git a/src/3rdparty/libwebp/src/enc/delta_palettization.c b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.c
index 062e588..eaf0f05 100644
--- a/src/3rdparty/libwebp/src/enc/delta_palettization.c
+++ b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.c
@@ -10,7 +10,7 @@
 // Author: Mislav Bradac (mislavm@google.com)
 //
 
-#include "./delta_palettization.h"
+#include "./delta_palettization_enc.h"
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
 #include "../webp/types.h"
diff --git a/src/3rdparty/libwebp/src/enc/delta_palettization.h b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.h
index e41c0c5..63048ec 100644
--- a/src/3rdparty/libwebp/src/enc/delta_palettization.h
+++ b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.h
@@ -14,7 +14,7 @@
 #define WEBP_ENC_DELTA_PALETTIZATION_H_
 
 #include "../webp/encode.h"
-#include "../enc/vp8li.h"
+#include "../enc/vp8li_enc.h"
 
 // Replaces enc->argb_[] input by a palettizable approximation of it,
 // and generates optimal enc->palette_[].
diff --git a/src/3rdparty/libwebp/src/enc/filter.c b/src/3rdparty/libwebp/src/enc/filter_enc.c
index e8ea8b4..4bc3672 100644
--- a/src/3rdparty/libwebp/src/enc/filter.c
+++ b/src/3rdparty/libwebp/src/enc/filter_enc.c
@@ -12,7 +12,7 @@
 // Author: somnath@google.com (Somnath Banerjee)
 
 #include <assert.h>
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 #include "../dsp/dsp.h"
 
 // This table gives, for a given sharpness, the filtering strength to be
@@ -105,115 +105,28 @@ static void DoFilter(const VP8EncIterator* const it, int level) {
 }
 
 //------------------------------------------------------------------------------
-// SSIM metric
-
-static const double kMinValue = 1.e-10;  // minimal threshold
-
-void VP8SSIMAddStats(const VP8DistoStats* const src, VP8DistoStats* const dst) {
-  dst->w   += src->w;
-  dst->xm  += src->xm;
-  dst->ym  += src->ym;
-  dst->xxm += src->xxm;
-  dst->xym += src->xym;
-  dst->yym += src->yym;
-}
-
-double VP8SSIMGet(const VP8DistoStats* const stats) {
-  const double xmxm = stats->xm * stats->xm;
-  const double ymym = stats->ym * stats->ym;
-  const double xmym = stats->xm * stats->ym;
-  const double w2 = stats->w * stats->w;
-  double sxx = stats->xxm * stats->w - xmxm;
-  double syy = stats->yym * stats->w - ymym;
-  double sxy = stats->xym * stats->w - xmym;
-  double C1, C2;
-  double fnum;
-  double fden;
-  // small errors are possible, due to rounding. Clamp to zero.
-  if (sxx < 0.) sxx = 0.;
-  if (syy < 0.) syy = 0.;
-  C1 = 6.5025 * w2;
-  C2 = 58.5225 * w2;
-  fnum = (2 * xmym + C1) * (2 * sxy + C2);
-  fden = (xmxm + ymym + C1) * (sxx + syy + C2);
-  return (fden != 0.) ? fnum / fden : kMinValue;
-}
-
-double VP8SSIMGetSquaredError(const VP8DistoStats* const s) {
-  if (s->w > 0.) {
-    const double iw2 = 1. / (s->w * s->w);
-    const double sxx = s->xxm * s->w - s->xm * s->xm;
-    const double syy = s->yym * s->w - s->ym * s->ym;
-    const double sxy = s->xym * s->w - s->xm * s->ym;
-    const double SSE = iw2 * (sxx + syy - 2. * sxy);
-    if (SSE > kMinValue) return SSE;
-  }
-  return kMinValue;
-}
-
-#define LIMIT(A, M)  ((A) > (M) ? (M) : (A))
-static void VP8SSIMAccumulateRow(const uint8_t* src1, int stride1,
-                                 const uint8_t* src2, int stride2,
-                                 int y, int W, int H,
-                                 VP8DistoStats* const stats) {
-  int x = 0;
-  const int w0 = LIMIT(VP8_SSIM_KERNEL, W);
-  for (x = 0; x < w0; ++x) {
-    VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-  }
-  for (; x <= W - 8 + VP8_SSIM_KERNEL; ++x) {
-    VP8SSIMAccumulate(
-        src1 + (y - VP8_SSIM_KERNEL) * stride1 + (x - VP8_SSIM_KERNEL), stride1,
-        src2 + (y - VP8_SSIM_KERNEL) * stride2 + (x - VP8_SSIM_KERNEL), stride2,
-        stats);
-  }
-  for (; x < W; ++x) {
-    VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-  }
-}
-
-void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1,
-                            const uint8_t* src2, int stride2,
-                            int W, int H, VP8DistoStats* const stats) {
-  int x, y;
-  const int h0 = LIMIT(VP8_SSIM_KERNEL, H);
-  const int h1 = LIMIT(VP8_SSIM_KERNEL, H - VP8_SSIM_KERNEL);
-  for (y = 0; y < h0; ++y) {
-    for (x = 0; x < W; ++x) {
-      VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-    }
-  }
-  for (; y < h1; ++y) {
-    VP8SSIMAccumulateRow(src1, stride1, src2, stride2, y, W, H, stats);
-  }
-  for (; y < H; ++y) {
-    for (x = 0; x < W; ++x) {
-      VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-    }
-  }
-}
-#undef LIMIT
+// SSIM metric for one macroblock
 
 static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
   int x, y;
-  VP8DistoStats s = { .0, .0, .0, .0, .0, .0 };
+  double sum = 0.;
 
   // compute SSIM in a 10 x 10 window
   for (y = VP8_SSIM_KERNEL; y < 16 - VP8_SSIM_KERNEL; y++) {
     for (x = VP8_SSIM_KERNEL; x < 16 - VP8_SSIM_KERNEL; x++) {
-      VP8SSIMAccumulateClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
-                               x, y, 16, 16, &s);
+      sum += VP8SSIMGetClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
+                               x, y, 16, 16);
     }
   }
   for (x = 1; x < 7; x++) {
     for (y = 1; y < 7; y++) {
-      VP8SSIMAccumulateClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
-                               x, y, 8, 8, &s);
-      VP8SSIMAccumulateClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
-                               x, y, 8, 8, &s);
+      sum += VP8SSIMGetClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
+                               x, y, 8, 8);
+      sum += VP8SSIMGetClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
+                               x, y, 8, 8);
     }
   }
-  return VP8SSIMGet(&s);
+  return sum;
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/frame.c b/src/3rdparty/libwebp/src/enc/frame_enc.c
index 5b7a40b..abef523 100644
--- a/src/3rdparty/libwebp/src/enc/frame.c
+++ b/src/3rdparty/libwebp/src/enc/frame_enc.c
@@ -14,8 +14,8 @@
 #include <string.h>
 #include <math.h>
 
-#include "./cost.h"
-#include "./vp8enci.h"
+#include "./cost_enc.h"
+#include "./vp8i_enc.h"
 #include "../dsp/dsp.h"
 #include "../webp/format_constants.h"  // RIFF constants
 
@@ -185,6 +185,13 @@ static int GetProba(int a, int b) {
                       : (255 * a + total / 2) / total;  // rounded proba
 }
 
+static void ResetSegments(VP8Encoder* const enc) {
+  int n;
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    enc->mb_info_[n].segment_ = 0;
+  }
+}
+
 static void SetSegmentProbas(VP8Encoder* const enc) {
   int p[NUM_MB_SEGMENTS] = { 0 };
   int n;
@@ -206,6 +213,7 @@ static void SetSegmentProbas(VP8Encoder* const enc) {
 
     enc->segment_hdr_.update_map_ =
         (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
+    if (!enc->segment_hdr_.update_map_) ResetSegments(enc);
     enc->segment_hdr_.size_ =
         p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
         p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
@@ -240,8 +248,9 @@ static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
       p = res->prob[VP8EncBands[n]][1];
     } else {
       if (!VP8PutBit(bw, v > 4, p[3])) {
-        if (VP8PutBit(bw, v != 2, p[4]))
+        if (VP8PutBit(bw, v != 2, p[4])) {
           VP8PutBit(bw, v == 4, p[5]);
+        }
       } else if (!VP8PutBit(bw, v > 10, p[6])) {
         if (!VP8PutBit(bw, v > 6, p[7])) {
           VP8PutBit(bw, v == 6, 159);
@@ -406,9 +415,7 @@ static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
     VP8InitResidual(0, 1, enc, &res);
     VP8SetResidualCoeffs(rd->y_dc_levels, &res);
     it->top_nz_[8] = it->left_nz_[8] =
-        VP8RecordCoeffTokens(ctx, 1,
-                             res.first, res.last, res.coeffs, tokens);
-    VP8RecordCoeffs(ctx, &res);
+        VP8RecordCoeffTokens(ctx, &res, tokens);
     VP8InitResidual(1, 0, enc, &res);
   } else {
     VP8InitResidual(0, 3, enc, &res);
@@ -420,9 +427,7 @@ static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
       const int ctx = it->top_nz_[x] + it->left_nz_[y];
       VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
       it->top_nz_[x] = it->left_nz_[y] =
-          VP8RecordCoeffTokens(ctx, res.coeff_type,
-                               res.first, res.last, res.coeffs, tokens);
-      VP8RecordCoeffs(ctx, &res);
+          VP8RecordCoeffTokens(ctx, &res, tokens);
     }
   }
 
@@ -434,9 +439,7 @@ static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
         const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
         VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
         it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
-            VP8RecordCoeffTokens(ctx, 2,
-                                 res.first, res.last, res.coeffs, tokens);
-        VP8RecordCoeffs(ctx, &res);
+            VP8RecordCoeffTokens(ctx, &res, tokens);
       }
     }
   }
@@ -555,8 +558,9 @@ static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
     size += info.R + info.H;
     size_p0 += info.H;
     distortion += info.D;
-    if (percent_delta && !VP8IteratorProgress(&it, percent_delta))
+    if (percent_delta && !VP8IteratorProgress(&it, percent_delta)) {
       return 0;
+    }
     VP8IteratorSaveBoundary(&it);
   } while (VP8IteratorNext(&it) && --nb_mbs > 0);
 
@@ -814,7 +818,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
            num_pass_left, stats.last_value, stats.value,
            stats.last_q, stats.q, stats.dq);
 #endif
-    if (size_p0 > PARTITION0_SIZE_LIMIT) {
+    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
       ++num_pass_left;
       enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
       continue;                        // ...and start over
diff --git a/src/3rdparty/libwebp/src/enc/histogram.c b/src/3rdparty/libwebp/src/enc/histogram_enc.c
index 395372b..808b6f7 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.c
+++ b/src/3rdparty/libwebp/src/enc/histogram_enc.c
@@ -15,9 +15,10 @@
 
 #include <math.h>
 
-#include "./backward_references.h"
-#include "./histogram.h"
+#include "./backward_references_enc.h"
+#include "./histogram_enc.h"
 #include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
 #include "../utils/utils.h"
 
 #define MAX_COST 1.e38
@@ -213,10 +214,19 @@ static double InitialHuffmanCost(void) {
 
 // Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
 static double FinalHuffmanCost(const VP8LStreaks* const stats) {
+  // The constants in this function are experimental and got rounded from
+  // their original values in 1/8 when switched to 1/1024.
   double retval = InitialHuffmanCost();
+  // Second coefficient: Many zeros in the histogram are covered efficiently
+  // by a run-length encode. Originally 2/8.
   retval += stats->counts[0] * 1.5625 + 0.234375 * stats->streaks[0][1];
+  // Second coefficient: Constant values are encoded less efficiently, but still
+  // RLE'ed. Originally 6/8.
   retval += stats->counts[1] * 2.578125 + 0.703125 * stats->streaks[1][1];
+  // 0s are usually encoded more efficiently than non-0s.
+  // Originally 15/8.
   retval += 1.796875 * stats->streaks[0][0];
+  // Originally 26/8.
   retval += 3.28125 * stats->streaks[1][0];
   return retval;
 }
@@ -236,14 +246,30 @@ static double PopulationCost(const uint32_t* const population, int length,
   return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
 }
 
+// trivial_at_end is 1 if the two histograms only have one element that is
+// non-zero: both the zero-th one, or both the last one.
 static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
                                              const uint32_t* const Y,
-                                             int length) {
-  VP8LBitEntropy bit_entropy;
+                                             int length, int trivial_at_end) {
   VP8LStreaks stats;
-  VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
+  if (trivial_at_end) {
+    // This configuration is due to palettization that transforms an indexed
+    // pixel into 0xff000000 | (pixel << 8) in VP8LBundleColorMap.
+    // BitsEntropyRefine is 0 for histograms with only one non-zero value.
+    // Only FinalHuffmanCost needs to be evaluated.
+    memset(&stats, 0, sizeof(stats));
+    // Deal with the non-zero value at index 0 or length-1.
+    stats.streaks[1][0] += 1;
+    // Deal with the following/previous zero streak.
+    stats.counts[0] += 1;
+    stats.streaks[0][1] += length - 1;
+    return FinalHuffmanCost(&stats);
+  } else {
+    VP8LBitEntropy bit_entropy;
+    VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
 
-  return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
+    return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
+  }
 }
 
 // Estimates the Entropy + Huffman + other block overhead size cost.
@@ -267,24 +293,42 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
                                        double cost_threshold,
                                        double* cost) {
   const int palette_code_bits = a->palette_code_bits_;
+  int trivial_at_end = 0;
   assert(a->palette_code_bits_ == b->palette_code_bits_);
   *cost += GetCombinedEntropy(a->literal_, b->literal_,
-                              VP8LHistogramNumCodes(palette_code_bits));
+                              VP8LHistogramNumCodes(palette_code_bits), 0);
   *cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
                                  b->literal_ + NUM_LITERAL_CODES,
                                  NUM_LENGTH_CODES);
   if (*cost > cost_threshold) return 0;
 
-  *cost += GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES);
+  if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM &&
+      a->trivial_symbol_ == b->trivial_symbol_) {
+    // A, R and B are all 0 or 0xff.
+    const uint32_t color_a = (a->trivial_symbol_ >> 24) & 0xff;
+    const uint32_t color_r = (a->trivial_symbol_ >> 16) & 0xff;
+    const uint32_t color_b = (a->trivial_symbol_ >> 0) & 0xff;
+    if ((color_a == 0 || color_a == 0xff) &&
+        (color_r == 0 || color_r == 0xff) &&
+        (color_b == 0 || color_b == 0xff)) {
+      trivial_at_end = 1;
+    }
+  }
+
+  *cost +=
+      GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, trivial_at_end);
   if (*cost > cost_threshold) return 0;
 
-  *cost += GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES);
+  *cost +=
+      GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, trivial_at_end);
   if (*cost > cost_threshold) return 0;
 
-  *cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES);
+  *cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
+                              trivial_at_end);
   if (*cost > cost_threshold) return 0;
 
-  *cost += GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES);
+  *cost +=
+      GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES, 0);
   *cost +=
       VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES);
   if (*cost > cost_threshold) return 0;
@@ -292,6 +336,15 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
   return 1;
 }
 
+static WEBP_INLINE void HistogramAdd(const VP8LHistogram* const a,
+                                     const VP8LHistogram* const b,
+                                     VP8LHistogram* const out) {
+  VP8LHistogramAdd(a, b, out);
+  out->trivial_symbol_ = (a->trivial_symbol_ == b->trivial_symbol_)
+                       ? a->trivial_symbol_
+                       : VP8L_NON_TRIVIAL_SYM;
+}
+
 // Performs out = a + b, computing the cost C(a+b) - C(a) - C(b) while comparing
 // to the threshold value 'cost_threshold'. The score returned is
 //  Score = C(a+b) - C(a) - C(b), where C(a) + C(b) is known and fixed.
@@ -307,11 +360,9 @@ static double HistogramAddEval(const VP8LHistogram* const a,
   cost_threshold += sum_cost;
 
   if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) {
-    VP8LHistogramAdd(a, b, out);
+    HistogramAdd(a, b, out);
     out->bit_cost_ = cost;
     out->palette_code_bits_ = a->palette_code_bits_;
-    out->trivial_symbol_ = (a->trivial_symbol_ == b->trivial_symbol_) ?
-        a->trivial_symbol_ : VP8L_NON_TRIVIAL_SYM;
   }
 
   return cost - sum_cost;
@@ -450,113 +501,103 @@ static void HistogramCopyAndAnalyze(
 // Partition histograms to different entropy bins for three dominant (literal,
 // red and blue) symbol costs and compute the histogram aggregate bit_cost.
 static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
-                                       int16_t* const bin_map, int low_effort) {
+                                       uint16_t* const bin_map,
+                                       int low_effort) {
   int i;
   VP8LHistogram** const histograms = image_histo->histograms;
   const int histo_size = image_histo->size;
-  const int bin_depth = histo_size + 1;
   DominantCostRange cost_range;
   DominantCostRangeInit(&cost_range);
 
   // Analyze the dominant (literal, red and blue) entropy costs.
   for (i = 0; i < histo_size; ++i) {
-    VP8LHistogram* const histo = histograms[i];
-    UpdateDominantCostRange(histo, &cost_range);
+    UpdateDominantCostRange(histograms[i], &cost_range);
   }
 
   // bin-hash histograms on three of the dominant (literal, red and blue)
-  // symbol costs.
+  // symbol costs and store the resulting bin_id for each histogram.
   for (i = 0; i < histo_size; ++i) {
-    const VP8LHistogram* const histo = histograms[i];
-    const int bin_id = GetHistoBinIndex(histo, &cost_range, low_effort);
-    const int bin_offset = bin_id * bin_depth;
-    // bin_map[n][0] for every bin 'n' maintains the counter for the number of
-    // histograms in that bin.
-    // Get and increment the num_histos in that bin.
-    const int num_histos = ++bin_map[bin_offset];
-    assert(bin_offset + num_histos < bin_depth * BIN_SIZE);
-    // Add histogram i'th index at num_histos (last) position in the bin_map.
-    bin_map[bin_offset + num_histos] = i;
-  }
-}
-
-// Compact the histogram set by removing unused entries.
-static void HistogramCompactBins(VP8LHistogramSet* const image_histo) {
-  VP8LHistogram** const histograms = image_histo->histograms;
-  int i, j;
-
-  for (i = 0, j = 0; i < image_histo->size; ++i) {
-    if (histograms[i] != NULL && histograms[i]->bit_cost_ != 0.) {
-      if (j < i) {
-        histograms[j] = histograms[i];
-        histograms[i] = NULL;
-      }
-      ++j;
-    }
+    bin_map[i] = GetHistoBinIndex(histograms[i], &cost_range, low_effort);
   }
-  image_histo->size = j;
 }
 
+// Compact image_histo[] by merging some histograms with same bin_id together if
+// it's advantageous.
 static VP8LHistogram* HistogramCombineEntropyBin(
     VP8LHistogramSet* const image_histo,
     VP8LHistogram* cur_combo,
-    int16_t* const bin_map, int bin_depth, int num_bins,
+    const uint16_t* const bin_map, int bin_map_size, int num_bins,
     double combine_cost_factor, int low_effort) {
-  int bin_id;
   VP8LHistogram** const histograms = image_histo->histograms;
-
-  for (bin_id = 0; bin_id < num_bins; ++bin_id) {
-    const int bin_offset = bin_id * bin_depth;
-    const int num_histos = bin_map[bin_offset];
-    const int idx1 = bin_map[bin_offset + 1];
-    int num_combine_failures = 0;
-    int n;
-    for (n = 2; n <= num_histos; ++n) {
-      const int idx2 = bin_map[bin_offset + n];
-      if (low_effort) {
-        // Merge all histograms with the same bin index, irrespective of cost of
-        // the merged histograms.
-        VP8LHistogramAdd(histograms[idx1], histograms[idx2], histograms[idx1]);
-        histograms[idx2]->bit_cost_ = 0.;
-      } else {
-        const double bit_cost_idx2 = histograms[idx2]->bit_cost_;
-        if (bit_cost_idx2 > 0.) {
-          const double bit_cost_thresh = -bit_cost_idx2 * combine_cost_factor;
-          const double curr_cost_diff =
-              HistogramAddEval(histograms[idx1], histograms[idx2],
-                               cur_combo, bit_cost_thresh);
-          if (curr_cost_diff < bit_cost_thresh) {
-            // Try to merge two histograms only if the combo is a trivial one or
-            // the two candidate histograms are already non-trivial.
-            // For some images, 'try_combine' turns out to be false for a lot of
-            // histogram pairs. In that case, we fallback to combining
-            // histograms as usual to avoid increasing the header size.
-            const int try_combine =
-                (cur_combo->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM) ||
-                ((histograms[idx1]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM) &&
-                 (histograms[idx2]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM));
-            const int max_combine_failures = 32;
-            if (try_combine || (num_combine_failures >= max_combine_failures)) {
-              HistogramSwap(&cur_combo, &histograms[idx1]);
-              histograms[idx2]->bit_cost_ = 0.;
-            } else {
-              ++num_combine_failures;
-            }
-          }
+  int idx;
+  // Work in-place: processed histograms are put at the beginning of
+  // image_histo[]. At the end, we just have to truncate the array.
+  int size = 0;
+  struct {
+    int16_t first;    // position of the histogram that accumulates all
+                      // histograms with the same bin_id
+    uint16_t num_combine_failures;   // number of combine failures per bin_id
+  } bin_info[BIN_SIZE];
+
+  assert(num_bins <= BIN_SIZE);
+  for (idx = 0; idx < num_bins; ++idx) {
+    bin_info[idx].first = -1;
+    bin_info[idx].num_combine_failures = 0;
+  }
+
+  for (idx = 0; idx < bin_map_size; ++idx) {
+    const int bin_id = bin_map[idx];
+    const int first = bin_info[bin_id].first;
+    assert(size <= idx);
+    if (first == -1) {
+      // just move histogram #idx to its final position
+      histograms[size] = histograms[idx];
+      bin_info[bin_id].first = size++;
+    } else if (low_effort) {
+      HistogramAdd(histograms[idx], histograms[first], histograms[first]);
+    } else {
+      // try to merge #idx into #first (both share the same bin_id)
+      const double bit_cost = histograms[idx]->bit_cost_;
+      const double bit_cost_thresh = -bit_cost * combine_cost_factor;
+      const double curr_cost_diff =
+          HistogramAddEval(histograms[first], histograms[idx],
+                           cur_combo, bit_cost_thresh);
+      if (curr_cost_diff < bit_cost_thresh) {
+        // Try to merge two histograms only if the combo is a trivial one or
+        // the two candidate histograms are already non-trivial.
+        // For some images, 'try_combine' turns out to be false for a lot of
+        // histogram pairs. In that case, we fallback to combining
+        // histograms as usual to avoid increasing the header size.
+        const int try_combine =
+            (cur_combo->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM) ||
+            ((histograms[idx]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM) &&
+             (histograms[first]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM));
+        const int max_combine_failures = 32;
+        if (try_combine ||
+            bin_info[bin_id].num_combine_failures >= max_combine_failures) {
+          // move the (better) merged histogram to its final slot
+          HistogramSwap(&cur_combo, &histograms[first]);
+        } else {
+          histograms[size++] = histograms[idx];
+          ++bin_info[bin_id].num_combine_failures;
         }
+      } else {
+        histograms[size++] = histograms[idx];
       }
     }
-    if (low_effort) {
-      // Update the bit_cost for the merged histograms (per bin index).
-      UpdateHistogramCost(histograms[idx1]);
+  }
+  image_histo->size = size;
+  if (low_effort) {
+    // for low_effort case, update the final cost when everything is merged
+    for (idx = 0; idx < size; ++idx) {
+      UpdateHistogramCost(histograms[idx]);
     }
   }
-  HistogramCompactBins(image_histo);
   return cur_combo;
 }
 
-static uint32_t MyRand(uint32_t *seed) {
-  *seed *= 16807U;
+static uint32_t MyRand(uint32_t* const seed) {
+  *seed = (*seed * 16807ull) & 0xffffffffu;
   if (*seed == 0) {
     *seed = 1;
   }
@@ -592,8 +633,8 @@ static int HistoQueueInit(HistoQueue* const histo_queue, const int max_index) {
   histo_queue->max_size = max_index * max_index;
   // We allocate max_size + 1 because the last element at index "size" is
   // used as temporary data (and it could be up to max_size).
-  histo_queue->queue = WebPSafeMalloc(histo_queue->max_size + 1,
-                                      sizeof(*histo_queue->queue));
+  histo_queue->queue = (HistogramPair*)WebPSafeMalloc(
+      histo_queue->max_size + 1, sizeof(*histo_queue->queue));
   return histo_queue->queue != NULL;
 }
 
@@ -659,7 +700,8 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
   int i, j;
   VP8LHistogram** const histograms = image_histo->histograms;
   // Indexes of remaining histograms.
-  int* const clusters = WebPSafeMalloc(image_histo_size, sizeof(*clusters));
+  int* const clusters =
+      (int*)WebPSafeMalloc(image_histo_size, sizeof(*clusters));
   // Priority queue of histogram pairs.
   HistoQueue histo_queue;
 
@@ -681,7 +723,7 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
     HistogramPair* copy_to;
     const int idx1 = histo_queue.queue[0].idx1;
     const int idx2 = histo_queue.queue[0].idx2;
-    VP8LHistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
+    HistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
     histograms[idx1]->bit_cost_ = histo_queue.queue[0].cost_combo;
     // Remove merged histogram.
     for (i = 0; i + 1 < image_histo_size; ++i) {
@@ -747,6 +789,8 @@ static void HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
   const int outer_iters = image_histo_size * iter_mult;
   const int num_pairs = image_histo_size / 2;
   const int num_tries_no_success = outer_iters / 2;
+  int idx2_max = image_histo_size - 1;
+  int do_brute_dorce = 0;
   VP8LHistogram** const histograms = image_histo->histograms;
 
   // Collapse similar histograms in 'image_histo'.
@@ -757,43 +801,62 @@ static void HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
     double best_cost_diff = 0.;
     int best_idx1 = -1, best_idx2 = 1;
     int j;
-    const int num_tries =
+    int num_tries =
         (num_pairs < image_histo_size) ? num_pairs : image_histo_size;
+    // Use a brute force approach if:
+    // - stochastic has not worked for a while and
+    // - if the number of iterations for brute force is less than the number of
+    // iterations if we never find a match ever again stochastically (hence
+    // num_tries times the number of remaining outer iterations).
+    do_brute_dorce =
+        (tries_with_no_success > 10) &&
+        (idx2_max * (idx2_max + 1) < 2 * num_tries * (outer_iters - iter));
+    if (do_brute_dorce) num_tries = idx2_max;
+
     seed += iter;
     for (j = 0; j < num_tries; ++j) {
       double curr_cost_diff;
       // Choose two histograms at random and try to combine them.
-      const uint32_t idx1 = MyRand(&seed) % image_histo_size;
-      const uint32_t tmp = (j & 7) + 1;
-      const uint32_t diff =
-          (tmp < 3) ? tmp : MyRand(&seed) % (image_histo_size - 1);
-      const uint32_t idx2 = (idx1 + diff + 1) % image_histo_size;
-      if (idx1 == idx2) {
-        continue;
+      uint32_t idx1, idx2;
+      if (do_brute_dorce) {
+        // Use a brute force approach.
+        idx1 = (uint32_t)j;
+        idx2 = (uint32_t)idx2_max;
+      } else {
+        const uint32_t tmp = (j & 7) + 1;
+        const uint32_t diff =
+            (tmp < 3) ? tmp : MyRand(&seed) % (image_histo_size - 1);
+        idx1 = MyRand(&seed) % image_histo_size;
+        idx2 = (idx1 + diff + 1) % image_histo_size;
+        if (idx1 == idx2) {
+          continue;
+        }
       }
 
       // Calculate cost reduction on combining.
       curr_cost_diff = HistogramAddEval(histograms[idx1], histograms[idx2],
                                         tmp_histo, best_cost_diff);
-      if (curr_cost_diff < best_cost_diff) {    // found a better pair?
+      if (curr_cost_diff < best_cost_diff) {  // found a better pair?
         HistogramSwap(&best_combo, &tmp_histo);
         best_cost_diff = curr_cost_diff;
         best_idx1 = idx1;
         best_idx2 = idx2;
       }
     }
+    if (do_brute_dorce) --idx2_max;
 
     if (best_idx1 >= 0) {
       HistogramSwap(&best_combo, &histograms[best_idx1]);
       // swap best_idx2 slot with last one (which is now unused)
       --image_histo_size;
+      if (idx2_max >= image_histo_size) idx2_max = image_histo_size - 1;
       if (best_idx2 != image_histo_size) {
         HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]);
         histograms[image_histo_size] = NULL;
       }
       tries_with_no_success = 0;
     }
-    if (++tries_with_no_success >= num_tries_no_success) {
+    if (++tries_with_no_success >= num_tries_no_success || idx2_max == 0) {
       break;
     }
   }
@@ -842,7 +905,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
 
   for (i = 0; i < in_size; ++i) {
     const int idx = symbols[i];
-    VP8LHistogramAdd(in_histo[i], out_histo[idx], out_histo[idx]);
+    HistogramAdd(in_histo[i], out_histo[idx], out_histo[idx]);
   }
 }
 
@@ -868,32 +931,18 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
   const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
   const int image_histo_raw_size = histo_xsize * histo_ysize;
-  const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
-
-  // The bin_map for every bin follows following semantics:
-  // bin_map[n][0] = num_histo; // The number of histograms in that bin.
-  // bin_map[n][1] = index of first histogram in that bin;
-  // bin_map[n][num_histo] = index of last histogram in that bin;
-  // bin_map[n][num_histo + 1] ... bin_map[n][bin_depth - 1] = unused indices.
-  const int bin_depth = image_histo_raw_size + 1;
-  int16_t* bin_map = NULL;
   VP8LHistogramSet* const orig_histo =
       VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
   VP8LHistogram* cur_combo;
+  // Don't attempt linear bin-partition heuristic for
+  // histograms of small sizes (as bin_map will be very sparse) and
+  // maximum quality q==100 (to preserve the compression gains at that level).
+  const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
   const int entropy_combine =
       (orig_histo->size > entropy_combine_num_bins * 2) && (quality < 100);
 
   if (orig_histo == NULL) goto Error;
 
-  // Don't attempt linear bin-partition heuristic for:
-  // histograms of small sizes, as bin_map will be very sparse and;
-  // Maximum quality (q==100), to preserve the compression gains at that level.
-  if (entropy_combine) {
-    const int bin_map_size = bin_depth * entropy_combine_num_bins;
-    bin_map = (int16_t*)WebPSafeCalloc(bin_map_size, sizeof(*bin_map));
-    if (bin_map == NULL) goto Error;
-  }
-
   // Construct the histograms from backward references.
   HistogramBuild(xsize, histo_bits, refs, orig_histo);
   // Copies the histograms and computes its bit_cost.
@@ -901,12 +950,17 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
 
   cur_combo = tmp_histos->histograms[1];  // pick up working slot
   if (entropy_combine) {
+    const int bin_map_size = orig_histo->size;
+    // Reuse histogram_symbols storage. By definition, it's guaranteed to be ok.
+    uint16_t* const bin_map = histogram_symbols;
     const double combine_cost_factor =
         GetCombineCostFactor(image_histo_raw_size, quality);
+
     HistogramAnalyzeEntropyBin(orig_histo, bin_map, low_effort);
     // Collapse histograms with similar entropy.
-    cur_combo = HistogramCombineEntropyBin(image_histo, cur_combo, bin_map,
-                                           bin_depth, entropy_combine_num_bins,
+    cur_combo = HistogramCombineEntropyBin(image_histo, cur_combo,
+                                           bin_map, bin_map_size,
+                                           entropy_combine_num_bins,
                                            combine_cost_factor, low_effort);
   }
 
@@ -931,7 +985,6 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   ok = 1;
 
  Error:
-  WebPSafeFree(bin_map);
   VP8LFreeHistogramSet(orig_histo);
   return ok;
 }
diff --git a/src/3rdparty/libwebp/src/enc/histogram.h b/src/3rdparty/libwebp/src/enc/histogram_enc.h
index d303d1d..a9d258a 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.h
+++ b/src/3rdparty/libwebp/src/enc/histogram_enc.h
@@ -16,7 +16,7 @@
 
 #include <string.h>
 
-#include "./backward_references.h"
+#include "./backward_references_enc.h"
 #include "../webp/format_constants.h"
 #include "../webp/types.h"
 
diff --git a/src/3rdparty/libwebp/src/enc/iterator.c b/src/3rdparty/libwebp/src/enc/iterator_enc.c
index 99d960a..e48d30b 100644
--- a/src/3rdparty/libwebp/src/enc/iterator.c
+++ b/src/3rdparty/libwebp/src/enc/iterator_enc.c
@@ -13,7 +13,7 @@
 
 #include <string.h>
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // VP8Iterator
@@ -53,7 +53,6 @@ void VP8IteratorReset(VP8EncIterator* const it) {
   VP8IteratorSetRow(it, 0);
   VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_);  // default
   InitTop(it);
-  InitLeft(it);
   memset(it->bit_count_, 0, sizeof(it->bit_count_));
   it->do_trellis_ = 0;
 }
@@ -68,8 +67,6 @@ int VP8IteratorIsDone(const VP8EncIterator* const it) {
 
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
   it->enc_ = enc;
-  it->y_stride_  = enc->pic_->y_stride;
-  it->uv_stride_ = enc->pic_->uv_stride;
   it->yuv_in_   = (uint8_t*)WEBP_ALIGN(it->yuv_mem_);
   it->yuv_out_  = it->yuv_in_ + YUV_SIZE_ENC;
   it->yuv_out2_ = it->yuv_out_ + YUV_SIZE_ENC;
@@ -309,14 +306,14 @@ void VP8IteratorSaveBoundary(VP8EncIterator* const it) {
 }
 
 int VP8IteratorNext(VP8EncIterator* const it) {
-  it->preds_ += 4;
-  it->mb_ += 1;
-  it->nz_ += 1;
-  it->y_top_ += 16;
-  it->uv_top_ += 16;
-  it->x_ += 1;
-  if (it->x_ == it->enc_->mb_w_) {
+  if (++it->x_ == it->enc_->mb_w_) {
     VP8IteratorSetRow(it, ++it->y_);
+  } else {
+    it->preds_ += 4;
+    it->mb_ += 1;
+    it->nz_ += 1;
+    it->y_top_ += 16;
+    it->uv_top_ += 16;
   }
   return (0 < --it->count_down_);
 }
diff --git a/src/3rdparty/libwebp/src/enc/near_lossless.c b/src/3rdparty/libwebp/src/enc/near_lossless_enc.c
index f4ab91f..2bd03ab 100644
--- a/src/3rdparty/libwebp/src/enc/near_lossless.c
+++ b/src/3rdparty/libwebp/src/enc/near_lossless_enc.c
@@ -17,9 +17,9 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
 #include "../utils/utils.h"
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 #define MIN_DIM_FOR_NEAR_LOSSLESS 64
 #define MAX_LIMIT_BITS             5
diff --git a/src/3rdparty/libwebp/src/enc/picture_csp.c b/src/3rdparty/libwebp/src/enc/picture_csp_enc.c
index 607a624..e5d1c75 100644
--- a/src/3rdparty/libwebp/src/enc/picture_csp.c
+++ b/src/3rdparty/libwebp/src/enc/picture_csp_enc.c
@@ -15,8 +15,8 @@
 #include <stdlib.h>
 #include <math.h>
 
-#include "./vp8enci.h"
-#include "../utils/random.h"
+#include "./vp8i_enc.h"
+#include "../utils/random_utils.h"
 #include "../utils/utils.h"
 #include "../dsp/yuv.h"
 
@@ -153,9 +153,9 @@ static int RGBToV(int r, int g, int b, VP8Random* const rg) {
 }
 
 //------------------------------------------------------------------------------
-// Smart RGB->YUV conversion
+// Sharp RGB->YUV conversion
 
-static const int kNumIterations = 6;
+static const int kNumIterations = 4;
 static const int kMinDimensionIterativeConversion = 4;
 
 // We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
@@ -171,9 +171,9 @@ typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
 #if defined(USE_GAMMA_COMPRESSION)
 
 // float variant of gamma-correction
-// We use tables of different size and precision, along with a 'real-world'
-// Gamma value close to ~2.
-#define kGammaF 2.2
+// We use tables of different size and precision for the Rec709
+// transfer function.
+#define kGammaF (1./0.45)
 static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
 static float kLinearToGammaTabF[kGammaTabSize + 2];
 static volatile int kGammaTablesFOk = 0;
@@ -183,11 +183,26 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
     int v;
     const double norm = 1. / MAX_Y_T;
     const double scale = 1. / kGammaTabSize;
+    const double a = 0.099;
+    const double thresh = 0.018;
     for (v = 0; v <= MAX_Y_T; ++v) {
-      kGammaToLinearTabF[v] = (float)pow(norm * v, kGammaF);
+      const double g = norm * v;
+      if (g <= thresh * 4.5) {
+        kGammaToLinearTabF[v] = (float)(g / 4.5);
+      } else {
+        const double a_rec = 1. / (1. + a);
+        kGammaToLinearTabF[v] = (float)pow(a_rec * (g + a), kGammaF);
+      }
     }
     for (v = 0; v <= kGammaTabSize; ++v) {
-      kLinearToGammaTabF[v] = (float)(MAX_Y_T * pow(scale * v, 1. / kGammaF));
+      const double g = scale * v;
+      double value;
+      if (g <= thresh) {
+        value = 4.5 * g;
+      } else {
+        value = (1. + a) * pow(g, 1. / kGammaF) - a;
+      }
+      kLinearToGammaTabF[v] = (float)(MAX_Y_T * value);
     }
     // to prevent small rounding errors to cause read-overflow:
     kLinearToGammaTabF[kGammaTabSize + 1] = kLinearToGammaTabF[kGammaTabSize];
@@ -235,12 +250,12 @@ static fixed_y_t clip_y(int y) {
 //------------------------------------------------------------------------------
 
 static int RGBToGray(int r, int g, int b) {
-  const int luma = 19595 * r + 38470 * g + 7471 * b + YUV_HALF;
+  const int luma = 13933 * r + 46871 * g + 4732 * b + YUV_HALF;
   return (luma >> YUV_FIX);
 }
 
 static float RGBToGrayF(float r, float g, float b) {
-  return 0.299f * r + 0.587f * g + 0.114f * b;
+  return (float)(0.2126 * r + 0.7152 * g + 0.0722 * b);
 }
 
 static int ScaleDown(int a, int b, int c, int d) {
@@ -251,58 +266,50 @@ static int ScaleDown(int a, int b, int c, int d) {
   return LinearToGammaF(0.25f * (A + B + C + D));
 }
 
-static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int len) {
-  while (len-- > 0) {
-    const float R = GammaToLinearF(src[0]);
-    const float G = GammaToLinearF(src[1]);
-    const float B = GammaToLinearF(src[2]);
+static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
+  int i;
+  for (i = 0; i < w; ++i) {
+    const float R = GammaToLinearF(src[0 * w + i]);
+    const float G = GammaToLinearF(src[1 * w + i]);
+    const float B = GammaToLinearF(src[2 * w + i]);
     const float Y = RGBToGrayF(R, G, B);
-    *dst++ = (fixed_y_t)LinearToGammaF(Y);
-    src += 3;
+    dst[i] = (fixed_y_t)LinearToGammaF(Y);
   }
 }
 
-static int UpdateChroma(const fixed_y_t* src1,
-                        const fixed_y_t* src2,
-                        fixed_t* dst, fixed_y_t* tmp, int len) {
-  int diff = 0;
-  while (len--> 0) {
-    const int r = ScaleDown(src1[0], src1[3], src2[0], src2[3]);
-    const int g = ScaleDown(src1[1], src1[4], src2[1], src2[4]);
-    const int b = ScaleDown(src1[2], src1[5], src2[2], src2[5]);
+static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
+                         fixed_t* dst, int uv_w) {
+  int i;
+  for (i = 0; i < uv_w; ++i) {
+    const int r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
+                            src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
+    const int g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
+                            src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
+    const int b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
+                            src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
     const int W = RGBToGray(r, g, b);
-    const int r_avg = (src1[0] + src1[3] + src2[0] + src2[3] + 2) >> 2;
-    const int g_avg = (src1[1] + src1[4] + src2[1] + src2[4] + 2) >> 2;
-    const int b_avg = (src1[2] + src1[5] + src2[2] + src2[5] + 2) >> 2;
-    dst[0] = (fixed_t)(r - W);
-    dst[1] = (fixed_t)(g - W);
-    dst[2] = (fixed_t)(b - W);
-    dst += 3;
-    src1 += 6;
-    src2 += 6;
-    if (tmp != NULL) {
-      tmp[0] = tmp[1] = clip_y(W);
-      tmp += 2;
-    }
-    diff += abs(RGBToGray(r_avg, g_avg, b_avg) - W);
+    dst[0 * uv_w] = (fixed_t)(r - W);
+    dst[1 * uv_w] = (fixed_t)(g - W);
+    dst[2 * uv_w] = (fixed_t)(b - W);
+    dst  += 1;
+    src1 += 2;
+    src2 += 2;
   }
-  return diff;
 }
 
-//------------------------------------------------------------------------------
-
-static WEBP_INLINE int Filter(const fixed_t* const A, const fixed_t* const B,
-                              int rightwise) {
-  int v;
-  if (!rightwise) {
-    v = (A[0] * 9 + A[-3] * 3 + B[0] * 3 + B[-3]);
-  } else {
-    v = (A[0] * 9 + A[+3] * 3 + B[0] * 3 + B[+3]);
+static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
+  int i;
+  for (i = 0; i < w; ++i) {
+    y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
   }
-  return (v + 8) >> 4;
 }
 
-static WEBP_INLINE int Filter2(int A, int B) { return (A * 3 + B + 2) >> 2; }
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) {
+  const int v0 = (A * 3 + B + 2) >> 2;
+  return clip_y(v0 + W0);
+}
 
 //------------------------------------------------------------------------------
 
@@ -317,52 +324,50 @@ static void ImportOneRow(const uint8_t* const r_ptr,
                          int pic_width,
                          fixed_y_t* const dst) {
   int i;
+  const int w = (pic_width + 1) & ~1;
   for (i = 0; i < pic_width; ++i) {
     const int off = i * step;
-    dst[3 * i + 0] = UpLift(r_ptr[off]);
-    dst[3 * i + 1] = UpLift(g_ptr[off]);
-    dst[3 * i + 2] = UpLift(b_ptr[off]);
+    dst[i + 0 * w] = UpLift(r_ptr[off]);
+    dst[i + 1 * w] = UpLift(g_ptr[off]);
+    dst[i + 2 * w] = UpLift(b_ptr[off]);
   }
   if (pic_width & 1) {  // replicate rightmost pixel
-    memcpy(dst + 3 * pic_width, dst + 3 * (pic_width - 1), 3 * sizeof(*dst));
+    dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
+    dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
+    dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
   }
 }
 
 static void InterpolateTwoRows(const fixed_y_t* const best_y,
-                               const fixed_t* const prev_uv,
-                               const fixed_t* const cur_uv,
-                               const fixed_t* const next_uv,
+                               const fixed_t* prev_uv,
+                               const fixed_t* cur_uv,
+                               const fixed_t* next_uv,
                                int w,
-                               fixed_y_t* const out1,
-                               fixed_y_t* const out2) {
-  int i, k;
-  {  // special boundary case for i==0
-    const int W0 = best_y[0];
-    const int W1 = best_y[w];
-    for (k = 0; k <= 2; ++k) {
-      out1[k] = clip_y(Filter2(cur_uv[k], prev_uv[k]) + W0);
-      out2[k] = clip_y(Filter2(cur_uv[k], next_uv[k]) + W1);
-    }
-  }
-  for (i = 1; i < w - 1; ++i) {
-    const int W0 = best_y[i + 0];
-    const int W1 = best_y[i + w];
-    const int off = 3 * (i >> 1);
-    for (k = 0; k <= 2; ++k) {
-      const int tmp0 = Filter(cur_uv + off + k, prev_uv + off + k, i & 1);
-      const int tmp1 = Filter(cur_uv + off + k, next_uv + off + k, i & 1);
-      out1[3 * i + k] = clip_y(tmp0 + W0);
-      out2[3 * i + k] = clip_y(tmp1 + W1);
-    }
-  }
-  {  // special boundary case for i == w - 1
-    const int W0 = best_y[i + 0];
-    const int W1 = best_y[i + w];
-    const int off = 3 * (i >> 1);
-    for (k = 0; k <= 2; ++k) {
-      out1[3 * i + k] = clip_y(Filter2(cur_uv[off + k], prev_uv[off + k]) + W0);
-      out2[3 * i + k] = clip_y(Filter2(cur_uv[off + k], next_uv[off + k]) + W1);
+                               fixed_y_t* out1,
+                               fixed_y_t* out2) {
+  const int uv_w = w >> 1;
+  const int len = (w - 1) >> 1;   // length to filter
+  int k = 3;
+  while (k-- > 0) {   // process each R/G/B segments in turn
+    // special boundary case for i==0
+    out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
+    out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
+
+    WebPSharpYUVFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
+    WebPSharpYUVFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
+
+    // special boundary case for i == w - 1 when w is even
+    if (!(w & 1)) {
+      out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
+                            best_y[w - 1 + 0]);
+      out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
+                            best_y[w - 1 + w]);
     }
+    out1 += w;
+    out2 += w;
+    prev_uv += uv_w;
+    cur_uv  += uv_w;
+    next_uv += uv_w;
   }
 }
 
@@ -381,36 +386,42 @@ static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
   return clip_8b(128 + (v >> (YUV_FIX + SFIX)));
 }
 
-static int ConvertWRGBToYUV(const fixed_y_t* const best_y,
-                            const fixed_t* const best_uv,
+static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
                             WebPPicture* const picture) {
   int i, j;
+  uint8_t* dst_y = picture->y;
+  uint8_t* dst_u = picture->u;
+  uint8_t* dst_v = picture->v;
+  const fixed_t* const best_uv_base = best_uv;
   const int w = (picture->width + 1) & ~1;
   const int h = (picture->height + 1) & ~1;
   const int uv_w = w >> 1;
   const int uv_h = h >> 1;
-  for (j = 0; j < picture->height; ++j) {
+  for (best_uv = best_uv_base, j = 0; j < picture->height; ++j) {
     for (i = 0; i < picture->width; ++i) {
-      const int off = 3 * ((i >> 1) + (j >> 1) * uv_w);
-      const int off2 = i + j * picture->y_stride;
-      const int W = best_y[i + j * w];
-      const int r = best_uv[off + 0] + W;
-      const int g = best_uv[off + 1] + W;
-      const int b = best_uv[off + 2] + W;
-      picture->y[off2] = ConvertRGBToY(r, g, b);
+      const int off = (i >> 1);
+      const int W = best_y[i];
+      const int r = best_uv[off + 0 * uv_w] + W;
+      const int g = best_uv[off + 1 * uv_w] + W;
+      const int b = best_uv[off + 2 * uv_w] + W;
+      dst_y[i] = ConvertRGBToY(r, g, b);
     }
+    best_y += w;
+    best_uv += (j & 1) * 3 * uv_w;
+    dst_y += picture->y_stride;
   }
-  for (j = 0; j < uv_h; ++j) {
-    uint8_t* const dst_u = picture->u + j * picture->uv_stride;
-    uint8_t* const dst_v = picture->v + j * picture->uv_stride;
+  for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
     for (i = 0; i < uv_w; ++i) {
-      const int off = 3 * (i + j * uv_w);
-      const int r = best_uv[off + 0];
-      const int g = best_uv[off + 1];
-      const int b = best_uv[off + 2];
+      const int off = i;
+      const int r = best_uv[off + 0 * uv_w];
+      const int g = best_uv[off + 1 * uv_w];
+      const int b = best_uv[off + 2 * uv_w];
       dst_u[i] = ConvertRGBToU(r, g, b);
       dst_v[i] = ConvertRGBToV(r, g, b);
     }
+    best_uv += 3 * uv_w;
+    dst_u += picture->uv_stride;
+    dst_v += picture->uv_stride;
   }
   return 1;
 }
@@ -420,9 +431,9 @@ static int ConvertWRGBToYUV(const fixed_y_t* const best_y,
 
 #define SAFE_ALLOC(W, H, T) ((T*)WebPSafeMalloc((W) * (H), sizeof(T)))
 
-static int PreprocessARGB(const uint8_t* const r_ptr,
-                          const uint8_t* const g_ptr,
-                          const uint8_t* const b_ptr,
+static int PreprocessARGB(const uint8_t* r_ptr,
+                          const uint8_t* g_ptr,
+                          const uint8_t* b_ptr,
                           int step, int rgb_stride,
                           WebPPicture* const picture) {
   // we expand the right/bottom border if needed
@@ -430,25 +441,27 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
   const int h = (picture->height + 1) & ~1;
   const int uv_w = w >> 1;
   const int uv_h = h >> 1;
-  int i, j, iter;
+  uint64_t prev_diff_y_sum = ~0;
+  int j, iter;
 
   // TODO(skal): allocate one big memory chunk. But for now, it's easier
   // for valgrind debugging to have several chunks.
   fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
-  fixed_y_t* const best_y = SAFE_ALLOC(w, h, fixed_y_t);
-  fixed_y_t* const target_y = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
   fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
-  fixed_t* const best_uv = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
-  fixed_t* const target_uv = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
   fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
+  fixed_y_t* best_y = best_y_base;
+  fixed_y_t* target_y = target_y_base;
+  fixed_t* best_uv = best_uv_base;
+  fixed_t* target_uv = target_uv_base;
+  const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
   int ok;
-  int diff_sum = 0;
-  const int first_diff_threshold = (int)(2.5 * w * h);
-  const int min_improvement = 5;   // stop if improvement is below this %
-  const int min_first_improvement = 80;
 
-  if (best_y == NULL || best_uv == NULL ||
-      target_y == NULL || target_uv == NULL ||
+  if (best_y_base == NULL || best_uv_base == NULL ||
+      target_y_base == NULL || target_uv_base == NULL ||
       best_rgb_y == NULL || best_rgb_uv == NULL ||
       tmp_buffer == NULL) {
     ok = WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
@@ -457,101 +470,86 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
   assert(picture->width >= kMinDimensionIterativeConversion);
   assert(picture->height >= kMinDimensionIterativeConversion);
 
+  WebPInitConvertARGBToYUV();
+
   // Import RGB samples to W/RGB representation.
   for (j = 0; j < picture->height; j += 2) {
     const int is_last_row = (j == picture->height - 1);
-    fixed_y_t* const src1 = tmp_buffer;
+    fixed_y_t* const src1 = tmp_buffer + 0 * w;
     fixed_y_t* const src2 = tmp_buffer + 3 * w;
-    const int off1 = j * rgb_stride;
-    const int off2 = off1 + rgb_stride;
-    const int uv_off = (j >> 1) * 3 * uv_w;
-    fixed_y_t* const dst_y = best_y + j * w;
 
     // prepare two rows of input
-    ImportOneRow(r_ptr + off1, g_ptr + off1, b_ptr + off1,
-                 step, picture->width, src1);
+    ImportOneRow(r_ptr, g_ptr, b_ptr, step, picture->width, src1);
     if (!is_last_row) {
-      ImportOneRow(r_ptr + off2, g_ptr + off2, b_ptr + off2,
+      ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
                    step, picture->width, src2);
     } else {
       memcpy(src2, src1, 3 * w * sizeof(*src2));
     }
-    UpdateW(src1, target_y + (j + 0) * w, w);
-    UpdateW(src2, target_y + (j + 1) * w, w);
-    diff_sum += UpdateChroma(src1, src2, target_uv + uv_off, dst_y, uv_w);
-    memcpy(best_uv + uv_off, target_uv + uv_off, 3 * uv_w * sizeof(*best_uv));
-    memcpy(dst_y + w, dst_y, w * sizeof(*dst_y));
+    StoreGray(src1, best_y + 0, w);
+    StoreGray(src2, best_y + w, w);
+
+    UpdateW(src1, target_y, w);
+    UpdateW(src2, target_y + w, w);
+    UpdateChroma(src1, src2, target_uv, uv_w);
+    memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
+    best_y += 2 * w;
+    best_uv += 3 * uv_w;
+    target_y += 2 * w;
+    target_uv += 3 * uv_w;
+    r_ptr += 2 * rgb_stride;
+    g_ptr += 2 * rgb_stride;
+    b_ptr += 2 * rgb_stride;
   }
 
   // Iterate and resolve clipping conflicts.
   for (iter = 0; iter < kNumIterations; ++iter) {
-    int k;
-    const fixed_t* cur_uv = best_uv;
-    const fixed_t* prev_uv = best_uv;
-    const int old_diff_sum = diff_sum;
-    diff_sum = 0;
+    const fixed_t* cur_uv = best_uv_base;
+    const fixed_t* prev_uv = best_uv_base;
+    uint64_t diff_y_sum = 0;
+
+    best_y = best_y_base;
+    best_uv = best_uv_base;
+    target_y = target_y_base;
+    target_uv = target_uv_base;
     for (j = 0; j < h; j += 2) {
-      fixed_y_t* const src1 = tmp_buffer;
+      fixed_y_t* const src1 = tmp_buffer + 0 * w;
       fixed_y_t* const src2 = tmp_buffer + 3 * w;
       {
         const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
-        InterpolateTwoRows(best_y + j * w, prev_uv, cur_uv, next_uv,
-                           w, src1, src2);
+        InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, src1, src2);
         prev_uv = cur_uv;
         cur_uv = next_uv;
       }
 
       UpdateW(src1, best_rgb_y + 0 * w, w);
       UpdateW(src2, best_rgb_y + 1 * w, w);
-      diff_sum += UpdateChroma(src1, src2, best_rgb_uv, NULL, uv_w);
+      UpdateChroma(src1, src2, best_rgb_uv, uv_w);
 
       // update two rows of Y and one row of RGB
-      for (i = 0; i < 2 * w; ++i) {
-        const int off = i + j * w;
-        const int diff_y = target_y[off] - best_rgb_y[i];
-        const int new_y = (int)best_y[off] + diff_y;
-        best_y[off] = clip_y(new_y);
-      }
-      for (i = 0; i < uv_w; ++i) {
-        const int off = 3 * (i + (j >> 1) * uv_w);
-        int W;
-        for (k = 0; k <= 2; ++k) {
-          const int diff_uv = (int)target_uv[off + k] - best_rgb_uv[3 * i + k];
-          best_uv[off + k] += diff_uv;
-        }
-        W = RGBToGray(best_uv[off + 0], best_uv[off + 1], best_uv[off + 2]);
-        for (k = 0; k <= 2; ++k) {
-          best_uv[off + k] -= W;
-        }
-      }
+      diff_y_sum += WebPSharpYUVUpdateY(target_y, best_rgb_y, best_y, 2 * w);
+      WebPSharpYUVUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
+
+      best_y += 2 * w;
+      best_uv += 3 * uv_w;
+      target_y += 2 * w;
+      target_uv += 3 * uv_w;
     }
     // test exit condition
-    if (diff_sum > 0) {
-      const int improvement = 100 * abs(diff_sum - old_diff_sum) / diff_sum;
-      // Check if first iteration gave good result already, without a large
-      // jump of improvement (otherwise it means we need to try few extra
-      // iterations, just to be sure).
-      if (iter == 0 && diff_sum < first_diff_threshold &&
-          improvement < min_first_improvement) {
-        break;
-      }
-      // then, check if improvement is stalling.
-      if (improvement < min_improvement) {
-        break;
-      }
-    } else {
-      break;
+    if (iter > 0) {
+      if (diff_y_sum < diff_y_threshold) break;
+      if (diff_y_sum > prev_diff_y_sum) break;
     }
+    prev_diff_y_sum = diff_y_sum;
   }
-
   // final reconstruction
-  ok = ConvertWRGBToYUV(best_y, best_uv, picture);
+  ok = ConvertWRGBToYUV(best_y_base, best_uv_base, picture);
 
  End:
-  WebPSafeFree(best_y);
-  WebPSafeFree(best_uv);
-  WebPSafeFree(target_y);
-  WebPSafeFree(target_uv);
+  WebPSafeFree(best_y_base);
+  WebPSafeFree(best_uv_base);
+  WebPSafeFree(target_y_base);
+  WebPSafeFree(target_uv_base);
   WebPSafeFree(best_rgb_y);
   WebPSafeFree(best_rgb_uv);
   WebPSafeFree(tmp_buffer);
@@ -830,10 +828,10 @@ static WEBP_INLINE void ConvertRowsToUV(const uint16_t* rgb,
   }
 }
 
-static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
-                              const uint8_t* const g_ptr,
-                              const uint8_t* const b_ptr,
-                              const uint8_t* const a_ptr,
+static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
+                              const uint8_t* g_ptr,
+                              const uint8_t* b_ptr,
+                              const uint8_t* a_ptr,
                               int step,         // bytes per pixel
                               int rgb_stride,   // bytes per scanline
                               float dithering,
@@ -900,36 +898,34 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
     // Downsample Y/U/V planes, two rows at a time
     for (y = 0; y < (height >> 1); ++y) {
       int rows_have_alpha = has_alpha;
-      const int off1 = (2 * y + 0) * rgb_stride;
-      const int off2 = (2 * y + 1) * rgb_stride;
       if (use_dsp) {
         if (is_rgb) {
-          WebPConvertRGB24ToY(r_ptr + off1, dst_y, width);
-          WebPConvertRGB24ToY(r_ptr + off2, dst_y + picture->y_stride, width);
+          WebPConvertRGB24ToY(r_ptr, dst_y, width);
+          WebPConvertRGB24ToY(r_ptr + rgb_stride,
+                              dst_y + picture->y_stride, width);
         } else {
-          WebPConvertBGR24ToY(b_ptr + off1, dst_y, width);
-          WebPConvertBGR24ToY(b_ptr + off2, dst_y + picture->y_stride, width);
+          WebPConvertBGR24ToY(b_ptr, dst_y, width);
+          WebPConvertBGR24ToY(b_ptr + rgb_stride,
+                              dst_y + picture->y_stride, width);
         }
       } else {
-        ConvertRowToY(r_ptr + off1, g_ptr + off1, b_ptr + off1, step,
-                      dst_y, width, rg);
-        ConvertRowToY(r_ptr + off2, g_ptr + off2, b_ptr + off2, step,
+        ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
+        ConvertRowToY(r_ptr + rgb_stride,
+                      g_ptr + rgb_stride,
+                      b_ptr + rgb_stride, step,
                       dst_y + picture->y_stride, width, rg);
       }
       dst_y += 2 * picture->y_stride;
       if (has_alpha) {
-        rows_have_alpha &= !WebPExtractAlpha(a_ptr + off1, rgb_stride,
-                                             width, 2,
+        rows_have_alpha &= !WebPExtractAlpha(a_ptr, rgb_stride, width, 2,
                                              dst_a, picture->a_stride);
         dst_a += 2 * picture->a_stride;
       }
       // Collect averaged R/G/B(/A)
       if (!rows_have_alpha) {
-        AccumulateRGB(r_ptr + off1, g_ptr + off1, b_ptr + off1,
-                      step, rgb_stride, tmp_rgb, width);
+        AccumulateRGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, tmp_rgb, width);
       } else {
-        AccumulateRGBA(r_ptr + off1, g_ptr + off1, b_ptr + off1, a_ptr + off1,
-                       rgb_stride, tmp_rgb, width);
+        AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, rgb_stride, tmp_rgb, width);
       }
       // Convert to U/V
       if (rg == NULL) {
@@ -939,31 +935,33 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
       }
       dst_u += picture->uv_stride;
       dst_v += picture->uv_stride;
+      r_ptr += 2 * rgb_stride;
+      b_ptr += 2 * rgb_stride;
+      g_ptr += 2 * rgb_stride;
+      if (has_alpha) a_ptr += 2 * rgb_stride;
     }
     if (height & 1) {    // extra last row
-      const int off = 2 * y * rgb_stride;
       int row_has_alpha = has_alpha;
       if (use_dsp) {
         if (r_ptr < b_ptr) {
-          WebPConvertRGB24ToY(r_ptr + off, dst_y, width);
+          WebPConvertRGB24ToY(r_ptr, dst_y, width);
         } else {
-          WebPConvertBGR24ToY(b_ptr + off, dst_y, width);
+          WebPConvertBGR24ToY(b_ptr, dst_y, width);
         }
       } else {
-        ConvertRowToY(r_ptr + off, g_ptr + off, b_ptr + off, step,
-                      dst_y, width, rg);
+        ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
       }
       if (row_has_alpha) {
-        row_has_alpha &= !WebPExtractAlpha(a_ptr + off, 0, width, 1, dst_a, 0);
+        row_has_alpha &= !WebPExtractAlpha(a_ptr, 0, width, 1, dst_a, 0);
       }
       // Collect averaged R/G/B(/A)
       if (!row_has_alpha) {
         // Collect averaged R/G/B
-        AccumulateRGB(r_ptr + off, g_ptr + off, b_ptr + off,
-                      step, /* rgb_stride = */ 0, tmp_rgb, width);
+        AccumulateRGB(r_ptr, g_ptr, b_ptr, step, /* rgb_stride = */ 0,
+                      tmp_rgb, width);
       } else {
-        AccumulateRGBA(r_ptr + off, g_ptr + off, b_ptr + off, a_ptr + off,
-                       /* rgb_stride = */ 0, tmp_rgb, width);
+        AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, /* rgb_stride = */ 0,
+                       tmp_rgb, width);
       }
       if (rg == NULL) {
         WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
@@ -1013,9 +1011,13 @@ int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
   return PictureARGBToYUVA(picture, colorspace, 0.f, 0);
 }
 
-int WebPPictureSmartARGBToYUVA(WebPPicture* picture) {
+int WebPPictureSharpARGBToYUVA(WebPPicture* picture) {
   return PictureARGBToYUVA(picture, WEBP_YUV420, 0.f, 1);
 }
+// for backward compatibility
+int WebPPictureSmartARGBToYUVA(WebPPicture* picture) {
+  return WebPPictureSharpARGBToYUVA(picture);
+}
 
 //------------------------------------------------------------------------------
 // call for YUVA -> ARGB conversion
@@ -1086,10 +1088,10 @@ static int Import(WebPPicture* const picture,
                   const uint8_t* const rgb, int rgb_stride,
                   int step, int swap_rb, int import_alpha) {
   int y;
-  const uint8_t* const r_ptr = rgb + (swap_rb ? 2 : 0);
-  const uint8_t* const g_ptr = rgb + 1;
-  const uint8_t* const b_ptr = rgb + (swap_rb ? 0 : 2);
-  const uint8_t* const a_ptr = import_alpha ? rgb + 3 : NULL;
+  const uint8_t* r_ptr = rgb + (swap_rb ? 2 : 0);
+  const uint8_t* g_ptr = rgb + 1;
+  const uint8_t* b_ptr = rgb + (swap_rb ? 0 : 2);
+  const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL;
   const int width = picture->width;
   const int height = picture->height;
 
@@ -1102,20 +1104,25 @@ static int Import(WebPPicture* const picture,
   VP8EncDspARGBInit();
 
   if (import_alpha) {
+    uint32_t* dst = picture->argb;
     assert(step == 4);
     for (y = 0; y < height; ++y) {
-      uint32_t* const dst = &picture->argb[y * picture->argb_stride];
-      const int offset = y * rgb_stride;
-      VP8PackARGB(a_ptr + offset, r_ptr + offset, g_ptr + offset,
-                  b_ptr + offset, width, dst);
+      VP8PackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
+      a_ptr += rgb_stride;
+      r_ptr += rgb_stride;
+      g_ptr += rgb_stride;
+      b_ptr += rgb_stride;
+      dst += picture->argb_stride;
     }
   } else {
+    uint32_t* dst = picture->argb;
     assert(step >= 3);
     for (y = 0; y < height; ++y) {
-      uint32_t* const dst = &picture->argb[y * picture->argb_stride];
-      const int offset = y * rgb_stride;
-      VP8PackRGB(r_ptr + offset, g_ptr + offset, b_ptr + offset,
-                 width, step, dst);
+      VP8PackRGB(r_ptr, g_ptr, b_ptr, width, step, dst);
+      r_ptr += rgb_stride;
+      g_ptr += rgb_stride;
+      b_ptr += rgb_stride;
+      dst += picture->argb_stride;
     }
   }
   return 1;
diff --git a/src/3rdparty/libwebp/src/enc/picture.c b/src/3rdparty/libwebp/src/enc/picture_enc.c
index d9befbc..dfa6651 100644
--- a/src/3rdparty/libwebp/src/enc/picture.c
+++ b/src/3rdparty/libwebp/src/enc/picture_enc.c
@@ -14,7 +14,7 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 #include "../dsp/dsp.h"
 #include "../utils/utils.h"
 
@@ -88,8 +88,9 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
 }
 
 int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
-  const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
-  const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT;
+  const WebPEncCSP uv_csp =
+      (WebPEncCSP)((int)picture->colorspace & WEBP_CSP_UV_MASK);
+  const int has_alpha = (int)picture->colorspace & WEBP_CSP_ALPHA_BIT;
   const int y_stride = width;
   const int uv_width = (width + 1) >> 1;
   const int uv_height = (height + 1) >> 1;
diff --git a/src/3rdparty/libwebp/src/enc/picture_psnr.c b/src/3rdparty/libwebp/src/enc/picture_psnr.c
deleted file mode 100644
index 81ab1b5..0000000
--- a/src/3rdparty/libwebp/src/enc/picture_psnr.c
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// WebPPicture tools for measuring distortion
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <math.h>
-#include <stdlib.h>
-
-#include "./vp8enci.h"
-#include "../utils/utils.h"
-
-//------------------------------------------------------------------------------
-// local-min distortion
-//
-// For every pixel in the *reference* picture, we search for the local best
-// match in the compressed image. This is not a symmetrical measure.
-
-#define RADIUS 2  // search radius. Shouldn't be too large.
-
-static void AccumulateLSIM(const uint8_t* src, int src_stride,
-                           const uint8_t* ref, int ref_stride,
-                           int w, int h, VP8DistoStats* stats) {
-  int x, y;
-  double total_sse = 0.;
-  for (y = 0; y < h; ++y) {
-    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
-    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
-    for (x = 0; x < w; ++x) {
-      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
-      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
-      double best_sse = 255. * 255.;
-      const double value = (double)ref[y * ref_stride + x];
-      int i, j;
-      for (j = y_0; j < y_1; ++j) {
-        const uint8_t* const s = src + j * src_stride;
-        for (i = x_0; i < x_1; ++i) {
-          const double diff = s[i] - value;
-          const double sse = diff * diff;
-          if (sse < best_sse) best_sse = sse;
-        }
-      }
-      total_sse += best_sse;
-    }
-  }
-  stats->w = w * h;
-  stats->xm = 0;
-  stats->ym = 0;
-  stats->xxm = total_sse;
-  stats->yym = 0;
-  stats->xxm = 0;
-}
-#undef RADIUS
-
-//------------------------------------------------------------------------------
-// Distortion
-
-// Max value returned in case of exact similarity.
-static const double kMinDistortion_dB = 99.;
-static float GetPSNR(const double v) {
-  return (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
-                          : kMinDistortion_dB);
-}
-
-int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
-                          int type, float result[5]) {
-  VP8DistoStats stats[5];
-  int w, h;
-
-  memset(stats, 0, sizeof(stats));
-
-  VP8SSIMDspInit();
-
-  if (src == NULL || ref == NULL ||
-      src->width != ref->width || src->height != ref->height ||
-      src->use_argb != ref->use_argb || result == NULL) {
-    return 0;
-  }
-  w = src->width;
-  h = src->height;
-
-  if (src->use_argb == 1) {
-    if (src->argb == NULL || ref->argb == NULL) {
-      return 0;
-    } else {
-      int i, j, c;
-      uint8_t* tmp1, *tmp2;
-      uint8_t* const tmp_plane =
-          (uint8_t*)WebPSafeMalloc(2ULL * w * h, sizeof(*tmp_plane));
-      if (tmp_plane == NULL) return 0;
-      tmp1 = tmp_plane;
-      tmp2 = tmp_plane + w * h;
-      for (c = 0; c < 4; ++c) {
-        for (j = 0; j < h; ++j) {
-          for (i = 0; i < w; ++i) {
-            tmp1[j * w + i] = src->argb[i + j * src->argb_stride] >> (c * 8);
-            tmp2[j * w + i] = ref->argb[i + j * ref->argb_stride] >> (c * 8);
-          }
-        }
-        if (type >= 2) {
-          AccumulateLSIM(tmp1, w, tmp2, w, w, h, &stats[c]);
-        } else {
-          VP8SSIMAccumulatePlane(tmp1, w, tmp2, w, w, h, &stats[c]);
-        }
-      }
-      free(tmp_plane);
-    }
-  } else {
-    int has_alpha, uv_w, uv_h;
-    if (src->y == NULL || ref->y == NULL ||
-        src->u == NULL || ref->u == NULL ||
-        src->v == NULL || ref->v == NULL) {
-      return 0;
-    }
-    has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
-    if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
-        (has_alpha && (src->a == NULL || ref->a == NULL))) {
-      return 0;
-    }
-
-    uv_w = (src->width + 1) >> 1;
-    uv_h = (src->height + 1) >> 1;
-    if (type >= 2) {
-      AccumulateLSIM(src->y, src->y_stride, ref->y, ref->y_stride,
-                     w, h, &stats[0]);
-      AccumulateLSIM(src->u, src->uv_stride, ref->u, ref->uv_stride,
-                     uv_w, uv_h, &stats[1]);
-      AccumulateLSIM(src->v, src->uv_stride, ref->v, ref->uv_stride,
-                     uv_w, uv_h, &stats[2]);
-      if (has_alpha) {
-        AccumulateLSIM(src->a, src->a_stride, ref->a, ref->a_stride,
-                       w, h, &stats[3]);
-      }
-    } else {
-      VP8SSIMAccumulatePlane(src->y, src->y_stride,
-                             ref->y, ref->y_stride,
-                             w, h, &stats[0]);
-      VP8SSIMAccumulatePlane(src->u, src->uv_stride,
-                             ref->u, ref->uv_stride,
-                             uv_w, uv_h, &stats[1]);
-      VP8SSIMAccumulatePlane(src->v, src->uv_stride,
-                             ref->v, ref->uv_stride,
-                             uv_w, uv_h, &stats[2]);
-      if (has_alpha) {
-        VP8SSIMAccumulatePlane(src->a, src->a_stride,
-                               ref->a, ref->a_stride,
-                               w, h, &stats[3]);
-      }
-    }
-  }
-  // Final stat calculations.
-  {
-    int c;
-    for (c = 0; c <= 4; ++c) {
-      if (type == 1) {
-        const double v = VP8SSIMGet(&stats[c]);
-        result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
-                                     : kMinDistortion_dB);
-      } else {
-        const double v = VP8SSIMGetSquaredError(&stats[c]);
-        result[c] = GetPSNR(v);
-      }
-      // Accumulate forward
-      if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
-    }
-  }
-  return 1;
-}
-
-//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c b/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c
new file mode 100644
index 0000000..9c0b229
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c
@@ -0,0 +1,213 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools for measuring distortion
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "./vp8i_enc.h"
+#include "../utils/utils.h"
+
+typedef double (*AccumulateFunc)(const uint8_t* src, int src_stride,
+                                 const uint8_t* ref, int ref_stride,
+                                 int w, int h);
+
+//------------------------------------------------------------------------------
+// local-min distortion
+//
+// For every pixel in the *reference* picture, we search for the local best
+// match in the compressed image. This is not a symmetrical measure.
+
+#define RADIUS 2  // search radius. Shouldn't be too large.
+
+static double AccumulateLSIM(const uint8_t* src, int src_stride,
+                             const uint8_t* ref, int ref_stride,
+                             int w, int h) {
+  int x, y;
+  double total_sse = 0.;
+  for (y = 0; y < h; ++y) {
+    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
+    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
+    for (x = 0; x < w; ++x) {
+      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
+      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
+      double best_sse = 255. * 255.;
+      const double value = (double)ref[y * ref_stride + x];
+      int i, j;
+      for (j = y_0; j < y_1; ++j) {
+        const uint8_t* const s = src + j * src_stride;
+        for (i = x_0; i < x_1; ++i) {
+          const double diff = s[i] - value;
+          const double sse = diff * diff;
+          if (sse < best_sse) best_sse = sse;
+        }
+      }
+      total_sse += best_sse;
+    }
+  }
+  return total_sse;
+}
+#undef RADIUS
+
+static double AccumulateSSE(const uint8_t* src, int src_stride,
+                            const uint8_t* ref, int ref_stride,
+                            int w, int h) {
+  int y;
+  double total_sse = 0.;
+  for (y = 0; y < h; ++y) {
+    total_sse += VP8AccumulateSSE(src, ref, w);
+    src += src_stride;
+    ref += ref_stride;
+  }
+  return total_sse;
+}
+
+//------------------------------------------------------------------------------
+
+static double AccumulateSSIM(const uint8_t* src, int src_stride,
+                             const uint8_t* ref, int ref_stride,
+                             int w, int h) {
+  const int w0 = (w < VP8_SSIM_KERNEL) ? w : VP8_SSIM_KERNEL;
+  const int w1 = w - VP8_SSIM_KERNEL - 1;
+  const int h0 = (h < VP8_SSIM_KERNEL) ? h : VP8_SSIM_KERNEL;
+  const int h1 = h - VP8_SSIM_KERNEL - 1;
+  int x, y;
+  double sum = 0.;
+  for (y = 0; y < h0; ++y) {
+    for (x = 0; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  for (; y < h1; ++y) {
+    for (x = 0; x < w0; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+    for (; x < w1; ++x) {
+      const int off1 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * src_stride;
+      const int off2 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * ref_stride;
+      sum += VP8SSIMGet(src + off1, src_stride, ref + off2, ref_stride);
+    }
+    for (; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  for (; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  return sum;
+}
+
+//------------------------------------------------------------------------------
+// Distortion
+
+// Max value returned in case of exact similarity.
+static const double kMinDistortion_dB = 99.;
+
+static double GetPSNR(double v, double size) {
+  return (v > 0. && size > 0.) ? -4.3429448 * log(v / (size * 255 * 255.))
+                               : kMinDistortion_dB;
+}
+
+static double GetLogSSIM(double v, double size) {
+  v = (size > 0.) ? v / size : 1.;
+  return (v < 1.) ? -10.0 * log10(1. - v) : kMinDistortion_dB;
+}
+
+int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+                        const uint8_t* ref, size_t ref_stride,
+                        int width, int height, size_t x_step,
+                        int type, float* distortion, float* result) {
+  uint8_t* allocated = NULL;
+  const AccumulateFunc metric = (type == 0) ? AccumulateSSE :
+                                (type == 1) ? AccumulateSSIM :
+                                              AccumulateLSIM;
+  if (src == NULL || ref == NULL ||
+      src_stride < x_step * width || ref_stride < x_step * width ||
+      result == NULL || distortion == NULL) {
+    return 0;
+  }
+
+  VP8SSIMDspInit();
+  if (x_step != 1) {   // extract a packed plane if needed
+    int x, y;
+    uint8_t* tmp1;
+    uint8_t* tmp2;
+    allocated =
+        (uint8_t*)WebPSafeMalloc(2ULL * width * height, sizeof(*allocated));
+    if (allocated == NULL) return 0;
+    tmp1 = allocated;
+    tmp2 = tmp1 + (size_t)width * height;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        tmp1[x + y * width] = src[x * x_step + y * src_stride];
+        tmp2[x + y * width] = ref[x * x_step + y * ref_stride];
+      }
+    }
+    src = tmp1;
+    ref = tmp2;
+  }
+  *distortion = (float)metric(src, width, ref, width, width, height);
+  WebPSafeFree(allocated);
+
+  *result = (type == 1) ? (float)GetLogSSIM(*distortion, (double)width * height)
+                        : (float)GetPSNR(*distortion, (double)width * height);
+  return 1;
+}
+
+int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+                          int type, float results[5]) {
+  int w, h, c;
+  int ok = 0;
+  WebPPicture p0, p1;
+  double total_size = 0., total_distortion = 0.;
+  if (src == NULL || ref == NULL ||
+      src->width != ref->width || src->height != ref->height ||
+      results == NULL) {
+    return 0;
+  }
+
+  VP8SSIMDspInit();
+  if (!WebPPictureInit(&p0) || !WebPPictureInit(&p1)) return 0;
+  w = src->width;
+  h = src->height;
+  if (!WebPPictureView(src, 0, 0, w, h, &p0)) goto Error;
+  if (!WebPPictureView(ref, 0, 0, w, h, &p1)) goto Error;
+
+  // We always measure distortion in ARGB space.
+  if (p0.use_argb == 0 && !WebPPictureYUVAToARGB(&p0)) goto Error;
+  if (p1.use_argb == 0 && !WebPPictureYUVAToARGB(&p1)) goto Error;
+  for (c = 0; c < 4; ++c) {
+    float distortion;
+    const size_t stride0 = 4 * (size_t)p0.argb_stride;
+    const size_t stride1 = 4 * (size_t)p1.argb_stride;
+    if (!WebPPlaneDistortion((const uint8_t*)p0.argb + c, stride0,
+                             (const uint8_t*)p1.argb + c, stride1,
+                             w, h, 4, type, &distortion, results + c)) {
+      goto Error;
+    }
+    total_distortion += distortion;
+    total_size += w * h;
+  }
+
+  results[4] = (type == 1) ? (float)GetLogSSIM(total_distortion, total_size)
+                           : (float)GetPSNR(total_distortion, total_size);
+  ok = 1;
+
+ Error:
+  WebPPictureFree(&p0);
+  WebPPictureFree(&p1);
+  return ok;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/picture_rescale.c b/src/3rdparty/libwebp/src/enc/picture_rescale_enc.c
index 9f19e8e..0b7181c 100644
--- a/src/3rdparty/libwebp/src/enc/picture_rescale.c
+++ b/src/3rdparty/libwebp/src/enc/picture_rescale_enc.c
@@ -14,8 +14,8 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8enci.h"
-#include "../utils/rescaler.h"
+#include "./vp8i_enc.h"
+#include "../utils/rescaler_utils.h"
 #include "../utils/utils.h"
 
 #define HALVE(x) (((x) + 1) >> 1)
diff --git a/src/3rdparty/libwebp/src/enc/picture_tools.c b/src/3rdparty/libwebp/src/enc/picture_tools_enc.c
index bf97af8..895df51 100644
--- a/src/3rdparty/libwebp/src/enc/picture_tools.c
+++ b/src/3rdparty/libwebp/src/enc/picture_tools_enc.c
@@ -13,7 +13,7 @@
 
 #include <assert.h>
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 #include "../dsp/yuv.h"
 
 static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
diff --git a/src/3rdparty/libwebp/src/enc/predictor_enc.c b/src/3rdparty/libwebp/src/enc/predictor_enc.c
new file mode 100644
index 0000000..0639b74
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/predictor_enc.c
@@ -0,0 +1,750 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+//          Urvang Joshi (urvang@google.com)
+//          Vincent Rabaud (vrabaud@google.com)
+
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "./vp8li_enc.h"
+
+#define MAX_DIFF_COST (1e30f)
+
+static const float kSpatialPredictorBias = 15.f;
+static const int kPredLowEffort = 11;
+static const uint32_t kMaskAlpha = 0xff000000;
+
+// Mostly used to reduce code size + readability
+static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
+static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
+
+//------------------------------------------------------------------------------
+// Methods to calculate Entropy (Shannon).
+
+static float PredictionCostSpatial(const int counts[256], int weight_0,
+                                   double exp_val) {
+  const int significant_symbols = 256 >> 4;
+  const double exp_decay_factor = 0.6;
+  double bits = weight_0 * counts[0];
+  int i;
+  for (i = 1; i < significant_symbols; ++i) {
+    bits += exp_val * (counts[i] + counts[256 - i]);
+    exp_val *= exp_decay_factor;
+  }
+  return (float)(-0.1 * bits);
+}
+
+static float PredictionCostSpatialHistogram(const int accumulated[4][256],
+                                            const int tile[4][256]) {
+  int i;
+  double retval = 0;
+  for (i = 0; i < 4; ++i) {
+    const double kExpValue = 0.94;
+    retval += PredictionCostSpatial(tile[i], 1, kExpValue);
+    retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
+  }
+  return (float)retval;
+}
+
+static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
+  ++histo_argb[0][argb >> 24];
+  ++histo_argb[1][(argb >> 16) & 0xff];
+  ++histo_argb[2][(argb >> 8) & 0xff];
+  ++histo_argb[3][argb & 0xff];
+}
+
+//------------------------------------------------------------------------------
+// Spatial transform functions.
+
+static WEBP_INLINE void PredictBatch(int mode, int x_start, int y,
+                                     int num_pixels, const uint32_t* current,
+                                     const uint32_t* upper, uint32_t* out) {
+  if (x_start == 0) {
+    if (y == 0) {
+      // ARGB_BLACK.
+      VP8LPredictorsSub[0](current, NULL, 1, out);
+    } else {
+      // Top one.
+      VP8LPredictorsSub[2](current, upper, 1, out);
+    }
+    ++x_start;
+    ++out;
+    --num_pixels;
+  }
+  if (y == 0) {
+    // Left one.
+    VP8LPredictorsSub[1](current + x_start, NULL, num_pixels, out);
+  } else {
+    VP8LPredictorsSub[mode](current + x_start, upper + x_start, num_pixels,
+                            out);
+  }
+}
+
+static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
+  const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
+  const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
+  const int diff_g = abs((int)((p1 >> 8) & 0xff) - (int)((p2 >> 8) & 0xff));
+  const int diff_b = abs((int)(p1 & 0xff) - (int)(p2 & 0xff));
+  return GetMax(GetMax(diff_a, diff_r), GetMax(diff_g, diff_b));
+}
+
+static int MaxDiffAroundPixel(uint32_t current, uint32_t up, uint32_t down,
+                              uint32_t left, uint32_t right) {
+  const int diff_up = MaxDiffBetweenPixels(current, up);
+  const int diff_down = MaxDiffBetweenPixels(current, down);
+  const int diff_left = MaxDiffBetweenPixels(current, left);
+  const int diff_right = MaxDiffBetweenPixels(current, right);
+  return GetMax(GetMax(diff_up, diff_down), GetMax(diff_left, diff_right));
+}
+
+static uint32_t AddGreenToBlueAndRed(uint32_t argb) {
+  const uint32_t green = (argb >> 8) & 0xff;
+  uint32_t red_blue = argb & 0x00ff00ffu;
+  red_blue += (green << 16) | green;
+  red_blue &= 0x00ff00ffu;
+  return (argb & 0xff00ff00u) | red_blue;
+}
+
+static void MaxDiffsForRow(int width, int stride, const uint32_t* const argb,
+                           uint8_t* const max_diffs, int used_subtract_green) {
+  uint32_t current, up, down, left, right;
+  int x;
+  if (width <= 2) return;
+  current = argb[0];
+  right = argb[1];
+  if (used_subtract_green) {
+    current = AddGreenToBlueAndRed(current);
+    right = AddGreenToBlueAndRed(right);
+  }
+  // max_diffs[0] and max_diffs[width - 1] are never used.
+  for (x = 1; x < width - 1; ++x) {
+    up = argb[-stride + x];
+    down = argb[stride + x];
+    left = current;
+    current = right;
+    right = argb[x + 1];
+    if (used_subtract_green) {
+      up = AddGreenToBlueAndRed(up);
+      down = AddGreenToBlueAndRed(down);
+      right = AddGreenToBlueAndRed(right);
+    }
+    max_diffs[x] = MaxDiffAroundPixel(current, up, down, left, right);
+  }
+}
+
+// Quantize the difference between the actual component value and its prediction
+// to a multiple of quantization, working modulo 256, taking care not to cross
+// a boundary (inclusive upper limit).
+static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
+                                     uint8_t boundary, int quantization) {
+  const int residual = (value - predict) & 0xff;
+  const int boundary_residual = (boundary - predict) & 0xff;
+  const int lower = residual & ~(quantization - 1);
+  const int upper = lower + quantization;
+  // Resolve ties towards a value closer to the prediction (i.e. towards lower
+  // if value comes after prediction and towards upper otherwise).
+  const int bias = ((boundary - value) & 0xff) < boundary_residual;
+  if (residual - lower < upper - residual + bias) {
+    // lower is closer to residual than upper.
+    if (residual > boundary_residual && lower <= boundary_residual) {
+      // Halve quantization step to avoid crossing boundary. This midpoint is
+      // on the same side of boundary as residual because midpoint >= residual
+      // (since lower is closer than upper) and residual is above the boundary.
+      return lower + (quantization >> 1);
+    }
+    return lower;
+  } else {
+    // upper is closer to residual than lower.
+    if (residual <= boundary_residual && upper > boundary_residual) {
+      // Halve quantization step to avoid crossing boundary. This midpoint is
+      // on the same side of boundary as residual because midpoint <= residual
+      // (since upper is closer than lower) and residual is below the boundary.
+      return lower + (quantization >> 1);
+    }
+    return upper & 0xff;
+  }
+}
+
+// Quantize every component of the difference between the actual pixel value and
+// its prediction to a multiple of a quantization (a power of 2, not larger than
+// max_quantization which is a power of 2, smaller than max_diff). Take care if
+// value and predict have undergone subtract green, which means that red and
+// blue are represented as offsets from green.
+static uint32_t NearLossless(uint32_t value, uint32_t predict,
+                             int max_quantization, int max_diff,
+                             int used_subtract_green) {
+  int quantization;
+  uint8_t new_green = 0;
+  uint8_t green_diff = 0;
+  uint8_t a, r, g, b;
+  if (max_diff <= 2) {
+    return VP8LSubPixels(value, predict);
+  }
+  quantization = max_quantization;
+  while (quantization >= max_diff) {
+    quantization >>= 1;
+  }
+  if ((value >> 24) == 0 || (value >> 24) == 0xff) {
+    // Preserve transparency of fully transparent or fully opaque pixels.
+    a = ((value >> 24) - (predict >> 24)) & 0xff;
+  } else {
+    a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
+  }
+  g = NearLosslessComponent((value >> 8) & 0xff, (predict >> 8) & 0xff, 0xff,
+                            quantization);
+  if (used_subtract_green) {
+    // The green offset will be added to red and blue components during decoding
+    // to obtain the actual red and blue values.
+    new_green = ((predict >> 8) + g) & 0xff;
+    // The amount by which green has been adjusted during quantization. It is
+    // subtracted from red and blue for compensation, to avoid accumulating two
+    // quantization errors in them.
+    green_diff = (new_green - (value >> 8)) & 0xff;
+  }
+  r = NearLosslessComponent(((value >> 16) - green_diff) & 0xff,
+                            (predict >> 16) & 0xff, 0xff - new_green,
+                            quantization);
+  b = NearLosslessComponent((value - green_diff) & 0xff, predict & 0xff,
+                            0xff - new_green, quantization);
+  return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
+}
+
+// Stores the difference between the pixel and its prediction in "out".
+// In case of a lossy encoding, updates the source image to avoid propagating
+// the deviation further to pixels which depend on the current pixel for their
+// predictions.
+static WEBP_INLINE void GetResidual(
+    int width, int height, uint32_t* const upper_row,
+    uint32_t* const current_row, const uint8_t* const max_diffs, int mode,
+    int x_start, int x_end, int y, int max_quantization, int exact,
+    int used_subtract_green, uint32_t* const out) {
+  if (exact) {
+    PredictBatch(mode, x_start, y, x_end - x_start, current_row, upper_row,
+                 out);
+  } else {
+    const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
+    int x;
+    for (x = x_start; x < x_end; ++x) {
+      uint32_t predict;
+      uint32_t residual;
+      if (y == 0) {
+        predict = (x == 0) ? ARGB_BLACK : current_row[x - 1];  // Left.
+      } else if (x == 0) {
+        predict = upper_row[x];  // Top.
+      } else {
+        predict = pred_func(current_row[x - 1], upper_row + x);
+      }
+      if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
+          x == 0 || x == width - 1) {
+        residual = VP8LSubPixels(current_row[x], predict);
+      } else {
+        residual = NearLossless(current_row[x], predict, max_quantization,
+                                max_diffs[x], used_subtract_green);
+        // Update the source image.
+        current_row[x] = VP8LAddPixels(predict, residual);
+        // x is never 0 here so we do not need to update upper_row like below.
+      }
+      if ((current_row[x] & kMaskAlpha) == 0) {
+        // If alpha is 0, cleanup RGB. We can choose the RGB values of the
+        // residual for best compression. The prediction of alpha itself can be
+        // non-zero and must be kept though. We choose RGB of the residual to be
+        // 0.
+        residual &= kMaskAlpha;
+        // Update the source image.
+        current_row[x] = predict & ~kMaskAlpha;
+        // The prediction for the rightmost pixel in a row uses the leftmost
+        // pixel
+        // in that row as its top-right context pixel. Hence if we change the
+        // leftmost pixel of current_row, the corresponding change must be
+        // applied
+        // to upper_row as well where top-right context is being read from.
+        if (x == 0 && y != 0) upper_row[width] = current_row[0];
+      }
+      out[x - x_start] = residual;
+    }
+  }
+}
+
+// Returns best predictor and updates the accumulated histogram.
+// If max_quantization > 1, assumes that near lossless processing will be
+// applied, quantizing residuals to multiples of quantization levels up to
+// max_quantization (the actual quantization level depends on smoothness near
+// the given pixel).
+static int GetBestPredictorForTile(int width, int height,
+                                   int tile_x, int tile_y, int bits,
+                                   int accumulated[4][256],
+                                   uint32_t* const argb_scratch,
+                                   const uint32_t* const argb,
+                                   int max_quantization,
+                                   int exact, int used_subtract_green,
+                                   const uint32_t* const modes) {
+  const int kNumPredModes = 14;
+  const int start_x = tile_x << bits;
+  const int start_y = tile_y << bits;
+  const int tile_size = 1 << bits;
+  const int max_y = GetMin(tile_size, height - start_y);
+  const int max_x = GetMin(tile_size, width - start_x);
+  // Whether there exist columns just outside the tile.
+  const int have_left = (start_x > 0);
+  const int have_right = (max_x < width - start_x);
+  // Position and size of the strip covering the tile and adjacent columns if
+  // they exist.
+  const int context_start_x = start_x - have_left;
+  const int context_width = max_x + have_left + have_right;
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  // Prediction modes of the left and above neighbor tiles.
+  const int left_mode = (tile_x > 0) ?
+      (modes[tile_y * tiles_per_row + tile_x - 1] >> 8) & 0xff : 0xff;
+  const int above_mode = (tile_y > 0) ?
+      (modes[(tile_y - 1) * tiles_per_row + tile_x] >> 8) & 0xff : 0xff;
+  // The width of upper_row and current_row is one pixel larger than image width
+  // to allow the top right pixel to point to the leftmost pixel of the next row
+  // when at the right edge.
+  uint32_t* upper_row = argb_scratch;
+  uint32_t* current_row = upper_row + width + 1;
+  uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1);
+  float best_diff = MAX_DIFF_COST;
+  int best_mode = 0;
+  int mode;
+  int histo_stack_1[4][256];
+  int histo_stack_2[4][256];
+  // Need pointers to be able to swap arrays.
+  int (*histo_argb)[256] = histo_stack_1;
+  int (*best_histo)[256] = histo_stack_2;
+  int i, j;
+  uint32_t residuals[1 << MAX_TRANSFORM_BITS];
+  assert(bits <= MAX_TRANSFORM_BITS);
+  assert(max_x <= (1 << MAX_TRANSFORM_BITS));
+
+  for (mode = 0; mode < kNumPredModes; ++mode) {
+    float cur_diff;
+    int relative_y;
+    memset(histo_argb, 0, sizeof(histo_stack_1));
+    if (start_y > 0) {
+      // Read the row above the tile which will become the first upper_row.
+      // Include a pixel to the left if it exists; include a pixel to the right
+      // in all cases (wrapping to the leftmost pixel of the next row if it does
+      // not exist).
+      memcpy(current_row + context_start_x,
+             argb + (start_y - 1) * width + context_start_x,
+             sizeof(*argb) * (max_x + have_left + 1));
+    }
+    for (relative_y = 0; relative_y < max_y; ++relative_y) {
+      const int y = start_y + relative_y;
+      int relative_x;
+      uint32_t* tmp = upper_row;
+      upper_row = current_row;
+      current_row = tmp;
+      // Read current_row. Include a pixel to the left if it exists; include a
+      // pixel to the right in all cases except at the bottom right corner of
+      // the image (wrapping to the leftmost pixel of the next row if it does
+      // not exist in the current row).
+      memcpy(current_row + context_start_x,
+             argb + y * width + context_start_x,
+             sizeof(*argb) * (max_x + have_left + (y + 1 < height)));
+      if (max_quantization > 1 && y >= 1 && y + 1 < height) {
+        MaxDiffsForRow(context_width, width, argb + y * width + context_start_x,
+                       max_diffs + context_start_x, used_subtract_green);
+      }
+
+      GetResidual(width, height, upper_row, current_row, max_diffs, mode,
+                  start_x, start_x + max_x, y, max_quantization, exact,
+                  used_subtract_green, residuals);
+      for (relative_x = 0; relative_x < max_x; ++relative_x) {
+        UpdateHisto(histo_argb, residuals[relative_x]);
+      }
+    }
+    cur_diff = PredictionCostSpatialHistogram(
+        (const int (*)[256])accumulated, (const int (*)[256])histo_argb);
+    // Favor keeping the areas locally similar.
+    if (mode == left_mode) cur_diff -= kSpatialPredictorBias;
+    if (mode == above_mode) cur_diff -= kSpatialPredictorBias;
+
+    if (cur_diff < best_diff) {
+      int (*tmp)[256] = histo_argb;
+      histo_argb = best_histo;
+      best_histo = tmp;
+      best_diff = cur_diff;
+      best_mode = mode;
+    }
+  }
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 256; j++) {
+      accumulated[i][j] += best_histo[i][j];
+    }
+  }
+
+  return best_mode;
+}
+
+// Converts pixels of the image to residuals with respect to predictions.
+// If max_quantization > 1, applies near lossless processing, quantizing
+// residuals to multiples of quantization levels up to max_quantization
+// (the actual quantization level depends on smoothness near the given pixel).
+static void CopyImageWithPrediction(int width, int height,
+                                    int bits, uint32_t* const modes,
+                                    uint32_t* const argb_scratch,
+                                    uint32_t* const argb,
+                                    int low_effort, int max_quantization,
+                                    int exact, int used_subtract_green) {
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  // The width of upper_row and current_row is one pixel larger than image width
+  // to allow the top right pixel to point to the leftmost pixel of the next row
+  // when at the right edge.
+  uint32_t* upper_row = argb_scratch;
+  uint32_t* current_row = upper_row + width + 1;
+  uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1);
+  uint8_t* lower_max_diffs = current_max_diffs + width;
+  int y;
+
+  for (y = 0; y < height; ++y) {
+    int x;
+    uint32_t* const tmp32 = upper_row;
+    upper_row = current_row;
+    current_row = tmp32;
+    memcpy(current_row, argb + y * width,
+           sizeof(*argb) * (width + (y + 1 < height)));
+
+    if (low_effort) {
+      PredictBatch(kPredLowEffort, 0, y, width, current_row, upper_row,
+                   argb + y * width);
+    } else {
+      if (max_quantization > 1) {
+        // Compute max_diffs for the lower row now, because that needs the
+        // contents of argb for the current row, which we will overwrite with
+        // residuals before proceeding with the next row.
+        uint8_t* const tmp8 = current_max_diffs;
+        current_max_diffs = lower_max_diffs;
+        lower_max_diffs = tmp8;
+        if (y + 2 < height) {
+          MaxDiffsForRow(width, width, argb + (y + 1) * width, lower_max_diffs,
+                         used_subtract_green);
+        }
+      }
+      for (x = 0; x < width;) {
+        const int mode =
+            (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff;
+        int x_end = x + (1 << bits);
+        if (x_end > width) x_end = width;
+        GetResidual(width, height, upper_row, current_row, current_max_diffs,
+                    mode, x, x_end, y, max_quantization, exact,
+                    used_subtract_green, argb + y * width + x);
+        x = x_end;
+      }
+    }
+  }
+}
+
+// Finds the best predictor for each tile, and converts the image to residuals
+// with respect to predictions. If near_lossless_quality < 100, applies
+// near lossless processing, shaving off more bits of residuals for lower
+// qualities.
+void VP8LResidualImage(int width, int height, int bits, int low_effort,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image, int near_lossless_quality,
+                       int exact, int used_subtract_green) {
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  const int tiles_per_col = VP8LSubSampleSize(height, bits);
+  int tile_y;
+  int histo[4][256];
+  const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
+  if (low_effort) {
+    int i;
+    for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
+      image[i] = ARGB_BLACK | (kPredLowEffort << 8);
+    }
+  } else {
+    memset(histo, 0, sizeof(histo));
+    for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
+      int tile_x;
+      for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
+        const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y,
+            bits, histo, argb_scratch, argb, max_quantization, exact,
+            used_subtract_green, image);
+        image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
+      }
+    }
+  }
+
+  CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
+                          low_effort, max_quantization, exact,
+                          used_subtract_green);
+}
+
+//------------------------------------------------------------------------------
+// Color transform functions.
+
+static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
+  m->green_to_red_ = 0;
+  m->green_to_blue_ = 0;
+  m->red_to_blue_ = 0;
+}
+
+static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
+                                               VP8LMultipliers* const m) {
+  m->green_to_red_  = (color_code >>  0) & 0xff;
+  m->green_to_blue_ = (color_code >>  8) & 0xff;
+  m->red_to_blue_   = (color_code >> 16) & 0xff;
+}
+
+static WEBP_INLINE uint32_t MultipliersToColorCode(
+    const VP8LMultipliers* const m) {
+  return 0xff000000u |
+         ((uint32_t)(m->red_to_blue_) << 16) |
+         ((uint32_t)(m->green_to_blue_) << 8) |
+         m->green_to_red_;
+}
+
+static float PredictionCostCrossColor(const int accumulated[256],
+                                      const int counts[256]) {
+  // Favor low entropy, locally and globally.
+  // Favor small absolute values for PredictionCostSpatial
+  static const double kExpValue = 2.4;
+  return VP8LCombinedShannonEntropy(counts, accumulated) +
+         PredictionCostSpatial(counts, 3, kExpValue);
+}
+
+static float GetPredictionCostCrossColorRed(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
+    const int accumulated_red_histo[256]) {
+  int histo[256] = { 0 };
+  float cur_diff;
+
+  VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height,
+                                green_to_red, histo);
+
+  cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
+  if ((uint8_t)green_to_red == prev_x.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)green_to_red == prev_y.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_red == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+static void GetBestGreenToRed(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+    const int accumulated_red_histo[256], VP8LMultipliers* const best_tx) {
+  const int kMaxIters = 4 + ((7 * quality) >> 8);  // in range [4..6]
+  int green_to_red_best = 0;
+  int iter, offset;
+  float best_diff = GetPredictionCostCrossColorRed(
+      argb, stride, tile_width, tile_height, prev_x, prev_y,
+      green_to_red_best, accumulated_red_histo);
+  for (iter = 0; iter < kMaxIters; ++iter) {
+    // ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
+    // one in color computation. Having initial delta here as 1 is sufficient
+    // to explore the range of (-2, 2).
+    const int delta = 32 >> iter;
+    // Try a negative and a positive delta from the best known value.
+    for (offset = -delta; offset <= delta; offset += 2 * delta) {
+      const int green_to_red_cur = offset + green_to_red_best;
+      const float cur_diff = GetPredictionCostCrossColorRed(
+          argb, stride, tile_width, tile_height, prev_x, prev_y,
+          green_to_red_cur, accumulated_red_histo);
+      if (cur_diff < best_diff) {
+        best_diff = cur_diff;
+        green_to_red_best = green_to_red_cur;
+      }
+    }
+  }
+  best_tx->green_to_red_ = green_to_red_best;
+}
+
+static float GetPredictionCostCrossColorBlue(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y,
+    int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256]) {
+  int histo[256] = { 0 };
+  float cur_diff;
+
+  VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height,
+                                 green_to_blue, red_to_blue, histo);
+
+  cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
+  if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  if (red_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+#define kGreenRedToBlueNumAxis 8
+#define kGreenRedToBlueMaxIters 7
+static void GetBestGreenRedToBlue(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+    const int accumulated_blue_histo[256],
+    VP8LMultipliers* const best_tx) {
+  const int8_t offset[kGreenRedToBlueNumAxis][2] =
+      {{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}};
+  const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 };
+  const int iters =
+      (quality < 25) ? 1 : (quality > 50) ? kGreenRedToBlueMaxIters : 4;
+  int green_to_blue_best = 0;
+  int red_to_blue_best = 0;
+  int iter;
+  // Initial value at origin:
+  float best_diff = GetPredictionCostCrossColorBlue(
+      argb, stride, tile_width, tile_height, prev_x, prev_y,
+      green_to_blue_best, red_to_blue_best, accumulated_blue_histo);
+  for (iter = 0; iter < iters; ++iter) {
+    const int delta = delta_lut[iter];
+    int axis;
+    for (axis = 0; axis < kGreenRedToBlueNumAxis; ++axis) {
+      const int green_to_blue_cur =
+          offset[axis][0] * delta + green_to_blue_best;
+      const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best;
+      const float cur_diff = GetPredictionCostCrossColorBlue(
+          argb, stride, tile_width, tile_height, prev_x, prev_y,
+          green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo);
+      if (cur_diff < best_diff) {
+        best_diff = cur_diff;
+        green_to_blue_best = green_to_blue_cur;
+        red_to_blue_best = red_to_blue_cur;
+      }
+      if (quality < 25 && iter == 4) {
+        // Only axis aligned diffs for lower quality.
+        break;  // next iter.
+      }
+    }
+    if (delta == 2 && green_to_blue_best == 0 && red_to_blue_best == 0) {
+      // Further iterations would not help.
+      break;  // out of iter-loop.
+    }
+  }
+  best_tx->green_to_blue_ = green_to_blue_best;
+  best_tx->red_to_blue_ = red_to_blue_best;
+}
+#undef kGreenRedToBlueMaxIters
+#undef kGreenRedToBlueNumAxis
+
+static VP8LMultipliers GetBestColorTransformForTile(
+    int tile_x, int tile_y, int bits,
+    VP8LMultipliers prev_x,
+    VP8LMultipliers prev_y,
+    int quality, int xsize, int ysize,
+    const int accumulated_red_histo[256],
+    const int accumulated_blue_histo[256],
+    const uint32_t* const argb) {
+  const int max_tile_size = 1 << bits;
+  const int tile_y_offset = tile_y * max_tile_size;
+  const int tile_x_offset = tile_x * max_tile_size;
+  const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
+  const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
+  const int tile_width = all_x_max - tile_x_offset;
+  const int tile_height = all_y_max - tile_y_offset;
+  const uint32_t* const tile_argb = argb + tile_y_offset * xsize
+                                  + tile_x_offset;
+  VP8LMultipliers best_tx;
+  MultipliersClear(&best_tx);
+
+  GetBestGreenToRed(tile_argb, xsize, tile_width, tile_height,
+                    prev_x, prev_y, quality, accumulated_red_histo, &best_tx);
+  GetBestGreenRedToBlue(tile_argb, xsize, tile_width, tile_height,
+                        prev_x, prev_y, quality, accumulated_blue_histo,
+                        &best_tx);
+  return best_tx;
+}
+
+static void CopyTileWithColorTransform(int xsize, int ysize,
+                                       int tile_x, int tile_y,
+                                       int max_tile_size,
+                                       VP8LMultipliers color_transform,
+                                       uint32_t* argb) {
+  const int xscan = GetMin(max_tile_size, xsize - tile_x);
+  int yscan = GetMin(max_tile_size, ysize - tile_y);
+  argb += tile_y * xsize + tile_x;
+  while (yscan-- > 0) {
+    VP8LTransformColor(&color_transform, argb, xscan);
+    argb += xsize;
+  }
+}
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                             uint32_t* const argb, uint32_t* image) {
+  const int max_tile_size = 1 << bits;
+  const int tile_xsize = VP8LSubSampleSize(width, bits);
+  const int tile_ysize = VP8LSubSampleSize(height, bits);
+  int accumulated_red_histo[256] = { 0 };
+  int accumulated_blue_histo[256] = { 0 };
+  int tile_x, tile_y;
+  VP8LMultipliers prev_x, prev_y;
+  MultipliersClear(&prev_y);
+  MultipliersClear(&prev_x);
+  for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
+    for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
+      int y;
+      const int tile_x_offset = tile_x * max_tile_size;
+      const int tile_y_offset = tile_y * max_tile_size;
+      const int all_x_max = GetMin(tile_x_offset + max_tile_size, width);
+      const int all_y_max = GetMin(tile_y_offset + max_tile_size, height);
+      const int offset = tile_y * tile_xsize + tile_x;
+      if (tile_y != 0) {
+        ColorCodeToMultipliers(image[offset - tile_xsize], &prev_y);
+      }
+      prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits,
+                                            prev_x, prev_y,
+                                            quality, width, height,
+                                            accumulated_red_histo,
+                                            accumulated_blue_histo,
+                                            argb);
+      image[offset] = MultipliersToColorCode(&prev_x);
+      CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset,
+                                 max_tile_size, prev_x, argb);
+
+      // Gather accumulated histogram data.
+      for (y = tile_y_offset; y < all_y_max; ++y) {
+        int ix = y * width + tile_x_offset;
+        const int ix_end = ix + all_x_max - tile_x_offset;
+        for (; ix < ix_end; ++ix) {
+          const uint32_t pix = argb[ix];
+          if (ix >= 2 &&
+              pix == argb[ix - 2] &&
+              pix == argb[ix - 1]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          if (ix >= width + 2 &&
+              argb[ix - 2] == argb[ix - width - 2] &&
+              argb[ix - 1] == argb[ix - width - 1] &&
+              pix == argb[ix - width]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          ++accumulated_red_histo[(pix >> 16) & 0xff];
+          ++accumulated_blue_histo[(pix >> 0) & 0xff];
+        }
+      }
+    }
+  }
+}
diff --git a/src/3rdparty/libwebp/src/enc/quant.c b/src/3rdparty/libwebp/src/enc/quant_enc.c
index 549ad26..b118fb2 100644
--- a/src/3rdparty/libwebp/src/enc/quant.c
+++ b/src/3rdparty/libwebp/src/enc/quant_enc.c
@@ -15,8 +15,8 @@
 #include <math.h>
 #include <stdlib.h>  // for abs()
 
-#include "./vp8enci.h"
-#include "./cost.h"
+#include "./vp8i_enc.h"
+#include "./cost_enc.h"
 
 #define DO_TRELLIS_I4  1
 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
@@ -278,7 +278,7 @@ static void SetupMatrices(VP8Encoder* enc) {
     CheckLambdaValue(&m->lambda_trellis_uv_);
     CheckLambdaValue(&m->tlambda_);
 
-    m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
+    m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
     m->max_edge_  = 0;
 
     m->i4_penalty_ = 1000 * q_i4 * q_i4;
@@ -643,6 +643,8 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
     const int sign = (in[j] < 0);
     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
     int level0 = QUANTDIV(coeff0, iQ, B);
+    int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
+    if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
 
     {   // Swap current and previous score states
@@ -657,23 +659,17 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
       int level = level0 + m;
       const int ctx = (level > 2) ? 2 : level;
       const int band = VP8EncBands[n + 1];
-      score_t base_score, last_pos_score;
+      score_t base_score;
       score_t best_cur_score = MAX_COST;
       int best_prev = 0;   // default, in case
 
       ss_cur[m].score = MAX_COST;
       ss_cur[m].costs = costs[n + 1][ctx];
-      if (level > MAX_LEVEL || level < 0) {   // node is dead?
+      if (level < 0 || level > thresh_level) {
+        // Node is dead.
         continue;
       }
 
-      // Compute extra rate cost if last coeff's position is < 15
-      {
-        const score_t last_pos_cost =
-            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
-        last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
-      }
-
       {
         // Compute delta_error = how much coding this level will
         // subtract to max_error as distortion.
@@ -705,6 +701,9 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
 
       // Now, record best terminal node (and thus best entry in the graph).
       if (level != 0) {
+        const score_t last_pos_cost =
+            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
+        const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
         const score_t score = best_cur_score + last_pos_score;
         if (score < best_score) {
           best_score = score;
@@ -874,9 +873,9 @@ static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
   // We look at the first three AC coefficients to determine what is the average
   // delta between each sub-4x4 block.
   const int v0 = abs(DCs[1]);
-  const int v1 = abs(DCs[4]);
-  const int v2 = abs(DCs[5]);
-  int max_v = (v0 > v1) ? v1 : v0;
+  const int v1 = abs(DCs[2]);
+  const int v2 = abs(DCs[4]);
+  int max_v = (v1 > v0) ? v1 : v0;
   max_v = (v2 > max_v) ? v2 : max_v;
   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
 }
@@ -957,7 +956,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
   // we have a blocky macroblock (only DCs are non-zero) with fairly high
   // distortion, record max delta so we can later adjust the minimal filtering
   // strength needed to smooth these blocks out.
-  if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
+  if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
     StoreMaxDelta(dqm, rd->y_dc_levels);
   }
 }
@@ -1155,7 +1154,8 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
   const int lambda_d_uv = 120;
   score_t score_i4 = dqm->i4_penalty_;
   score_t i4_bit_sum = 0;
-  const score_t bit_limit = it->enc_->mb_header_limit_;
+  const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
+                                           : MAX_COST;  // no early-out allowed
 
   if (is_i16) {   // First, evaluate Intra16 distortion
     int best_mode = -1;
diff --git a/src/3rdparty/libwebp/src/enc/syntax.c b/src/3rdparty/libwebp/src/enc/syntax_enc.c
index a0e79ef..90665bd 100644
--- a/src/3rdparty/libwebp/src/enc/syntax.c
+++ b/src/3rdparty/libwebp/src/enc/syntax_enc.c
@@ -16,7 +16,7 @@
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"  // RIFF constants
 #include "../webp/mux_types.h"         // ALPHA_FLAG
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Helper functions
@@ -362,8 +362,7 @@ int VP8EncWrite(VP8Encoder* const enc) {
   for (p = 0; p < enc->num_parts_; ++p) {
     const uint8_t* const buf = VP8BitWriterBuf(enc->parts_ + p);
     const size_t size = VP8BitWriterSize(enc->parts_ + p);
-    if (size)
-      ok = ok && pic->writer(buf, size, pic);
+    if (size) ok = ok && pic->writer(buf, size, pic);
     VP8BitWriterWipeOut(enc->parts_ + p);    // will free the internal buffer.
     ok = ok && WebPReportProgress(pic, enc->percent_ + percent_per_part,
                                   &enc->percent_);
diff --git a/src/3rdparty/libwebp/src/enc/token.c b/src/3rdparty/libwebp/src/enc/token_enc.c
index e73256b..02a0d72 100644
--- a/src/3rdparty/libwebp/src/enc/token.c
+++ b/src/3rdparty/libwebp/src/enc/token_enc.c
@@ -20,8 +20,8 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./cost.h"
-#include "./vp8enci.h"
+#include "./cost_enc.h"
+#include "./vp8i_enc.h"
 #include "../utils/utils.h"
 
 #if !defined(DISABLE_TOKEN_BUFFER)
@@ -87,14 +87,16 @@ static int TBufferNewPage(VP8TBuffer* const b) {
 #define TOKEN_ID(t, b, ctx) \
     (NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
 
-static WEBP_INLINE uint32_t AddToken(VP8TBuffer* const b,
-                                     uint32_t bit, uint32_t proba_idx) {
+static WEBP_INLINE uint32_t AddToken(VP8TBuffer* const b, uint32_t bit,
+                                     uint32_t proba_idx,
+                                     proba_t* const stats) {
   assert(proba_idx < FIXED_PROBA_BIT);
   assert(bit <= 1);
   if (b->left_ > 0 || TBufferNewPage(b)) {
     const int slot = --b->left_;
     b->tokens_[slot] = (bit << 15) | proba_idx;
   }
+  VP8RecordStats(bit, stats);
   return bit;
 }
 
@@ -108,13 +110,16 @@ static WEBP_INLINE void AddConstantToken(VP8TBuffer* const b,
   }
 }
 
-int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
-                         int first, int last,
-                         const int16_t* const coeffs,
+int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res,
                          VP8TBuffer* const tokens) {
-  int n = first;
+  const int16_t* const coeffs = res->coeffs;
+  const int coeff_type = res->coeff_type;
+  const int last = res->last;
+  int n = res->first;
   uint32_t base_id = TOKEN_ID(coeff_type, n, ctx);
-  if (!AddToken(tokens, last >= 0, base_id + 0)) {
+  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  proba_t* s = res->stats[n][ctx];
+  if (!AddToken(tokens, last >= 0, base_id + 0, s + 0)) {
     return 0;
   }
 
@@ -122,18 +127,21 @@ int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
     const int c = coeffs[n++];
     const int sign = c < 0;
     const uint32_t v = sign ? -c : c;
-    if (!AddToken(tokens, v != 0, base_id + 1)) {
+    if (!AddToken(tokens, v != 0, base_id + 1, s + 1)) {
       base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 0);  // ctx=0
+      s = res->stats[VP8EncBands[n]][0];
       continue;
     }
-    if (!AddToken(tokens, v > 1, base_id + 2)) {
+    if (!AddToken(tokens, v > 1, base_id + 2, s + 2)) {
       base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 1);  // ctx=1
+      s = res->stats[VP8EncBands[n]][1];
     } else {
-      if (!AddToken(tokens, v > 4, base_id + 3)) {
-        if (AddToken(tokens, v != 2, base_id + 4))
-          AddToken(tokens, v == 4, base_id + 5);
-      } else if (!AddToken(tokens, v > 10, base_id + 6)) {
-        if (!AddToken(tokens, v > 6, base_id + 7)) {
+      if (!AddToken(tokens, v > 4, base_id + 3, s + 3)) {
+        if (AddToken(tokens, v != 2, base_id + 4, s + 4)) {
+          AddToken(tokens, v == 4, base_id + 5, s + 5);
+        }
+      } else if (!AddToken(tokens, v > 10, base_id + 6, s + 6)) {
+        if (!AddToken(tokens, v > 6, base_id + 7, s + 7)) {
           AddConstantToken(tokens, v == 6, 159);
         } else {
           AddConstantToken(tokens, v >= 9, 165);
@@ -144,26 +152,26 @@ int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
         const uint8_t* tab;
         uint32_t residue = v - 3;
         if (residue < (8 << 1)) {          // VP8Cat3  (3b)
-          AddToken(tokens, 0, base_id + 8);
-          AddToken(tokens, 0, base_id + 9);
+          AddToken(tokens, 0, base_id + 8, s + 8);
+          AddToken(tokens, 0, base_id + 9, s + 9);
           residue -= (8 << 0);
           mask = 1 << 2;
           tab = VP8Cat3;
         } else if (residue < (8 << 2)) {   // VP8Cat4  (4b)
-          AddToken(tokens, 0, base_id + 8);
-          AddToken(tokens, 1, base_id + 9);
+          AddToken(tokens, 0, base_id + 8, s + 8);
+          AddToken(tokens, 1, base_id + 9, s + 9);
           residue -= (8 << 1);
           mask = 1 << 3;
           tab = VP8Cat4;
         } else if (residue < (8 << 3)) {   // VP8Cat5  (5b)
-          AddToken(tokens, 1, base_id + 8);
-          AddToken(tokens, 0, base_id + 10);
+          AddToken(tokens, 1, base_id + 8, s + 8);
+          AddToken(tokens, 0, base_id + 10, s + 9);
           residue -= (8 << 2);
           mask = 1 << 4;
           tab = VP8Cat5;
         } else {                         // VP8Cat6 (11b)
-          AddToken(tokens, 1, base_id + 8);
-          AddToken(tokens, 1, base_id + 10);
+          AddToken(tokens, 1, base_id + 8, s + 8);
+          AddToken(tokens, 1, base_id + 10, s + 9);
           residue -= (8 << 3);
           mask = 1 << 10;
           tab = VP8Cat6;
@@ -174,9 +182,10 @@ int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
         }
       }
       base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 2);  // ctx=2
+      s = res->stats[VP8EncBands[n]][2];
     }
     AddConstantToken(tokens, sign, 128);
-    if (n == 16 || !AddToken(tokens, n <= last, base_id + 0)) {
+    if (n == 16 || !AddToken(tokens, n <= last, base_id + 0, s + 0)) {
       return 1;   // EOB
     }
   }
diff --git a/src/3rdparty/libwebp/src/enc/tree.c b/src/3rdparty/libwebp/src/enc/tree_enc.c
index f141006..2c40fe7 100644
--- a/src/3rdparty/libwebp/src/enc/tree.c
+++ b/src/3rdparty/libwebp/src/enc/tree_enc.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Default probabilities
diff --git a/src/3rdparty/libwebp/src/enc/vp8enci.h b/src/3rdparty/libwebp/src/enc/vp8i_enc.h
index c1fbd76..93c95ec 100644
--- a/src/3rdparty/libwebp/src/enc/vp8enci.h
+++ b/src/3rdparty/libwebp/src/enc/vp8i_enc.h
@@ -15,10 +15,10 @@
 #define WEBP_ENC_VP8ENCI_H_
 
 #include <string.h>     // for memcpy()
-#include "../dec/common.h"
+#include "../dec/common_dec.h"
 #include "../dsp/dsp.h"
-#include "../utils/bit_writer.h"
-#include "../utils/thread.h"
+#include "../utils/bit_writer_utils.h"
+#include "../utils/thread_utils.h"
 #include "../utils/utils.h"
 #include "../webp/encode.h"
 
@@ -31,8 +31,8 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 5
-#define ENC_REV_VERSION 1
+#define ENC_MIN_VERSION 6
+#define ENC_REV_VERSION 0
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
@@ -219,7 +219,6 @@ typedef struct {
 // right neighbouring data (samples, predictions, contexts, ...)
 typedef struct {
   int x_, y_;                      // current macroblock
-  int y_stride_, uv_stride_;       // respective strides
   uint8_t*      yuv_in_;           // input samples
   uint8_t*      yuv_out_;          // output samples
   uint8_t*      yuv_out2_;         // secondary buffer swapped with yuv_out_.
@@ -325,9 +324,7 @@ int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
                   const uint8_t* const probas, int final_pass);
 
 // record the coding of coefficients without knowing the probabilities yet
-int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
-                         int first, int last,
-                         const int16_t* const coeffs,
+int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res,
                          VP8TBuffer* const tokens);
 
 // Estimate the final coded size given a set of 'probas'.
@@ -476,14 +473,6 @@ int VP8EncStartAlpha(VP8Encoder* const enc);    // start alpha coding process
 int VP8EncFinishAlpha(VP8Encoder* const enc);   // finalize compressed data
 int VP8EncDeleteAlpha(VP8Encoder* const enc);   // delete compressed data
 
-  // in filter.c
-void VP8SSIMAddStats(const VP8DistoStats* const src, VP8DistoStats* const dst);
-void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1,
-                            const uint8_t* src2, int stride2,
-                            int W, int H, VP8DistoStats* const stats);
-double VP8SSIMGet(const VP8DistoStats* const stats);
-double VP8SSIMGetSquaredError(const VP8DistoStats* const stats);
-
 // autofilter
 void VP8InitFilter(VP8EncIterator* const it);
 void VP8StoreFilterStats(VP8EncIterator* const it);
diff --git a/src/3rdparty/libwebp/src/enc/vp8l.c b/src/3rdparty/libwebp/src/enc/vp8l_enc.c
index c16e256..b1a793d 100644
--- a/src/3rdparty/libwebp/src/enc/vp8l.c
+++ b/src/3rdparty/libwebp/src/enc/vp8l_enc.c
@@ -15,17 +15,18 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./backward_references.h"
-#include "./histogram.h"
-#include "./vp8enci.h"
-#include "./vp8li.h"
+#include "./backward_references_enc.h"
+#include "./histogram_enc.h"
+#include "./vp8i_enc.h"
+#include "./vp8li_enc.h"
 #include "../dsp/lossless.h"
-#include "../utils/bit_writer.h"
-#include "../utils/huffman_encode.h"
+#include "../dsp/lossless_common.h"
+#include "../utils/bit_writer_utils.h"
+#include "../utils/huffman_encode_utils.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
-#include "./delta_palettization.h"
+#include "./delta_palettization_enc.h"
 
 #define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
 // Maximum number of histogram images (sub-blocks).
@@ -34,8 +35,8 @@
 // Palette reordering for smaller sum of deltas (and for smaller storage).
 
 static int PaletteCompareColorsForQsort(const void* p1, const void* p2) {
-  const uint32_t a = WebPMemToUint32(p1);
-  const uint32_t b = WebPMemToUint32(p2);
+  const uint32_t a = WebPMemToUint32((uint8_t*)p1);
+  const uint32_t b = WebPMemToUint32((uint8_t*)p2);
   assert(a != b);
   return (a < b) ? -1 : 1;
 }
@@ -163,18 +164,25 @@ typedef enum {
   kHistoTotal  // Must be last.
 } HistoIx;
 
-static void AddSingleSubGreen(uint32_t p, uint32_t* r, uint32_t* b) {
-  const uint32_t green = p >> 8;  // The upper bits are masked away later.
+static void AddSingleSubGreen(int p, uint32_t* const r, uint32_t* const b) {
+  const int green = p >> 8;  // The upper bits are masked away later.
   ++r[((p >> 16) - green) & 0xff];
-  ++b[(p - green) & 0xff];
+  ++b[((p >>  0) - green) & 0xff];
 }
 
 static void AddSingle(uint32_t p,
-                      uint32_t* a, uint32_t* r, uint32_t* g, uint32_t* b) {
-  ++a[p >> 24];
+                      uint32_t* const a, uint32_t* const r,
+                      uint32_t* const g, uint32_t* const b) {
+  ++a[(p >> 24) & 0xff];
   ++r[(p >> 16) & 0xff];
-  ++g[(p >> 8) & 0xff];
-  ++b[(p & 0xff)];
+  ++g[(p >>  8) & 0xff];
+  ++b[(p >>  0) & 0xff];
+}
+
+static WEBP_INLINE uint32_t HashPix(uint32_t pix) {
+  // Note that masking with 0xffffffffu is for preventing an
+  // 'unsigned int overflow' warning. Doesn't impact the compiled code.
+  return ((((uint64_t)pix + (pix >> 19)) * 0x39c5fba7ull) & 0xffffffffu) >> 24;
 }
 
 static int AnalyzeEntropy(const uint32_t* argb,
@@ -214,8 +222,8 @@ static int AnalyzeEntropy(const uint32_t* argb,
                           &histo[kHistoBluePredSubGreen * 256]);
         {
           // Approximate the palette by the entropy of the multiplicative hash.
-          const int hash = ((pix + (pix >> 19)) * 0x39c5fba7) >> 24;
-          ++histo[kHistoPalette * 256 + (hash & 0xff)];
+          const uint32_t hash = HashPix(pix);
+          ++histo[kHistoPalette * 256 + hash];
         }
       }
       prev_row = curr_row;
@@ -224,9 +232,8 @@ static int AnalyzeEntropy(const uint32_t* argb,
     {
       double entropy_comp[kHistoTotal];
       double entropy[kNumEntropyIx];
-      EntropyIx k;
-      EntropyIx last_mode_to_analyze =
-          use_palette ? kPalette : kSpatialSubGreen;
+      int k;
+      int last_mode_to_analyze = use_palette ? kPalette : kSpatialSubGreen;
       int j;
       // Let's add one zero to the predicted histograms. The zeros are removed
       // too efficiently by the pix_diff == 0 comparison, at least one of the
@@ -263,7 +270,7 @@ static int AnalyzeEntropy(const uint32_t* argb,
       *min_entropy_ix = kDirect;
       for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) {
         if (entropy[*min_entropy_ix] > entropy[k]) {
-          *min_entropy_ix = k;
+          *min_entropy_ix = (EntropyIx)k;
         }
       }
       *red_and_blue_always_zero = 1;
@@ -312,7 +319,10 @@ static int GetHistoBits(int method, int use_palette, int width, int height) {
 
 static int GetTransformBits(int method, int histo_bits) {
   const int max_transform_bits = (method < 4) ? 6 : (method > 4) ? 4 : 5;
-  return (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
+  const int res =
+      (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
+  assert(res <= MAX_TRANSFORM_BITS);
+  return res;
 }
 
 static int AnalyzeAndInit(VP8LEncoder* const enc) {
@@ -697,7 +707,7 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
                                               VP8LHashChain* const hash_chain,
                                               VP8LBackwardRefs refs_array[2],
                                               int width, int height,
-                                              int quality) {
+                                              int quality, int low_effort) {
   int i;
   int max_tokens = 0;
   WebPEncodingError err = VP8_ENC_OK;
@@ -715,7 +725,8 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
   }
 
   // Calculate backward references from ARGB image.
-  if (VP8LHashChainFill(hash_chain, quality, argb, width, height) == 0) {
+  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
@@ -815,11 +826,18 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
     goto Error;
   }
 
-  *cache_bits = use_cache ? MAX_COLOR_CACHE_BITS : 0;
+  if (use_cache) {
+    // If the value is different from zero, it has been set during the
+    // palette analysis.
+    if (*cache_bits == 0) *cache_bits = MAX_COLOR_CACHE_BITS;
+  } else {
+    *cache_bits = 0;
+  }
   // 'best_refs' is the reference to the best backward refs and points to one
   // of refs_array[0] or refs_array[1].
   // Calculate backward references from ARGB image.
-  if (VP8LHashChainFill(hash_chain, quality, argb, width, height) == 0) {
+  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
@@ -900,7 +918,7 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
       err = EncodeImageNoHuffman(bw, histogram_argb, hash_chain, refs_array,
                                  VP8LSubSampleSize(width, histogram_bits),
                                  VP8LSubSampleSize(height, histogram_bits),
-                                 quality);
+                                 quality, low_effort);
       WebPSafeFree(histogram_argb);
       if (err != VP8_ENC_OK) goto Error;
     }
@@ -991,12 +1009,12 @@ static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
                               (VP8LHashChain*)&enc->hash_chain_,
                               (VP8LBackwardRefs*)enc->refs_,  // cast const away
                               transform_width, transform_height,
-                              quality);
+                              quality, low_effort);
 }
 
 static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
                                                int width, int height,
-                                               int quality,
+                                               int quality, int low_effort,
                                                VP8LBitWriter* const bw) {
   const int ccolor_transform_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
@@ -1012,7 +1030,7 @@ static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
                               (VP8LHashChain*)&enc->hash_chain_,
                               (VP8LBackwardRefs*)enc->refs_,  // cast const away
                               transform_width, transform_height,
-                              quality);
+                              quality, low_effort);
 }
 
 // -----------------------------------------------------------------------------
@@ -1157,7 +1175,8 @@ static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
 
 // -----------------------------------------------------------------------------
 
-static int SearchColor(const uint32_t sorted[], uint32_t color, int hi) {
+static WEBP_INLINE int SearchColorNoIdx(const uint32_t sorted[], uint32_t color,
+                                        int hi) {
   int low = 0;
   if (sorted[low] == color) return low;  // loop invariant: sorted[low] != color
   while (1) {
@@ -1172,35 +1191,68 @@ static int SearchColor(const uint32_t sorted[], uint32_t color, int hi) {
   }
 }
 
+#define APPLY_PALETTE_GREEDY_MAX 4
+
+static WEBP_INLINE uint32_t SearchColorGreedy(const uint32_t palette[],
+                                              int palette_size,
+                                              uint32_t color) {
+  (void)palette_size;
+  assert(palette_size < APPLY_PALETTE_GREEDY_MAX);
+  assert(3 == APPLY_PALETTE_GREEDY_MAX - 1);
+  if (color == palette[0]) return 0;
+  if (color == palette[1]) return 1;
+  if (color == palette[2]) return 2;
+  return 3;
+}
+
+static WEBP_INLINE uint32_t ApplyPaletteHash0(uint32_t color) {
+  // Focus on the green color.
+  return (color >> 8) & 0xff;
+}
+
+#define PALETTE_INV_SIZE_BITS 11
+#define PALETTE_INV_SIZE (1 << PALETTE_INV_SIZE_BITS)
+
+static WEBP_INLINE uint32_t ApplyPaletteHash1(uint32_t color) {
+  // Forget about alpha.
+  return ((color & 0x00ffffffu) * 4222244071u) >> (32 - PALETTE_INV_SIZE_BITS);
+}
+
+static WEBP_INLINE uint32_t ApplyPaletteHash2(uint32_t color) {
+  // Forget about alpha.
+  return (color & 0x00ffffffu) * ((1u << 31) - 1) >>
+         (32 - PALETTE_INV_SIZE_BITS);
+}
+
 // Sort palette in increasing order and prepare an inverse mapping array.
 static void PrepareMapToPalette(const uint32_t palette[], int num_colors,
-                                uint32_t sorted[], int idx_map[]) {
+                                uint32_t sorted[], uint32_t idx_map[]) {
   int i;
   memcpy(sorted, palette, num_colors * sizeof(*sorted));
   qsort(sorted, num_colors, sizeof(*sorted), PaletteCompareColorsForQsort);
   for (i = 0; i < num_colors; ++i) {
-    idx_map[SearchColor(sorted, palette[i], num_colors)] = i;
+    idx_map[SearchColorNoIdx(sorted, palette[i], num_colors)] = i;
   }
 }
 
-static void MapToPalette(const uint32_t sorted_palette[], int num_colors,
-                         uint32_t* const last_pix, int* const last_idx,
-                         const int idx_map[],
-                         const uint32_t* src, uint8_t* dst, int width) {
-  int x;
-  int prev_idx = *last_idx;
-  uint32_t prev_pix = *last_pix;
-  for (x = 0; x < width; ++x) {
-    const uint32_t pix = src[x];
-    if (pix != prev_pix) {
-      prev_idx = idx_map[SearchColor(sorted_palette, pix, num_colors)];
-      prev_pix = pix;
-    }
-    dst[x] = prev_idx;
-  }
-  *last_idx = prev_idx;
-  *last_pix = prev_pix;
-}
+// Use 1 pixel cache for ARGB pixels.
+#define APPLY_PALETTE_FOR(COLOR_INDEX) do {         \
+  uint32_t prev_pix = palette[0];                   \
+  uint32_t prev_idx = 0;                            \
+  for (y = 0; y < height; ++y) {                    \
+    for (x = 0; x < width; ++x) {                   \
+      const uint32_t pix = src[x];                  \
+      if (pix != prev_pix) {                        \
+        prev_idx = COLOR_INDEX;                     \
+        prev_pix = pix;                             \
+      }                                             \
+      tmp_row[x] = prev_idx;                        \
+    }                                               \
+    VP8LBundleColorMap(tmp_row, width, xbits, dst); \
+    src += src_stride;                              \
+    dst += dst_stride;                              \
+  }                                                 \
+} while (0)
 
 // Remap argb values in src[] to packed palettes entries in dst[]
 // using 'row' as a temporary buffer of size 'width'.
@@ -1213,52 +1265,59 @@ static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
   // TODO(skal): this tmp buffer is not needed if VP8LBundleColorMap() can be
   // made to work in-place.
   uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
-  int i, x, y;
-  int use_LUT = 1;
+  int x, y;
 
   if (tmp_row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
-  for (i = 0; i < palette_size; ++i) {
-    if ((palette[i] & 0xffff00ffu) != 0) {
-      use_LUT = 0;
-      break;
-    }
-  }
 
-  if (use_LUT) {
-    uint8_t inv_palette[MAX_PALETTE_SIZE] = { 0 };
-    for (i = 0; i < palette_size; ++i) {
-      const int color = (palette[i] >> 8) & 0xff;
-      inv_palette[color] = i;
-    }
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const int color = (src[x] >> 8) & 0xff;
-        tmp_row[x] = inv_palette[color];
+  if (palette_size < APPLY_PALETTE_GREEDY_MAX) {
+    APPLY_PALETTE_FOR(SearchColorGreedy(palette, palette_size, pix));
+  } else {
+    int i, j;
+    uint16_t buffer[PALETTE_INV_SIZE];
+    uint32_t (*const hash_functions[])(uint32_t) = {
+        ApplyPaletteHash0, ApplyPaletteHash1, ApplyPaletteHash2
+    };
+
+    // Try to find a perfect hash function able to go from a color to an index
+    // within 1 << PALETTE_INV_SIZE_BITS in order to build a hash map to go
+    // from color to index in palette.
+    for (i = 0; i < 3; ++i) {
+      int use_LUT = 1;
+      // Set each element in buffer to max uint16_t.
+      memset(buffer, 0xff, sizeof(buffer));
+      for (j = 0; j < palette_size; ++j) {
+        const uint32_t ind = hash_functions[i](palette[j]);
+        if (buffer[ind] != 0xffffu) {
+          use_LUT = 0;
+          break;
+        } else {
+          buffer[ind] = j;
+        }
       }
-      VP8LBundleColorMap(tmp_row, width, xbits, dst);
-      src += src_stride;
-      dst += dst_stride;
+      if (use_LUT) break;
     }
-  } else {
-    // Use 1 pixel cache for ARGB pixels.
-    uint32_t last_pix;
-    int last_idx;
-    uint32_t sorted[MAX_PALETTE_SIZE];
-    int idx_map[MAX_PALETTE_SIZE];
-    PrepareMapToPalette(palette, palette_size, sorted, idx_map);
-    last_pix = palette[0];
-    last_idx = 0;
-    for (y = 0; y < height; ++y) {
-      MapToPalette(sorted, palette_size, &last_pix, &last_idx,
-                   idx_map, src, tmp_row, width);
-      VP8LBundleColorMap(tmp_row, width, xbits, dst);
-      src += src_stride;
-      dst += dst_stride;
+
+    if (i == 0) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash0(pix)]);
+    } else if (i == 1) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash1(pix)]);
+    } else if (i == 2) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash2(pix)]);
+    } else {
+      uint32_t idx_map[MAX_PALETTE_SIZE];
+      uint32_t palette_sorted[MAX_PALETTE_SIZE];
+      PrepareMapToPalette(palette, palette_size, palette_sorted, idx_map);
+      APPLY_PALETTE_FOR(
+          idx_map[SearchColorNoIdx(palette_sorted, pix, palette_size)]);
     }
   }
   WebPSafeFree(tmp_row);
   return VP8_ENC_OK;
 }
+#undef APPLY_PALETTE_FOR
+#undef PALETTE_INV_SIZE_BITS
+#undef PALETTE_INV_SIZE
+#undef APPLY_PALETTE_GREEDY_MAX
 
 // Note: Expects "enc->palette_" to be set properly.
 static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
@@ -1291,7 +1350,7 @@ static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
 }
 
 // Save palette_[] to bitstream.
-static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
+static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
                                        VP8LEncoder* const enc) {
   int i;
   uint32_t tmp_palette[MAX_PALETTE_SIZE];
@@ -1306,13 +1365,14 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
   }
   tmp_palette[0] = palette[0];
   return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_, enc->refs_,
-                              palette_size, 1, 20 /* quality */);
+                              palette_size, 1, 20 /* quality */, low_effort);
 }
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
 
 static WebPEncodingError EncodeDeltaPalettePredictorImage(
-    VP8LBitWriter* const bw, VP8LEncoder* const enc, int quality) {
+    VP8LBitWriter* const bw, VP8LEncoder* const enc, int quality,
+    int low_effort) {
   const WebPPicture* const pic = enc->pic_;
   const int width = pic->width;
   const int height = pic->height;
@@ -1343,7 +1403,7 @@ static WebPEncodingError EncodeDeltaPalettePredictorImage(
   err = EncodeImageNoHuffman(bw, predictors, &enc->hash_chain_,
                              (VP8LBackwardRefs*)enc->refs_,  // cast const away
                              transform_width, transform_height,
-                             quality);
+                             quality, low_effort);
   WebPSafeFree(predictors);
   return err;
 }
@@ -1394,7 +1454,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   int use_near_lossless = 0;
   int hdr_size = 0;
   int data_size = 0;
-  int use_delta_palettization = 0;
+  int use_delta_palette = 0;
 
   if (enc == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
@@ -1421,7 +1481,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   }
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (config->delta_palettization) {
+  if (config->use_delta_palette) {
     enc->use_predict_ = 1;
     enc->use_cross_color_ = 0;
     enc->use_subtract_green_ = 0;
@@ -1433,21 +1493,25 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     if (enc->use_palette_) {
       err = AllocateTransformBuffer(enc, width, height);
       if (err != VP8_ENC_OK) goto Error;
-      err = EncodeDeltaPalettePredictorImage(bw, enc, quality);
+      err = EncodeDeltaPalettePredictorImage(bw, enc, quality, low_effort);
       if (err != VP8_ENC_OK) goto Error;
-      use_delta_palettization = 1;
+      use_delta_palette = 1;
     }
   }
 #endif  // WEBP_EXPERIMENTAL_FEATURES
 
   // Encode palette
   if (enc->use_palette_) {
-    err = EncodePalette(bw, enc);
+    err = EncodePalette(bw, low_effort, enc);
     if (err != VP8_ENC_OK) goto Error;
-    err = MapImageFromPalette(enc, use_delta_palettization);
+    err = MapImageFromPalette(enc, use_delta_palette);
     if (err != VP8_ENC_OK) goto Error;
+    // If using a color cache, do not have it bigger than the number of colors.
+    if (use_cache && enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
+      enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
+    }
   }
-  if (!use_delta_palettization) {
+  if (!use_delta_palette) {
     // In case image is not packed.
     if (enc->argb_ == NULL) {
       err = MakeInputImageCopy(enc);
@@ -1469,7 +1533,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
 
     if (enc->use_cross_color_) {
       err = ApplyCrossColorFilter(enc, enc->current_width_,
-                                  height, quality, bw);
+                                  height, quality, low_effort, bw);
       if (err != VP8_ENC_OK) goto Error;
     }
   }
diff --git a/src/3rdparty/libwebp/src/enc/vp8li.h b/src/3rdparty/libwebp/src/enc/vp8li_enc.h
index 371e276..8c5fbcb 100644
--- a/src/3rdparty/libwebp/src/enc/vp8li.h
+++ b/src/3rdparty/libwebp/src/enc/vp8li_enc.h
@@ -14,9 +14,9 @@
 #ifndef WEBP_ENC_VP8LI_H_
 #define WEBP_ENC_VP8LI_H_
 
-#include "./backward_references.h"
-#include "./histogram.h"
-#include "../utils/bit_writer.h"
+#include "./backward_references_enc.h"
+#include "./histogram_enc.h"
+#include "../utils/bit_writer_utils.h"
 #include "../webp/encode.h"
 #include "../webp/format_constants.h"
 
@@ -24,6 +24,9 @@
 extern "C" {
 #endif
 
+// maximum value of transform_bits_ in VP8LEncoder.
+#define MAX_TRANSFORM_BITS 6
+
 typedef struct {
   const WebPConfig* config_;      // user configuration and parameters
   const WebPPicture* pic_;        // input picture.
@@ -39,7 +42,7 @@ typedef struct {
 
   // Encoding parameters derived from quality parameter.
   int histo_bits_;
-  int transform_bits_;
+  int transform_bits_;    // <= MAX_TRANSFORM_BITS.
   int cache_bits_;        // If equal to 0, don't use color cache.
 
   // Encoding parameters derived from image characteristics.
@@ -73,6 +76,17 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
                                    VP8LBitWriter* const bw, int use_cache);
 
 //------------------------------------------------------------------------------
+// Image transforms in predictor.c.
+
+void VP8LResidualImage(int width, int height, int bits, int low_effort,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image, int near_lossless, int exact,
+                       int used_subtract_green);
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                             uint32_t* const argb, uint32_t* image);
+
+//------------------------------------------------------------------------------
 
 #ifdef __cplusplus
 }    // extern "C"
diff --git a/src/3rdparty/libwebp/src/enc/webpenc.c b/src/3rdparty/libwebp/src/enc/webp_enc.c
index a7d04ea..f18461e 100644
--- a/src/3rdparty/libwebp/src/enc/webpenc.c
+++ b/src/3rdparty/libwebp/src/enc/webp_enc.c
@@ -16,9 +16,9 @@
 #include <string.h>
 #include <math.h>
 
-#include "./cost.h"
-#include "./vp8enci.h"
-#include "./vp8li.h"
+#include "./cost_enc.h"
+#include "./vp8i_enc.h"
+#include "./vp8li_enc.h"
 #include "../utils/utils.h"
 
 // #define PRINT_MEMORY_INFO
@@ -75,7 +75,7 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) {
 //-------------------+---+---+---+---+---+---+---+
 // dynamic proba     | ~ | x | x | x | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
-// fast mode analysis|   |   |   |   | x | x | x |
+// fast mode analysis|[x]|[x]|   |   | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
 // basic rd-opt      |   |   |   | x | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
@@ -315,18 +315,21 @@ int WebPReportProgress(const WebPPicture* const pic,
 
 int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   int ok = 0;
+  if (pic == NULL) return 0;
 
-  if (pic == NULL)
-    return 0;
   WebPEncodingSetError(pic, VP8_ENC_OK);  // all ok so far
-  if (config == NULL)  // bad params
+  if (config == NULL) {  // bad params
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
-  if (!WebPValidateConfig(config))
+  }
+  if (!WebPValidateConfig(config)) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
-  if (pic->width <= 0 || pic->height <= 0)
+  }
+  if (pic->width <= 0 || pic->height <= 0) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
-  if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION)
+  }
+  if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
 
   if (pic->stats != NULL) memset(pic->stats, 0, sizeof(*pic->stats));
 
@@ -339,8 +342,8 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
 
     if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
       // Make sure we have YUVA samples.
-      if (config->preprocessing & 4) {
-        if (!WebPPictureSmartARGBToYUVA(pic)) {
+      if (config->use_sharp_yuv || (config->preprocessing & 4)) {
+        if (!WebPPictureSharpARGBToYUVA(pic)) {
           return 0;
         }
       } else {
diff --git a/src/3rdparty/libwebp/src/extras/extras.c b/src/3rdparty/libwebp/src/extras/extras.c
deleted file mode 100644
index ca32fbc..0000000
--- a/src/3rdparty/libwebp/src/extras/extras.c
+++ /dev/null
@@ -1,111 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-//  Additional WebP utilities.
-//
-
-#include "../webp/extras.h"
-
-#include <string.h>
-
-#define XTRA_MAJ_VERSION 0
-#define XTRA_MIN_VERSION 0
-#define XTRA_REV_VERSION 0
-
-//------------------------------------------------------------------------------
-
-int WebPGetExtrasVersion(void) {
-  return (XTRA_MAJ_VERSION << 16) | (XTRA_MIN_VERSION << 8) | XTRA_REV_VERSION;
-}
-
-//------------------------------------------------------------------------------
-
-int WebPImportGray(const uint8_t* gray_data, WebPPicture* pic) {
-  int y, width, uv_width;
-  if (pic == NULL || gray_data == NULL) return 0;
-  pic->colorspace = WEBP_YUV420;
-  if (!WebPPictureAlloc(pic)) return 0;
-  width = pic->width;
-  uv_width = (width + 1) >> 1;
-  for (y = 0; y < pic->height; ++y) {
-    memcpy(pic->y + y * pic->y_stride, gray_data, width);
-    gray_data += width;    // <- we could use some 'data_stride' here if needed
-    if ((y & 1) == 0) {
-      memset(pic->u + (y >> 1) * pic->uv_stride, 128, uv_width);
-      memset(pic->v + (y >> 1) * pic->uv_stride, 128, uv_width);
-    }
-  }
-  return 1;
-}
-
-int WebPImportRGB565(const uint8_t* rgb565, WebPPicture* pic) {
-  int x, y;
-  if (pic == NULL || rgb565 == NULL) return 0;
-  pic->colorspace = WEBP_YUV420;
-  pic->use_argb = 1;
-  if (!WebPPictureAlloc(pic)) return 0;
-  for (y = 0; y < pic->height; ++y) {
-    const int width = pic->width;
-    uint32_t* dst = pic->argb + y * pic->argb_stride;
-    for (x = 0; x < width; ++x) {
-#ifdef WEBP_SWAP_16BIT_CSP
-      const uint32_t rg = rgb565[2 * x + 1];
-      const uint32_t gb = rgb565[2 * x + 0];
-#else
-      const uint32_t rg = rgb565[2 * x + 0];
-      const uint32_t gb = rgb565[2 * x + 1];
-#endif
-      uint32_t r = rg & 0xf8;
-      uint32_t g = ((rg << 5) | (gb >> 3)) & 0xfc;
-      uint32_t b = (gb << 5);
-      // dithering
-      r = r | (r >> 5);
-      g = g | (g >> 6);
-      b = b | (b >> 5);
-      dst[x] = (r << 16) | (g << 8) | b;
-    }
-    rgb565 += 2 * width;
-  }
-  return 1;
-}
-
-int WebPImportRGB4444(const uint8_t* rgb4444, WebPPicture* pic) {
-  int x, y;
-  if (pic == NULL || rgb4444 == NULL) return 0;
-  pic->colorspace = WEBP_YUV420;
-  pic->use_argb = 1;
-  if (!WebPPictureAlloc(pic)) return 0;
-  for (y = 0; y < pic->height; ++y) {
-    const int width = pic->width;
-    uint32_t* dst = pic->argb + y * pic->argb_stride;
-    for (x = 0; x < width; ++x) {
-#ifdef WEBP_SWAP_16BIT_CSP
-      const uint32_t rg = rgb4444[2 * x + 1];
-      const uint32_t ba = rgb4444[2 * x + 0];
-#else
-      const uint32_t rg = rgb4444[2 * x + 0];
-      const uint32_t ba = rgb4444[2 * x + 1];
-#endif
-      uint32_t r = rg & 0xf0;
-      uint32_t g = (rg << 4);
-      uint32_t b = (ba & 0xf0);
-      uint32_t a = (ba << 4);
-      // dithering
-      r = r | (r >> 4);
-      g = g | (g >> 4);
-      b = b | (b >> 4);
-      a = a | (a >> 4);
-      dst[x] = (a << 24) | (r << 16) | (g << 8) | b;
-    }
-    rgb4444 += 2 * width;
-  }
-  return 1;
-}
-
-//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/mux/anim_encode.c b/src/3rdparty/libwebp/src/mux/anim_encode.c
index 53e2906..6066388 100644
--- a/src/3rdparty/libwebp/src/mux/anim_encode.c
+++ b/src/3rdparty/libwebp/src/mux/anim_encode.c
@@ -16,6 +16,7 @@
 #include <stdio.h>
 #include <stdlib.h>  // for abs()
 
+#include "../mux/animi.h"
 #include "../utils/utils.h"
 #include "../webp/decode.h"
 #include "../webp/encode.h"
@@ -128,14 +129,13 @@ static void SanitizeEncoderOptions(WebPAnimEncoderOptions* const enc_options) {
     DisableKeyframes(enc_options);
   }
 
-  if (enc_options->kmin <= 0) {
-    DisableKeyframes(enc_options);
-    print_warning = 0;
-  }
-  if (enc_options->kmax <= 0) {  // All frames will be key-frames.
+  if (enc_options->kmax == 1) {  // All frames will be key-frames.
     enc_options->kmin = 0;
     enc_options->kmax = 0;
     return;
+  } else if (enc_options->kmax <= 0) {
+    DisableKeyframes(enc_options);
+    print_warning = 0;
   }
 
   if (enc_options->kmin >= enc_options->kmax) {
@@ -378,10 +378,10 @@ static WEBP_INLINE int PixelsAreSimilar(uint32_t src, uint32_t dst,
   const int dst_g = (dst >> 8) & 0xff;
   const int dst_b = (dst >> 0) & 0xff;
 
-  return (abs(src_r * src_a - dst_r * dst_a) <= (max_allowed_diff * 255)) &&
-         (abs(src_g * src_a - dst_g * dst_a) <= (max_allowed_diff * 255)) &&
-         (abs(src_b * src_a - dst_b * dst_a) <= (max_allowed_diff * 255)) &&
-         (abs(src_a - dst_a) <= max_allowed_diff);
+  return (src_a == dst_a) &&
+         (abs(src_r - dst_r) * dst_a <= (max_allowed_diff * 255)) &&
+         (abs(src_g - dst_g) * dst_a <= (max_allowed_diff * 255)) &&
+         (abs(src_b - dst_b) * dst_a <= (max_allowed_diff * 255));
 }
 
 // Returns true if 'length' number of pixels in 'src' and 'dst' are within an
@@ -586,6 +586,39 @@ static int GetSubRects(const WebPPicture* const prev_canvas,
                     &params->rect_lossy_, &params->sub_frame_lossy_);
 }
 
+static WEBP_INLINE int clip(int v, int min_v, int max_v) {
+  return (v < min_v) ? min_v : (v > max_v) ? max_v : v;
+}
+
+int WebPAnimEncoderRefineRect(
+    const WebPPicture* const prev_canvas, const WebPPicture* const curr_canvas,
+    int is_lossless, float quality, int* const x_offset, int* const y_offset,
+    int* const width, int* const height) {
+  FrameRect rect;
+  const int right = clip(*x_offset + *width, 0, curr_canvas->width);
+  const int left = clip(*x_offset, 0, curr_canvas->width - 1);
+  const int bottom = clip(*y_offset + *height, 0, curr_canvas->height);
+  const int top = clip(*y_offset, 0, curr_canvas->height - 1);
+  if (prev_canvas == NULL || curr_canvas == NULL ||
+      prev_canvas->width != curr_canvas->width ||
+      prev_canvas->height != curr_canvas->height ||
+      !prev_canvas->use_argb || !curr_canvas->use_argb) {
+    return 0;
+  }
+  rect.x_offset_ = left;
+  rect.y_offset_ = top;
+  rect.width_ = clip(right - left, 0, curr_canvas->width - rect.x_offset_);
+  rect.height_ = clip(bottom - top, 0, curr_canvas->height - rect.y_offset_);
+  MinimizeChangeRectangle(prev_canvas, curr_canvas, &rect, is_lossless,
+                          quality);
+  SnapToEvenOffsets(&rect);
+  *x_offset = rect.x_offset_;
+  *y_offset = rect.y_offset_;
+  *width = rect.width_;
+  *height = rect.height_;
+  return 1;
+}
+
 static void DisposeFrameRectangle(int dispose_method,
                                   const FrameRect* const rect,
                                   WebPPicture* const curr_canvas) {
@@ -829,8 +862,8 @@ static WebPEncodingError GenerateCandidates(
   WebPPicture* const curr_canvas = &enc->curr_canvas_copy_;
   const WebPPicture* const prev_canvas =
       is_dispose_none ? &enc->prev_canvas_ : &enc->prev_canvas_disposed_;
-  int use_blending_ll;
-  int use_blending_lossy;
+  int use_blending_ll, use_blending_lossy;
+  int evaluate_ll, evaluate_lossy;
 
   CopyCurrentCanvas(enc);
   use_blending_ll =
@@ -843,16 +876,19 @@ static WebPEncodingError GenerateCandidates(
 
   // Pick candidates to be tried.
   if (!enc->options_.allow_mixed) {
-    candidate_ll->evaluate_ = is_lossless;
-    candidate_lossy->evaluate_ = !is_lossless;
+    evaluate_ll = is_lossless;
+    evaluate_lossy = !is_lossless;
+  } else if (enc->options_.minimize_size) {
+    evaluate_ll = 1;
+    evaluate_lossy = 1;
   } else {  // Use a heuristic for trying lossless and/or lossy compression.
     const int num_colors = WebPGetColorPalette(&params->sub_frame_ll_, NULL);
-    candidate_ll->evaluate_ = (num_colors < MAX_COLORS_LOSSLESS);
-    candidate_lossy->evaluate_ = (num_colors >= MIN_COLORS_LOSSY);
+    evaluate_ll = (num_colors < MAX_COLORS_LOSSLESS);
+    evaluate_lossy = (num_colors >= MIN_COLORS_LOSSY);
   }
 
   // Generate candidates.
-  if (candidate_ll->evaluate_) {
+  if (evaluate_ll) {
     CopyCurrentCanvas(enc);
     if (use_blending_ll) {
       enc->curr_canvas_copy_modified_ =
@@ -862,7 +898,7 @@ static WebPEncodingError GenerateCandidates(
                                  config_ll, use_blending_ll, candidate_ll);
     if (error_code != VP8_ENC_OK) return error_code;
   }
-  if (candidate_lossy->evaluate_) {
+  if (evaluate_lossy) {
     CopyCurrentCanvas(enc);
     if (use_blending_lossy) {
       enc->curr_canvas_copy_modified_ =
@@ -1029,6 +1065,8 @@ static WebPEncodingError SetFrame(WebPAnimEncoder* const enc,
   const WebPPicture* const prev_canvas = &enc->prev_canvas_;
   Candidate candidates[CANDIDATE_COUNT];
   const int is_lossless = config->lossless;
+  const int consider_lossless = is_lossless || enc->options_.allow_mixed;
+  const int consider_lossy = !is_lossless || enc->options_.allow_mixed;
   const int is_first_frame = enc->is_first_frame_;
 
   // First frame cannot be skipped as there is no 'previous frame' to merge it
@@ -1066,9 +1104,7 @@ static WebPEncodingError SetFrame(WebPAnimEncoder* const enc,
     return VP8_ENC_ERROR_INVALID_CONFIGURATION;
   }
 
-  for (i = 0; i < CANDIDATE_COUNT; ++i) {
-    candidates[i].evaluate_ = 0;
-  }
+  memset(candidates, 0, sizeof(candidates));
 
   // Change-rectangle assuming previous frame was DISPOSE_NONE.
   if (!GetSubRects(prev_canvas, curr_canvas, is_key_frame, is_first_frame,
@@ -1077,8 +1113,8 @@ static WebPEncodingError SetFrame(WebPAnimEncoder* const enc,
     goto Err;
   }
 
-  if ((is_lossless && IsEmptyRect(&dispose_none_params.rect_ll_)) ||
-      (!is_lossless && IsEmptyRect(&dispose_none_params.rect_lossy_))) {
+  if ((consider_lossless && IsEmptyRect(&dispose_none_params.rect_ll_)) ||
+      (consider_lossy && IsEmptyRect(&dispose_none_params.rect_lossy_))) {
     // Don't encode the frame at all. Instead, the duration of the previous
     // frame will be increased later.
     assert(empty_rect_allowed_none);
@@ -1187,16 +1223,20 @@ static int CacheFrame(WebPAnimEncoder* const enc,
       enc->prev_candidate_undecided_ = 0;
     } else {
       int64_t curr_delta;
+      FrameRect prev_rect_key, prev_rect_sub;
 
       // Add this as a frame rectangle to enc.
       error_code = SetFrame(enc, config, 0, encoded_frame, &frame_skipped);
       if (error_code != VP8_ENC_OK) goto End;
       if (frame_skipped) goto Skip;
+      prev_rect_sub = enc->prev_rect_;
+
 
       // Add this as a key-frame to enc, too.
       error_code = SetFrame(enc, config, 1, encoded_frame, &frame_skipped);
       if (error_code != VP8_ENC_OK) goto End;
       assert(frame_skipped == 0);  // Key-frame cannot be an empty rectangle.
+      prev_rect_key = enc->prev_rect_;
 
       // Analyze size difference of the two variants.
       curr_delta = KeyFramePenalty(encoded_frame);
@@ -1207,11 +1247,13 @@ static int CacheFrame(WebPAnimEncoder* const enc,
           old_keyframe->is_key_frame_ = 0;
         }
         encoded_frame->is_key_frame_ = 1;
+        enc->prev_candidate_undecided_ = 1;
         enc->keyframe_ = (int)position;
         enc->best_delta_ = curr_delta;
         enc->flush_count_ = enc->count_ - 1;  // We can flush previous frames.
       } else {
         encoded_frame->is_key_frame_ = 0;
+        enc->prev_candidate_undecided_ = 0;
       }
       // Note: We need '>=' below because when kmin and kmax are both zero,
       // count_since_key_frame will always be > kmax.
@@ -1221,7 +1263,10 @@ static int CacheFrame(WebPAnimEncoder* const enc,
         enc->keyframe_ = KEYFRAME_NONE;
         enc->best_delta_ = DELTA_INFINITY;
       }
-      enc->prev_candidate_undecided_ = 1;
+      if (!enc->prev_candidate_undecided_) {
+        enc->prev_rect_ =
+            encoded_frame->is_key_frame_ ? prev_rect_key : prev_rect_sub;
+      }
     }
   }
 
diff --git a/src/3rdparty/libwebp/src/mux/animi.h b/src/3rdparty/libwebp/src/mux/animi.h
new file mode 100644
index 0000000..cecaf1f
--- /dev/null
+++ b/src/3rdparty/libwebp/src/mux/animi.h
@@ -0,0 +1,43 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Internal header for animation related functions.
+//
+// Author: Hui Su (huisu@google.com)
+
+#ifndef WEBP_MUX_ANIMI_H_
+#define WEBP_MUX_ANIMI_H_
+
+#include "../webp/mux.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Picks the optimal rectangle between two pictures, starting with initial
+// values of offsets and dimensions that are passed in. The initial
+// values will be clipped, if necessary, to make sure the rectangle is
+// within the canvas. "use_argb" must be true for both pictures.
+// Parameters:
+//   prev_canvas, curr_canvas - (in) two input pictures to compare.
+//   is_lossless, quality - (in) encoding settings.
+//   x_offset, y_offset, width, height - (in/out) rectangle between the two
+//                                                input pictures.
+// Returns true on success.
+int WebPAnimEncoderRefineRect(
+    const struct WebPPicture* const prev_canvas,
+    const struct WebPPicture* const curr_canvas,
+    int is_lossless, float quality, int* const x_offset, int* const y_offset,
+    int* const width, int* const height);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_MUX_ANIMI_H_ */
diff --git a/src/3rdparty/libwebp/src/mux/muxedit.c b/src/3rdparty/libwebp/src/mux/muxedit.c
index 9bbed42..d2c5305 100644
--- a/src/3rdparty/libwebp/src/mux/muxedit.c
+++ b/src/3rdparty/libwebp/src/mux/muxedit.c
@@ -93,34 +93,32 @@ static WebPMuxError MuxSet(WebPMux* const mux, uint32_t tag, uint32_t nth,
 }
 #undef SWITCH_ID_LIST
 
-// Create data for frame/fragment given image data, offsets and duration.
-static WebPMuxError CreateFrameFragmentData(
-    int width, int height, const WebPMuxFrameInfo* const info, int is_frame,
-    WebPData* const frame_frgm) {
-  uint8_t* frame_frgm_bytes;
-  const size_t frame_frgm_size = kChunks[is_frame ? IDX_ANMF : IDX_FRGM].size;
+// Create data for frame given image data, offsets and duration.
+static WebPMuxError CreateFrameData(
+    int width, int height, const WebPMuxFrameInfo* const info,
+    WebPData* const frame) {
+  uint8_t* frame_bytes;
+  const size_t frame_size = kChunks[IDX_ANMF].size;
 
   assert(width > 0 && height > 0 && info->duration >= 0);
   assert(info->dispose_method == (info->dispose_method & 1));
   // Note: assertion on upper bounds is done in PutLE24().
 
-  frame_frgm_bytes = (uint8_t*)WebPSafeMalloc(1ULL, frame_frgm_size);
-  if (frame_frgm_bytes == NULL) return WEBP_MUX_MEMORY_ERROR;
+  frame_bytes = (uint8_t*)WebPSafeMalloc(1ULL, frame_size);
+  if (frame_bytes == NULL) return WEBP_MUX_MEMORY_ERROR;
 
-  PutLE24(frame_frgm_bytes + 0, info->x_offset / 2);
-  PutLE24(frame_frgm_bytes + 3, info->y_offset / 2);
+  PutLE24(frame_bytes + 0, info->x_offset / 2);
+  PutLE24(frame_bytes + 3, info->y_offset / 2);
 
-  if (is_frame) {
-    PutLE24(frame_frgm_bytes + 6, width - 1);
-    PutLE24(frame_frgm_bytes + 9, height - 1);
-    PutLE24(frame_frgm_bytes + 12, info->duration);
-    frame_frgm_bytes[15] =
-        (info->blend_method == WEBP_MUX_NO_BLEND ? 2 : 0) |
-        (info->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND ? 1 : 0);
-  }
+  PutLE24(frame_bytes + 6, width - 1);
+  PutLE24(frame_bytes + 9, height - 1);
+  PutLE24(frame_bytes + 12, info->duration);
+  frame_bytes[15] =
+      (info->blend_method == WEBP_MUX_NO_BLEND ? 2 : 0) |
+      (info->dispose_method == WEBP_MUX_DISPOSE_BACKGROUND ? 1 : 0);
 
-  frame_frgm->bytes = frame_frgm_bytes;
-  frame_frgm->size = frame_frgm_size;
+  frame->bytes = frame_bytes;
+  frame->size = frame_size;
   return WEBP_MUX_OK;
 }
 
@@ -264,23 +262,16 @@ WebPMuxError WebPMuxSetImage(WebPMux* mux, const WebPData* bitstream,
   return err;
 }
 
-WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* frame,
+WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* info,
                               int copy_data) {
   WebPMuxImage wpi;
   WebPMuxError err;
-  int is_frame;
-  const WebPData* const bitstream = &frame->bitstream;
+  const WebPData* const bitstream = &info->bitstream;
 
   // Sanity checks.
-  if (mux == NULL || frame == NULL) return WEBP_MUX_INVALID_ARGUMENT;
+  if (mux == NULL || info == NULL) return WEBP_MUX_INVALID_ARGUMENT;
 
-  is_frame = (frame->id == WEBP_CHUNK_ANMF);
-  if (!(is_frame || (frame->id == WEBP_CHUNK_FRGM))) {
-    return WEBP_MUX_INVALID_ARGUMENT;
-  }
-  if (frame->id == WEBP_CHUNK_FRGM) {     // Dead experiment.
-    return WEBP_MUX_INVALID_ARGUMENT;
-  }
+  if (info->id != WEBP_CHUNK_ANMF) return WEBP_MUX_INVALID_ARGUMENT;
 
   if (bitstream->bytes == NULL || bitstream->size > MAX_CHUNK_PAYLOAD) {
     return WEBP_MUX_INVALID_ARGUMENT;
@@ -290,7 +281,7 @@ WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* frame,
     const WebPMuxImage* const image = mux->images_;
     const uint32_t image_id = (image->header_ != NULL) ?
         ChunkGetIdFromTag(image->header_->tag_) : WEBP_CHUNK_IMAGE;
-    if (image_id != frame->id) {
+    if (image_id != info->id) {
       return WEBP_MUX_INVALID_ARGUMENT;  // Conflicting frame types.
     }
   }
@@ -301,16 +292,11 @@ WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* frame,
   assert(wpi.img_ != NULL);  // As SetAlphaAndImageChunks() was successful.
 
   {
-    WebPData frame_frgm;
-    const uint32_t tag = kChunks[is_frame ? IDX_ANMF : IDX_FRGM].tag;
-    WebPMuxFrameInfo tmp = *frame;
+    WebPData frame;
+    const uint32_t tag = kChunks[IDX_ANMF].tag;
+    WebPMuxFrameInfo tmp = *info;
     tmp.x_offset &= ~1;  // Snap offsets to even.
     tmp.y_offset &= ~1;
-    if (!is_frame) {  // Reset unused values.
-      tmp.duration = 1;
-      tmp.dispose_method = WEBP_MUX_DISPOSE_NONE;
-      tmp.blend_method = WEBP_MUX_BLEND;
-    }
     if (tmp.x_offset < 0 || tmp.x_offset >= MAX_POSITION_OFFSET ||
         tmp.y_offset < 0 || tmp.y_offset >= MAX_POSITION_OFFSET ||
         (tmp.duration < 0 || tmp.duration >= MAX_DURATION) ||
@@ -318,12 +304,11 @@ WebPMuxError WebPMuxPushFrame(WebPMux* mux, const WebPMuxFrameInfo* frame,
       err = WEBP_MUX_INVALID_ARGUMENT;
       goto Err;
     }
-    err = CreateFrameFragmentData(wpi.width_, wpi.height_, &tmp, is_frame,
-                                  &frame_frgm);
+    err = CreateFrameData(wpi.width_, wpi.height_, &tmp, &frame);
     if (err != WEBP_MUX_OK) goto Err;
-    // Add frame/fragment chunk (with copy_data = 1).
-    err = AddDataToChunkList(&frame_frgm, 1, tag, &wpi.header_);
-    WebPDataClear(&frame_frgm);  // frame_frgm owned by wpi.header_ now.
+    // Add frame chunk (with copy_data = 1).
+    err = AddDataToChunkList(&frame, 1, tag, &wpi.header_);
+    WebPDataClear(&frame);  // frame owned by wpi.header_ now.
     if (err != WEBP_MUX_OK) goto Err;
   }
 
@@ -402,21 +387,18 @@ WebPMuxError WebPMuxDeleteFrame(WebPMux* mux, uint32_t nth) {
 //------------------------------------------------------------------------------
 // Assembly of the WebP RIFF file.
 
-static WebPMuxError GetFrameFragmentInfo(
-    const WebPChunk* const frame_frgm_chunk,
+static WebPMuxError GetFrameInfo(
+    const WebPChunk* const frame_chunk,
     int* const x_offset, int* const y_offset, int* const duration) {
-  const uint32_t tag = frame_frgm_chunk->tag_;
-  const int is_frame = (tag == kChunks[IDX_ANMF].tag);
-  const WebPData* const data = &frame_frgm_chunk->data_;
-  const size_t expected_data_size =
-      is_frame ? ANMF_CHUNK_SIZE : FRGM_CHUNK_SIZE;
-  assert(frame_frgm_chunk != NULL);
-  assert(tag == kChunks[IDX_ANMF].tag || tag ==  kChunks[IDX_FRGM].tag);
+  const WebPData* const data = &frame_chunk->data_;
+  const size_t expected_data_size = ANMF_CHUNK_SIZE;
+  assert(frame_chunk->tag_ == kChunks[IDX_ANMF].tag);
+  assert(frame_chunk != NULL);
   if (data->size != expected_data_size) return WEBP_MUX_INVALID_ARGUMENT;
 
   *x_offset = 2 * GetLE24(data->bytes + 0);
   *y_offset = 2 * GetLE24(data->bytes + 3);
-  if (is_frame) *duration = GetLE24(data->bytes + 12);
+  *duration = GetLE24(data->bytes + 12);
   return WEBP_MUX_OK;
 }
 
@@ -424,13 +406,13 @@ static WebPMuxError GetImageInfo(const WebPMuxImage* const wpi,
                                  int* const x_offset, int* const y_offset,
                                  int* const duration,
                                  int* const width, int* const height) {
-  const WebPChunk* const frame_frgm_chunk = wpi->header_;
+  const WebPChunk* const frame_chunk = wpi->header_;
   WebPMuxError err;
   assert(wpi != NULL);
-  assert(frame_frgm_chunk != NULL);
+  assert(frame_chunk != NULL);
 
-  // Get offsets and duration from ANMF/FRGM chunk.
-  err = GetFrameFragmentInfo(frame_frgm_chunk, x_offset, y_offset, duration);
+  // Get offsets and duration from ANMF chunk.
+  err = GetFrameInfo(frame_chunk, x_offset, y_offset, duration);
   if (err != WEBP_MUX_OK) return err;
 
   // Get width and height from VP8/VP8L chunk.
@@ -441,7 +423,6 @@ static WebPMuxError GetImageInfo(const WebPMuxImage* const wpi,
 
 // Returns the tightest dimension for the canvas considering the image list.
 static WebPMuxError GetAdjustedCanvasSize(const WebPMux* const mux,
-                                          uint32_t flags,
                                           int* const width, int* const height) {
   WebPMuxImage* wpi = NULL;
   assert(mux != NULL);
@@ -452,12 +433,10 @@ static WebPMuxError GetAdjustedCanvasSize(const WebPMux* const mux,
   assert(wpi->img_ != NULL);
 
   if (wpi->next_ != NULL) {
-    int max_x = 0;
-    int max_y = 0;
-    int64_t image_area = 0;
+    int max_x = 0, max_y = 0;
     // if we have a chain of wpi's, header_ is necessarily set
     assert(wpi->header_ != NULL);
-    // Aggregate the bounding box for animation frames & fragmented images.
+    // Aggregate the bounding box for animation frames.
     for (; wpi != NULL; wpi = wpi->next_) {
       int x_offset = 0, y_offset = 0, duration = 0, w = 0, h = 0;
       const WebPMuxError err = GetImageInfo(wpi, &x_offset, &y_offset,
@@ -470,19 +449,9 @@ static WebPMuxError GetAdjustedCanvasSize(const WebPMux* const mux,
 
       if (max_x_pos > max_x) max_x = max_x_pos;
       if (max_y_pos > max_y) max_y = max_y_pos;
-      image_area += w * h;
     }
     *width = max_x;
     *height = max_y;
-    // Crude check to validate that there are no image overlaps/holes for
-    // fragmented images. Check that the aggregated image area for individual
-    // fragments exactly matches the image area of the constructed canvas.
-    // However, the area-match is necessary but not sufficient condition.
-    if ((flags & FRAGMENTS_FLAG) && (image_area != (max_x * max_y))) {
-      *width = 0;
-      *height = 0;
-      return WEBP_MUX_INVALID_ARGUMENT;
-    }
   } else {
     // For a single image, canvas dimensions are same as image dimensions.
     *width = wpi->width_;
@@ -528,10 +497,7 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
     flags |= XMP_FLAG;
   }
   if (images->header_ != NULL) {
-    if (images->header_->tag_ == kChunks[IDX_FRGM].tag) {
-      // This is a fragmented image.
-      flags |= FRAGMENTS_FLAG;
-    } else if (images->header_->tag_ == kChunks[IDX_ANMF].tag) {
+    if (images->header_->tag_ == kChunks[IDX_ANMF].tag) {
       // This is an image with animation.
       flags |= ANIMATION_FLAG;
     }
@@ -540,7 +506,7 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
     flags |= ALPHA_FLAG;  // Some images have an alpha channel.
   }
 
-  err = GetAdjustedCanvasSize(mux, flags, &width, &height);
+  err = GetAdjustedCanvasSize(mux, &width, &height);
   if (err != WEBP_MUX_OK) return err;
 
   if (width <= 0 || height <= 0) {
@@ -580,31 +546,26 @@ static WebPMuxError CreateVP8XChunk(WebPMux* const mux) {
 // Cleans up 'mux' by removing any unnecessary chunks.
 static WebPMuxError MuxCleanup(WebPMux* const mux) {
   int num_frames;
-  int num_fragments;
   int num_anim_chunks;
 
-  // If we have an image with a single fragment or frame, and its rectangle
-  // covers the whole canvas, convert it to a non-animated non-fragmented image
-  // (to avoid writing FRGM/ANMF chunk unnecessarily).
+  // If we have an image with a single frame, and its rectangle
+  // covers the whole canvas, convert it to a non-animated image
+  // (to avoid writing ANMF chunk unnecessarily).
   WebPMuxError err = WebPMuxNumChunks(mux, kChunks[IDX_ANMF].id, &num_frames);
   if (err != WEBP_MUX_OK) return err;
-  err = WebPMuxNumChunks(mux, kChunks[IDX_FRGM].id, &num_fragments);
-  if (err != WEBP_MUX_OK) return err;
-  if (num_frames == 1 || num_fragments == 1) {
-    WebPMuxImage* frame_frag;
-    err = MuxImageGetNth((const WebPMuxImage**)&mux->images_, 1, &frame_frag);
-    assert(err == WEBP_MUX_OK);  // We know that one frame/fragment does exist.
-    assert(frame_frag != NULL);
-    if (frame_frag->header_ != NULL &&
+  if (num_frames == 1) {
+    WebPMuxImage* frame = NULL;
+    err = MuxImageGetNth((const WebPMuxImage**)&mux->images_, 1, &frame);
+    assert(err == WEBP_MUX_OK);  // We know that one frame does exist.
+    assert(frame != NULL);
+    if (frame->header_ != NULL &&
         ((mux->canvas_width_ == 0 && mux->canvas_height_ == 0) ||
-         (frame_frag->width_ == mux->canvas_width_ &&
-          frame_frag->height_ == mux->canvas_height_))) {
-      assert(frame_frag->header_->tag_ == kChunks[IDX_ANMF].tag ||
-             frame_frag->header_->tag_ == kChunks[IDX_FRGM].tag);
-      ChunkDelete(frame_frag->header_);  // Removes ANMF/FRGM chunk.
-      frame_frag->header_ = NULL;
+         (frame->width_ == mux->canvas_width_ &&
+          frame->height_ == mux->canvas_height_))) {
+      assert(frame->header_->tag_ == kChunks[IDX_ANMF].tag);
+      ChunkDelete(frame->header_);  // Removes ANMF chunk.
+      frame->header_ = NULL;
       num_frames = 0;
-      num_fragments = 0;
     }
   }
   // Remove ANIM chunk if this is a non-animated image.
diff --git a/src/3rdparty/libwebp/src/mux/muxi.h b/src/3rdparty/libwebp/src/mux/muxi.h
index d4d5cba..e6606aa 100644
--- a/src/3rdparty/libwebp/src/mux/muxi.h
+++ b/src/3rdparty/libwebp/src/mux/muxi.h
@@ -15,8 +15,8 @@
 #define WEBP_MUX_MUXI_H_
 
 #include <stdlib.h>
-#include "../dec/vp8i.h"
-#include "../dec/vp8li.h"
+#include "../dec/vp8i_dec.h"
+#include "../dec/vp8li_dec.h"
 #include "../webp/mux.h"
 
 #ifdef __cplusplus
@@ -27,8 +27,8 @@ extern "C" {
 // Defines and constants.
 
 #define MUX_MAJ_VERSION 0
-#define MUX_MIN_VERSION 3
-#define MUX_REV_VERSION 1
+#define MUX_MIN_VERSION 4
+#define MUX_REV_VERSION 0
 
 // Chunk object.
 typedef struct WebPChunk WebPChunk;
@@ -36,16 +36,16 @@ struct WebPChunk {
   uint32_t        tag_;
   int             owner_;  // True if *data_ memory is owned internally.
                            // VP8X, ANIM, and other internally created chunks
-                           // like ANMF/FRGM are always owned.
+                           // like ANMF are always owned.
   WebPData        data_;
   WebPChunk*      next_;
 };
 
-// MuxImage object. Store a full WebP image (including ANMF/FRGM chunk, ALPH
+// MuxImage object. Store a full WebP image (including ANMF chunk, ALPH
 // chunk and VP8/VP8L chunk),
 typedef struct WebPMuxImage WebPMuxImage;
 struct WebPMuxImage {
-  WebPChunk*  header_;      // Corresponds to WEBP_CHUNK_ANMF/WEBP_CHUNK_FRGM.
+  WebPChunk*  header_;      // Corresponds to WEBP_CHUNK_ANMF.
   WebPChunk*  alpha_;       // Corresponds to WEBP_CHUNK_ALPHA.
   WebPChunk*  img_;         // Corresponds to WEBP_CHUNK_IMAGE.
   WebPChunk*  unknown_;     // Corresponds to WEBP_CHUNK_UNKNOWN.
@@ -79,7 +79,6 @@ typedef enum {
   IDX_ICCP,
   IDX_ANIM,
   IDX_ANMF,
-  IDX_FRGM,
   IDX_ALPHA,
   IDX_VP8,
   IDX_VP8L,
@@ -185,7 +184,6 @@ int MuxImageFinalize(WebPMuxImage* const wpi);
 static WEBP_INLINE int IsWPI(WebPChunkId id) {
   switch (id) {
     case WEBP_CHUNK_ANMF:
-    case WEBP_CHUNK_FRGM:
     case WEBP_CHUNK_ALPHA:
     case WEBP_CHUNK_IMAGE:  return 1;
     default:        return 0;
diff --git a/src/3rdparty/libwebp/src/mux/muxinternal.c b/src/3rdparty/libwebp/src/mux/muxinternal.c
index 4babbe8..387b57e 100644
--- a/src/3rdparty/libwebp/src/mux/muxinternal.c
+++ b/src/3rdparty/libwebp/src/mux/muxinternal.c
@@ -16,14 +16,13 @@
 #include "./muxi.h"
 #include "../utils/utils.h"
 
-#define UNDEFINED_CHUNK_SIZE (-1)
+#define UNDEFINED_CHUNK_SIZE ((uint32_t)(-1))
 
 const ChunkInfo kChunks[] = {
   { MKFOURCC('V', 'P', '8', 'X'),  WEBP_CHUNK_VP8X,    VP8X_CHUNK_SIZE },
   { MKFOURCC('I', 'C', 'C', 'P'),  WEBP_CHUNK_ICCP,    UNDEFINED_CHUNK_SIZE },
   { MKFOURCC('A', 'N', 'I', 'M'),  WEBP_CHUNK_ANIM,    ANIM_CHUNK_SIZE },
   { MKFOURCC('A', 'N', 'M', 'F'),  WEBP_CHUNK_ANMF,    ANMF_CHUNK_SIZE },
-  { MKFOURCC('F', 'R', 'G', 'M'),  WEBP_CHUNK_FRGM,    FRGM_CHUNK_SIZE },
   { MKFOURCC('A', 'L', 'P', 'H'),  WEBP_CHUNK_ALPHA,   UNDEFINED_CHUNK_SIZE },
   { MKFOURCC('V', 'P', '8', ' '),  WEBP_CHUNK_IMAGE,   UNDEFINED_CHUNK_SIZE },
   { MKFOURCC('V', 'P', '8', 'L'),  WEBP_CHUNK_IMAGE,   UNDEFINED_CHUNK_SIZE },
@@ -251,8 +250,7 @@ static WebPChunk** GetChunkListFromId(const WebPMuxImage* const wpi,
                                       WebPChunkId id) {
   assert(wpi != NULL);
   switch (id) {
-    case WEBP_CHUNK_ANMF:
-    case WEBP_CHUNK_FRGM:  return (WebPChunk**)&wpi->header_;
+    case WEBP_CHUNK_ANMF:  return (WebPChunk**)&wpi->header_;
     case WEBP_CHUNK_ALPHA: return (WebPChunk**)&wpi->alpha_;
     case WEBP_CHUNK_IMAGE: return (WebPChunk**)&wpi->img_;
     default: return NULL;
@@ -372,13 +370,12 @@ size_t MuxImageDiskSize(const WebPMuxImage* const wpi) {
   return size;
 }
 
-// Special case as ANMF/FRGM chunk encapsulates other image chunks.
+// Special case as ANMF chunk encapsulates other image chunks.
 static uint8_t* ChunkEmitSpecial(const WebPChunk* const header,
                                  size_t total_size, uint8_t* dst) {
   const size_t header_size = header->data_.size;
   const size_t offset_to_next = total_size - CHUNK_HEADER_SIZE;
-  assert(header->tag_ == kChunks[IDX_ANMF].tag ||
-         header->tag_ == kChunks[IDX_FRGM].tag);
+  assert(header->tag_ == kChunks[IDX_ANMF].tag);
   PutLE32(dst + 0, header->tag_);
   PutLE32(dst + TAG_SIZE, (uint32_t)offset_to_next);
   assert(header_size == (uint32_t)header_size);
@@ -391,7 +388,7 @@ static uint8_t* ChunkEmitSpecial(const WebPChunk* const header,
 
 uint8_t* MuxImageEmit(const WebPMuxImage* const wpi, uint8_t* dst) {
   // Ordering of chunks to be emitted is strictly as follows:
-  // 1. ANMF/FRGM chunk (if present).
+  // 1. ANMF chunk (if present).
   // 2. ALPH chunk (if present).
   // 3. VP8/VP8L chunk.
   assert(wpi);
@@ -439,7 +436,7 @@ static int IsNotCompatible(int feature, int num_items) {
   return (feature != 0) != (num_items > 0);
 }
 
-#define NO_FLAG 0
+#define NO_FLAG ((WebPFeatureFlags)0)
 
 // Test basic constraints:
 // retrieval, maximum number of chunks by index (use -1 to skip)
@@ -465,7 +462,6 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
   int num_xmp;
   int num_anim;
   int num_frames;
-  int num_fragments;
   int num_vp8x;
   int num_images;
   int num_alpha;
@@ -510,10 +506,6 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
     }
   }
 
-  // Fragmentation: FRAGMENTS_FLAG and FRGM chunk(s) are consistent.
-  err = ValidateChunk(mux, IDX_FRGM, FRAGMENTS_FLAG, flags, -1, &num_fragments);
-  if (err != WEBP_MUX_OK) return err;
-
   // Verify either VP8X chunk is present OR there is only one elem in
   // mux->images_.
   err = ValidateChunk(mux, IDX_VP8X, NO_FLAG, flags, 1, &num_vp8x);
@@ -537,11 +529,6 @@ WebPMuxError MuxValidate(const WebPMux* const mux) {
     if (flags & ALPHA_FLAG) return WEBP_MUX_INVALID_ARGUMENT;
   }
 
-  // num_fragments & num_images are consistent.
-  if (num_fragments > 0 && num_images != num_fragments) {
-    return WEBP_MUX_INVALID_ARGUMENT;
-  }
-
   return WEBP_MUX_OK;
 }
 
diff --git a/src/3rdparty/libwebp/src/mux/muxread.c b/src/3rdparty/libwebp/src/mux/muxread.c
index 8957a1e..410acd9 100644
--- a/src/3rdparty/libwebp/src/mux/muxread.c
+++ b/src/3rdparty/libwebp/src/mux/muxread.c
@@ -104,17 +104,15 @@ static int MuxImageParse(const WebPChunk* const chunk, int copy_data,
   size_t subchunk_size;
   ChunkInit(&subchunk);
 
-  assert(chunk->tag_ == kChunks[IDX_ANMF].tag ||
-         chunk->tag_ == kChunks[IDX_FRGM].tag);
+  assert(chunk->tag_ == kChunks[IDX_ANMF].tag);
   assert(!wpi->is_partial_);
 
-  // ANMF/FRGM.
+  // ANMF.
   {
-    const size_t hdr_size = (chunk->tag_ == kChunks[IDX_ANMF].tag) ?
-        ANMF_CHUNK_SIZE : FRGM_CHUNK_SIZE;
+    const size_t hdr_size = ANMF_CHUNK_SIZE;
     const WebPData temp = { bytes, hdr_size };
-    // Each of ANMF and FRGM chunk contain a header at the beginning. So, its
-    // size should at least be 'hdr_size'.
+    // Each of ANMF chunk contain a header at the beginning. So, its size should
+    // be at least 'hdr_size'.
     if (size < hdr_size) goto Fail;
     ChunkAssignData(&subchunk, &temp, copy_data, chunk->tag_);
   }
@@ -292,16 +290,15 @@ WebPMux* WebPMuxCreateInternal(const WebPData* bitstream, int copy_data,
 static WebPMuxError ValidateForSingleImage(const WebPMux* const mux) {
   const int num_images = MuxImageCount(mux->images_, WEBP_CHUNK_IMAGE);
   const int num_frames = MuxImageCount(mux->images_, WEBP_CHUNK_ANMF);
-  const int num_fragments = MuxImageCount(mux->images_, WEBP_CHUNK_FRGM);
 
   if (num_images == 0) {
     // No images in mux.
     return WEBP_MUX_NOT_FOUND;
-  } else if (num_images == 1 && num_frames == 0 && num_fragments == 0) {
+  } else if (num_images == 1 && num_frames == 0) {
     // Valid case (single image).
     return WEBP_MUX_OK;
   } else {
-    // Frame/Fragment case OR an invalid mux.
+    // Frame case OR an invalid mux.
     return WEBP_MUX_INVALID_ARGUMENT;
   }
 }
@@ -379,7 +376,7 @@ static WebPMuxError SynthesizeBitstream(const WebPMuxImage* const wpi,
   const int need_vp8x = (wpi->alpha_ != NULL);
   const size_t vp8x_size = need_vp8x ? CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE : 0;
   const size_t alpha_size = need_vp8x ? ChunkDiskSize(wpi->alpha_) : 0;
-  // Note: No need to output ANMF/FRGM chunk for a single image.
+  // Note: No need to output ANMF chunk for a single image.
   const size_t size = RIFF_HEADER_SIZE + vp8x_size + alpha_size +
                       ChunkDiskSize(wpi->img_);
   uint8_t* const data = (uint8_t*)WebPSafeMalloc(1ULL, size);
@@ -436,29 +433,24 @@ static WebPMuxError MuxGetImageInternal(const WebPMuxImage* const wpi,
   return SynthesizeBitstream(wpi, &info->bitstream);
 }
 
-static WebPMuxError MuxGetFrameFragmentInternal(const WebPMuxImage* const wpi,
-                                                WebPMuxFrameInfo* const frame) {
+static WebPMuxError MuxGetFrameInternal(const WebPMuxImage* const wpi,
+                                        WebPMuxFrameInfo* const frame) {
   const int is_frame = (wpi->header_->tag_ == kChunks[IDX_ANMF].tag);
-  const CHUNK_INDEX idx = is_frame ? IDX_ANMF : IDX_FRGM;
-  const WebPData* frame_frgm_data;
+  const WebPData* frame_data;
   if (!is_frame) return WEBP_MUX_INVALID_ARGUMENT;
   assert(wpi->header_ != NULL);  // Already checked by WebPMuxGetFrame().
-  // Get frame/fragment chunk.
-  frame_frgm_data = &wpi->header_->data_;
-  if (frame_frgm_data->size < kChunks[idx].size) return WEBP_MUX_BAD_DATA;
+  // Get frame chunk.
+  frame_data = &wpi->header_->data_;
+  if (frame_data->size < kChunks[IDX_ANMF].size) return WEBP_MUX_BAD_DATA;
   // Extract info.
-  frame->x_offset = 2 * GetLE24(frame_frgm_data->bytes + 0);
-  frame->y_offset = 2 * GetLE24(frame_frgm_data->bytes + 3);
-  if (is_frame) {
-    const uint8_t bits = frame_frgm_data->bytes[15];
-    frame->duration = GetLE24(frame_frgm_data->bytes + 12);
+  frame->x_offset = 2 * GetLE24(frame_data->bytes + 0);
+  frame->y_offset = 2 * GetLE24(frame_data->bytes + 3);
+  {
+    const uint8_t bits = frame_data->bytes[15];
+    frame->duration = GetLE24(frame_data->bytes + 12);
     frame->dispose_method =
         (bits & 1) ? WEBP_MUX_DISPOSE_BACKGROUND : WEBP_MUX_DISPOSE_NONE;
     frame->blend_method = (bits & 2) ? WEBP_MUX_NO_BLEND : WEBP_MUX_BLEND;
-  } else {  // Defaults for unused values.
-    frame->duration = 1;
-    frame->dispose_method = WEBP_MUX_DISPOSE_NONE;
-    frame->blend_method = WEBP_MUX_BLEND;
   }
   frame->id = ChunkGetIdFromTag(wpi->header_->tag_);
   return SynthesizeBitstream(wpi, &frame->bitstream);
@@ -482,7 +474,7 @@ WebPMuxError WebPMuxGetFrame(
   if (wpi->header_ == NULL) {
     return MuxGetImageInternal(wpi, frame);
   } else {
-    return MuxGetFrameFragmentInternal(wpi, frame);
+    return MuxGetFrameInternal(wpi, frame);
   }
 }
 
diff --git a/src/3rdparty/libwebp/src/utils/bit_reader_inl.h b/src/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
index 99ed313..fd7fb04 100644
--- a/src/3rdparty/libwebp/src/utils/bit_reader_inl.h
+++ b/src/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
@@ -20,13 +20,12 @@
 #include "../webp/config.h"
 #endif
 
-#ifdef WEBP_FORCE_ALIGNED
-#include <string.h>  // memcpy
-#endif
+#include <string.h>  // for memcpy
 
 #include "../dsp/dsp.h"
-#include "./bit_reader.h"
-#include "./endian_inl.h"
+#include "./bit_reader_utils.h"
+#include "./endian_inl_utils.h"
+#include "./utils.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -62,10 +61,7 @@ void VP8LoadNewBytes(VP8BitReader* const br) {
   if (br->buf_ < br->buf_max_) {
     // convert memory type to register type (with some zero'ing!)
     bit_t bits;
-#if defined(WEBP_FORCE_ALIGNED)
-    lbit_t in_bits;
-    memcpy(&in_bits, br->buf_, sizeof(in_bits));
-#elif defined(WEBP_USE_MIPS32)
+#if defined(WEBP_USE_MIPS32)
     // This is needed because of un-aligned read.
     lbit_t in_bits;
     lbit_t* p_buf_ = (lbit_t*)br->buf_;
@@ -80,7 +76,8 @@ void VP8LoadNewBytes(VP8BitReader* const br) {
       : "memory", "at"
     );
 #else
-    const lbit_t in_bits = *(const lbit_t*)br->buf_;
+    lbit_t in_bits;
+    memcpy(&in_bits, br->buf_, sizeof(in_bits));
 #endif
     br->buf_ += BITS >> 3;
 #if !defined(WORDS_BIGENDIAN)
@@ -119,37 +116,26 @@ static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
     const int pos = br->bits_;
     const range_t split = (range * prob) >> 8;
     const range_t value = (range_t)(br->value_ >> pos);
-#if defined(__arm__) || defined(_M_ARM)      // ARM-specific
-    const int bit = ((int)(split - value) >> 31) & 1;
-    if (value > split) {
-      range -= split + 1;
-      br->value_ -= (bit_t)(split + 1) << pos;
-    } else {
-      range = split;
-    }
-#else  // faster version on x86
-    int bit;  // Don't use 'const int bit = (value > split);", it's slower.
-    if (value > split) {
-      range -= split + 1;
+    const int bit = (value > split);
+    if (bit) {
+      range -= split;
       br->value_ -= (bit_t)(split + 1) << pos;
-      bit = 1;
     } else {
-      range = split;
-      bit = 0;
+      range = split + 1;
     }
-#endif
-    if (range <= (range_t)0x7e) {
-      const int shift = kVP8Log2Range[range];
-      range = kVP8NewRange[range];
+    {
+      const int shift = 7 ^ BitsLog2Floor(range);
+      range <<= shift;
       br->bits_ -= shift;
     }
-    br->range_ = range;
+    br->range_ = range - 1;
     return bit;
   }
 }
 
 // simplified version of VP8GetBit() for prob=0x80 (note shift is always 1 here)
-static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
+static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
+int VP8GetSigned(VP8BitReader* const br, int v) {
   if (br->bits_ < 0) {
     VP8LoadNewBytes(br);
   }
@@ -166,6 +152,37 @@ static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
   }
 }
 
+static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) {
+  // Don't move this declaration! It makes a big speed difference to store
+  // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
+  // alter br->range_ value.
+  range_t range = br->range_;
+  if (br->bits_ < 0) {
+    VP8LoadNewBytes(br);
+  }
+  {
+    const int pos = br->bits_;
+    const range_t split = (range * prob) >> 8;
+    const range_t value = (range_t)(br->value_ >> pos);
+    int bit;  // Don't use 'const int bit = (value > split);", it's slower.
+    if (value > split) {
+      range -= split + 1;
+      br->value_ -= (bit_t)(split + 1) << pos;
+      bit = 1;
+    } else {
+      range = split;
+      bit = 0;
+    }
+    if (range <= (range_t)0x7e) {
+      const int shift = kVP8Log2Range[range];
+      range = kVP8NewRange[range];
+      br->bits_ -= shift;
+    }
+    br->range_ = range;
+    return bit;
+  }
+}
+
 #ifdef __cplusplus
 }    // extern "C"
 #endif
diff --git a/src/3rdparty/libwebp/src/utils/bit_reader.c b/src/3rdparty/libwebp/src/utils/bit_reader_utils.c
index 50ffb74..c3157e8 100644
--- a/src/3rdparty/libwebp/src/utils/bit_reader.c
+++ b/src/3rdparty/libwebp/src/utils/bit_reader_utils.c
@@ -15,7 +15,7 @@
 #include "../webp/config.h"
 #endif
 
-#include "./bit_reader_inl.h"
+#include "./bit_reader_inl_utils.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/utils/bit_reader.h b/src/3rdparty/libwebp/src/utils/bit_reader_utils.h
index ec3426c..ec3426c 100644
--- a/src/3rdparty/libwebp/src/utils/bit_reader.h
+++ b/src/3rdparty/libwebp/src/utils/bit_reader_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/bit_writer.c b/src/3rdparty/libwebp/src/utils/bit_writer_utils.c
index 0644286..ab0c49d 100644
--- a/src/3rdparty/libwebp/src/utils/bit_writer.c
+++ b/src/3rdparty/libwebp/src/utils/bit_writer_utils.c
@@ -16,8 +16,8 @@
 #include <string.h>   // for memcpy()
 #include <stdlib.h>
 
-#include "./bit_writer.h"
-#include "./endian_inl.h"
+#include "./bit_writer_utils.h"
+#include "./endian_inl_utils.h"
 #include "./utils.h"
 
 //------------------------------------------------------------------------------
@@ -143,13 +143,13 @@ int VP8PutBitUniform(VP8BitWriter* const bw, int bit) {
 void VP8PutBits(VP8BitWriter* const bw, uint32_t value, int nb_bits) {
   uint32_t mask;
   assert(nb_bits > 0 && nb_bits < 32);
-  for (mask = 1u << (nb_bits - 1); mask; mask >>= 1)
+  for (mask = 1u << (nb_bits - 1); mask; mask >>= 1) {
     VP8PutBitUniform(bw, value & mask);
+  }
 }
 
 void VP8PutSignedBits(VP8BitWriter* const bw, int value, int nb_bits) {
-  if (!VP8PutBitUniform(bw, value != 0))
-    return;
+  if (!VP8PutBitUniform(bw, value != 0)) return;
   if (value < 0) {
     VP8PutBits(bw, ((-value) << 1) | 1, nb_bits + 1);
   } else {
diff --git a/src/3rdparty/libwebp/src/utils/bit_writer.h b/src/3rdparty/libwebp/src/utils/bit_writer_utils.h
index ef360d1..9c02bbc 100644
--- a/src/3rdparty/libwebp/src/utils/bit_writer.h
+++ b/src/3rdparty/libwebp/src/utils/bit_writer_utils.h
@@ -54,7 +54,8 @@ int VP8BitWriterAppend(VP8BitWriter* const bw,
 
 // return approximate write position (in bits)
 static WEBP_INLINE uint64_t VP8BitWriterPos(const VP8BitWriter* const bw) {
-  return (uint64_t)(bw->pos_ + bw->run_) * 8 + 8 + bw->nb_bits_;
+  const uint64_t nb_bits = 8 + bw->nb_bits_;   // bw->nb_bits_ is <= 0, note
+  return (bw->pos_ + bw->run_) * 8 + nb_bits;
 }
 
 // Returns a pointer to the internal buffer.
diff --git a/src/3rdparty/libwebp/src/utils/color_cache.c b/src/3rdparty/libwebp/src/utils/color_cache_utils.c
index c34b2e7..0172590 100644
--- a/src/3rdparty/libwebp/src/utils/color_cache.c
+++ b/src/3rdparty/libwebp/src/utils/color_cache_utils.c
@@ -14,7 +14,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./color_cache.h"
+#include "./color_cache_utils.h"
 #include "./utils.h"
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/utils/color_cache.h b/src/3rdparty/libwebp/src/utils/color_cache_utils.h
index a9a9f64..c373e6b 100644
--- a/src/3rdparty/libwebp/src/utils/color_cache.h
+++ b/src/3rdparty/libwebp/src/utils/color_cache_utils.h
@@ -28,7 +28,11 @@ typedef struct {
   int hash_bits_;
 } VP8LColorCache;
 
-static const uint32_t kHashMul = 0x1e35a7bd;
+static const uint64_t kHashMul = 0x1e35a7bdull;
+
+static WEBP_INLINE int HashPix(uint32_t argb, int shift) {
+  return (int)(((argb * kHashMul) & 0xffffffffu) >> shift);
+}
 
 static WEBP_INLINE uint32_t VP8LColorCacheLookup(
     const VP8LColorCache* const cc, uint32_t key) {
@@ -44,19 +48,20 @@ static WEBP_INLINE void VP8LColorCacheSet(const VP8LColorCache* const cc,
 
 static WEBP_INLINE void VP8LColorCacheInsert(const VP8LColorCache* const cc,
                                              uint32_t argb) {
-  const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
+  const int key = HashPix(argb, cc->hash_shift_);
   cc->colors_[key] = argb;
 }
 
 static WEBP_INLINE int VP8LColorCacheGetIndex(const VP8LColorCache* const cc,
                                               uint32_t argb) {
-  return (kHashMul * argb) >> cc->hash_shift_;
+  return HashPix(argb, cc->hash_shift_);
 }
 
+// Return the key if cc contains argb, and -1 otherwise.
 static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc,
                                               uint32_t argb) {
-  const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
-  return (cc->colors_[key] == argb);
+  const int key = HashPix(argb, cc->hash_shift_);
+  return (cc->colors_[key] == argb) ? key : -1;
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/utils/endian_inl.h b/src/3rdparty/libwebp/src/utils/endian_inl_utils.h
index e11260f..e11260f 100644
--- a/src/3rdparty/libwebp/src/utils/endian_inl.h
+++ b/src/3rdparty/libwebp/src/utils/endian_inl_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/filters.c b/src/3rdparty/libwebp/src/utils/filters_utils.c
index 15543b1..49c1d18 100644
--- a/src/3rdparty/libwebp/src/utils/filters.c
+++ b/src/3rdparty/libwebp/src/utils/filters_utils.c
@@ -11,7 +11,7 @@
 //
 // Author: Urvang (urvang@google.com)
 
-#include "./filters.h"
+#include "./filters_utils.h"
 #include <stdlib.h>
 #include <string.h>
 
diff --git a/src/3rdparty/libwebp/src/utils/filters.h b/src/3rdparty/libwebp/src/utils/filters_utils.h
index 088b132..088b132 100644
--- a/src/3rdparty/libwebp/src/utils/filters.h
+++ b/src/3rdparty/libwebp/src/utils/filters_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/huffman_encode.c b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.c
index 4e5ef6b..f950465 100644
--- a/src/3rdparty/libwebp/src/utils/huffman_encode.c
+++ b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.c
@@ -14,7 +14,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./huffman_encode.h"
+#include "./huffman_encode_utils.h"
 #include "./utils.h"
 #include "../webp/format_constants.h"
 
diff --git a/src/3rdparty/libwebp/src/utils/huffman_encode.h b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.h
index a157165..a157165 100644
--- a/src/3rdparty/libwebp/src/utils/huffman_encode.h
+++ b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/huffman.c b/src/3rdparty/libwebp/src/utils/huffman_utils.c
index 36e5502..008b5d7 100644
--- a/src/3rdparty/libwebp/src/utils/huffman.c
+++ b/src/3rdparty/libwebp/src/utils/huffman_utils.c
@@ -14,7 +14,7 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./huffman.h"
+#include "./huffman_utils.h"
 #include "./utils.h"
 #include "../webp/format_constants.h"
 
@@ -45,7 +45,7 @@ static WEBP_INLINE uint32_t GetNextKey(uint32_t key, int len) {
   while (key & step) {
     step >>= 1;
   }
-  return (key & (step - 1)) + step;
+  return step ? (key & (step - 1)) + step : key;
 }
 
 // Stores code in table[0], table[step], table[2*step], ..., table[end].
@@ -75,11 +75,13 @@ static WEBP_INLINE int NextTableBitSize(const int* const count,
   return len - root_bits;
 }
 
-int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
-                          const int code_lengths[], int code_lengths_size) {
+// sorted[code_lengths_size] is a pre-allocated array for sorting symbols
+// by code length.
+static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
+                             const int code_lengths[], int code_lengths_size,
+                             uint16_t sorted[]) {
   HuffmanCode* table = root_table;  // next available space in table
   int total_size = 1 << root_bits;  // total size root table + 2nd level table
-  int* sorted = NULL;               // symbols sorted by code length
   int len;                          // current code length
   int symbol;                       // symbol index in original or sorted table
   // number of codes of each length:
@@ -114,11 +116,6 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
     offset[len + 1] = offset[len] + count[len];
   }
 
-  sorted = (int*)WebPSafeMalloc(code_lengths_size, sizeof(*sorted));
-  if (sorted == NULL) {
-    return 0;
-  }
-
   // Sort symbols by length, by symbol order within each length.
   for (symbol = 0; symbol < code_lengths_size; ++symbol) {
     const int symbol_code_length = code_lengths[symbol];
@@ -133,7 +130,6 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
     code.bits = 0;
     code.value = (uint16_t)sorted[0];
     ReplicateValue(table, 1, total_size, code);
-    WebPSafeFree(sorted);
     return total_size;
   }
 
@@ -153,7 +149,6 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
       num_nodes += num_open;
       num_open -= count[len];
       if (num_open < 0) {
-        WebPSafeFree(sorted);
         return 0;
       }
       for (; count[len] > 0; --count[len]) {
@@ -172,7 +167,6 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
       num_nodes += num_open;
       num_open -= count[len];
       if (num_open < 0) {
-        WebPSafeFree(sorted);
         return 0;
       }
       for (; count[len] > 0; --count[len]) {
@@ -195,11 +189,35 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
 
     // Check if tree is full.
     if (num_nodes != 2 * offset[MAX_ALLOWED_CODE_LENGTH] - 1) {
-      WebPSafeFree(sorted);
       return 0;
     }
   }
 
-  WebPSafeFree(sorted);
+  return total_size;
+}
+
+// Maximum code_lengths_size is 2328 (reached for 11-bit color_cache_bits).
+// More commonly, the value is around ~280.
+#define MAX_CODE_LENGTHS_SIZE \
+  ((1 << MAX_CACHE_BITS) + NUM_LITERAL_CODES + NUM_LENGTH_CODES)
+// Cut-off value for switching between heap and stack allocation.
+#define SORTED_SIZE_CUTOFF 512
+int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
+                          const int code_lengths[], int code_lengths_size) {
+  int total_size;
+  assert(code_lengths_size <= MAX_CODE_LENGTHS_SIZE);
+  if (code_lengths_size <= SORTED_SIZE_CUTOFF) {
+    // use local stack-allocated array.
+    uint16_t sorted[SORTED_SIZE_CUTOFF];
+    total_size = BuildHuffmanTable(root_table, root_bits,
+                                   code_lengths, code_lengths_size, sorted);
+  } else {   // rare case. Use heap allocation.
+    uint16_t* const sorted =
+        (uint16_t*)WebPSafeMalloc(code_lengths_size, sizeof(*sorted));
+    if (sorted == NULL) return 0;
+    total_size = BuildHuffmanTable(root_table, root_bits,
+                                   code_lengths, code_lengths_size, sorted);
+    WebPSafeFree(sorted);
+  }
   return total_size;
 }
diff --git a/src/3rdparty/libwebp/src/utils/huffman.h b/src/3rdparty/libwebp/src/utils/huffman_utils.h
index c6dd6aa..c6dd6aa 100644
--- a/src/3rdparty/libwebp/src/utils/huffman.h
+++ b/src/3rdparty/libwebp/src/utils/huffman_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels_dec.c b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
index ee0a3fe..d4d23d3 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels_dec.c
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
@@ -14,7 +14,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./quant_levels_dec.h"
+#include "./quant_levels_dec_utils.h"
 
 #include <string.h>   // for memset
 
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels_dec.h b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.h
index 59a1349..59a1349 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels_dec.h
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels.c b/src/3rdparty/libwebp/src/utils/quant_levels_utils.c
index d7c8aab..73174e8 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels.c
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_utils.c
@@ -14,7 +14,7 @@
 
 #include <assert.h>
 
-#include "./quant_levels.h"
+#include "./quant_levels_utils.h"
 
 #define NUM_SYMBOLS     256
 
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels.h b/src/3rdparty/libwebp/src/utils/quant_levels_utils.h
index 1cb5a32..1cb5a32 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels.h
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/random.c b/src/3rdparty/libwebp/src/utils/random_utils.c
index 24e96ad..9f1e415 100644
--- a/src/3rdparty/libwebp/src/utils/random.c
+++ b/src/3rdparty/libwebp/src/utils/random_utils.c
@@ -12,7 +12,7 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <string.h>
-#include "./random.h"
+#include "./random_utils.h"
 
 //------------------------------------------------------------------------------
 
diff --git a/src/3rdparty/libwebp/src/utils/random.h b/src/3rdparty/libwebp/src/utils/random_utils.h
index c392a61..c392a61 100644
--- a/src/3rdparty/libwebp/src/utils/random.h
+++ b/src/3rdparty/libwebp/src/utils/random_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/rescaler.c b/src/3rdparty/libwebp/src/utils/rescaler_utils.c
index 00c9300..0d1f80d 100644
--- a/src/3rdparty/libwebp/src/utils/rescaler.c
+++ b/src/3rdparty/libwebp/src/utils/rescaler_utils.c
@@ -15,7 +15,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "../dsp/dsp.h"
-#include "./rescaler.h"
+#include "./rescaler_utils.h"
 
 //------------------------------------------------------------------------------
 
@@ -48,11 +48,15 @@ void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
   wrk->y_sub = wrk->y_expand ? y_sub - 1 : y_sub;
   wrk->y_accum = wrk->y_expand ? wrk->y_sub : wrk->y_add;
   if (!wrk->y_expand) {
-    // this is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
+    // This is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
+    // Its value is <= WEBP_RESCALER_ONE, because dst_height <= wrk->y_add, and
+    // wrk->x_add >= 1;
     const uint64_t ratio =
         (uint64_t)dst_height * WEBP_RESCALER_ONE / (wrk->x_add * wrk->y_add);
     if (ratio != (uint32_t)ratio) {
-      // We can't represent the ratio with the current fixed-point precision.
+      // When ratio == WEBP_RESCALER_ONE, we can't represent the ratio with the
+      // current fixed-point precision. This happens when src_height ==
+      // wrk->y_add (which == src_height), and wrk->x_add == 1.
       // => We special-case fxy_scale = 0, in WebPRescalerExportRow().
       wrk->fxy_scale = 0;
     } else {
diff --git a/src/3rdparty/libwebp/src/utils/rescaler.h b/src/3rdparty/libwebp/src/utils/rescaler_utils.h
index 98b01a7..98b01a7 100644
--- a/src/3rdparty/libwebp/src/utils/rescaler.h
+++ b/src/3rdparty/libwebp/src/utils/rescaler_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/thread.c b/src/3rdparty/libwebp/src/utils/thread_utils.c
index 93f7622..1729060 100644
--- a/src/3rdparty/libwebp/src/utils/thread.c
+++ b/src/3rdparty/libwebp/src/utils/thread_utils.c
@@ -13,7 +13,7 @@
 
 #include <assert.h>
 #include <string.h>   // for memset()
-#include "./thread.h"
+#include "./thread_utils.h"
 #include "./utils.h"
 
 #ifdef WEBP_USE_THREAD
@@ -183,8 +183,7 @@ static int pthread_cond_wait(pthread_cond_t* const condition,
 #else
   // note that there is a consumer available so the signal isn't dropped in
   // pthread_cond_signal
-  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL))
-    return 1;
+  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1;
   // now unlock the mutex so pthread_cond_signal may be issued
   pthread_mutex_unlock(mutex);
   ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
@@ -226,8 +225,7 @@ static THREADFN ThreadLoop(void* ptr) {
 }
 
 // main thread state control
-static void ChangeState(WebPWorker* const worker,
-                        WebPWorkerStatus new_status) {
+static void ChangeState(WebPWorker* const worker, WebPWorkerStatus new_status) {
   // No-op when attempting to change state on a thread that didn't come up.
   // Checking status_ without acquiring the lock first would result in a data
   // race.
diff --git a/src/3rdparty/libwebp/src/utils/thread.h b/src/3rdparty/libwebp/src/utils/thread_utils.h
index 8408311..8408311 100644
--- a/src/3rdparty/libwebp/src/utils/thread.h
+++ b/src/3rdparty/libwebp/src/utils/thread_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/utils.c b/src/3rdparty/libwebp/src/utils/utils.c
index 2602ca3..504d924 100644
--- a/src/3rdparty/libwebp/src/utils/utils.c
+++ b/src/3rdparty/libwebp/src/utils/utils.c
@@ -25,7 +25,7 @@
 //    http://valgrind.org/docs/manual/ms-manual.html
 // Here is an example command line:
 /*    valgrind --tool=massif --massif-out-file=massif.out \
-               --stacks=yes --alloc-fn=WebPSafeAlloc --alloc-fn=WebPSafeCalloc
+               --stacks=yes --alloc-fn=WebPSafeMalloc --alloc-fn=WebPSafeCalloc
       ms_print massif.out
 */
 // In addition:
@@ -175,8 +175,12 @@ static int CheckSizeArgumentsOverflow(uint64_t nmemb, size_t size) {
   }
 #endif
 #if defined(MALLOC_LIMIT)
-  if (mem_limit > 0 && total_mem + total_size >= mem_limit) {
-    return 0;   // fake fail!
+  if (mem_limit > 0) {
+    const uint64_t new_total_mem = (uint64_t)total_mem + total_size;
+    if (new_total_mem != (size_t)new_total_mem ||
+        new_total_mem > mem_limit) {
+      return 0;   // fake fail!
+    }
   }
 #endif
 
@@ -239,8 +243,7 @@ void WebPCopyPixels(const WebPPicture* const src, WebPPicture* const dst) {
 
 //------------------------------------------------------------------------------
 
-#define MAX_COLOR_COUNT         MAX_PALETTE_SIZE
-#define COLOR_HASH_SIZE         (MAX_COLOR_COUNT * 4)
+#define COLOR_HASH_SIZE         (MAX_PALETTE_SIZE * 4)
 #define COLOR_HASH_RIGHT_SHIFT  22  // 32 - log2(COLOR_HASH_SIZE).
 
 int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
@@ -249,7 +252,7 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
   int num_colors = 0;
   uint8_t in_use[COLOR_HASH_SIZE] = { 0 };
   uint32_t colors[COLOR_HASH_SIZE];
-  static const uint32_t kHashMul = 0x1e35a7bdU;
+  static const uint64_t kHashMul = 0x1e35a7bdull;
   const uint32_t* argb = pic->argb;
   const int width = pic->width;
   const int height = pic->height;
@@ -264,14 +267,14 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
         continue;
       }
       last_pix = argb[x];
-      key = (kHashMul * last_pix) >> COLOR_HASH_RIGHT_SHIFT;
+      key = ((last_pix * kHashMul) & 0xffffffffu) >> COLOR_HASH_RIGHT_SHIFT;
       while (1) {
         if (!in_use[key]) {
           colors[key] = last_pix;
           in_use[key] = 1;
           ++num_colors;
-          if (num_colors > MAX_COLOR_COUNT) {
-            return MAX_COLOR_COUNT + 1;  // Exact count not needed.
+          if (num_colors > MAX_PALETTE_SIZE) {
+            return MAX_PALETTE_SIZE + 1;  // Exact count not needed.
           }
           break;
         } else if (colors[key] == last_pix) {
@@ -298,8 +301,30 @@ int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
   return num_colors;
 }
 
-#undef MAX_COLOR_COUNT
 #undef COLOR_HASH_SIZE
 #undef COLOR_HASH_RIGHT_SHIFT
 
 //------------------------------------------------------------------------------
+
+#if defined(WEBP_NEED_LOG_TABLE_8BIT)
+const uint8_t WebPLogTable8bit[256] = {   // 31 ^ clz(i)
+  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+};
+#endif
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/utils/utils.h b/src/3rdparty/libwebp/src/utils/utils.h
index e0a8112..3ab4590 100644
--- a/src/3rdparty/libwebp/src/utils/utils.h
+++ b/src/3rdparty/libwebp/src/utils/utils.h
@@ -20,6 +20,7 @@
 #endif
 
 #include <assert.h>
+#include <limits.h>
 
 #include "../dsp/dsp.h"
 #include "../webp/types.h"
@@ -32,7 +33,14 @@ extern "C" {
 // Memory allocation
 
 // This is the maximum memory amount that libwebp will ever try to allocate.
-#define WEBP_MAX_ALLOCABLE_MEMORY (1ULL << 40)
+#ifndef WEBP_MAX_ALLOCABLE_MEMORY
+#if SIZE_MAX > (1ULL << 34)
+#define WEBP_MAX_ALLOCABLE_MEMORY (1ULL << 34)
+#else
+// For 32-bit targets keep this below INT_MAX to avoid valgrind warnings.
+#define WEBP_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16))
+#endif
+#endif  // WEBP_MAX_ALLOCABLE_MEMORY
 
 // size-checking safe malloc/calloc: verify that the requested size is not too
 // large, or return NULL. You don't need to call these for constructs like
@@ -54,7 +62,6 @@ WEBP_EXTERN(void) WebPSafeFree(void* const ptr);
 #define WEBP_ALIGN_CST 31
 #define WEBP_ALIGN(PTR) (((uintptr_t)(PTR) + WEBP_ALIGN_CST) & ~WEBP_ALIGN_CST)
 
-#if defined(WEBP_FORCE_ALIGNED)
 #include <string.h>
 // memcpy() is the safe way of moving potentially unaligned 32b memory.
 static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) {
@@ -65,16 +72,6 @@ static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) {
 static WEBP_INLINE void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) {
   memcpy(ptr, &val, sizeof(val));
 }
-#else
-static WEBP_UBSAN_IGNORE_UNDEF WEBP_INLINE
-uint32_t WebPMemToUint32(const uint8_t* const ptr) {
-  return *(const uint32_t*)ptr;
-}
-static WEBP_UBSAN_IGNORE_UNDEF WEBP_INLINE
-void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) {
-  *(uint32_t*)ptr = val;
-}
-#endif
 
 //------------------------------------------------------------------------------
 // Reading/writing data.
@@ -110,6 +107,19 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
   PutLE16(data + 2, (int)(val >> 16));
 }
 
+// Returns 31 ^ clz(n) = log2(n). This is the default C-implementation, either
+// based on table or not. Can be used as fallback if clz() is not available.
+#define WEBP_NEED_LOG_TABLE_8BIT
+extern const uint8_t WebPLogTable8bit[256];
+static WEBP_INLINE int WebPLog2FloorC(uint32_t n) {
+  int log = 0;
+  while (n >= 256) {
+    log += 8;
+    n >>= 8;
+  }
+  return log + WebPLogTable8bit[n];
+}
+
 // Returns (int)floor(log2(n)). n must be > 0.
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
@@ -127,22 +137,8 @@ static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
-#else
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  int log = 0;
-  uint32_t value = n;
-  int i;
-
-  for (i = 4; i >= 0; --i) {
-    const int shift = (1 << i);
-    const uint32_t x = value >> shift;
-    if (x != 0) {
-      value = x;
-      log += shift;
-    }
-  }
-  return log;
-}
+#else   // default: use the C-version.
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) { return WebPLog2FloorC(n); }
 #endif
 
 //------------------------------------------------------------------------------
@@ -164,12 +160,12 @@ WEBP_EXTERN(void) WebPCopyPixels(const struct WebPPicture* const src,
 // Unique colors.
 
 // Returns count of unique colors in 'pic', assuming pic->use_argb is true.
-// If the unique color count is more than MAX_COLOR_COUNT, returns
-// MAX_COLOR_COUNT+1.
+// If the unique color count is more than MAX_PALETTE_SIZE, returns
+// MAX_PALETTE_SIZE+1.
 // If 'palette' is not NULL and number of unique colors is less than or equal to
-// MAX_COLOR_COUNT, also outputs the actual unique colors into 'palette'.
+// MAX_PALETTE_SIZE, also outputs the actual unique colors into 'palette'.
 // Note: 'palette' is assumed to be an array already allocated with at least
-// MAX_COLOR_COUNT elements.
+// MAX_PALETTE_SIZE elements.
 WEBP_EXTERN(int) WebPGetColorPalette(const struct WebPPicture* const pic,
                                      uint32_t* const palette);
 
diff --git a/src/3rdparty/libwebp/src/webp/config.h b/src/3rdparty/libwebp/src/webp/config.h
index 118ac38..731115b 100644
--- a/src/3rdparty/libwebp/src/webp/config.h
+++ b/src/3rdparty/libwebp/src/webp/config.h
@@ -79,7 +79,7 @@
 #define PACKAGE_NAME "libwebp"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "libwebp 0.5.1"
+#define PACKAGE_STRING "libwebp 0.6.0"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "libwebp"
@@ -88,7 +88,7 @@
 #define PACKAGE_URL "http://developers.google.com/speed/webp"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "0.5.1"
+#define PACKAGE_VERSION "0.6.0"
 
 /* Define to necessary symbol if this constant uses a non-standard name on
    your system. */
@@ -98,7 +98,7 @@
 /* #undef STDC_HEADERS */
 
 /* Version number of package */
-#define VERSION "0.5.1"
+#define VERSION "0.6.0"
 
 /* Enable experimental code */
 /* #undef WEBP_EXPERIMENTAL_FEATURES */
diff --git a/src/3rdparty/libwebp/src/webp/decode.h b/src/3rdparty/libwebp/src/webp/decode.h
index 7a3bed9..4c5e74a 100644
--- a/src/3rdparty/libwebp/src/webp/decode.h
+++ b/src/3rdparty/libwebp/src/webp/decode.h
@@ -248,19 +248,19 @@ typedef enum VP8StatusCode {
 // picture is only partially decoded, pending additional input.
 // Code example:
 //
-//   WebPInitDecBuffer(&buffer);
-//   buffer.colorspace = mode;
+//   WebPInitDecBuffer(&output_buffer);
+//   output_buffer.colorspace = mode;
 //   ...
-//   WebPIDecoder* idec = WebPINewDecoder(&buffer);
-//   while (has_more_data) {
-//     // ... (get additional data)
+//   WebPIDecoder* idec = WebPINewDecoder(&output_buffer);
+//   while (additional_data_is_available) {
+//     // ... (get additional data in some new_data[] buffer)
 //     status = WebPIAppend(idec, new_data, new_data_size);
-//     if (status != VP8_STATUS_SUSPENDED ||
-//       break;
+//     if (status != VP8_STATUS_OK && status != VP8_STATUS_SUSPENDED) {
+//       break;    // an error occurred.
 //     }
 //
 //     // The above call decodes the current available buffer.
-//     // Part of the image can now be refreshed by calling to
+//     // Part of the image can now be refreshed by calling
 //     // WebPIDecGetRGB()/WebPIDecGetYUVA() etc.
 //   }
 //   WebPIDelete(idec);
diff --git a/src/3rdparty/libwebp/src/webp/encode.h b/src/3rdparty/libwebp/src/webp/encode.h
index 9291b71..35fde1d 100644
--- a/src/3rdparty/libwebp/src/webp/encode.h
+++ b/src/3rdparty/libwebp/src/webp/encode.h
@@ -20,7 +20,7 @@
 extern "C" {
 #endif
 
-#define WEBP_ENCODER_ABI_VERSION 0x0209    // MAJOR(8b) + MINOR(8b)
+#define WEBP_ENCODER_ABI_VERSION 0x020e    // MAJOR(8b) + MINOR(8b)
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
@@ -141,12 +141,10 @@ struct WebPConfig {
                           // RGB information for better compression. The default
                           // value is 0.
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  int delta_palettization;
+  int use_delta_palette;  // reserved for future lossless feature
+  int use_sharp_yuv;      // if needed, use sharp (and slow) RGB->YUV conversion
+
   uint32_t pad[2];        // padding for later use
-#else
-  uint32_t pad[3];        // padding for later use
-#endif  // WEBP_EXPERIMENTAL_FEATURES
 };
 
 // Enumerate some predefined settings for WebPConfig, depending on the type
@@ -388,9 +386,24 @@ WEBP_EXTERN(void) WebPPictureFree(WebPPicture* picture);
 // Returns false in case of memory allocation error.
 WEBP_EXTERN(int) WebPPictureCopy(const WebPPicture* src, WebPPicture* dst);
 
+// Compute the single distortion for packed planes of samples.
+// 'src' will be compared to 'ref', and the raw distortion stored into
+// '*distortion'. The refined metric (log(MSE), log(1 - ssim),...' will be
+// stored in '*result'.
+// 'x_step' is the horizontal stride (in bytes) between samples.
+// 'src/ref_stride' is the byte distance between rows.
+// Returns false in case of error (bad parameter, memory allocation error, ...).
+WEBP_EXTERN(int) WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+                                     const uint8_t* ref, size_t ref_stride,
+                                     int width, int height,
+                                     size_t x_step,
+                                     int type,   // 0 = PSNR, 1 = SSIM, 2 = LSIM
+                                     float* distortion, float* result);
+
 // Compute PSNR, SSIM or LSIM distortion metric between two pictures. Results
-// are in dB, stored in result[] in the Y/U/V/Alpha/All or B/G/R/A/All order.
-// Returns false in case of error (src and ref don't have same dimension, ...)
+// are in dB, stored in result[] in the B/G/R/A/All order. The distortion is
+// always performed using ARGB samples. Hence if the input is YUV(A), the
+// picture will be internally converted to ARGB (just for the measurement).
 // Warning: this function is rather CPU-intensive.
 WEBP_EXTERN(int) WebPPictureDistortion(
     const WebPPicture* src, const WebPPicture* ref,
@@ -473,18 +486,20 @@ WEBP_EXTERN(int) WebPPictureARGBToYUVA(WebPPicture* picture,
 WEBP_EXTERN(int) WebPPictureARGBToYUVADithered(
     WebPPicture* picture, WebPEncCSP colorspace, float dithering);
 
-// Performs 'smart' RGBA->YUVA420 downsampling and colorspace conversion.
+// Performs 'sharp' RGBA->YUVA420 downsampling and colorspace conversion.
 // Downsampling is handled with extra care in case of color clipping. This
 // method is roughly 2x slower than WebPPictureARGBToYUVA() but produces better
-// YUV representation.
+// and sharper YUV representation.
 // Returns false in case of error.
+WEBP_EXTERN(int) WebPPictureSharpARGBToYUVA(WebPPicture* picture);
+// kept for backward compatibility:
 WEBP_EXTERN(int) WebPPictureSmartARGBToYUVA(WebPPicture* picture);
 
 // Converts picture->yuv to picture->argb and sets picture->use_argb to true.
-// The input format must be YUV_420 or YUV_420A.
-// Note that the use of this method is discouraged if one has access to the
-// raw ARGB samples, since using YUV420 is comparatively lossy. Also, the
-// conversion from YUV420 to ARGB incurs a small loss too.
+// The input format must be YUV_420 or YUV_420A. The conversion from YUV420 to
+// ARGB incurs a small loss too.
+// Note that the use of this colorspace is discouraged if one has access to the
+// raw ARGB samples, since using YUV420 is comparatively lossy.
 // Returns false in case of error.
 WEBP_EXTERN(int) WebPPictureYUVAToARGB(WebPPicture* picture);
 
diff --git a/src/3rdparty/libwebp/src/webp/extras.h b/src/3rdparty/libwebp/src/webp/extras.h
deleted file mode 100644
index 1c24be2..0000000
--- a/src/3rdparty/libwebp/src/webp/extras.h
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-
-#ifndef WEBP_WEBP_EXTRAS_H_
-#define WEBP_WEBP_EXTRAS_H_
-
-#include "./types.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#include "./encode.h"
-
-#define WEBP_EXTRAS_ABI_VERSION 0x0000    // MAJOR(8b) + MINOR(8b)
-
-//------------------------------------------------------------------------------
-
-// Returns the version number of the extras library, packed in hexadecimal using
-// 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507.
-WEBP_EXTERN(int) WebPGetExtrasVersion(void);
-
-//------------------------------------------------------------------------------
-// Ad-hoc colorspace importers.
-
-// Import luma sample (gray scale image) into 'picture'. The 'picture'
-// width and height must be set prior to calling this function.
-WEBP_EXTERN(int) WebPImportGray(const uint8_t* gray, WebPPicture* picture);
-
-// Import rgb sample in RGB565 packed format into 'picture'. The 'picture'
-// width and height must be set prior to calling this function.
-WEBP_EXTERN(int) WebPImportRGB565(const uint8_t* rgb565, WebPPicture* pic);
-
-// Import rgb sample in RGB4444 packed format into 'picture'. The 'picture'
-// width and height must be set prior to calling this function.
-WEBP_EXTERN(int) WebPImportRGB4444(const uint8_t* rgb4444, WebPPicture* pic);
-
-//------------------------------------------------------------------------------
-
-#ifdef __cplusplus
-}    // extern "C"
-#endif
-
-#endif  /* WEBP_WEBP_EXTRAS_H_ */
diff --git a/src/3rdparty/libwebp/src/webp/format_constants.h b/src/3rdparty/libwebp/src/webp/format_constants.h
index b6e78a6..329fc8a 100644
--- a/src/3rdparty/libwebp/src/webp/format_constants.h
+++ b/src/3rdparty/libwebp/src/webp/format_constants.h
@@ -72,14 +72,13 @@ typedef enum {
 #define RIFF_HEADER_SIZE   12    // Size of the RIFF header ("RIFFnnnnWEBP").
 #define ANMF_CHUNK_SIZE    16    // Size of an ANMF chunk.
 #define ANIM_CHUNK_SIZE    6     // Size of an ANIM chunk.
-#define FRGM_CHUNK_SIZE    6     // Size of a FRGM chunk.
 #define VP8X_CHUNK_SIZE    10    // Size of a VP8X chunk.
 
 #define MAX_CANVAS_SIZE     (1 << 24)     // 24-bit max for VP8X width/height.
 #define MAX_IMAGE_AREA      (1ULL << 32)  // 32-bit max for width x height.
 #define MAX_LOOP_COUNT      (1 << 16)     // maximum value for loop-count
 #define MAX_DURATION        (1 << 24)     // maximum duration
-#define MAX_POSITION_OFFSET (1 << 24)     // maximum frame/fragment x/y offset
+#define MAX_POSITION_OFFSET (1 << 24)     // maximum frame x/y offset
 
 // Maximum chunk payload is such that adding the header and padding won't
 // overflow a uint32_t.
diff --git a/src/3rdparty/libwebp/src/webp/mux.h b/src/3rdparty/libwebp/src/webp/mux.h
index b72658c..daccc65 100644
--- a/src/3rdparty/libwebp/src/webp/mux.h
+++ b/src/3rdparty/libwebp/src/webp/mux.h
@@ -21,13 +21,13 @@
 extern "C" {
 #endif
 
-#define WEBP_MUX_ABI_VERSION 0x0106        // MAJOR(8b) + MINOR(8b)
+#define WEBP_MUX_ABI_VERSION 0x0108        // MAJOR(8b) + MINOR(8b)
 
 //------------------------------------------------------------------------------
 // Mux API
 //
 // This API allows manipulation of WebP container images containing features
-// like color profile, metadata, animation and fragmented images.
+// like color profile, metadata, animation.
 //
 // Code Example#1: Create a WebPMux object with image data, color profile and
 // XMP metadata.
@@ -81,16 +81,16 @@ typedef enum WebPMuxError {
 
 // IDs for different types of chunks.
 typedef enum WebPChunkId {
-  WEBP_CHUNK_VP8X,     // VP8X
-  WEBP_CHUNK_ICCP,     // ICCP
-  WEBP_CHUNK_ANIM,     // ANIM
-  WEBP_CHUNK_ANMF,     // ANMF
-  WEBP_CHUNK_FRGM,     // FRGM
-  WEBP_CHUNK_ALPHA,    // ALPH
-  WEBP_CHUNK_IMAGE,    // VP8/VP8L
-  WEBP_CHUNK_EXIF,     // EXIF
-  WEBP_CHUNK_XMP,      // XMP
-  WEBP_CHUNK_UNKNOWN,  // Other chunks.
+  WEBP_CHUNK_VP8X,        // VP8X
+  WEBP_CHUNK_ICCP,        // ICCP
+  WEBP_CHUNK_ANIM,        // ANIM
+  WEBP_CHUNK_ANMF,        // ANMF
+  WEBP_CHUNK_DEPRECATED,  // (deprecated from FRGM)
+  WEBP_CHUNK_ALPHA,       // ALPH
+  WEBP_CHUNK_IMAGE,       // VP8/VP8L
+  WEBP_CHUNK_EXIF,        // EXIF
+  WEBP_CHUNK_XMP,         // XMP
+  WEBP_CHUNK_UNKNOWN,     // Other chunks.
   WEBP_CHUNK_NIL
 } WebPChunkId;
 
@@ -142,7 +142,7 @@ static WEBP_INLINE WebPMux* WebPMuxCreate(const WebPData* bitstream,
 // Non-image chunks.
 
 // Note: Only non-image related chunks should be managed through chunk APIs.
-// (Image related chunks are: "ANMF", "FRGM", "VP8 ", "VP8L" and "ALPH").
+// (Image related chunks are: "ANMF", "VP8 ", "VP8L" and "ALPH").
 // To add, get and delete images, use WebPMuxSetImage(), WebPMuxPushFrame(),
 // WebPMuxGetFrame() and WebPMuxDeleteFrame().
 
@@ -195,7 +195,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxDeleteChunk(
 //------------------------------------------------------------------------------
 // Images.
 
-// Encapsulates data about a single frame/fragment.
+// Encapsulates data about a single frame.
 struct WebPMuxFrameInfo {
   WebPData    bitstream;  // image data: can be a raw VP8/VP8L bitstream
                           // or a single-image WebP file.
@@ -203,19 +203,19 @@ struct WebPMuxFrameInfo {
   int         y_offset;   // y-offset of the frame.
   int         duration;   // duration of the frame (in milliseconds).
 
-  WebPChunkId id;         // frame type: should be one of WEBP_CHUNK_ANMF,
-                          // WEBP_CHUNK_FRGM or WEBP_CHUNK_IMAGE
+  WebPChunkId id;         // frame type: should be one of WEBP_CHUNK_ANMF
+                          // or WEBP_CHUNK_IMAGE
   WebPMuxAnimDispose dispose_method;  // Disposal method for the frame.
   WebPMuxAnimBlend   blend_method;    // Blend operation for the frame.
   uint32_t    pad[1];     // padding for later use
 };
 
-// Sets the (non-animated and non-fragmented) image in the mux object.
-// Note: Any existing images (including frames/fragments) will be removed.
+// Sets the (non-animated) image in the mux object.
+// Note: Any existing images (including frames) will be removed.
 // Parameters:
 //   mux - (in/out) object in which the image is to be set
 //   bitstream - (in) can be a raw VP8/VP8L bitstream or a single-image
-//               WebP file (non-animated and non-fragmented)
+//               WebP file (non-animated)
 //   copy_data - (in) value 1 indicates given data WILL be copied to the mux
 //               object and value 0 indicates data will NOT be copied.
 // Returns:
@@ -226,9 +226,8 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetImage(
     WebPMux* mux, const WebPData* bitstream, int copy_data);
 
 // Adds a frame at the end of the mux object.
-// Notes: (1) frame.id should be one of WEBP_CHUNK_ANMF or WEBP_CHUNK_FRGM
-//        (2) For setting a non-animated non-fragmented image, use
-//            WebPMuxSetImage() instead.
+// Notes: (1) frame.id should be WEBP_CHUNK_ANMF
+//        (2) For setting a non-animated image, use WebPMuxSetImage() instead.
 //        (3) Type of frame being pushed must be same as the frames in mux.
 //        (4) As WebP only supports even offsets, any odd offset will be snapped
 //            to an even location using: offset &= ~1
@@ -431,9 +430,10 @@ struct WebPAnimEncoderOptions {
                         // frames in the output. The library may insert some key
                         // frames as needed to satisfy this criteria.
                         // Note that these conditions should hold: kmax > kmin
-                        // and kmin >= kmax / 2 + 1. Also, if kmin == 0, then
-                        // key-frame insertion is disabled; and if kmax == 0,
-                        // then all frames will be key-frames.
+                        // and kmin >= kmax / 2 + 1. Also, if kmax <= 0, then
+                        // key-frame insertion is disabled; and if kmax == 1,
+                        // then all frames will be key-frames (kmin value does
+                        // not matter for these special cases).
   int allow_mixed;      // If true, use mixed compression mode; may choose
                         // either lossy and lossless for each frame.
   int verbose;          // If true, print info and warning messages to stderr.
diff --git a/src/3rdparty/libwebp/src/webp/mux_types.h b/src/3rdparty/libwebp/src/webp/mux_types.h
index c94043a..b37e2c6 100644
--- a/src/3rdparty/libwebp/src/webp/mux_types.h
+++ b/src/3rdparty/libwebp/src/webp/mux_types.h
@@ -31,12 +31,13 @@ typedef struct WebPData WebPData;
 
 // VP8X Feature Flags.
 typedef enum WebPFeatureFlags {
-  FRAGMENTS_FLAG  = 0x00000001,
   ANIMATION_FLAG  = 0x00000002,
   XMP_FLAG        = 0x00000004,
   EXIF_FLAG       = 0x00000008,
   ALPHA_FLAG      = 0x00000010,
-  ICCP_FLAG       = 0x00000020
+  ICCP_FLAG       = 0x00000020,
+
+  ALL_VALID_FLAGS = 0x0000003e
 } WebPFeatureFlags;
 
 // Dispose method (animation only). Indicates how the area used by the current
author	Liang Qi <liang.qi@qt.io>	2017-03-07 13:05:21 +0100
committer	Liang Qi <liang.qi@qt.io>	2017-03-13 10:47:45 +0000
commit	b7ec9e78633d8f2c75a8b02e17e169497bb103e2 (patch)
tree	e4be04af4dbcf8cd635715efdf4e769281183746 /src/3rdparty/libwebp/src
parent	f2dbc67c2b032a5f27d0224e020fb6dfcd3fd142 (diff)