Update bundled libwebp to version 1.0.0

This commit imports libwebp 1.0.0, including AUTHORS, COPYING, ChangeLog, NEWS, PATENTS, README and src directories. In src, only includes header and source files. Upstream changes since 0.6.1 have been merged in. Also updated version in qt_attribution.json. [ChangeLog][Third-Party Code] Update bundled libwebp to version 1.0.0. Change-Id: Ia30ccc90286d5dd3e48e091f101f1cae84785150 Reviewed-by: Kai Koehne <kai.koehne@qt.io> Reviewed-by: Eirik Aavitsland <eirik.aavitsland@qt.io>
author: Liang Qi <liang.qi@qt.io> 2018-05-14 13:07:15 +0200
committer: Liang Qi <liang.qi@qt.io> 2018-05-22 08:03:36 +0000
commit: 62082a63e112e9991b33c2045896ced78ffcb62e (patch)
tree: 04a66f057499c90be0a8abfe8b0375886c6f25df
parent: 92398950d9cfe5a88cb685ec166eb413aa8613ec (diff)
56 files changed, 1693 insertions, 858 deletions
diff --git a/src/3rdparty/libwebp.pri b/src/3rdparty/libwebp.pri
index eba9212..d1f7fdf 100644
--- a/src/3rdparty/libwebp.pri
+++ b/src/3rdparty/libwebp.pri
@@ -67,18 +67,19 @@ SOURCES += \
     $$PWD/libwebp/src/dsp/upsampling_mips_dsp_r2.c \
     $$PWD/libwebp/src/dsp/upsampling_msa.c \
     $$PWD/libwebp/src/dsp/upsampling_sse2.c \
+    $$PWD/libwebp/src/dsp/upsampling_sse41.c \
     $$PWD/libwebp/src/dsp/yuv.c \
     $$PWD/libwebp/src/dsp/yuv_mips_dsp_r2.c \
     $$PWD/libwebp/src/dsp/lossless_sse2.c \
     $$PWD/libwebp/src/dsp/yuv_mips32.c \
     $$PWD/libwebp/src/dsp/yuv_sse2.c \
+    $$PWD/libwebp/src/dsp/yuv_sse41.c \
     $$PWD/libwebp/src/enc/alpha_enc.c \
     $$PWD/libwebp/src/enc/analysis_enc.c \
     $$PWD/libwebp/src/enc/backward_references_cost_enc.c \
     $$PWD/libwebp/src/enc/backward_references_enc.c \
     $$PWD/libwebp/src/enc/config_enc.c \
     $$PWD/libwebp/src/enc/cost_enc.c \
-    $$PWD/libwebp/src/enc/delta_palettization_enc.c \
     $$PWD/libwebp/src/enc/filter_enc.c \
     $$PWD/libwebp/src/enc/frame_enc.c \
     $$PWD/libwebp/src/enc/histogram_enc.c \
diff --git a/src/3rdparty/libwebp/AUTHORS b/src/3rdparty/libwebp/AUTHORS
index b6e9cfb..83c7b9c 100644
--- a/src/3rdparty/libwebp/AUTHORS
+++ b/src/3rdparty/libwebp/AUTHORS
@@ -35,4 +35,5 @@ Contributors:
 - Urvang Joshi (urvang at google dot com)
 - Vikas Arora (vikasa at google dot com)
 - Vincent Rabaud (vrabaud at google dot com)
+- Vlad Tsyrklevich (vtsyrklevich at chromium dot org)
 - Yang Zhang (yang dot zhang at arm dot com)
diff --git a/src/3rdparty/libwebp/ChangeLog b/src/3rdparty/libwebp/ChangeLog
index b17feb2..9fd9acf 100644
--- a/src/3rdparty/libwebp/ChangeLog
+++ b/src/3rdparty/libwebp/ChangeLog
@@ -1,9 +1,92 @@
+8d510751 webp-container-spec: correct frame duration=0 note
+e6b2164e vwebp: Copy Chrome's behavior w/frame duration == 0
+d20b7707 update ChangeLog (tag: v1.0.0-rc3)
+0d5fad46 add WEBP_DSP_INIT / WEBP_DSP_INIT_FUNC
+c1cb86af fix 16b overflow in SSE2
+e577feb7 makefile.unix: add DEBUG flag for compiling w/ debug-symbol
+99be34b3 cwebp,get_disto: fix bpp output
+f5565ca8 cmake: Make sure we use near-lossless by default.
+d898dc14 fix bug in WebPImport565: alpha value was not set
+882784b0 update ChangeLog (tag: v1.0.0-rc2)
+2f930e08 Revert "Use proper targets for CMake."
+8165e8fb Use proper targets for CMake.
+3f157dd5 Remove some very hard TODOs.
+cd758a17 {de,}mux/Makefile.am: add missing headers
+b892b8ba makefile.unix,dist: use ascii for text output
+64a57d05 add -version option to anim_dump,anim_diff and img2webp
+fc1b8e3a webp_js: fix webp_js demo html
+15aa48d9 update ChangeLog (tag: v1.0.0-rc1)
+e607dabc update AUTHORS
+38410c08 [CFI] Remove function pointer casts
+c57b2736 bump version to 1.0.0
+cba28853 update NEWS
+c909d531 Merge "remove some deprecation warning on MacOSX"
+217443c7 remove some deprecation warning on MacOSX
+b672bdfa configure: quiet glut deprecation warnings on OS X
+daa9fcaf configure: use sdl-config if available
+dd174cae Merge "imagedec: support metadata reading for WebP image decoding"
+641cedcc imagedec: support metadata reading for WebP image decoding
+065b2ce1 anim_diff: add a couple missing newlines in Help()
+c4cc1147 Merge "gif2webp: force low duration frames to 100ms"
+09333097 gif2webp: force low duration frames to 100ms
+e03f0ec3 sharp_yuv: use 14b fixed-point precision for gamma
+b2db361c image_enc,WebPWritePNG: move locals after setjmp
+74e82ec6 Merge "WebPPictureDistortion: fix big-endian results order"
+645d04ca Merge "cwebp,get_disto: report bpp"
+120f58c3 Merge "lossless*sse2: improve non-const 16-bit vector creation"
+a7fe9412 WebPPictureDistortion: fix big-endian results order
+e26fe066 cwebp,get_disto: report bpp
+9df64e28 Merge changes Id5b4a1a4,Ia20ce844
+8043504f lossless*sse2: improve non-const 16-bit vector creation
+1e3dfc48 Import: extract condition from loop
+3b07d327 Import,RGBA: fix for BigEndian import
+551948e4 Remove unused argument in VP8LBitsEntropy.
+3005237a ReadWebP: fix for big-endian
+499c395a Merge "anim_diff: expose the -max_diff option"
+f69dcd69 Merge "remove WEBP_EXPERIMENTAL_FEATURES"
+07d884d5 anim_diff: expose the -max_diff option
+f4dd9256 remove WEBP_EXPERIMENTAL_FEATURES
+94a8377b extract the command-line parsing helpers to example_util
+fc09e6e2 PNM decoder: prevent unsupported depth=2 PAM case.
+6de58603 MIPS64: Fix defined-but-not-used errors with WEBP_REDUCE_CSP
+cbde5728 gif2webp: add support for reading from stdin
+cf1c5054 Add an SSE4 version of some lossless color transforms.
+45a8b5eb Fix lint error with man page.
+cff38e8f Merge "PNG decoder: handle gAMA chunk"
+59cb1a48 Merge "enable dc error-diffusion always"
+78318b30 PNG decoder: handle gAMA chunk
+664c21dd Merge "remove some TODOs"
+815652de enable dc error-diffusion always
+aec45cec remove some TODOs
+5715dfce fix block-count[] increment in case of large image
+c2d04f3e enable DC error-diffusion always for multi-pass
+96bf07c5 use DC error diffusion for U/V at low-quality
+1c59020b fix missing sse41 targets in makefile.unix
+7a8e814b cosmetics: s/color_space/colorspace/
+05f6fe24 upsampling: rm asserts w/REDUCE_CSP+OMIT_C_CODE
+b4cf5597 Merge "Upsampling SSE2/SSE4 speedup."
+ccbeb32c Makefile.vc: add missing sse41 files
+55403a9a Upsampling SSE2/SSE4 speedup.
+807b53c4 Implement the upsampling/yuv functions in SSE41
+84101a81 Fix wasm WebP compilation
+8bebd2a3 fix warning on MSVC
+a7f93fe3 webpmux: allow reading argument from a file
+b69f18a7 gif2webp.1: fix -loop_compatibility layout
+72d530c0 Merge "fix lossless decoding w/WEBP_REDUCE_SIZE"
+296c7dc4 fix lossless decoding w/WEBP_REDUCE_SIZE
+0d5d029c Merge "ImgIoUtilReadFile: fix file leak upon error"
+ae568ce7 ImgIoUtilReadFile: fix file leak upon error
+796b5a8a Merge tag 'v0.6.1'
+6b7a95fd update ChangeLog (tag: v0.6.1)
 f66955de WEBP_REDUCE_CSP: restrict colorspace support
+1af0df76 Merge "WEBP_REDUCE_CSP: restrict colorspace support"
+6de20df0 WEBP_REDUCE_CSP: restrict colorspace support
 a289d8e7 update ChangeLog (tag: v0.6.1-rc2)
 c10a493c vwebp: disable double buffering on windows & mac
 0d4466c2 webp_to_sdl.c: fix file mode
 1b27bf8b WEBP_REDUCE_SIZE: disable all rescaler code
 126be109 webpinfo: add -version option
+0df22b9e WEBP_REDUCE_SIZE: disable all rescaler code
 9add62b5 bump version to 0.6.1
 d3e26144 update NEWS
 2edda639 README: add webpinfo section
diff --git a/src/3rdparty/libwebp/NEWS b/src/3rdparty/libwebp/NEWS
index 85d273e..480cb7d 100644
--- a/src/3rdparty/libwebp/NEWS
+++ b/src/3rdparty/libwebp/NEWS
@@ -1,3 +1,13 @@
+- 4/2/2018: version 1.0.0
+  This is a binary compatible release.
+  * lossy encoder improvements to avoid chroma shifts in various circumstances
+    (issues #308, #340)
+  * big-endian fixes for decode, RGBA import and WebPPictureDistortion
+  Tool updates:
+    gifwebp, anim_diff - default duration behavior (<= 10ms) changed to match
+                         web browsers, transcoding tools (issue #379)
+    img2webp, webpmux - allow options to be passed in via a file (issue #355)
+
 - 11/24/2017: version 0.6.1
   This is a binary compatible release.
   * lossless performance and compression improvements + a new 'cruncher' mode
diff --git a/src/3rdparty/libwebp/README b/src/3rdparty/libwebp/README
index e9817bf..a76b378 100644
--- a/src/3rdparty/libwebp/README
+++ b/src/3rdparty/libwebp/README
@@ -4,7 +4,7 @@
           \__\__/\____/\_____/__/ ____  ___
                 / _/ /    \    \ /  _ \/ _/
                /  \_/   / /   \ \   __/  \__
-               \____/____/\_____/_____/____/v0.6.1
+               \____/____/\_____/_____/____/v1.0.0
 
 Description:
 ============
@@ -458,6 +458,7 @@ File-level options (only used at the start of compression):
  -mixed ............... use mixed lossy/lossless automatic mode
  -v ................... verbose mode
  -h ................... this help
+ -version ............. print version number and exit
 
 Per-frame options (only used for subsequent images input):
  -d <int> ............. frame duration in ms (default: 100)
@@ -524,6 +525,11 @@ Options:
   -min_psnr <float> ... minimum per-frame PSNR
   -raw_comparison ..... if this flag is not used, RGB is
                         premultiplied before comparison
+  -max_diff <int> ..... maximum allowed difference per channel
+                        between corresponding pixels in subsequent
+                        frames
+  -h .................. this help
+  -version ............ print version number and exit
 
 Building:
 ---------
diff --git a/src/3rdparty/libwebp/qt_attribution.json b/src/3rdparty/libwebp/qt_attribution.json
index 70165be..ca2c361 100644
--- a/src/3rdparty/libwebp/qt_attribution.json
+++ b/src/3rdparty/libwebp/qt_attribution.json
@@ -6,7 +6,7 @@
 
     "Description": "WebP is a new image format that provides lossless and lossy compression for images on the web.",
     "Homepage": "https://developers.google.com/speed/webp/",
-    "Version": "0.6.1",
+    "Version": "1.0.0",
     "License": "BSD 3-clause \"New\" or \"Revised\" License",
     "LicenseId": "BSD-3-Clause",
     "LicenseFile": "COPYING",
diff --git a/src/3rdparty/libwebp/src/dec/frame_dec.c b/src/3rdparty/libwebp/src/dec/frame_dec.c
index 517d0f5..a9d5430 100644
--- a/src/3rdparty/libwebp/src/dec/frame_dec.c
+++ b/src/3rdparty/libwebp/src/dec/frame_dec.c
@@ -400,7 +400,9 @@ static void DitherRow(VP8Decoder* const dec) {
 #define MACROBLOCK_VPOS(mb_y)  ((mb_y) * 16)    // vertical position of a MB
 
 // Finalize and transmit a complete row. Return false in case of user-abort.
-static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
+static int FinishRow(void* arg1, void* arg2) {
+  VP8Decoder* const dec = (VP8Decoder*)arg1;
+  VP8Io* const io = (VP8Io*)arg2;
   int ok = 1;
   const VP8ThreadContext* const ctx = &dec->thread_ctx_;
   const int cache_id = ctx->id_;
@@ -448,10 +450,9 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
     if (y_end > io->crop_bottom) {
       y_end = io->crop_bottom;    // make sure we don't overflow on last row.
     }
+    // If dec->alpha_data_ is not NULL, we have some alpha plane present.
     io->a = NULL;
     if (dec->alpha_data_ != NULL && y_start < y_end) {
-      // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
-      // good idea.
       io->a = VP8DecompressAlphaRows(dec, io, y_start, y_end - y_start);
       if (io->a == NULL) {
         return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
@@ -558,7 +559,6 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
   if (io->bypass_filtering) {
     dec->filter_type_ = 0;
   }
-  // TODO(skal): filter type / strength / sharpness forcing
 
   // Define the area where we can skip in-loop filtering, in case of cropping.
   //
@@ -569,8 +569,6 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
   // Means: there's a dependency chain that goes all the way up to the
   // top-left corner of the picture (MB #0). We must filter all the previous
   // macroblocks.
-  // TODO(skal): add an 'approximate_decoding' option, that won't produce
-  // a 1:1 bit-exactness for complex filtering?
   {
     const int extra_pixels = kFilterExtraRows[dec->filter_type_];
     if (dec->filter_type_ == 2) {
@@ -651,7 +649,7 @@ static int InitThreadContext(VP8Decoder* const dec) {
     }
     worker->data1 = dec;
     worker->data2 = (void*)&dec->thread_ctx_.io_;
-    worker->hook = (WebPWorkerHook)FinishRow;
+    worker->hook = FinishRow;
     dec->num_caches_ =
       (dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1;
   } else {
diff --git a/src/3rdparty/libwebp/src/dec/vp8_dec.c b/src/3rdparty/libwebp/src/dec/vp8_dec.c
index 6212efd..c904b52 100644
--- a/src/3rdparty/libwebp/src/dec/vp8_dec.c
+++ b/src/3rdparty/libwebp/src/dec/vp8_dec.c
@@ -491,7 +491,7 @@ static int GetCoeffsAlt(VP8BitReader* const br,
   return 16;
 }
 
-WEBP_TSAN_IGNORE_FUNCTION static void InitGetCoeffs(void) {
+static WEBP_TSAN_IGNORE_FUNCTION void InitGetCoeffs(void) {
   if (GetCoeffs == NULL) {
     if (VP8GetCPUInfo != NULL && VP8GetCPUInfo(kSlowSSSE3)) {
       GetCoeffs = GetCoeffsAlt;
diff --git a/src/3rdparty/libwebp/src/dec/vp8i_dec.h b/src/3rdparty/libwebp/src/dec/vp8i_dec.h
index 28244d9..c929933 100644
--- a/src/3rdparty/libwebp/src/dec/vp8i_dec.h
+++ b/src/3rdparty/libwebp/src/dec/vp8i_dec.h
@@ -30,9 +30,9 @@ extern "C" {
 // Various defines and enums
 
 // version numbers
-#define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 6
-#define DEC_REV_VERSION 1
+#define DEC_MAJ_VERSION 1
+#define DEC_MIN_VERSION 0
+#define DEC_REV_VERSION 0
 
 // YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
 // Constraints are: We need to store one 16x16 block of luma samples (y),
diff --git a/src/3rdparty/libwebp/src/dec/vp8l_dec.c b/src/3rdparty/libwebp/src/dec/vp8l_dec.c
index 42ea3b5..0570f53 100644
--- a/src/3rdparty/libwebp/src/dec/vp8l_dec.c
+++ b/src/3rdparty/libwebp/src/dec/vp8l_dec.c
@@ -1643,17 +1643,17 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
 
 #if !defined(WEBP_REDUCE_SIZE)
     if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
-
-    if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
-      // need the alpha-multiply functions for premultiplied output or rescaling
-      WebPInitAlphaProcessing();
-    }
 #else
     if (io->use_scaling) {
       dec->status_ = VP8_STATUS_INVALID_PARAM;
       goto Err;
     }
 #endif
+    if (io->use_scaling || WebPIsPremultipliedMode(dec->output_->colorspace)) {
+      // need the alpha-multiply functions for premultiplied output or rescaling
+      WebPInitAlphaProcessing();
+    }
+
     if (!WebPIsRGBMode(dec->output_->colorspace)) {
       WebPInitConvertARGBToYUV();
       if (dec->output_->u.YUVA.a != NULL) WebPInitAlphaProcessing();
diff --git a/src/3rdparty/libwebp/src/demux/demux.c b/src/3rdparty/libwebp/src/demux/demux.c
index 79c24a5..684215e 100644
--- a/src/3rdparty/libwebp/src/demux/demux.c
+++ b/src/3rdparty/libwebp/src/demux/demux.c
@@ -23,9 +23,9 @@
 #include "src/webp/demux.h"
 #include "src/webp/format_constants.h"
 
-#define DMUX_MAJ_VERSION 0
-#define DMUX_MIN_VERSION 3
-#define DMUX_REV_VERSION 3
+#define DMUX_MAJ_VERSION 1
+#define DMUX_MIN_VERSION 0
+#define DMUX_REV_VERSION 0
 
 typedef struct {
   size_t start_;        // start location of the data
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing.c b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
index 590e3bc..819d139 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing.c
@@ -366,6 +366,16 @@ static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
   return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
 }
 
+#ifdef WORDS_BIGENDIAN
+static void PackARGB_C(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                       const uint8_t* b, int len, uint32_t* out) {
+  int i;
+  for (i = 0; i < len; ++i) {
+    out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
+  }
+}
+#endif
+
 static void PackRGB_C(const uint8_t* r, const uint8_t* g, const uint8_t* b,
                       int len, int step, uint32_t* out) {
   int i, offset = 0;
@@ -381,6 +391,10 @@ int (*WebPDispatchAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPDispatchAlphaToGreen)(const uint8_t*, int, int, int, uint32_t*, int);
 int (*WebPExtractAlpha)(const uint8_t*, int, int, int, uint8_t*, int);
 void (*WebPExtractGreen)(const uint32_t* argb, uint8_t* alpha, int size);
+#ifdef WORDS_BIGENDIAN
+void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r, const uint8_t* g,
+                     const uint8_t* b, int, uint32_t*);
+#endif
 void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
                     int len, int step, uint32_t* out);
 
@@ -395,16 +409,14 @@ extern void WebPInitAlphaProcessingSSE2(void);
 extern void WebPInitAlphaProcessingSSE41(void);
 extern void WebPInitAlphaProcessingNEON(void);
 
-static volatile VP8CPUInfo alpha_processing_last_cpuinfo_used =
-    (VP8CPUInfo)&alpha_processing_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
-  if (alpha_processing_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(WebPInitAlphaProcessing) {
   WebPMultARGBRow = WebPMultARGBRow_C;
   WebPMultRow = WebPMultRow_C;
   WebPApplyAlphaMultiply4444 = ApplyAlphaMultiply_16b_C;
 
+#ifdef WORDS_BIGENDIAN
+  WebPPackARGB = PackARGB_C;
+#endif
   WebPPackRGB = PackRGB_C;
 #if !WEBP_NEON_OMIT_C_CODE
   WebPApplyAlphaMultiply = ApplyAlphaMultiply_C;
@@ -451,9 +463,10 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessing(void) {
   assert(WebPDispatchAlphaToGreen != NULL);
   assert(WebPExtractAlpha != NULL);
   assert(WebPExtractGreen != NULL);
+#ifdef WORDS_BIGENDIAN
+  assert(WebPPackARGB != NULL);
+#endif
   assert(WebPPackRGB != NULL);
   assert(WebPHasAlpha8b != NULL);
   assert(WebPHasAlpha32b != NULL);
-
-  alpha_processing_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c b/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
index e0dc91b..0090e87 100644
--- a/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
+++ b/src/3rdparty/libwebp/src/dsp/alpha_processing_mips_dsp_r2.c
@@ -125,6 +125,49 @@ static void MultARGBRow_MIPSdspR2(uint32_t* const ptr, int width,
   }
 }
 
+#ifdef WORDS_BIGENDIAN
+static void PackARGB_MIPSdspR2(const uint8_t* a, const uint8_t* r,
+                               const uint8_t* g, const uint8_t* b, int len,
+                               uint32_t* out) {
+  int temp0, temp1, temp2, temp3, offset;
+  const int rest = len & 1;
+  const uint32_t* const loop_end = out + len - rest;
+  const int step = 4;
+  __asm__ volatile (
+    "xor          %[offset],   %[offset], %[offset]    \n\t"
+    "beq          %[loop_end], %[out],    0f           \n\t"
+  "2:                                                  \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "addiu        %[out],      %[out],    4            \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    -4(%[out])              \n\t"
+    "addu         %[offset],   %[offset], %[step]      \n\t"
+    "bne          %[loop_end], %[out],    2b           \n\t"
+  "0:                                                  \n\t"
+    "beq          %[rest],     $zero,     1f           \n\t"
+    "lbux         %[temp0],    %[offset](%[a])         \n\t"
+    "lbux         %[temp1],    %[offset](%[r])         \n\t"
+    "lbux         %[temp2],    %[offset](%[g])         \n\t"
+    "lbux         %[temp3],    %[offset](%[b])         \n\t"
+    "ins          %[temp1],    %[temp0],  16,     16   \n\t"
+    "ins          %[temp3],    %[temp2],  16,     16   \n\t"
+    "precr.qb.ph  %[temp0],    %[temp1],  %[temp3]     \n\t"
+    "sw           %[temp0],    0(%[out])               \n\t"
+  "1:                                                  \n\t"
+    : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), [temp2]"=&r"(temp2),
+      [temp3]"=&r"(temp3), [offset]"=&r"(offset), [out]"+&r"(out)
+    : [a]"r"(a), [r]"r"(r), [g]"r"(g), [b]"r"(b), [step]"r"(step),
+      [loop_end]"r"(loop_end), [rest]"r"(rest)
+    : "memory"
+  );
+}
+#endif  // WORDS_BIGENDIAN
+
 static void PackRGB_MIPSdspR2(const uint8_t* r, const uint8_t* g,
                               const uint8_t* b, int len, int step,
                               uint32_t* out) {
@@ -172,6 +215,9 @@ extern void WebPInitAlphaProcessingMIPSdspR2(void);
 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingMIPSdspR2(void) {
   WebPDispatchAlpha = DispatchAlpha_MIPSdspR2;
   WebPMultARGBRow = MultARGBRow_MIPSdspR2;
+#ifdef WORDS_BIGENDIAN
+  WebPPackARGB = PackARGB_MIPSdspR2;
+#endif
   WebPPackRGB = PackRGB_MIPSdspR2;
 }
 
diff --git a/src/3rdparty/libwebp/src/dsp/common_sse2.h b/src/3rdparty/libwebp/src/dsp/common_sse2.h
index 995d7cf..e9f1ebf 100644
--- a/src/3rdparty/libwebp/src/dsp/common_sse2.h
+++ b/src/3rdparty/libwebp/src/dsp/common_sse2.h
@@ -128,9 +128,9 @@ static WEBP_INLINE void VP8Transpose_2_4x4_16b(
 // Pack the planar buffers
 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
-static WEBP_INLINE void VP8PlanarTo24b(__m128i* const in0, __m128i* const in1,
-                                       __m128i* const in2, __m128i* const in3,
-                                       __m128i* const in4, __m128i* const in5) {
+static WEBP_INLINE void VP8PlanarTo24b_SSE2(
+    __m128i* const in0, __m128i* const in1, __m128i* const in2,
+    __m128i* const in3, __m128i* const in4, __m128i* const in5) {
   // The input is 6 registers of sixteen 8b but for the sake of explanation,
   // let's take 6 registers of four 8b values.
   // To pack, we will keep taking one every two 8b integer and move it
@@ -159,10 +159,10 @@ static WEBP_INLINE void VP8PlanarTo24b(__m128i* const in0, __m128i* const in1,
 
 // Convert four packed four-channel buffers like argbargbargbargb... into the
 // split channels aaaaa ... rrrr ... gggg .... bbbbb ......
-static WEBP_INLINE void VP8L32bToPlanar(__m128i* const in0,
-                                        __m128i* const in1,
-                                        __m128i* const in2,
-                                        __m128i* const in3) {
+static WEBP_INLINE void VP8L32bToPlanar_SSE2(__m128i* const in0,
+                                             __m128i* const in1,
+                                             __m128i* const in2,
+                                             __m128i* const in3) {
   // Column-wise transpose.
   const __m128i A0 = _mm_unpacklo_epi8(*in0, *in1);
   const __m128i A1 = _mm_unpackhi_epi8(*in0, *in1);
diff --git a/src/3rdparty/libwebp/src/dsp/common_sse41.h b/src/3rdparty/libwebp/src/dsp/common_sse41.h
new file mode 100644
index 0000000..2f173c0
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/common_sse41.h
@@ -0,0 +1,132 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE4 code common to several files.
+//
+// Author: Vincent Rabaud (vrabaud@google.com)
+
+#ifndef WEBP_DSP_COMMON_SSE41_H_
+#define WEBP_DSP_COMMON_SSE41_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(WEBP_USE_SSE41)
+#include <smmintrin.h>
+
+//------------------------------------------------------------------------------
+// Channel mixing.
+// Shuffles the input buffer as A0 0 0 A1 0 0 A2 ...
+#define WEBP_SSE41_SHUFF(OUT, IN0, IN1)    \
+  OUT##0 = _mm_shuffle_epi8(*IN0, shuff0); \
+  OUT##1 = _mm_shuffle_epi8(*IN0, shuff1); \
+  OUT##2 = _mm_shuffle_epi8(*IN0, shuff2); \
+  OUT##3 = _mm_shuffle_epi8(*IN1, shuff0); \
+  OUT##4 = _mm_shuffle_epi8(*IN1, shuff1); \
+  OUT##5 = _mm_shuffle_epi8(*IN1, shuff2);
+
+// Pack the planar buffers
+// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
+static WEBP_INLINE void VP8PlanarTo24b_SSE41(
+    __m128i* const in0, __m128i* const in1, __m128i* const in2,
+    __m128i* const in3, __m128i* const in4, __m128i* const in5) {
+  __m128i R0, R1, R2, R3, R4, R5;
+  __m128i G0, G1, G2, G3, G4, G5;
+  __m128i B0, B1, B2, B3, B4, B5;
+
+  // Process R.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        5, -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, 10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+     -1, -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1);
+    WEBP_SSE41_SHUFF(R, in0, in1)
+  }
+
+  // Process G.
+  {
+    // Same as before, just shifted to the left by one and including the right
+    // padding.
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1);
+    const __m128i shuff1 = _mm_set_epi8(
+        10, -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5);
+    const __m128i shuff2 = _mm_set_epi8(
+     -1, 15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1);
+    WEBP_SSE41_SHUFF(G, in2, in3)
+  }
+
+  // Process B.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, 4, -1, -1, 3, -1, -1, 2, -1, -1, 1, -1, -1, 0, -1, -1);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, -1, 9, -1, -1, 8, -1, -1, 7, -1, -1, 6, -1, -1, 5, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+      15, -1, -1, 14, -1, -1, 13, -1, -1, 12, -1, -1, 11, -1, -1, 10);
+    WEBP_SSE41_SHUFF(B, in4, in5)
+  }
+
+  // OR the different channels.
+  {
+    const __m128i RG0 = _mm_or_si128(R0, G0);
+    const __m128i RG1 = _mm_or_si128(R1, G1);
+    const __m128i RG2 = _mm_or_si128(R2, G2);
+    const __m128i RG3 = _mm_or_si128(R3, G3);
+    const __m128i RG4 = _mm_or_si128(R4, G4);
+    const __m128i RG5 = _mm_or_si128(R5, G5);
+    *in0 = _mm_or_si128(RG0, B0);
+    *in1 = _mm_or_si128(RG1, B1);
+    *in2 = _mm_or_si128(RG2, B2);
+    *in3 = _mm_or_si128(RG3, B3);
+    *in4 = _mm_or_si128(RG4, B4);
+    *in5 = _mm_or_si128(RG5, B5);
+  }
+}
+
+#undef WEBP_SSE41_SHUFF
+
+// Convert four packed four-channel buffers like argbargbargbargb... into the
+// split channels aaaaa ... rrrr ... gggg .... bbbbb ......
+static WEBP_INLINE void VP8L32bToPlanar_SSE41(__m128i* const in0,
+                                              __m128i* const in1,
+                                              __m128i* const in2,
+                                              __m128i* const in3) {
+  // aaaarrrrggggbbbb
+  const __m128i shuff0 =
+      _mm_set_epi8(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+  const __m128i A0 = _mm_shuffle_epi8(*in0, shuff0);
+  const __m128i A1 = _mm_shuffle_epi8(*in1, shuff0);
+  const __m128i A2 = _mm_shuffle_epi8(*in2, shuff0);
+  const __m128i A3 = _mm_shuffle_epi8(*in3, shuff0);
+  // A0A1R0R1
+  // G0G1B0B1
+  // A2A3R2R3
+  // G0G1B0B1
+  const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
+  const __m128i B1 = _mm_unpackhi_epi32(A0, A1);
+  const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
+  const __m128i B3 = _mm_unpackhi_epi32(A2, A3);
+  *in3 = _mm_unpacklo_epi64(B0, B2);
+  *in2 = _mm_unpackhi_epi64(B0, B2);
+  *in1 = _mm_unpacklo_epi64(B1, B3);
+  *in0 = _mm_unpackhi_epi64(B1, B3);
+}
+
+#endif  // WEBP_USE_SSE41
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // WEBP_DSP_COMMON_SSE41_H_
diff --git a/src/3rdparty/libwebp/src/dsp/cost.c b/src/3rdparty/libwebp/src/dsp/cost.c
index a732389..634ccc2 100644
--- a/src/3rdparty/libwebp/src/dsp/cost.c
+++ b/src/3rdparty/libwebp/src/dsp/cost.c
@@ -378,12 +378,7 @@ extern void VP8EncDspCostInitMIPS32(void);
 extern void VP8EncDspCostInitMIPSdspR2(void);
 extern void VP8EncDspCostInitSSE2(void);
 
-static volatile VP8CPUInfo cost_last_cpuinfo_used =
-    (VP8CPUInfo)&cost_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
-  if (cost_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(VP8EncDspCostInit) {
   VP8GetResidualCost = GetResidualCost_C;
   VP8SetResidualCoeffs = SetResidualCoeffs_C;
 
@@ -405,8 +400,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspCostInit(void) {
     }
 #endif
   }
-
-  cost_last_cpuinfo_used = VP8GetCPUInfo;
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/cpu.c b/src/3rdparty/libwebp/src/dsp/cpu.c
index 6f42cbb..8b40fee 100644
--- a/src/3rdparty/libwebp/src/dsp/cpu.c
+++ b/src/3rdparty/libwebp/src/dsp/cpu.c
@@ -18,7 +18,7 @@
 #include <string.h>
 #endif
 
-#if defined(WEBP_ANDROID_NEON) && !defined(Q_OS_ANDROID_EMBEDDED)
+#if defined(WEBP_ANDROID_NEON)
 #include <cpu-features.h>
 #endif
 
@@ -168,7 +168,7 @@ static int x86CPUInfo(CPUFeature feature) {
   return 0;
 }
 VP8CPUInfo VP8GetCPUInfo = x86CPUInfo;
-#elif defined(WEBP_ANDROID_NEON) && !defined(Q_OS_ANDROID_EMBEDDED) // NB: needs to be before generic NEON test.
+#elif defined(WEBP_ANDROID_NEON)  // NB: needs to be before generic NEON test.
 static int AndroidCPUInfo(CPUFeature feature) {
   const AndroidCpuFamily cpu_family = android_getCpuFamily();
   const uint64_t cpu_features = android_getCpuFeatures();
diff --git a/src/3rdparty/libwebp/src/dsp/dec.c b/src/3rdparty/libwebp/src/dsp/dec.c
index 7e82407..1119842 100644
--- a/src/3rdparty/libwebp/src/dsp/dec.c
+++ b/src/3rdparty/libwebp/src/dsp/dec.c
@@ -741,12 +741,7 @@ extern void VP8DspInitMIPS32(void);
 extern void VP8DspInitMIPSdspR2(void);
 extern void VP8DspInitMSA(void);
 
-static volatile VP8CPUInfo dec_last_cpuinfo_used =
-    (VP8CPUInfo)&dec_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
-  if (dec_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(VP8DspInit) {
   VP8InitClipTables();
 
 #if !WEBP_NEON_OMIT_C_CODE
@@ -889,6 +884,4 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8DspInit(void) {
   assert(VP8PredChroma8[5] != NULL);
   assert(VP8PredChroma8[6] != NULL);
   assert(VP8DitherCombine8x8 != NULL);
-
-  dec_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/dsp.h b/src/3rdparty/libwebp/src/dsp/dsp.h
index 5b703a1..4ab77a5 100644
--- a/src/3rdparty/libwebp/src/dsp/dsp.h
+++ b/src/3rdparty/libwebp/src/dsp/dsp.h
@@ -60,8 +60,7 @@ extern "C" {
 #endif
 
 #if defined(_MSC_VER) && _MSC_VER >= 1500 && \
-    (defined(_M_X64) || defined(_M_IX86)) && \
-    !defined(__clang__)
+    (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE41  // Visual C++ SSE4.1 targets
 #endif
 
@@ -142,6 +141,42 @@ extern "C" {
 #endif
 #endif
 
+#if defined(WEBP_USE_THREAD) && !defined(_WIN32)
+#include <pthread.h>  // NOLINT
+
+#define WEBP_DSP_INIT(func) do {                                    \
+  static volatile VP8CPUInfo func ## _last_cpuinfo_used =           \
+      (VP8CPUInfo)&func ## _last_cpuinfo_used;                      \
+  static pthread_mutex_t func ## _lock = PTHREAD_MUTEX_INITIALIZER; \
+  if (pthread_mutex_lock(&func ## _lock)) break;                    \
+  if (func ## _last_cpuinfo_used != VP8GetCPUInfo) func();          \
+  func ## _last_cpuinfo_used = VP8GetCPUInfo;                       \
+  (void)pthread_mutex_unlock(&func ## _lock);                       \
+} while (0)
+#else  // !(defined(WEBP_USE_THREAD) && !defined(_WIN32))
+#define WEBP_DSP_INIT(func) do {                                    \
+  static volatile VP8CPUInfo func ## _last_cpuinfo_used =           \
+      (VP8CPUInfo)&func ## _last_cpuinfo_used;                      \
+  if (func ## _last_cpuinfo_used == VP8GetCPUInfo) break;           \
+  func();                                                           \
+  func ## _last_cpuinfo_used = VP8GetCPUInfo;                       \
+} while (0)
+#endif  // defined(WEBP_USE_THREAD) && !defined(_WIN32)
+
+// Defines an Init + helper function that control multiple initialization of
+// function pointers / tables.
+/* Usage:
+   WEBP_DSP_INIT_FUNC(InitFunc) {
+     ...function body
+   }
+*/
+#define WEBP_DSP_INIT_FUNC(name)                             \
+  static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void); \
+  WEBP_TSAN_IGNORE_FUNCTION void name(void) {                \
+    WEBP_DSP_INIT(name ## _body);                            \
+  }                                                          \
+  static WEBP_TSAN_IGNORE_FUNCTION void name ## _body(void)
+
 #define WEBP_UBSAN_IGNORE_UNDEF
 #define WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW
 #if defined(__clang__) && defined(__has_attribute)
@@ -167,6 +202,13 @@ extern "C" {
 #define WEBP_SWAP_16BIT_CSP 0
 #endif
 
+// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
+#if !defined(WORDS_BIGENDIAN) && \
+    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
+     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
+#define WORDS_BIGENDIAN
+#endif
+
 typedef enum {
   kSSE2,
   kSSE3,
@@ -190,7 +232,7 @@ WEBP_EXTERN VP8CPUInfo VP8GetCPUInfo;
 // avoiding a compiler warning.
 #define WEBP_DSP_INIT_STUB(func) \
   extern void func(void); \
-  WEBP_TSAN_IGNORE_FUNCTION void func(void) {}
+  void func(void) {}
 
 //------------------------------------------------------------------------------
 // Encoding
@@ -579,6 +621,13 @@ void WebPMultRow_C(uint8_t* const ptr, const uint8_t* const alpha,
                    int width, int inverse);
 void WebPMultARGBRow_C(uint32_t* const ptr, int width, int inverse);
 
+#ifdef WORDS_BIGENDIAN
+// ARGB packing function: a/r/g/b input is rgba or bgra order.
+extern void (*WebPPackARGB)(const uint8_t* a, const uint8_t* r,
+                            const uint8_t* g, const uint8_t* b, int len,
+                            uint32_t* out);
+#endif
+
 // RGB packing function. 'step' can be 3 or 4. r/g/b input is rgb or bgr order.
 extern void (*WebPPackRGB)(const uint8_t* r, const uint8_t* g, const uint8_t* b,
                            int len, int step, uint32_t* out);
diff --git a/src/3rdparty/libwebp/src/dsp/enc.c b/src/3rdparty/libwebp/src/dsp/enc.c
index 1c807f1..fa23b40 100644
--- a/src/3rdparty/libwebp/src/dsp/enc.c
+++ b/src/3rdparty/libwebp/src/dsp/enc.c
@@ -740,12 +740,7 @@ extern void VP8EncDspInitMIPS32(void);
 extern void VP8EncDspInitMIPSdspR2(void);
 extern void VP8EncDspInitMSA(void);
 
-static volatile VP8CPUInfo enc_last_cpuinfo_used =
-    (VP8CPUInfo)&enc_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
-  if (enc_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(VP8EncDspInit) {
   VP8DspInit();  // common inverse transforms
   InitTables();
 
@@ -838,6 +833,4 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInit(void) {
   assert(VP8EncQuantizeBlockWHT != NULL);
   assert(VP8Copy4x4 != NULL);
   assert(VP8Copy16x8 != NULL);
-
-  enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/filters.c b/src/3rdparty/libwebp/src/dsp/filters.c
index ca5f877..069a22e 100644
--- a/src/3rdparty/libwebp/src/dsp/filters.c
+++ b/src/3rdparty/libwebp/src/dsp/filters.c
@@ -238,12 +238,7 @@ extern void VP8FiltersInitMSA(void);
 extern void VP8FiltersInitNEON(void);
 extern void VP8FiltersInitSSE2(void);
 
-static volatile VP8CPUInfo filters_last_cpuinfo_used =
-    (VP8CPUInfo)&filters_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
-  if (filters_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(VP8FiltersInit) {
   WebPUnfilters[WEBP_FILTER_NONE] = NULL;
 #if !WEBP_NEON_OMIT_C_CODE
   WebPUnfilters[WEBP_FILTER_HORIZONTAL] = HorizontalUnfilter_C;
@@ -289,6 +284,4 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8FiltersInit(void) {
   assert(WebPFilters[WEBP_FILTER_HORIZONTAL] != NULL);
   assert(WebPFilters[WEBP_FILTER_VERTICAL] != NULL);
   assert(WebPFilters[WEBP_FILTER_GRADIENT] != NULL);
-
-  filters_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.c b/src/3rdparty/libwebp/src/dsp/lossless.c
index 83f553d..f9b3c18 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless.c
@@ -577,9 +577,6 @@ extern void VP8LDspInitNEON(void);
 extern void VP8LDspInitMIPSdspR2(void);
 extern void VP8LDspInitMSA(void);
 
-static volatile VP8CPUInfo lossless_last_cpuinfo_used =
-    (VP8CPUInfo)&lossless_last_cpuinfo_used;
-
 #define COPY_PREDICTOR_ARRAY(IN, OUT) do {                \
   (OUT)[0] = IN##0_C;                                     \
   (OUT)[1] = IN##1_C;                                     \
@@ -599,9 +596,7 @@ static volatile VP8CPUInfo lossless_last_cpuinfo_used =
   (OUT)[15] = IN##0_C;                                    \
 } while (0);
 
-WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
-  if (lossless_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(VP8LDspInit) {
   COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors)
   COPY_PREDICTOR_ARRAY(Predictor, VP8LPredictors_C)
   COPY_PREDICTOR_ARRAY(PredictorAdd, VP8LPredictorsAdd)
@@ -658,8 +653,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInit(void) {
   assert(VP8LConvertBGRAToRGB565 != NULL);
   assert(VP8LMapColor32b != NULL);
   assert(VP8LMapColor8b != NULL);
-
-  lossless_last_cpuinfo_used = VP8GetCPUInfo;
 }
 #undef COPY_PREDICTOR_ARRAY
 
diff --git a/src/3rdparty/libwebp/src/dsp/lossless.h b/src/3rdparty/libwebp/src/dsp/lossless.h
index a99dbda..b2bbdfc 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless.h
+++ b/src/3rdparty/libwebp/src/dsp/lossless.h
@@ -25,10 +25,6 @@
 extern "C" {
 #endif
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "src/enc/delta_palettization_enc.h"
-#endif  // WEBP_EXPERIMENTAL_FEATURES
-
 //------------------------------------------------------------------------------
 // Decoding
 
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc.c b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
index 92ca3c0..d608326 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc.c
@@ -863,12 +863,7 @@ extern void VP8LEncDspInitMIPS32(void);
 extern void VP8LEncDspInitMIPSdspR2(void);
 extern void VP8LEncDspInitMSA(void);
 
-static volatile VP8CPUInfo lossless_enc_last_cpuinfo_used =
-    (VP8CPUInfo)&lossless_enc_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
-  if (lossless_enc_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(VP8LEncDspInit) {
   VP8LDspInit();
 
 #if !WEBP_NEON_OMIT_C_CODE
@@ -1011,8 +1006,6 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInit(void) {
   assert(VP8LPredictorsSub_C[13] != NULL);
   assert(VP8LPredictorsSub_C[14] != NULL);
   assert(VP8LPredictorsSub_C[15] != NULL);
-
-  lossless_enc_last_cpuinfo_used = VP8GetCPUInfo;
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
index 1eaf35c..f84a990 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse2.c
@@ -46,16 +46,14 @@ static void SubtractGreenFromBlueAndRed_SSE2(uint32_t* argb_data,
 //------------------------------------------------------------------------------
 // Color Transform
 
+#define MK_CST_16(HI, LO) \
+  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+
 static void TransformColor_SSE2(const VP8LMultipliers* const m,
                                 uint32_t* argb_data, int num_pixels) {
-  const __m128i mults_rb = _mm_set_epi16(
-      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
-      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
-      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
-      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_));
-  const __m128i mults_b2 = _mm_set_epi16(
-      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0,
-      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0);
+  const __m128i mults_rb = MK_CST_16(CST_5b(m->green_to_red_),
+                                     CST_5b(m->green_to_blue_));
+  const __m128i mults_b2 = MK_CST_16(CST_5b(m->red_to_blue_), 0);
   const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
   const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);  // red-blue masks
   int i;
@@ -85,12 +83,8 @@ static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
                                             int tile_width, int tile_height,
                                             int green_to_blue, int red_to_blue,
                                             int histo[]) {
-  const __m128i mults_r = _mm_set_epi16(
-      CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
-      CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
-  const __m128i mults_g = _mm_set_epi16(
-      0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue),
-      0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue));
+  const __m128i mults_r = MK_CST_16(CST_5b(red_to_blue), 0);
+  const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_blue));
   const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
   const __m128i mask_b = _mm_set1_epi32(0x0000ff);  // blue mask
   int y;
@@ -135,9 +129,7 @@ static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride,
 static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
                                            int tile_width, int tile_height,
                                            int green_to_red, int histo[]) {
-  const __m128i mults_g = _mm_set_epi16(
-      0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
-      0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
+  const __m128i mults_g = MK_CST_16(0, CST_5b(green_to_red));
   const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
   const __m128i mask = _mm_set1_epi32(0xff);
 
@@ -174,6 +166,7 @@ static void CollectColorRedTransforms_SSE2(const uint32_t* argb, int stride,
   }
 }
 #undef SPAN
+#undef MK_CST_16
 
 //------------------------------------------------------------------------------
 
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
index 3526a34..2e12a71 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_enc_sse41.c
@@ -18,6 +18,9 @@
 #include <smmintrin.h>
 #include "src/dsp/lossless.h"
 
+// For sign-extended multiplying constants, pre-shifted by 5:
+#define CST_5b(X)  (((int16_t)((uint16_t)(X) << 8)) >> 5)
+
 //------------------------------------------------------------------------------
 // Subtract-Green Transform
 
@@ -39,12 +42,103 @@ static void SubtractGreenFromBlueAndRed_SSE41(uint32_t* argb_data,
 }
 
 //------------------------------------------------------------------------------
+// Color Transform
+
+#define SPAN 8
+static void CollectColorBlueTransforms_SSE41(const uint32_t* argb, int stride,
+                                             int tile_width, int tile_height,
+                                             int green_to_blue, int red_to_blue,
+                                             int histo[]) {
+  const __m128i mults_r = _mm_set1_epi16(CST_5b(red_to_blue));
+  const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_blue));
+  const __m128i mask_g = _mm_set1_epi16(0xff00);   // green mask
+  const __m128i mask_gb = _mm_set1_epi32(0xffff);  // green/blue mask
+  const __m128i mask_b = _mm_set1_epi16(0x00ff);   // blue mask
+  const __m128i shuffler_lo = _mm_setr_epi8(-1, 2, -1, 6, -1, 10, -1, 14, -1,
+                                            -1, -1, -1, -1, -1, -1, -1);
+  const __m128i shuffler_hi = _mm_setr_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1,
+                                            2, -1, 6, -1, 10, -1, 14);
+  int y;
+  for (y = 0; y < tile_height; ++y) {
+    const uint32_t* const src = argb + y * stride;
+    int i, x;
+    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+      uint16_t values[SPAN];
+      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
+      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+      const __m128i r0 = _mm_shuffle_epi8(in0, shuffler_lo);
+      const __m128i r1 = _mm_shuffle_epi8(in1, shuffler_hi);
+      const __m128i r = _mm_or_si128(r0, r1);         // r 0
+      const __m128i gb0 = _mm_and_si128(in0, mask_gb);
+      const __m128i gb1 = _mm_and_si128(in1, mask_gb);
+      const __m128i gb = _mm_packus_epi32(gb0, gb1);  // g b
+      const __m128i g = _mm_and_si128(gb, mask_g);    // g 0
+      const __m128i A = _mm_mulhi_epi16(r, mults_r);  // x dbr
+      const __m128i B = _mm_mulhi_epi16(g, mults_g);  // x dbg
+      const __m128i C = _mm_sub_epi8(gb, B);          // x b'
+      const __m128i D = _mm_sub_epi8(C, A);           // x b''
+      const __m128i E = _mm_and_si128(D, mask_b);     // 0 b''
+      _mm_storeu_si128((__m128i*)values, E);
+      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & (SPAN - 1);
+    if (left_over > 0) {
+      VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
+                                       left_over, tile_height,
+                                       green_to_blue, red_to_blue, histo);
+    }
+  }
+}
+
+static void CollectColorRedTransforms_SSE41(const uint32_t* argb, int stride,
+                                            int tile_width, int tile_height,
+                                            int green_to_red, int histo[]) {
+  const __m128i mults_g = _mm_set1_epi16(CST_5b(green_to_red));
+  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
+  const __m128i mask = _mm_set1_epi16(0xff);
+
+  int y;
+  for (y = 0; y < tile_height; ++y) {
+    const uint32_t* const src = argb + y * stride;
+    int i, x;
+    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
+      uint16_t values[SPAN];
+      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]);
+      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
+      const __m128i g0 = _mm_and_si128(in0, mask_g);  // 0 0  | g 0
+      const __m128i g1 = _mm_and_si128(in1, mask_g);
+      const __m128i g = _mm_packus_epi32(g0, g1);     // g 0
+      const __m128i A0 = _mm_srli_epi32(in0, 16);     // 0 0  | x r
+      const __m128i A1 = _mm_srli_epi32(in1, 16);
+      const __m128i A = _mm_packus_epi32(A0, A1);     // x r
+      const __m128i B = _mm_mulhi_epi16(g, mults_g);  // x dr
+      const __m128i C = _mm_sub_epi8(A, B);           // x r'
+      const __m128i D = _mm_and_si128(C, mask);       // 0 r'
+      _mm_storeu_si128((__m128i*)values, D);
+      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
+    }
+  }
+  {
+    const int left_over = tile_width & (SPAN - 1);
+    if (left_over > 0) {
+      VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
+                                      left_over, tile_height, green_to_red,
+                                      histo);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
 // Entry point
 
 extern void VP8LEncDspInitSSE41(void);
 
 WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitSSE41(void) {
   VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed_SSE41;
+  VP8LCollectColorBlueTransforms = CollectColorBlueTransforms_SSE41;
+  VP8LCollectColorRedTransforms = CollectColorRedTransforms_SSE41;
 }
 
 #else  // !WEBP_USE_SSE41
diff --git a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
index 653b466..17d7576 100644
--- a/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/lossless_sse2.c
@@ -453,14 +453,11 @@ static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
                                        int num_pixels, uint32_t* dst) {
 // sign-extended multiplying constants, pre-shifted by 5.
 #define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
-  const __m128i mults_rb = _mm_set_epi16(
-      CST(green_to_red_), CST(green_to_blue_),
-      CST(green_to_red_), CST(green_to_blue_),
-      CST(green_to_red_), CST(green_to_blue_),
-      CST(green_to_red_), CST(green_to_blue_));
-  const __m128i mults_b2 = _mm_set_epi16(
-      CST(red_to_blue_), 0, CST(red_to_blue_), 0,
-      CST(red_to_blue_), 0, CST(red_to_blue_), 0);
+#define MK_CST_16(HI, LO) \
+  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
+  const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));
+  const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
+#undef MK_CST_16
 #undef CST
   const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
   int i;
@@ -503,11 +500,11 @@ static void ConvertBGRAToRGB_SSE2(const uint32_t* src, int num_pixels,
     __m128i in5 = _mm_loadu_si128(in + 5);
     __m128i in6 = _mm_loadu_si128(in + 6);
     __m128i in7 = _mm_loadu_si128(in + 7);
-    VP8L32bToPlanar(&in0, &in1, &in2, &in3);
-    VP8L32bToPlanar(&in4, &in5, &in6, &in7);
+    VP8L32bToPlanar_SSE2(&in0, &in1, &in2, &in3);
+    VP8L32bToPlanar_SSE2(&in4, &in5, &in6, &in7);
     // At this points, in1/in5 contains red only, in2/in6 green only ...
     // Pack the colors in 24b RGB.
-    VP8PlanarTo24b(&in1, &in5, &in2, &in6, &in3, &in7);
+    VP8PlanarTo24b_SSE2(&in1, &in5, &in2, &in6, &in3, &in7);
     _mm_storeu_si128(out + 0, in1);
     _mm_storeu_si128(out + 1, in5);
     _mm_storeu_si128(out + 2, in2);
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler.c b/src/3rdparty/libwebp/src/dsp/rescaler.c
index 4b6b783..f307d35 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler.c
@@ -204,11 +204,7 @@ extern void WebPRescalerDspInitMIPSdspR2(void);
 extern void WebPRescalerDspInitMSA(void);
 extern void WebPRescalerDspInitNEON(void);
 
-static volatile VP8CPUInfo rescaler_last_cpuinfo_used =
-    (VP8CPUInfo)&rescaler_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
-  if (rescaler_last_cpuinfo_used == VP8GetCPUInfo) return;
+WEBP_DSP_INIT_FUNC(WebPRescalerDspInit) {
 #if !defined(WEBP_REDUCE_SIZE)
 #if !WEBP_NEON_OMIT_C_CODE
   WebPRescalerExportRowExpand = WebPRescalerExportRowExpand_C;
@@ -253,5 +249,4 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPRescalerDspInit(void) {
   assert(WebPRescalerImportRowExpand != NULL);
   assert(WebPRescalerImportRowShrink != NULL);
 #endif   // WEBP_REDUCE_SIZE
-  rescaler_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
index f93b204..64c50de 100644
--- a/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/rescaler_sse2.c
@@ -36,7 +36,7 @@ static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) {
 }
 
 // input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
-static void LoadHeightPixels_SSE2(const uint8_t* const src, __m128i* out) {
+static void LoadEightPixels_SSE2(const uint8_t* const src, __m128i* out) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i A = _mm_loadl_epi64((const __m128i*)(src));  // ABCDEFGH
   *out = _mm_unpacklo_epi8(A, zero);
@@ -50,13 +50,15 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
   int accum = x_add;
   __m128i cur_pixels;
 
+  // SSE2 implementation only works with 16b signed arithmetic at max.
+  if (wrk->src_width < 8 || accum >= (1 << 15)) {
+    WebPRescalerImportRowExpand_C(wrk, src);
+    return;
+  }
+
   assert(!WebPRescalerInputDone(wrk));
   assert(wrk->x_expand);
   if (wrk->num_channels == 4) {
-    if (wrk->src_width < 2) {
-      WebPRescalerImportRowExpand_C(wrk, src);
-      return;
-    }
     LoadTwoPixels_SSE2(src, &cur_pixels);
     src += 4;
     while (1) {
@@ -75,11 +77,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
   } else {
     int left;
     const uint8_t* const src_limit = src + wrk->src_width - 8;
-    if (wrk->src_width < 8) {
-      WebPRescalerImportRowExpand_C(wrk, src);
-      return;
-    }
-    LoadHeightPixels_SSE2(src, &cur_pixels);
+    LoadEightPixels_SSE2(src, &cur_pixels);
     src += 7;
     left = 7;
     while (1) {
@@ -94,7 +92,7 @@ static void RescalerImportRowExpand_SSE2(WebPRescaler* const wrk,
         if (--left) {
           cur_pixels = _mm_srli_si128(cur_pixels, 2);
         } else if (src <= src_limit) {
-          LoadHeightPixels_SSE2(src, &cur_pixels);
+          LoadEightPixels_SSE2(src, &cur_pixels);
           src += 7;
           left = 7;
         } else {   // tail
diff --git a/src/3rdparty/libwebp/src/dsp/ssim.c b/src/3rdparty/libwebp/src/dsp/ssim.c
index dc1b518..989ce82 100644
--- a/src/3rdparty/libwebp/src/dsp/ssim.c
+++ b/src/3rdparty/libwebp/src/dsp/ssim.c
@@ -139,12 +139,7 @@ VP8AccumulateSSEFunc VP8AccumulateSSE;
 
 extern void VP8SSIMDspInitSSE2(void);
 
-static volatile VP8CPUInfo ssim_last_cpuinfo_used =
-    (VP8CPUInfo)&ssim_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
-  if (ssim_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(VP8SSIMDspInit) {
 #if !defined(WEBP_REDUCE_SIZE)
   VP8SSIMGetClipped = SSIMGetClipped_C;
   VP8SSIMGet = SSIMGet_C;
@@ -161,6 +156,4 @@ WEBP_TSAN_IGNORE_FUNCTION void VP8SSIMDspInit(void) {
     }
 #endif
   }
-
-  ssim_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling.c b/src/3rdparty/libwebp/src/dsp/upsampling.c
index e72626a..9b60da5 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling.c
@@ -217,13 +217,9 @@ WebPYUV444Converter WebPYUV444Converters[MODE_LAST];
 
 extern void WebPInitYUV444ConvertersMIPSdspR2(void);
 extern void WebPInitYUV444ConvertersSSE2(void);
+extern void WebPInitYUV444ConvertersSSE41(void);
 
-static volatile VP8CPUInfo upsampling_last_cpuinfo_used1 =
-    (VP8CPUInfo)&upsampling_last_cpuinfo_used1;
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
-  if (upsampling_last_cpuinfo_used1 == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(WebPInitYUV444Converters) {
   WebPYUV444Converters[MODE_RGBA]      = WebPYuv444ToRgba_C;
   WebPYUV444Converters[MODE_BGRA]      = WebPYuv444ToBgra_C;
   WebPYUV444Converters[MODE_RGB]       = WebPYuv444ToRgb_C;
@@ -242,29 +238,29 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444Converters(void) {
       WebPInitYUV444ConvertersSSE2();
     }
 #endif
+#if defined(WEBP_USE_SSE41)
+    if (VP8GetCPUInfo(kSSE4_1)) {
+      WebPInitYUV444ConvertersSSE41();
+    }
+#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
     if (VP8GetCPUInfo(kMIPSdspR2)) {
       WebPInitYUV444ConvertersMIPSdspR2();
     }
 #endif
   }
-  upsampling_last_cpuinfo_used1 = VP8GetCPUInfo;
 }
 
 //------------------------------------------------------------------------------
 // Main calls
 
 extern void WebPInitUpsamplersSSE2(void);
+extern void WebPInitUpsamplersSSE41(void);
 extern void WebPInitUpsamplersNEON(void);
 extern void WebPInitUpsamplersMIPSdspR2(void);
 extern void WebPInitUpsamplersMSA(void);
 
-static volatile VP8CPUInfo upsampling_last_cpuinfo_used2 =
-    (VP8CPUInfo)&upsampling_last_cpuinfo_used2;
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
-  if (upsampling_last_cpuinfo_used2 == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(WebPInitUpsamplers) {
 #ifdef FANCY_UPSAMPLING
 #if !WEBP_NEON_OMIT_C_CODE
   WebPUpsamplers[MODE_RGBA]      = UpsampleRgbaLinePair_C;
@@ -287,6 +283,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
       WebPInitUpsamplersSSE2();
     }
 #endif
+#if defined(WEBP_USE_SSE41)
+    if (VP8GetCPUInfo(kSSE4_1)) {
+      WebPInitUpsamplersSSE41();
+    }
+#endif
 #if defined(WEBP_USE_MIPS_DSP_R2)
     if (VP8GetCPUInfo(kMIPSdspR2)) {
       WebPInitUpsamplersMIPSdspR2();
@@ -310,6 +311,7 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
   assert(WebPUpsamplers[MODE_BGRA] != NULL);
   assert(WebPUpsamplers[MODE_rgbA] != NULL);
   assert(WebPUpsamplers[MODE_bgrA] != NULL);
+#if !defined(WEBP_REDUCE_CSP) || !WEBP_NEON_OMIT_C_CODE
   assert(WebPUpsamplers[MODE_RGB] != NULL);
   assert(WebPUpsamplers[MODE_BGR] != NULL);
   assert(WebPUpsamplers[MODE_ARGB] != NULL);
@@ -317,9 +319,9 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplers(void) {
   assert(WebPUpsamplers[MODE_RGB_565] != NULL);
   assert(WebPUpsamplers[MODE_Argb] != NULL);
   assert(WebPUpsamplers[MODE_rgbA_4444] != NULL);
+#endif
 
 #endif  // FANCY_UPSAMPLING
-  upsampling_last_cpuinfo_used2 = VP8GetCPUInfo;
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_msa.c b/src/3rdparty/libwebp/src/dsp/upsampling_msa.c
index 535ffb7..99eea70 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_msa.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_msa.c
@@ -264,6 +264,7 @@ static void YuvToBgr(int y, int u, int v, uint8_t* const bgr) {
   bgr[2] = Clip8(r1 >> 6);
 }
 
+#if !defined(WEBP_REDUCE_CSP)
 static void YuvToRgb565(int y, int u, int v, uint8_t* const rgb) {
   const int y1 = MultHi(y, 19077);
   const int r1 = y1 + MultHi(v, 26149) - 14234;
@@ -306,6 +307,7 @@ static void YuvToArgb(uint8_t y, uint8_t u, uint8_t v, uint8_t* const argb) {
   argb[0] = 0xff;
   YuvToRgb(y, u, v, argb + 1);
 }
+#endif  // WEBP_REDUCE_CSP
 
 static void YuvToBgra(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgra) {
   YuvToBgr(y, u, v, bgra);
@@ -317,6 +319,7 @@ static void YuvToRgba(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgba) {
   rgba[3] = 0xff;
 }
 
+#if !defined(WEBP_REDUCE_CSP)
 static void YuvToRgbLine(const uint8_t* y, const uint8_t* u,
                          const uint8_t* v, uint8_t* dst, int length) {
   v16u8 R, G, B;
@@ -370,6 +373,7 @@ static void YuvToBgrLine(const uint8_t* y, const uint8_t* u,
     memcpy(dst, temp, length * 3 * sizeof(*dst));
   }
 }
+#endif  // WEBP_REDUCE_CSP
 
 static void YuvToRgbaLine(const uint8_t* y, const uint8_t* u,
                           const uint8_t* v, uint8_t* dst, int length) {
@@ -427,6 +431,7 @@ static void YuvToBgraLine(const uint8_t* y, const uint8_t* u,
   }
 }
 
+#if !defined(WEBP_REDUCE_CSP)
 static void YuvToArgbLine(const uint8_t* y, const uint8_t* u,
                           const uint8_t* v, uint8_t* dst, int length) {
   v16u8 R, G, B;
@@ -526,6 +531,7 @@ static void YuvToRgb565Line(const uint8_t* y, const uint8_t* u,
     memcpy(dst, temp, length * 2 * sizeof(*dst));
   }
 }
+#endif  // WEBP_REDUCE_CSP
 
 #define UPSAMPLE_32PIXELS(a, b, c, d) do {    \
   v16u8 s = __msa_aver_u_b(a, d);             \
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c b/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
index fd5d303..340f1e2 100644
--- a/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_sse2.c
@@ -104,21 +104,6 @@ static void Upsample32Pixels_SSE2(const uint8_t r1[], const uint8_t r2[],
   Upsample32Pixels_SSE2(r1, r2, out);                                          \
 }
 
-#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y,                              \
-                    top_dst, bottom_dst, cur_x, num_pixels) {                  \
-  int n;                                                                       \
-  for (n = 0; n < (num_pixels); ++n) {                                         \
-    FUNC((top_y)[(cur_x) + n], r_u[n], r_v[n],                                 \
-         (top_dst) + ((cur_x) + n) * (XSTEP));                                 \
-  }                                                                            \
-  if ((bottom_y) != NULL) {                                                    \
-    for (n = 0; n < (num_pixels); ++n) {                                       \
-      FUNC((bottom_y)[(cur_x) + n], r_u[64 + n], r_v[64 + n],                  \
-           (bottom_dst) + ((cur_x) + n) * (XSTEP));                            \
-    }                                                                          \
-  }                                                                            \
-}
-
 #define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
                        top_dst, bottom_dst, cur_x) do {                        \
   FUNC##32_SSE2((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP));   \
@@ -135,7 +120,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
                       uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
   int uv_pos, pos;                                                             \
   /* 16byte-aligned array to cache reconstructed u and v */                    \
-  uint8_t uv_buf[4 * 32 + 15];                                                 \
+  uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
   uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
   uint8_t* const r_v = r_u + 32;                                               \
                                                                                \
@@ -160,11 +145,22 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
   }                                                                            \
   if (len > 1) {                                                               \
     const int left_over = ((len + 1) >> 1) - (pos >> 1);                       \
+    uint8_t* const tmp_top_dst = r_u + 4 * 32;                                 \
+    uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32;                      \
+    uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32;                          \
+    uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32;      \
     assert(left_over > 0);                                                     \
     UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u);       \
     UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v);       \
-    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst,             \
-                pos, len - pos);                                               \
+    memcpy(tmp_top, top_y + pos, len - pos);                                   \
+    if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos);       \
+    CONVERT2RGB_32(FUNC, XSTEP, tmp_top, tmp_bottom, tmp_top_dst,              \
+         tmp_bottom_dst, 0);                                                   \
+    memcpy(top_dst + pos * (XSTEP), tmp_top_dst, (len - pos) * (XSTEP));       \
+    if (bottom_y != NULL) {                                                    \
+      memcpy(bottom_dst + pos * (XSTEP), tmp_bottom_dst,                       \
+             (len - pos) * (XSTEP));                                           \
+    }                                                                          \
   }                                                                            \
 }
 
diff --git a/src/3rdparty/libwebp/src/dsp/upsampling_sse41.c b/src/3rdparty/libwebp/src/dsp/upsampling_sse41.c
new file mode 100644
index 0000000..648d456
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/upsampling_sse41.c
@@ -0,0 +1,239 @@
+// Copyright 2011 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// SSE41 version of YUV to RGB upsampling functions.
+//
+// Author: somnath@google.com (Somnath Banerjee)
+
+#include "src/dsp/dsp.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include <assert.h>
+#include <smmintrin.h>
+#include <string.h>
+#include "src/dsp/yuv.h"
+
+#ifdef FANCY_UPSAMPLING
+
+#if !defined(WEBP_REDUCE_CSP)
+
+// We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows
+// u = (9*a + 3*b + 3*c + d + 8) / 16
+//   = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2
+//   = (a + m + 1) / 2
+// where m = (a + 3*b + 3*c + d) / 8
+//         = ((a + b + c + d) / 2 + b + c) / 4
+//
+// Let's say  k = (a + b + c + d) / 4.
+// We can compute k as
+// k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1
+// where s = (a + d + 1) / 2 and t = (b + c + 1) / 2
+//
+// Then m can be written as
+// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1
+
+// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1
+#define GET_M(ij, in, out) do {                                                \
+  const __m128i tmp0 = _mm_avg_epu8(k, (in));     /* (k + in + 1) / 2 */       \
+  const __m128i tmp1 = _mm_and_si128((ij), st);   /* (ij) & (s^t) */           \
+  const __m128i tmp2 = _mm_xor_si128(k, (in));    /* (k^in) */                 \
+  const __m128i tmp3 = _mm_or_si128(tmp1, tmp2);  /* ((ij) & (s^t)) | (k^in) */\
+  const __m128i tmp4 = _mm_and_si128(tmp3, one);  /* & 1 -> lsb_correction */  \
+  (out) = _mm_sub_epi8(tmp0, tmp4);    /* (k + in + 1) / 2 - lsb_correction */ \
+} while (0)
+
+// pack and store two alternating pixel rows
+#define PACK_AND_STORE(a, b, da, db, out) do {                                 \
+  const __m128i t_a = _mm_avg_epu8(a, da);  /* (9a + 3b + 3c +  d + 8) / 16 */ \
+  const __m128i t_b = _mm_avg_epu8(b, db);  /* (3a + 9b +  c + 3d + 8) / 16 */ \
+  const __m128i t_1 = _mm_unpacklo_epi8(t_a, t_b);                             \
+  const __m128i t_2 = _mm_unpackhi_epi8(t_a, t_b);                             \
+  _mm_store_si128(((__m128i*)(out)) + 0, t_1);                                 \
+  _mm_store_si128(((__m128i*)(out)) + 1, t_2);                                 \
+} while (0)
+
+// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels.
+#define UPSAMPLE_32PIXELS(r1, r2, out) {                                       \
+  const __m128i one = _mm_set1_epi8(1);                                        \
+  const __m128i a = _mm_loadu_si128((const __m128i*)&(r1)[0]);                 \
+  const __m128i b = _mm_loadu_si128((const __m128i*)&(r1)[1]);                 \
+  const __m128i c = _mm_loadu_si128((const __m128i*)&(r2)[0]);                 \
+  const __m128i d = _mm_loadu_si128((const __m128i*)&(r2)[1]);                 \
+                                                                               \
+  const __m128i s = _mm_avg_epu8(a, d);        /* s = (a + d + 1) / 2 */       \
+  const __m128i t = _mm_avg_epu8(b, c);        /* t = (b + c + 1) / 2 */       \
+  const __m128i st = _mm_xor_si128(s, t);      /* st = s^t */                  \
+                                                                               \
+  const __m128i ad = _mm_xor_si128(a, d);      /* ad = a^d */                  \
+  const __m128i bc = _mm_xor_si128(b, c);      /* bc = b^c */                  \
+                                                                               \
+  const __m128i t1 = _mm_or_si128(ad, bc);     /* (a^d) | (b^c) */             \
+  const __m128i t2 = _mm_or_si128(t1, st);     /* (a^d) | (b^c) | (s^t) */     \
+  const __m128i t3 = _mm_and_si128(t2, one);   /* (a^d) | (b^c) | (s^t) & 1 */ \
+  const __m128i t4 = _mm_avg_epu8(s, t);                                       \
+  const __m128i k = _mm_sub_epi8(t4, t3);      /* k = (a + b + c + d) / 4 */   \
+  __m128i diag1, diag2;                                                        \
+                                                                               \
+  GET_M(bc, t, diag1);                  /* diag1 = (a + 3b + 3c + d) / 8 */    \
+  GET_M(ad, s, diag2);                  /* diag2 = (3a + b + c + 3d) / 8 */    \
+                                                                               \
+  /* pack the alternate pixels */                                              \
+  PACK_AND_STORE(a, b, diag1, diag2, (out) +      0);  /* store top */         \
+  PACK_AND_STORE(c, d, diag2, diag1, (out) + 2 * 32);  /* store bottom */      \
+}
+
+// Turn the macro into a function for reducing code-size when non-critical
+static void Upsample32Pixels_SSE41(const uint8_t r1[], const uint8_t r2[],
+                                  uint8_t* const out) {
+  UPSAMPLE_32PIXELS(r1, r2, out);
+}
+
+#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) {                         \
+  uint8_t r1[17], r2[17];                                                      \
+  memcpy(r1, (tb), (num_pixels));                                              \
+  memcpy(r2, (bb), (num_pixels));                                              \
+  /* replicate last byte */                                                    \
+  memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels));          \
+  memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels));          \
+  /* using the shared function instead of the macro saves ~3k code size */     \
+  Upsample32Pixels_SSE41(r1, r2, out);                                         \
+}
+
+#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
+                       top_dst, bottom_dst, cur_x) do {                        \
+  FUNC##32_SSE41((top_y) + (cur_x), r_u, r_v, (top_dst) + (cur_x) * (XSTEP));  \
+  if ((bottom_y) != NULL) {                                                    \
+    FUNC##32_SSE41((bottom_y) + (cur_x), r_u + 64, r_v + 64,                   \
+                  (bottom_dst) + (cur_x) * (XSTEP));                           \
+  }                                                                            \
+} while (0)
+
+#define SSE4_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
+static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
+                      const uint8_t* top_u, const uint8_t* top_v,              \
+                      const uint8_t* cur_u, const uint8_t* cur_v,              \
+                      uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
+  int uv_pos, pos;                                                             \
+  /* 16byte-aligned array to cache reconstructed u and v */                    \
+  uint8_t uv_buf[14 * 32 + 15] = { 0 };                                        \
+  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
+  uint8_t* const r_v = r_u + 32;                                               \
+                                                                               \
+  assert(top_y != NULL);                                                       \
+  {   /* Treat the first pixel in regular way */                               \
+    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                       \
+    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                       \
+    const int u0_t = (top_u[0] + u_diag) >> 1;                                 \
+    const int v0_t = (top_v[0] + v_diag) >> 1;                                 \
+    FUNC(top_y[0], u0_t, v0_t, top_dst);                                       \
+    if (bottom_y != NULL) {                                                    \
+      const int u0_b = (cur_u[0] + u_diag) >> 1;                               \
+      const int v0_b = (cur_v[0] + v_diag) >> 1;                               \
+      FUNC(bottom_y[0], u0_b, v0_b, bottom_dst);                               \
+    }                                                                          \
+  }                                                                            \
+  /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */  \
+  for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) {    \
+    UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u);                    \
+    UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v);                    \
+    CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos);    \
+  }                                                                            \
+  if (len > 1) {                                                               \
+    const int left_over = ((len + 1) >> 1) - (pos >> 1);                       \
+    uint8_t* const tmp_top_dst = r_u + 4 * 32;                                 \
+    uint8_t* const tmp_bottom_dst = tmp_top_dst + 4 * 32;                      \
+    uint8_t* const tmp_top = tmp_bottom_dst + 4 * 32;                          \
+    uint8_t* const tmp_bottom = (bottom_y == NULL) ? NULL : tmp_top + 32;      \
+    assert(left_over > 0);                                                     \
+    UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u);       \
+    UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v);       \
+    memcpy(tmp_top, top_y + pos, len - pos);                                   \
+    if (bottom_y != NULL) memcpy(tmp_bottom, bottom_y + pos, len - pos);       \
+    CONVERT2RGB_32(FUNC, XSTEP, tmp_top, tmp_bottom, tmp_top_dst,              \
+         tmp_bottom_dst, 0);                                                   \
+    memcpy(top_dst + pos * (XSTEP), tmp_top_dst, (len - pos) * (XSTEP));       \
+    if (bottom_y != NULL) {                                                    \
+      memcpy(bottom_dst + pos * (XSTEP), tmp_bottom_dst,                       \
+             (len - pos) * (XSTEP));                                           \
+    }                                                                          \
+  }                                                                            \
+}
+
+// SSE4 variants of the fancy upsampler.
+SSE4_UPSAMPLE_FUNC(UpsampleRgbLinePair_SSE41,  VP8YuvToRgb,  3)
+SSE4_UPSAMPLE_FUNC(UpsampleBgrLinePair_SSE41,  VP8YuvToBgr,  3)
+
+#undef GET_M
+#undef PACK_AND_STORE
+#undef UPSAMPLE_32PIXELS
+#undef UPSAMPLE_LAST_BLOCK
+#undef CONVERT2RGB
+#undef CONVERT2RGB_32
+#undef SSE4_UPSAMPLE_FUNC
+
+#endif   // WEBP_REDUCE_CSP
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
+
+extern void WebPInitUpsamplersSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitUpsamplersSSE41(void) {
+#if !defined(WEBP_REDUCE_CSP)
+  WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePair_SSE41;
+  WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePair_SSE41;
+#endif   // WEBP_REDUCE_CSP
+}
+
+#endif  // FANCY_UPSAMPLING
+
+//------------------------------------------------------------------------------
+
+extern WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */];
+extern void WebPInitYUV444ConvertersSSE41(void);
+
+#define YUV444_FUNC(FUNC_NAME, CALL, CALL_C, XSTEP)                            \
+extern void CALL_C(const uint8_t* y, const uint8_t* u, const uint8_t* v,       \
+                   uint8_t* dst, int len);                                     \
+static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v,    \
+                      uint8_t* dst, int len) {                                 \
+  int i;                                                                       \
+  const int max_len = len & ~31;                                               \
+  for (i = 0; i < max_len; i += 32) {                                          \
+    CALL(y + i, u + i, v + i, dst + i * (XSTEP));                              \
+  }                                                                            \
+  if (i < len) {  /* C-fallback */                                             \
+    CALL_C(y + i, u + i, v + i, dst + i * (XSTEP), len - i);                   \
+  }                                                                            \
+}
+
+#if !defined(WEBP_REDUCE_CSP)
+YUV444_FUNC(Yuv444ToRgb_SSE41, VP8YuvToRgb32_SSE41, WebPYuv444ToRgb_C, 3);
+YUV444_FUNC(Yuv444ToBgr_SSE41, VP8YuvToBgr32_SSE41, WebPYuv444ToBgr_C, 3);
+#endif  // WEBP_REDUCE_CSP
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitYUV444ConvertersSSE41(void) {
+#if !defined(WEBP_REDUCE_CSP)
+  WebPYUV444Converters[MODE_RGB]       = Yuv444ToRgb_SSE41;
+  WebPYUV444Converters[MODE_BGR]       = Yuv444ToBgr_SSE41;
+#endif   // WEBP_REDUCE_CSP
+}
+
+#else
+
+WEBP_DSP_INIT_STUB(WebPInitYUV444ConvertersSSE41)
+
+#endif  // WEBP_USE_SSE41
+
+#if !(defined(FANCY_UPSAMPLING) && defined(WEBP_USE_SSE41))
+WEBP_DSP_INIT_STUB(WebPInitUpsamplersSSE41)
+#endif
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.c b/src/3rdparty/libwebp/src/dsp/yuv.c
index bddf81f..14e67fc 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv.c
@@ -71,15 +71,11 @@ void WebPSamplerProcessPlane(const uint8_t* y, int y_stride,
 WebPSamplerRowFunc WebPSamplers[MODE_LAST];
 
 extern void WebPInitSamplersSSE2(void);
+extern void WebPInitSamplersSSE41(void);
 extern void WebPInitSamplersMIPS32(void);
 extern void WebPInitSamplersMIPSdspR2(void);
 
-static volatile VP8CPUInfo yuv_last_cpuinfo_used =
-    (VP8CPUInfo)&yuv_last_cpuinfo_used;
-
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
-  if (yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(WebPInitSamplers) {
   WebPSamplers[MODE_RGB]       = YuvToRgbRow;
   WebPSamplers[MODE_RGBA]      = YuvToRgbaRow;
   WebPSamplers[MODE_BGR]       = YuvToBgrRow;
@@ -99,6 +95,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
       WebPInitSamplersSSE2();
     }
 #endif  // WEBP_USE_SSE2
+#if defined(WEBP_USE_SSE41)
+    if (VP8GetCPUInfo(kSSE4_1)) {
+      WebPInitSamplersSSE41();
+    }
+#endif  // WEBP_USE_SSE41
 #if defined(WEBP_USE_MIPS32)
     if (VP8GetCPUInfo(kMIPS32)) {
       WebPInitSamplersMIPS32();
@@ -110,7 +111,6 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplers(void) {
     }
 #endif  // WEBP_USE_MIPS_DSP_R2
   }
-  yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }
 
 //-----------------------------------------------------------------------------
@@ -254,17 +254,13 @@ void (*WebPSharpYUVUpdateRGB)(const int16_t* ref, const int16_t* src,
 void (*WebPSharpYUVFilterRow)(const int16_t* A, const int16_t* B, int len,
                               const uint16_t* best_y, uint16_t* out);
 
-static volatile VP8CPUInfo rgba_to_yuv_last_cpuinfo_used =
-    (VP8CPUInfo)&rgba_to_yuv_last_cpuinfo_used;
-
 extern void WebPInitConvertARGBToYUVSSE2(void);
+extern void WebPInitConvertARGBToYUVSSE41(void);
 extern void WebPInitConvertARGBToYUVNEON(void);
 extern void WebPInitSharpYUVSSE2(void);
 extern void WebPInitSharpYUVNEON(void);
 
-WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
-  if (rgba_to_yuv_last_cpuinfo_used == VP8GetCPUInfo) return;
-
+WEBP_DSP_INIT_FUNC(WebPInitConvertARGBToYUV) {
   WebPConvertARGBToY = ConvertARGBToY_C;
   WebPConvertARGBToUV = WebPConvertARGBToUV_C;
 
@@ -286,6 +282,11 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
       WebPInitSharpYUVSSE2();
     }
 #endif  // WEBP_USE_SSE2
+#if defined(WEBP_USE_SSE41)
+    if (VP8GetCPUInfo(kSSE4_1)) {
+      WebPInitConvertARGBToYUVSSE41();
+    }
+#endif  // WEBP_USE_SSE41
   }
 
 #if defined(WEBP_USE_NEON)
@@ -304,6 +305,4 @@ WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUV(void) {
   assert(WebPSharpYUVUpdateY != NULL);
   assert(WebPSharpYUVUpdateRGB != NULL);
   assert(WebPSharpYUVFilterRow != NULL);
-
-  rgba_to_yuv_last_cpuinfo_used = VP8GetCPUInfo;
 }
diff --git a/src/3rdparty/libwebp/src/dsp/yuv.h b/src/3rdparty/libwebp/src/dsp/yuv.h
index c8a5583..eb78727 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv.h
+++ b/src/3rdparty/libwebp/src/dsp/yuv.h
@@ -166,6 +166,19 @@ void VP8YuvToRgb56532_SSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v,
 
 #endif    // WEBP_USE_SSE2
 
+//-----------------------------------------------------------------------------
+// SSE41 extra functions (mostly for upsampling_sse41.c)
+
+#if defined(WEBP_USE_SSE41)
+
+// Process 32 pixels and store the result (16b, 24b or 32b per pixel) in *dst.
+void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst);
+
+#endif    // WEBP_USE_SSE41
+
 //------------------------------------------------------------------------------
 // RGB -> YUV conversion
 
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
index 6810bf8..baa48d5 100644
--- a/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
+++ b/src/3rdparty/libwebp/src/dsp/yuv_sse2.c
@@ -180,7 +180,7 @@ static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
   // Repeat the same permutations twice more:
   //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
   //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
-  VP8PlanarTo24b(in0, in1, in2, in3, in4, in5);
+  VP8PlanarTo24b_SSE2(in0, in1, in2, in3, in4, in5);
 
   _mm_storeu_si128((__m128i*)(rgb +  0), *in0);
   _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
@@ -492,7 +492,7 @@ static WEBP_INLINE void RGB32PackedToPlanar_SSE2(const uint32_t* const argb,
   __m128i a1 = LOAD_16(argb + 4);
   __m128i a2 = LOAD_16(argb + 8);
   __m128i a3 = LOAD_16(argb + 12);
-  VP8L32bToPlanar(&a0, &a1, &a2, &a3);
+  VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3);
   rgb[0] = _mm_unpacklo_epi8(a1, zero);
   rgb[1] = _mm_unpackhi_epi8(a1, zero);
   rgb[2] = _mm_unpacklo_epi8(a2, zero);
diff --git a/src/3rdparty/libwebp/src/dsp/yuv_sse41.c b/src/3rdparty/libwebp/src/dsp/yuv_sse41.c
new file mode 100644
index 0000000..579d1f7
--- /dev/null
+++ b/src/3rdparty/libwebp/src/dsp/yuv_sse41.c
@@ -0,0 +1,613 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// YUV->RGB conversion functions
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "src/dsp/yuv.h"
+
+#if defined(WEBP_USE_SSE41)
+
+#include "src/dsp/common_sse41.h"
+#include <stdlib.h>
+#include <smmintrin.h>
+
+//-----------------------------------------------------------------------------
+// Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
+
+// These constants are 14b fixed-point version of ITU-R BT.601 constants.
+// R = (19077 * y             + 26149 * v - 14234) >> 6
+// G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
+// B = (19077 * y + 33050 * u             - 17685) >> 6
+static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0,
+                                     const __m128i* const U0,
+                                     const __m128i* const V0,
+                                     __m128i* const R,
+                                     __m128i* const G,
+                                     __m128i* const B) {
+  const __m128i k19077 = _mm_set1_epi16(19077);
+  const __m128i k26149 = _mm_set1_epi16(26149);
+  const __m128i k14234 = _mm_set1_epi16(14234);
+  // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
+  const __m128i k33050 = _mm_set1_epi16((short)33050);
+  const __m128i k17685 = _mm_set1_epi16(17685);
+  const __m128i k6419  = _mm_set1_epi16(6419);
+  const __m128i k13320 = _mm_set1_epi16(13320);
+  const __m128i k8708  = _mm_set1_epi16(8708);
+
+  const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
+
+  const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
+  const __m128i R1 = _mm_sub_epi16(Y1, k14234);
+  const __m128i R2 = _mm_add_epi16(R1, R0);
+
+  const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
+  const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
+  const __m128i G2 = _mm_add_epi16(Y1, k8708);
+  const __m128i G3 = _mm_add_epi16(G0, G1);
+  const __m128i G4 = _mm_sub_epi16(G2, G3);
+
+  // be careful with the saturated *unsigned* arithmetic here!
+  const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
+  const __m128i B1 = _mm_adds_epu16(B0, Y1);
+  const __m128i B2 = _mm_subs_epu16(B1, k17685);
+
+  // use logical shift for B2, which can be larger than 32767
+  *R = _mm_srai_epi16(R2, 6);   // range: [-14234, 30815]
+  *G = _mm_srai_epi16(G4, 6);   // range: [-10953, 27710]
+  *B = _mm_srli_epi16(B2, 6);   // range: [0, 34238]
+}
+
+// Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
+static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) {
+  const __m128i zero = _mm_setzero_si128();
+  return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
+}
+
+// Load and replicate the U/V samples
+static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i tmp0 = _mm_cvtsi32_si128(*(const uint32_t*)src);
+  const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
+  return _mm_unpacklo_epi16(tmp1, tmp1);   // replicate samples
+}
+
+// Convert 32 samples of YUV444 to R/G/B
+static void YUV444ToRGB_SSE41(const uint8_t* const y,
+                              const uint8_t* const u,
+                              const uint8_t* const v,
+                              __m128i* const R, __m128i* const G,
+                              __m128i* const B) {
+  const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
+                V0 = Load_HI_16_SSE41(v);
+  ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
+}
+
+// Convert 32 samples of YUV420 to R/G/B
+static void YUV420ToRGB_SSE41(const uint8_t* const y,
+                              const uint8_t* const u,
+                              const uint8_t* const v,
+                              __m128i* const R, __m128i* const G,
+                              __m128i* const B) {
+  const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
+                V0 = Load_UV_HI_8_SSE41(v);
+  ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
+}
+
+// Pack the planar buffers
+// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+// triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
+static WEBP_INLINE void PlanarTo24b_SSE41(
+    __m128i* const in0, __m128i* const in1, __m128i* const in2,
+    __m128i* const in3, __m128i* const in4, __m128i* const in5,
+    uint8_t* const rgb) {
+  // The input is 6 registers of sixteen 8b but for the sake of explanation,
+  // let's take 6 registers of four 8b values.
+  // To pack, we will keep taking one every two 8b integer and move it
+  // around as follows:
+  // Input:
+  //   r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
+  // Split the 6 registers in two sets of 3 registers: the first set as the even
+  // 8b bytes, the second the odd ones:
+  //   r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
+  // Repeat the same permutations twice more:
+  //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
+  //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
+  VP8PlanarTo24b_SSE41(in0, in1, in2, in3, in4, in5);
+
+  _mm_storeu_si128((__m128i*)(rgb +  0), *in0);
+  _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
+  _mm_storeu_si128((__m128i*)(rgb + 32), *in2);
+  _mm_storeu_si128((__m128i*)(rgb + 48), *in3);
+  _mm_storeu_si128((__m128i*)(rgb + 64), *in4);
+  _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
+}
+
+void VP8YuvToRgb32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
+  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+  __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
+
+  YUV444ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0);
+  YUV444ToRGB_SSE41(y + 8, u + 8, v + 8, &R1, &G1, &B1);
+  YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2);
+  YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3);
+
+  // Cast to 8b and store as RRRRGGGGBBBB.
+  rgb0 = _mm_packus_epi16(R0, R1);
+  rgb1 = _mm_packus_epi16(R2, R3);
+  rgb2 = _mm_packus_epi16(G0, G1);
+  rgb3 = _mm_packus_epi16(G2, G3);
+  rgb4 = _mm_packus_epi16(B0, B1);
+  rgb5 = _mm_packus_epi16(B2, B3);
+
+  // Pack as RGBRGBRGBRGB.
+  PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
+}
+
+void VP8YuvToBgr32_SSE41(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                         uint8_t* dst) {
+  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+  __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
+
+  YUV444ToRGB_SSE41(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+  YUV444ToRGB_SSE41(y +  8, u +  8, v +  8, &R1, &G1, &B1);
+  YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2);
+  YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3);
+
+  // Cast to 8b and store as BBBBGGGGRRRR.
+  bgr0 = _mm_packus_epi16(B0, B1);
+  bgr1 = _mm_packus_epi16(B2, B3);
+  bgr2 = _mm_packus_epi16(G0, G1);
+  bgr3 = _mm_packus_epi16(G2, G3);
+  bgr4 = _mm_packus_epi16(R0, R1);
+  bgr5= _mm_packus_epi16(R2, R3);
+
+  // Pack as BGRBGRBGRBGR.
+  PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
+}
+
+//-----------------------------------------------------------------------------
+// Arbitrary-length row conversion functions
+
+static void YuvToRgbRow_SSE41(const uint8_t* y,
+                              const uint8_t* u, const uint8_t* v,
+                              uint8_t* dst, int len) {
+  int n;
+  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
+    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+    __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
+
+    YUV420ToRGB_SSE41(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+    YUV420ToRGB_SSE41(y +  8, u +  4, v +  4, &R1, &G1, &B1);
+    YUV420ToRGB_SSE41(y + 16, u +  8, v +  8, &R2, &G2, &B2);
+    YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3);
+
+    // Cast to 8b and store as RRRRGGGGBBBB.
+    rgb0 = _mm_packus_epi16(R0, R1);
+    rgb1 = _mm_packus_epi16(R2, R3);
+    rgb2 = _mm_packus_epi16(G0, G1);
+    rgb3 = _mm_packus_epi16(G2, G3);
+    rgb4 = _mm_packus_epi16(B0, B1);
+    rgb5 = _mm_packus_epi16(B2, B3);
+
+    // Pack as RGBRGBRGBRGB.
+    PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
+
+    y += 32;
+    u += 16;
+    v += 16;
+  }
+  for (; n < len; ++n) {   // Finish off
+    VP8YuvToRgb(y[0], u[0], v[0], dst);
+    dst += 3;
+    y += 1;
+    u += (n & 1);
+    v += (n & 1);
+  }
+}
+
+static void YuvToBgrRow_SSE41(const uint8_t* y,
+                              const uint8_t* u, const uint8_t* v,
+                              uint8_t* dst, int len) {
+  int n;
+  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
+    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
+    __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
+
+    YUV420ToRGB_SSE41(y +  0, u +  0, v +  0, &R0, &G0, &B0);
+    YUV420ToRGB_SSE41(y +  8, u +  4, v +  4, &R1, &G1, &B1);
+    YUV420ToRGB_SSE41(y + 16, u +  8, v +  8, &R2, &G2, &B2);
+    YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3);
+
+    // Cast to 8b and store as BBBBGGGGRRRR.
+    bgr0 = _mm_packus_epi16(B0, B1);
+    bgr1 = _mm_packus_epi16(B2, B3);
+    bgr2 = _mm_packus_epi16(G0, G1);
+    bgr3 = _mm_packus_epi16(G2, G3);
+    bgr4 = _mm_packus_epi16(R0, R1);
+    bgr5 = _mm_packus_epi16(R2, R3);
+
+    // Pack as BGRBGRBGRBGR.
+    PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
+
+    y += 32;
+    u += 16;
+    v += 16;
+  }
+  for (; n < len; ++n) {   // Finish off
+    VP8YuvToBgr(y[0], u[0], v[0], dst);
+    dst += 3;
+    y += 1;
+    u += (n & 1);
+    v += (n & 1);
+  }
+}
+
+//------------------------------------------------------------------------------
+// Entry point
+
+extern void WebPInitSamplersSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
+  WebPSamplers[MODE_RGB]  = YuvToRgbRow_SSE41;
+  WebPSamplers[MODE_BGR]  = YuvToBgrRow_SSE41;
+}
+
+//------------------------------------------------------------------------------
+// RGB24/32 -> YUV converters
+
+// Load eight 16b-words from *src.
+#define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
+// Store either 16b-words into *dst
+#define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
+
+#define WEBP_SSE41_SHUFF(OUT)  do {                  \
+  const __m128i tmp0 = _mm_shuffle_epi8(A0, shuff0); \
+  const __m128i tmp1 = _mm_shuffle_epi8(A1, shuff1); \
+  const __m128i tmp2 = _mm_shuffle_epi8(A2, shuff2); \
+  const __m128i tmp3 = _mm_shuffle_epi8(A3, shuff0); \
+  const __m128i tmp4 = _mm_shuffle_epi8(A4, shuff1); \
+  const __m128i tmp5 = _mm_shuffle_epi8(A5, shuff2); \
+                                                     \
+  /* OR everything to get one channel */             \
+  const __m128i tmp6 = _mm_or_si128(tmp0, tmp1);     \
+  const __m128i tmp7 = _mm_or_si128(tmp3, tmp4);     \
+  out[OUT + 0] = _mm_or_si128(tmp6, tmp2);           \
+  out[OUT + 1] = _mm_or_si128(tmp7, tmp5);           \
+} while (0);
+
+// Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
+// rrrr... rrrr... gggg... gggg... bbbb... bbbb....
+// Similar to PlanarTo24bHelper(), but in reverse order.
+static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
+    const uint8_t* const rgb, __m128i* const out /*out[6]*/) {
+  const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb +  0));
+  const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16));
+  const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32));
+  const __m128i A3 = _mm_loadu_si128((const __m128i*)(rgb + 48));
+  const __m128i A4 = _mm_loadu_si128((const __m128i*)(rgb + 64));
+  const __m128i A5 = _mm_loadu_si128((const __m128i*)(rgb + 80));
+
+  // Compute RR.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+        13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+    WEBP_SSE41_SHUFF(0)
+  }
+  // Compute GG.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+        14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+    WEBP_SSE41_SHUFF(2)
+  }
+  // Compute BB.
+  {
+    const __m128i shuff0 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11, 8, 5, 2);
+    const __m128i shuff1 = _mm_set_epi8(
+        -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1, -1, -1, -1, -1, -1);
+    const __m128i shuff2 = _mm_set_epi8(
+        15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
+    WEBP_SSE41_SHUFF(4)
+  }
+}
+
+#undef WEBP_SSE41_SHUFF
+
+// Convert 8 packed ARGB to r[], g[], b[]
+static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
+    const uint32_t* const argb, __m128i* const rgb /*in[6]*/) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i a0 = LOAD_16(argb + 0);
+  __m128i a1 = LOAD_16(argb + 4);
+  __m128i a2 = LOAD_16(argb + 8);
+  __m128i a3 = LOAD_16(argb + 12);
+  VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3);
+  rgb[0] = _mm_unpacklo_epi8(a1, zero);
+  rgb[1] = _mm_unpackhi_epi8(a1, zero);
+  rgb[2] = _mm_unpacklo_epi8(a2, zero);
+  rgb[3] = _mm_unpackhi_epi8(a2, zero);
+  rgb[4] = _mm_unpacklo_epi8(a3, zero);
+  rgb[5] = _mm_unpackhi_epi8(a3, zero);
+}
+
+// This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
+// It's a macro and not a function because we need to use immediate values with
+// srai_epi32, e.g.
+#define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
+                  ROUNDER, DESCALE_FIX, OUT) do {               \
+  const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG);         \
+  const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG);         \
+  const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB);         \
+  const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB);         \
+  const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo);            \
+  const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi);            \
+  const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER);          \
+  const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER);          \
+  const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX);     \
+  const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX);     \
+  (OUT) = _mm_packs_epi32(V5_lo, V5_hi);                        \
+} while (0)
+
+#define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
+static WEBP_INLINE void ConvertRGBToY_SSE41(const __m128i* const R,
+                                            const __m128i* const G,
+                                            const __m128i* const B,
+                                            __m128i* const Y) {
+  const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
+  const __m128i kGB_y = MK_CST_16(16384, 6420);
+  const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
+
+  const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
+  const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
+  const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
+  const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
+  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
+}
+
+static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
+                                             const __m128i* const G,
+                                             const __m128i* const B,
+                                             __m128i* const U,
+                                             __m128i* const V) {
+  const __m128i kRG_u = MK_CST_16(-9719, -19081);
+  const __m128i kGB_u = MK_CST_16(0, 28800);
+  const __m128i kRG_v = MK_CST_16(28800, 0);
+  const __m128i kGB_v = MK_CST_16(-24116, -4684);
+  const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
+
+  const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
+  const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
+  const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
+  const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
+  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
+            kHALF_UV, YUV_FIX + 2, *U);
+  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
+            kHALF_UV, YUV_FIX + 2, *V);
+}
+
+#undef MK_CST_16
+#undef TRANSFORM
+
+static void ConvertRGB24ToY_SSE41(const uint8_t* rgb, uint8_t* y, int width) {
+  const int max_width = width & ~31;
+  int i;
+  for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
+    __m128i rgb_plane[6];
+    int j;
+
+    RGB24PackedToPlanar_SSE41(rgb, rgb_plane);
+
+    for (j = 0; j < 2; ++j, i += 16) {
+      const __m128i zero = _mm_setzero_si128();
+      __m128i r, g, b, Y0, Y1;
+
+      // Convert to 16-bit Y.
+      r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
+      g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
+      b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
+      ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
+
+      // Convert to 16-bit Y.
+      r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
+      g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
+      b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
+      ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
+
+      // Cast to 8-bit and store.
+      STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
+    }
+  }
+  for (; i < width; ++i, rgb += 3) {   // left-over
+    y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
+  }
+}
+
+static void ConvertBGR24ToY_SSE41(const uint8_t* bgr, uint8_t* y, int width) {
+  const int max_width = width & ~31;
+  int i;
+  for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
+    __m128i bgr_plane[6];
+    int j;
+
+    RGB24PackedToPlanar_SSE41(bgr, bgr_plane);
+
+    for (j = 0; j < 2; ++j, i += 16) {
+      const __m128i zero = _mm_setzero_si128();
+      __m128i r, g, b, Y0, Y1;
+
+      // Convert to 16-bit Y.
+      b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
+      g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
+      r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
+      ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
+
+      // Convert to 16-bit Y.
+      b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
+      g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
+      r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
+      ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
+
+      // Cast to 8-bit and store.
+      STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
+    }
+  }
+  for (; i < width; ++i, bgr += 3) {  // left-over
+    y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
+  }
+}
+
+static void ConvertARGBToY_SSE41(const uint32_t* argb, uint8_t* y, int width) {
+  const int max_width = width & ~15;
+  int i;
+  for (i = 0; i < max_width; i += 16) {
+    __m128i Y0, Y1, rgb[6];
+    RGB32PackedToPlanar_SSE41(&argb[i], rgb);
+    ConvertRGBToY_SSE41(&rgb[0], &rgb[2], &rgb[4], &Y0);
+    ConvertRGBToY_SSE41(&rgb[1], &rgb[3], &rgb[5], &Y1);
+    STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
+  }
+  for (; i < width; ++i) {   // left-over
+    const uint32_t p = argb[i];
+    y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
+                     YUV_HALF);
+  }
+}
+
+// Horizontal add (doubled) of two 16b values, result is 16b.
+// in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
+static void HorizontalAddPack_SSE41(const __m128i* const A,
+                                    const __m128i* const B,
+                                    __m128i* const out) {
+  const __m128i k2 = _mm_set1_epi16(2);
+  const __m128i C = _mm_madd_epi16(*A, k2);
+  const __m128i D = _mm_madd_epi16(*B, k2);
+  *out = _mm_packs_epi32(C, D);
+}
+
+static void ConvertARGBToUV_SSE41(const uint32_t* argb,
+                                  uint8_t* u, uint8_t* v,
+                                  int src_width, int do_store) {
+  const int max_width = src_width & ~31;
+  int i;
+  for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
+    __m128i rgb[6], U0, V0, U1, V1;
+    RGB32PackedToPlanar_SSE41(&argb[i], rgb);
+    HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
+    HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
+    HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
+    ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
+
+    RGB32PackedToPlanar_SSE41(&argb[i + 16], rgb);
+    HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
+    HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
+    HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
+    ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
+
+    U0 = _mm_packus_epi16(U0, U1);
+    V0 = _mm_packus_epi16(V0, V1);
+    if (!do_store) {
+      const __m128i prev_u = LOAD_16(u);
+      const __m128i prev_v = LOAD_16(v);
+      U0 = _mm_avg_epu8(U0, prev_u);
+      V0 = _mm_avg_epu8(V0, prev_v);
+    }
+    STORE_16(U0, u);
+    STORE_16(V0, v);
+  }
+  if (i < src_width) {  // left-over
+    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
+  }
+}
+
+// Convert 16 packed ARGB 16b-values to r[], g[], b[]
+static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
+    const uint16_t* const rgbx,
+    __m128i* const r, __m128i* const g, __m128i* const b) {
+  const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
+  const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
+  const __m128i in2 = LOAD_16(rgbx + 16);  // r4 | ...
+  const __m128i in3 = LOAD_16(rgbx + 24);  // r6 | ...
+  // aarrggbb as 16-bit.
+  const __m128i shuff0 =
+      _mm_set_epi8(-1, -1, -1, -1, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
+  const __m128i shuff1 =
+      _mm_set_epi8(13, 12, 5, 4, -1, -1, -1, -1, 11, 10, 3, 2, 9, 8, 1, 0);
+  const __m128i A0 = _mm_shuffle_epi8(in0, shuff0);
+  const __m128i A1 = _mm_shuffle_epi8(in1, shuff1);
+  const __m128i A2 = _mm_shuffle_epi8(in2, shuff0);
+  const __m128i A3 = _mm_shuffle_epi8(in3, shuff1);
+  // R0R1G0G1
+  // B0B1****
+  // R2R3G2G3
+  // B2B3****
+  // (OR is used to free port 5 for the unpack)
+  const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
+  const __m128i B1 = _mm_or_si128(A0, A1);
+  const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
+  const __m128i B3 = _mm_or_si128(A2, A3);
+  // Gather the channels.
+  *r = _mm_unpacklo_epi64(B0, B2);
+  *g = _mm_unpackhi_epi64(B0, B2);
+  *b = _mm_unpackhi_epi64(B1, B3);
+}
+
+static void ConvertRGBA32ToUV_SSE41(const uint16_t* rgb,
+                                    uint8_t* u, uint8_t* v, int width) {
+  const int max_width = width & ~15;
+  const uint16_t* const last_rgb = rgb + 4 * max_width;
+  while (rgb < last_rgb) {
+    __m128i r, g, b, U0, V0, U1, V1;
+    RGBA32PackedToPlanar_16b_SSE41(rgb +  0, &r, &g, &b);
+    ConvertRGBToUV_SSE41(&r, &g, &b, &U0, &V0);
+    RGBA32PackedToPlanar_16b_SSE41(rgb + 32, &r, &g, &b);
+    ConvertRGBToUV_SSE41(&r, &g, &b, &U1, &V1);
+    STORE_16(_mm_packus_epi16(U0, U1), u);
+    STORE_16(_mm_packus_epi16(V0, V1), v);
+    u += 16;
+    v += 16;
+    rgb += 2 * 32;
+  }
+  if (max_width < width) {  // left-over
+    WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
+  }
+}
+
+//------------------------------------------------------------------------------
+
+extern void WebPInitConvertARGBToYUVSSE41(void);
+
+WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE41(void) {
+  WebPConvertARGBToY = ConvertARGBToY_SSE41;
+  WebPConvertARGBToUV = ConvertARGBToUV_SSE41;
+
+  WebPConvertRGB24ToY = ConvertRGB24ToY_SSE41;
+  WebPConvertBGR24ToY = ConvertBGR24ToY_SSE41;
+
+  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE41;
+}
+
+//------------------------------------------------------------------------------
+
+#else  // !WEBP_USE_SSE41
+
+WEBP_DSP_INIT_STUB(WebPInitSamplersSSE41)
+WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE41)
+
+#endif  // WEBP_USE_SSE41
diff --git a/src/3rdparty/libwebp/src/enc/alpha_enc.c b/src/3rdparty/libwebp/src/enc/alpha_enc.c
index 7e8d87f..dce9ca9 100644
--- a/src/3rdparty/libwebp/src/enc/alpha_enc.c
+++ b/src/3rdparty/libwebp/src/enc/alpha_enc.c
@@ -361,7 +361,8 @@ static int EncodeAlpha(VP8Encoder* const enc,
 //------------------------------------------------------------------------------
 // Main calls
 
-static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
+static int CompressAlphaJob(void* arg1, void* dummy) {
+  VP8Encoder* const enc = (VP8Encoder*)arg1;
   const WebPConfig* config = enc->config_;
   uint8_t* alpha_data = NULL;
   size_t alpha_size = 0;
@@ -394,7 +395,7 @@ void VP8EncInitAlpha(VP8Encoder* const enc) {
     WebPGetWorkerInterface()->Init(worker);
     worker->data1 = enc;
     worker->data2 = NULL;
-    worker->hook = (WebPWorkerHook)CompressAlphaJob;
+    worker->hook = CompressAlphaJob;
   }
 }
 
diff --git a/src/3rdparty/libwebp/src/enc/analysis_enc.c b/src/3rdparty/libwebp/src/enc/analysis_enc.c
index 08f471f..a47ff7d 100644
--- a/src/3rdparty/libwebp/src/enc/analysis_enc.c
+++ b/src/3rdparty/libwebp/src/enc/analysis_enc.c
@@ -434,7 +434,9 @@ typedef struct {
 } SegmentJob;
 
 // main work call
-static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) {
+static int DoSegmentsJob(void* arg1, void* arg2) {
+  SegmentJob* const job = (SegmentJob*)arg1;
+  VP8EncIterator* const it = (VP8EncIterator*)arg2;
   int ok = 1;
   if (!VP8IteratorIsDone(it)) {
     uint8_t tmp[32 + WEBP_ALIGN_CST];
@@ -462,7 +464,7 @@ static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
   WebPGetWorkerInterface()->Init(&job->worker);
   job->worker.data1 = job;
   job->worker.data2 = &job->it;
-  job->worker.hook = (WebPWorkerHook)DoSegmentsJob;
+  job->worker.hook = DoSegmentsJob;
   VP8IteratorInit(enc, &job->it);
   VP8IteratorSetRow(&job->it, start_row);
   VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
diff --git a/src/3rdparty/libwebp/src/enc/delta_palettization_enc.c b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.c
deleted file mode 100644
index a61c8e6..0000000
--- a/src/3rdparty/libwebp/src/enc/delta_palettization_enc.c
+++ /dev/null
@@ -1,455 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Author: Mislav Bradac (mislavm@google.com)
-//
-
-#include "src/enc/delta_palettization_enc.h"
-
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-#include "src/webp/types.h"
-#include "src/dsp/lossless.h"
-
-#define MK_COL(r, g, b) (((r) << 16) + ((g) << 8) + (b))
-
-// Format allows palette up to 256 entries, but more palette entries produce
-// bigger entropy. In the future it will probably be useful to add more entries
-// that are far from the origin of the palette or choose remaining entries
-// dynamically.
-#define DELTA_PALETTE_SIZE 226
-
-// Palette used for delta_palettization. Entries are roughly sorted by distance
-// of their signed equivalents from the origin.
-static const uint32_t kDeltaPalette[DELTA_PALETTE_SIZE] = {
-  MK_COL(0u, 0u, 0u),
-  MK_COL(255u, 255u, 255u),
-  MK_COL(1u, 1u, 1u),
-  MK_COL(254u, 254u, 254u),
-  MK_COL(2u, 2u, 2u),
-  MK_COL(4u, 4u, 4u),
-  MK_COL(252u, 252u, 252u),
-  MK_COL(250u, 0u, 0u),
-  MK_COL(0u, 250u, 0u),
-  MK_COL(0u, 0u, 250u),
-  MK_COL(6u, 0u, 0u),
-  MK_COL(0u, 6u, 0u),
-  MK_COL(0u, 0u, 6u),
-  MK_COL(0u, 0u, 248u),
-  MK_COL(0u, 0u, 8u),
-  MK_COL(0u, 248u, 0u),
-  MK_COL(0u, 248u, 248u),
-  MK_COL(0u, 248u, 8u),
-  MK_COL(0u, 8u, 0u),
-  MK_COL(0u, 8u, 248u),
-  MK_COL(0u, 8u, 8u),
-  MK_COL(8u, 8u, 8u),
-  MK_COL(248u, 0u, 0u),
-  MK_COL(248u, 0u, 248u),
-  MK_COL(248u, 0u, 8u),
-  MK_COL(248u, 248u, 0u),
-  MK_COL(248u, 8u, 0u),
-  MK_COL(8u, 0u, 0u),
-  MK_COL(8u, 0u, 248u),
-  MK_COL(8u, 0u, 8u),
-  MK_COL(8u, 248u, 0u),
-  MK_COL(8u, 8u, 0u),
-  MK_COL(23u, 23u, 23u),
-  MK_COL(13u, 13u, 13u),
-  MK_COL(232u, 232u, 232u),
-  MK_COL(244u, 244u, 244u),
-  MK_COL(245u, 245u, 250u),
-  MK_COL(50u, 50u, 50u),
-  MK_COL(204u, 204u, 204u),
-  MK_COL(236u, 236u, 236u),
-  MK_COL(16u, 16u, 16u),
-  MK_COL(240u, 16u, 16u),
-  MK_COL(16u, 240u, 16u),
-  MK_COL(240u, 240u, 16u),
-  MK_COL(16u, 16u, 240u),
-  MK_COL(240u, 16u, 240u),
-  MK_COL(16u, 240u, 240u),
-  MK_COL(240u, 240u, 240u),
-  MK_COL(0u, 0u, 232u),
-  MK_COL(0u, 232u, 0u),
-  MK_COL(232u, 0u, 0u),
-  MK_COL(0u, 0u, 24u),
-  MK_COL(0u, 24u, 0u),
-  MK_COL(24u, 0u, 0u),
-  MK_COL(32u, 32u, 32u),
-  MK_COL(224u, 32u, 32u),
-  MK_COL(32u, 224u, 32u),
-  MK_COL(224u, 224u, 32u),
-  MK_COL(32u, 32u, 224u),
-  MK_COL(224u, 32u, 224u),
-  MK_COL(32u, 224u, 224u),
-  MK_COL(224u, 224u, 224u),
-  MK_COL(0u, 0u, 176u),
-  MK_COL(0u, 0u, 80u),
-  MK_COL(0u, 176u, 0u),
-  MK_COL(0u, 176u, 176u),
-  MK_COL(0u, 176u, 80u),
-  MK_COL(0u, 80u, 0u),
-  MK_COL(0u, 80u, 176u),
-  MK_COL(0u, 80u, 80u),
-  MK_COL(176u, 0u, 0u),
-  MK_COL(176u, 0u, 176u),
-  MK_COL(176u, 0u, 80u),
-  MK_COL(176u, 176u, 0u),
-  MK_COL(176u, 80u, 0u),
-  MK_COL(80u, 0u, 0u),
-  MK_COL(80u, 0u, 176u),
-  MK_COL(80u, 0u, 80u),
-  MK_COL(80u, 176u, 0u),
-  MK_COL(80u, 80u, 0u),
-  MK_COL(0u, 0u, 152u),
-  MK_COL(0u, 0u, 104u),
-  MK_COL(0u, 152u, 0u),
-  MK_COL(0u, 152u, 152u),
-  MK_COL(0u, 152u, 104u),
-  MK_COL(0u, 104u, 0u),
-  MK_COL(0u, 104u, 152u),
-  MK_COL(0u, 104u, 104u),
-  MK_COL(152u, 0u, 0u),
-  MK_COL(152u, 0u, 152u),
-  MK_COL(152u, 0u, 104u),
-  MK_COL(152u, 152u, 0u),
-  MK_COL(152u, 104u, 0u),
-  MK_COL(104u, 0u, 0u),
-  MK_COL(104u, 0u, 152u),
-  MK_COL(104u, 0u, 104u),
-  MK_COL(104u, 152u, 0u),
-  MK_COL(104u, 104u, 0u),
-  MK_COL(216u, 216u, 216u),
-  MK_COL(216u, 216u, 40u),
-  MK_COL(216u, 216u, 176u),
-  MK_COL(216u, 216u, 80u),
-  MK_COL(216u, 40u, 216u),
-  MK_COL(216u, 40u, 40u),
-  MK_COL(216u, 40u, 176u),
-  MK_COL(216u, 40u, 80u),
-  MK_COL(216u, 176u, 216u),
-  MK_COL(216u, 176u, 40u),
-  MK_COL(216u, 176u, 176u),
-  MK_COL(216u, 176u, 80u),
-  MK_COL(216u, 80u, 216u),
-  MK_COL(216u, 80u, 40u),
-  MK_COL(216u, 80u, 176u),
-  MK_COL(216u, 80u, 80u),
-  MK_COL(40u, 216u, 216u),
-  MK_COL(40u, 216u, 40u),
-  MK_COL(40u, 216u, 176u),
-  MK_COL(40u, 216u, 80u),
-  MK_COL(40u, 40u, 216u),
-  MK_COL(40u, 40u, 40u),
-  MK_COL(40u, 40u, 176u),
-  MK_COL(40u, 40u, 80u),
-  MK_COL(40u, 176u, 216u),
-  MK_COL(40u, 176u, 40u),
-  MK_COL(40u, 176u, 176u),
-  MK_COL(40u, 176u, 80u),
-  MK_COL(40u, 80u, 216u),
-  MK_COL(40u, 80u, 40u),
-  MK_COL(40u, 80u, 176u),
-  MK_COL(40u, 80u, 80u),
-  MK_COL(80u, 216u, 216u),
-  MK_COL(80u, 216u, 40u),
-  MK_COL(80u, 216u, 176u),
-  MK_COL(80u, 216u, 80u),
-  MK_COL(80u, 40u, 216u),
-  MK_COL(80u, 40u, 40u),
-  MK_COL(80u, 40u, 176u),
-  MK_COL(80u, 40u, 80u),
-  MK_COL(80u, 176u, 216u),
-  MK_COL(80u, 176u, 40u),
-  MK_COL(80u, 176u, 176u),
-  MK_COL(80u, 176u, 80u),
-  MK_COL(80u, 80u, 216u),
-  MK_COL(80u, 80u, 40u),
-  MK_COL(80u, 80u, 176u),
-  MK_COL(80u, 80u, 80u),
-  MK_COL(0u, 0u, 192u),
-  MK_COL(0u, 0u, 64u),
-  MK_COL(0u, 0u, 128u),
-  MK_COL(0u, 192u, 0u),
-  MK_COL(0u, 192u, 192u),
-  MK_COL(0u, 192u, 64u),
-  MK_COL(0u, 192u, 128u),
-  MK_COL(0u, 64u, 0u),
-  MK_COL(0u, 64u, 192u),
-  MK_COL(0u, 64u, 64u),
-  MK_COL(0u, 64u, 128u),
-  MK_COL(0u, 128u, 0u),
-  MK_COL(0u, 128u, 192u),
-  MK_COL(0u, 128u, 64u),
-  MK_COL(0u, 128u, 128u),
-  MK_COL(176u, 216u, 216u),
-  MK_COL(176u, 216u, 40u),
-  MK_COL(176u, 216u, 176u),
-  MK_COL(176u, 216u, 80u),
-  MK_COL(176u, 40u, 216u),
-  MK_COL(176u, 40u, 40u),
-  MK_COL(176u, 40u, 176u),
-  MK_COL(176u, 40u, 80u),
-  MK_COL(176u, 176u, 216u),
-  MK_COL(176u, 176u, 40u),
-  MK_COL(176u, 176u, 176u),
-  MK_COL(176u, 176u, 80u),
-  MK_COL(176u, 80u, 216u),
-  MK_COL(176u, 80u, 40u),
-  MK_COL(176u, 80u, 176u),
-  MK_COL(176u, 80u, 80u),
-  MK_COL(192u, 0u, 0u),
-  MK_COL(192u, 0u, 192u),
-  MK_COL(192u, 0u, 64u),
-  MK_COL(192u, 0u, 128u),
-  MK_COL(192u, 192u, 0u),
-  MK_COL(192u, 192u, 192u),
-  MK_COL(192u, 192u, 64u),
-  MK_COL(192u, 192u, 128u),
-  MK_COL(192u, 64u, 0u),
-  MK_COL(192u, 64u, 192u),
-  MK_COL(192u, 64u, 64u),
-  MK_COL(192u, 64u, 128u),
-  MK_COL(192u, 128u, 0u),
-  MK_COL(192u, 128u, 192u),
-  MK_COL(192u, 128u, 64u),
-  MK_COL(192u, 128u, 128u),
-  MK_COL(64u, 0u, 0u),
-  MK_COL(64u, 0u, 192u),
-  MK_COL(64u, 0u, 64u),
-  MK_COL(64u, 0u, 128u),
-  MK_COL(64u, 192u, 0u),
-  MK_COL(64u, 192u, 192u),
-  MK_COL(64u, 192u, 64u),
-  MK_COL(64u, 192u, 128u),
-  MK_COL(64u, 64u, 0u),
-  MK_COL(64u, 64u, 192u),
-  MK_COL(64u, 64u, 64u),
-  MK_COL(64u, 64u, 128u),
-  MK_COL(64u, 128u, 0u),
-  MK_COL(64u, 128u, 192u),
-  MK_COL(64u, 128u, 64u),
-  MK_COL(64u, 128u, 128u),
-  MK_COL(128u, 0u, 0u),
-  MK_COL(128u, 0u, 192u),
-  MK_COL(128u, 0u, 64u),
-  MK_COL(128u, 0u, 128u),
-  MK_COL(128u, 192u, 0u),
-  MK_COL(128u, 192u, 192u),
-  MK_COL(128u, 192u, 64u),
-  MK_COL(128u, 192u, 128u),
-  MK_COL(128u, 64u, 0u),
-  MK_COL(128u, 64u, 192u),
-  MK_COL(128u, 64u, 64u),
-  MK_COL(128u, 64u, 128u),
-  MK_COL(128u, 128u, 0u),
-  MK_COL(128u, 128u, 192u),
-  MK_COL(128u, 128u, 64u),
-  MK_COL(128u, 128u, 128u),
-};
-
-#undef MK_COL
-
-//------------------------------------------------------------------------------
-// TODO(skal): move the functions to dsp/lossless.c when the correct
-// granularity is found. For now, we'll just copy-paste some useful bits
-// here instead.
-
-// In-place sum of each component with mod 256.
-static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
-  const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
-  const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
-  *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
-}
-
-static WEBP_INLINE uint32_t Clip255(uint32_t a) {
-  if (a < 256) {
-    return a;
-  }
-  // return 0, when a is a negative integer.
-  // return 255, when a is positive.
-  return ~a >> 24;
-}
-
-// Delta palettization functions.
-static WEBP_INLINE int Square(int x) {
-  return x * x;
-}
-
-static WEBP_INLINE uint32_t Intensity(uint32_t a) {
-  return
-      30 * ((a >> 16) & 0xff) +
-      59 * ((a >>  8) & 0xff) +
-      11 * ((a >>  0) & 0xff);
-}
-
-static uint32_t CalcDist(uint32_t predicted_value, uint32_t actual_value,
-                         uint32_t palette_entry) {
-  int i;
-  uint32_t distance = 0;
-  AddPixelsEq(&predicted_value, palette_entry);
-  for (i = 0; i < 32; i += 8) {
-    const int32_t av = (actual_value >> i) & 0xff;
-    const int32_t pv = (predicted_value >> i) & 0xff;
-    distance += Square(pv - av);
-  }
-  // We sum square of intensity difference with factor 10, but because Intensity
-  // returns 100 times real intensity we need to multiply differences of colors
-  // by 1000.
-  distance *= 1000u;
-  distance += Square(Intensity(predicted_value)
-                     - Intensity(actual_value));
-  return distance;
-}
-
-static uint32_t Predict(int x, int y, uint32_t* image) {
-  const uint32_t t = (y == 0) ? ARGB_BLACK : image[x];
-  const uint32_t l = (x == 0) ? ARGB_BLACK : image[x - 1];
-  const uint32_t p =
-      (((((t >> 24) & 0xff) + ((l >> 24) & 0xff)) / 2) << 24) +
-      (((((t >> 16) & 0xff) + ((l >> 16) & 0xff)) / 2) << 16) +
-      (((((t >>  8) & 0xff) + ((l >>  8) & 0xff)) / 2) <<  8) +
-      (((((t >>  0) & 0xff) + ((l >>  0) & 0xff)) / 2) <<  0);
-  if (x == 0 && y == 0) return ARGB_BLACK;
-  if (x == 0) return t;
-  if (y == 0) return l;
-  return p;
-}
-
-static WEBP_INLINE int AddSubtractComponentFullWithCoefficient(
-    int a, int b, int c) {
-  return Clip255(a + ((b - c) >> 2));
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractFullWithCoefficient(
-    uint32_t c0, uint32_t c1, uint32_t c2) {
-  const int a = AddSubtractComponentFullWithCoefficient(
-      c0 >> 24, c1 >> 24, c2 >> 24);
-  const int r = AddSubtractComponentFullWithCoefficient((c0 >> 16) & 0xff,
-                                                       (c1 >> 16) & 0xff,
-                                                       (c2 >> 16) & 0xff);
-  const int g = AddSubtractComponentFullWithCoefficient((c0 >> 8) & 0xff,
-                                                       (c1 >> 8) & 0xff,
-                                                       (c2 >> 8) & 0xff);
-  const int b = AddSubtractComponentFullWithCoefficient(
-      c0 & 0xff, c1 & 0xff, c2 & 0xff);
-  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
-}
-
-//------------------------------------------------------------------------------
-
-// Find palette entry with minimum error from difference of actual pixel value
-// and predicted pixel value. Propagate error of pixel to its top and left pixel
-// in src array. Write predicted_value + palette_entry to new_image. Return
-// index of best palette entry.
-static int FindBestPaletteEntry(uint32_t src, uint32_t predicted_value,
-                                const uint32_t palette[], int palette_size) {
-  int i;
-  int idx = 0;
-  uint32_t best_distance = CalcDist(predicted_value, src, palette[0]);
-  for (i = 1; i < palette_size; ++i) {
-    const uint32_t distance = CalcDist(predicted_value, src, palette[i]);
-    if (distance < best_distance) {
-      best_distance = distance;
-      idx = i;
-    }
-  }
-  return idx;
-}
-
-static void ApplyBestPaletteEntry(int x, int y,
-                                  uint32_t new_value, uint32_t palette_value,
-                                  uint32_t* src, int src_stride,
-                                  uint32_t* new_image) {
-  AddPixelsEq(&new_value, palette_value);
-  if (x > 0) {
-    src[x - 1] = ClampedAddSubtractFullWithCoefficient(src[x - 1],
-                                                       new_value, src[x]);
-  }
-  if (y > 0) {
-    src[x - src_stride] =
-        ClampedAddSubtractFullWithCoefficient(src[x - src_stride],
-                                              new_value, src[x]);
-  }
-  new_image[x] = new_value;
-}
-
-//------------------------------------------------------------------------------
-// Main entry point
-
-static WebPEncodingError ApplyDeltaPalette(uint32_t* src, uint32_t* dst,
-                                           uint32_t src_stride,
-                                           uint32_t dst_stride,
-                                           const uint32_t* palette,
-                                           int palette_size,
-                                           int width, int height,
-                                           int num_passes) {
-  int x, y;
-  WebPEncodingError err = VP8_ENC_OK;
-  uint32_t* new_image = (uint32_t*)WebPSafeMalloc(width, sizeof(*new_image));
-  uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
-  if (new_image == NULL || tmp_row == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
-
-  while (num_passes--) {
-    uint32_t* cur_src = src;
-    uint32_t* cur_dst = dst;
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const uint32_t predicted_value = Predict(x, y, new_image);
-        tmp_row[x] = FindBestPaletteEntry(cur_src[x], predicted_value,
-                                          palette, palette_size);
-        ApplyBestPaletteEntry(x, y, predicted_value, palette[tmp_row[x]],
-                              cur_src, src_stride, new_image);
-      }
-      for (x = 0; x < width; ++x) {
-        cur_dst[x] = palette[tmp_row[x]];
-      }
-      cur_src += src_stride;
-      cur_dst += dst_stride;
-    }
-  }
- Error:
-  WebPSafeFree(new_image);
-  WebPSafeFree(tmp_row);
-  return err;
-}
-
-// replaces enc->argb_ by a palettizable approximation of it,
-// and generates optimal enc->palette_[]
-WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) {
-  const WebPPicture* const pic = enc->pic_;
-  uint32_t* src = pic->argb;
-  uint32_t* dst = enc->argb_;
-  const int width = pic->width;
-  const int height = pic->height;
-
-  WebPEncodingError err = VP8_ENC_OK;
-  memcpy(enc->palette_, kDeltaPalette, sizeof(kDeltaPalette));
-  enc->palette_[DELTA_PALETTE_SIZE - 1] = src[0] - 0xff000000u;
-  enc->palette_size_ = DELTA_PALETTE_SIZE;
-  err = ApplyDeltaPalette(src, dst, pic->argb_stride, enc->current_width_,
-                          enc->palette_, enc->palette_size_,
-                          width, height, 2);
-  if (err != VP8_ENC_OK) goto Error;
-
- Error:
-  return err;
-}
-
-#else  // !WEBP_EXPERIMENTAL_FEATURES
-
-WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) {
-  (void)enc;
-  return VP8_ENC_ERROR_INVALID_CONFIGURATION;
-}
-
-#endif  // WEBP_EXPERIMENTAL_FEATURES
diff --git a/src/3rdparty/libwebp/src/enc/delta_palettization_enc.h b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.h
deleted file mode 100644
index b15e2cd..0000000
--- a/src/3rdparty/libwebp/src/enc/delta_palettization_enc.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright 2015 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Author: Mislav Bradac (mislavm@google.com)
-//
-
-#ifndef WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
-#define WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
-
-#include "src/webp/encode.h"
-#include "src/enc/vp8li_enc.h"
-
-// Replaces enc->argb_[] input by a palettizable approximation of it,
-// and generates optimal enc->palette_[].
-// This function can revert enc->use_palette_ / enc->use_predict_ flag
-// if delta-palettization is not producing expected saving.
-WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc);
-
-#endif  // WEBP_ENC_DELTA_PALETTIZATION_ENC_H_
diff --git a/src/3rdparty/libwebp/src/enc/frame_enc.c b/src/3rdparty/libwebp/src/enc/frame_enc.c
index 2b0dc66..1aec376 100644
--- a/src/3rdparty/libwebp/src/enc/frame_enc.c
+++ b/src/3rdparty/libwebp/src/enc/frame_enc.c
@@ -198,7 +198,7 @@ static void SetSegmentProbas(VP8Encoder* const enc) {
 
   for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
     const VP8MBInfo* const mb = &enc->mb_info_[n];
-    p[mb->segment_]++;
+    ++p[mb->segment_];
   }
 #if !defined(WEBP_DISABLE_STATS)
   if (enc->pic_->stats != NULL) {
@@ -520,6 +520,14 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
 #endif
 }
 
+static void ResetSideInfo(const VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  WebPPicture* const pic = enc->pic_;
+  if (pic->stats != NULL) {
+    memset(enc->block_count_, 0, sizeof(enc->block_count_));
+  }
+  ResetSSE(enc);
+}
 #else  // defined(WEBP_DISABLE_STATS)
 static void ResetSSE(VP8Encoder* const enc) {
   (void)enc;
@@ -528,10 +536,16 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
   VP8Encoder* const enc = it->enc_;
   WebPPicture* const pic = enc->pic_;
   if (pic->extra_info != NULL) {
-    memset(pic->extra_info, 0,
-           enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
+    if (it->x_ == 0 && it->y_ == 0) {   // only do it once, at start
+      memset(pic->extra_info, 0,
+             enc->mb_w_ * enc->mb_h_ * sizeof(*pic->extra_info));
+    }
   }
 }
+
+static void ResetSideInfo(const VP8EncIterator* const it) {
+  (void)it;
+}
 #endif  // !defined(WEBP_DISABLE_STATS)
 
 static double GetPSNR(uint64_t mse, uint64_t size) {
@@ -570,7 +584,7 @@ static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
     VP8IteratorImport(&it, NULL);
     if (VP8Decimate(&it, &info, rd_opt)) {
       // Just record the number of skips and act like skip_proba is not used.
-      enc->proba_.nb_skip_++;
+      ++enc->proba_.nb_skip_;
     }
     RecordResiduals(&it, &info);
     size += info.R + info.H;
@@ -841,6 +855,9 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
     if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
       ++num_pass_left;
       enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
+      if (is_last_pass) {
+        ResetSideInfo(&it);
+      }
       continue;                        // ...and start over
     }
     if (is_last_pass) {
@@ -871,4 +888,3 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
 #endif    // DISABLE_TOKEN_BUFFER
 
 //------------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/enc/histogram_enc.c b/src/3rdparty/libwebp/src/enc/histogram_enc.c
index 056a972..9fdbc62 100644
--- a/src/3rdparty/libwebp/src/enc/histogram_enc.c
+++ b/src/3rdparty/libwebp/src/enc/histogram_enc.c
@@ -200,14 +200,9 @@ static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
   }
 }
 
-double VP8LBitsEntropy(const uint32_t* const array, int n,
-                       uint32_t* const trivial_symbol) {
+double VP8LBitsEntropy(const uint32_t* const array, int n) {
   VP8LBitEntropy entropy;
   VP8LBitsEntropyUnrefined(array, n, &entropy);
-  if (trivial_symbol != NULL) {
-    *trivial_symbol =
-        (entropy.nonzeros == 1) ? entropy.nonzero_code : VP8L_NON_TRIVIAL_SYM;
-  }
 
   return BitsEntropyRefine(&entropy);
 }
@@ -605,7 +600,7 @@ static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
 }
 
 // Implement a Lehmer random number generator with a multiplicative constant of
-// 48271 and a modulo constant of 2^31 − 1.
+// 48271 and a modulo constant of 2^31 - 1.
 static uint32_t MyRand(uint32_t* const seed) {
   *seed = (uint32_t)(((uint64_t)(*seed) * 48271u) % 2147483647u);
   assert(*seed > 0);
@@ -1031,7 +1026,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
     }
   }
 
-  // TODO(vikasa): Optimize HistogramRemap for low-effort compression mode also.
+  // TODO(vrabaud): Optimize HistogramRemap for low-effort compression mode.
   // Find the optimal map from original histograms to the final ones.
   HistogramRemap(orig_histo, image_histo, histogram_symbols);
 
diff --git a/src/3rdparty/libwebp/src/enc/histogram_enc.h b/src/3rdparty/libwebp/src/enc/histogram_enc.h
index 15b1fbd..e8c4c83 100644
--- a/src/3rdparty/libwebp/src/enc/histogram_enc.h
+++ b/src/3rdparty/libwebp/src/enc/histogram_enc.h
@@ -109,10 +109,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              uint16_t* const histogram_symbols);
 
 // Returns the entropy for the symbols in the input array.
-// Also sets trivial_symbol to the code value, if the array has only one code
-// value. Otherwise, set it to VP8L_NON_TRIVIAL_SYM.
-double VP8LBitsEntropy(const uint32_t* const array, int n,
-                       uint32_t* const trivial_symbol);
+double VP8LBitsEntropy(const uint32_t* const array, int n);
 
 // Estimate how many bits the combined entropy of literals and distance
 // approximately maps to.
diff --git a/src/3rdparty/libwebp/src/enc/iterator_enc.c b/src/3rdparty/libwebp/src/enc/iterator_enc.c
index cfacfd2..7c47d51 100644
--- a/src/3rdparty/libwebp/src/enc/iterator_enc.c
+++ b/src/3rdparty/libwebp/src/enc/iterator_enc.c
@@ -26,6 +26,9 @@ static void InitLeft(VP8EncIterator* const it) {
   memset(it->u_left_, 129, 8);
   memset(it->v_left_, 129, 8);
   it->left_nz_[8] = 0;
+  if (it->top_derr_ != NULL) {
+    memset(&it->left_derr_, 0, sizeof(it->left_derr_));
+  }
 }
 
 static void InitTop(VP8EncIterator* const it) {
@@ -33,6 +36,9 @@ static void InitTop(VP8EncIterator* const it) {
   const size_t top_size = enc->mb_w_ * 16;
   memset(enc->y_top_, 127, 2 * top_size);
   memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
+  if (enc->top_derr_ != NULL) {
+    memset(enc->top_derr_, 0, enc->mb_w_ * sizeof(*enc->top_derr_));
+  }
 }
 
 void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
@@ -76,6 +82,7 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
   it->y_left_ = (uint8_t*)WEBP_ALIGN(it->yuv_left_mem_ + 1);
   it->u_left_ = it->y_left_ + 16 + 16;
   it->v_left_ = it->u_left_ + 16;
+  it->top_derr_ = enc->top_derr_;
   VP8IteratorReset(it);
 }
 
@@ -450,4 +457,3 @@ int VP8IteratorRotateI4(VP8EncIterator* const it,
 }
 
 //------------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/enc/near_lossless_enc.c b/src/3rdparty/libwebp/src/enc/near_lossless_enc.c
index cadd14c..5517a7e 100644
--- a/src/3rdparty/libwebp/src/enc/near_lossless_enc.c
+++ b/src/3rdparty/libwebp/src/enc/near_lossless_enc.c
@@ -146,6 +146,6 @@ int VP8ApplyNearLossless(const WebPPicture* const picture, int quality,
 
 // Define a stub to suppress compiler warnings.
 extern void VP8LNearLosslessStub(void);
-WEBP_TSAN_IGNORE_FUNCTION void VP8LNearLosslessStub(void) {}
+void VP8LNearLosslessStub(void) {}
 
 #endif  // (WEBP_NEAR_LOSSLESS == 1)
diff --git a/src/3rdparty/libwebp/src/enc/picture_csp_enc.c b/src/3rdparty/libwebp/src/enc/picture_csp_enc.c
index d531dd0..02d9df7 100644
--- a/src/3rdparty/libwebp/src/enc/picture_csp_enc.c
+++ b/src/3rdparty/libwebp/src/enc/picture_csp_enc.c
@@ -28,11 +28,11 @@
 // If defined, use table to compute x / alpha.
 #define USE_INVERSE_ALPHA_TABLE
 
-static const union {
-  uint32_t argb;
-  uint8_t  bytes[4];
-} test_endian = { 0xff000000u };
-#define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)
+#ifdef WORDS_BIGENDIAN
+#define ALPHA_OFFSET 0   // uint32_t 0xff000000 is 0xff,00,00,00 in memory
+#else
+#define ALPHA_OFFSET 3   // uint32_t 0xff000000 is 0x00,00,00,ff in memory
+#endif
 
 //------------------------------------------------------------------------------
 // Detection of non-trivial transparency
@@ -61,7 +61,7 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
     return CheckNonOpaque(picture->a, picture->width, picture->height,
                           1, picture->a_stride);
   } else {
-    const int alpha_offset = ALPHA_IS_LAST ? 3 : 0;
+    const int alpha_offset = ALPHA_OFFSET;
     return CheckNonOpaque((const uint8_t*)picture->argb + alpha_offset,
                           picture->width, picture->height,
                           4, picture->argb_stride * sizeof(*picture->argb));
@@ -126,7 +126,7 @@ static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
 
 #else
 
-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTables(void) {}
+static void InitGammaTables(void) {}
 static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
 static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
   return (int)(base_value << shift);
@@ -170,29 +170,33 @@ typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
 
 #if defined(USE_GAMMA_COMPRESSION)
 
-// float variant of gamma-correction
 // We use tables of different size and precision for the Rec709 / BT2020
 // transfer function.
 #define kGammaF (1./0.45)
-static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
-static float kLinearToGammaTabF[kGammaTabSize + 2];
-static volatile int kGammaTablesFOk = 0;
-
-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
-  if (!kGammaTablesFOk) {
+static uint32_t kLinearToGammaTabS[kGammaTabSize + 2];
+#define GAMMA_TO_LINEAR_BITS 14
+static uint32_t kGammaToLinearTabS[MAX_Y_T + 1];   // size scales with Y_FIX
+static volatile int kGammaTablesSOk = 0;
+
+static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesS(void) {
+  assert(2 * GAMMA_TO_LINEAR_BITS < 32);  // we use uint32_t intermediate values
+  if (!kGammaTablesSOk) {
     int v;
     const double norm = 1. / MAX_Y_T;
     const double scale = 1. / kGammaTabSize;
     const double a = 0.09929682680944;
     const double thresh = 0.018053968510807;
+    const double final_scale = 1 << GAMMA_TO_LINEAR_BITS;
     for (v = 0; v <= MAX_Y_T; ++v) {
       const double g = norm * v;
+      double value;
       if (g <= thresh * 4.5) {
-        kGammaToLinearTabF[v] = (float)(g / 4.5);
+        value = g / 4.5;
       } else {
         const double a_rec = 1. / (1. + a);
-        kGammaToLinearTabF[v] = (float)pow(a_rec * (g + a), kGammaF);
+        value = pow(a_rec * (g + a), kGammaF);
       }
+      kGammaToLinearTabS[v] = (uint32_t)(value * final_scale + .5);
     }
     for (v = 0; v <= kGammaTabSize; ++v) {
       const double g = scale * v;
@@ -202,37 +206,44 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
       } else {
         value = (1. + a) * pow(g, 1. / kGammaF) - a;
       }
-      kLinearToGammaTabF[v] = (float)(MAX_Y_T * value);
+      // we already incorporate the 1/2 rounding constant here
+      kLinearToGammaTabS[v] =
+          (uint32_t)(MAX_Y_T * value) + (1 << GAMMA_TO_LINEAR_BITS >> 1);
     }
     // to prevent small rounding errors to cause read-overflow:
-    kLinearToGammaTabF[kGammaTabSize + 1] = kLinearToGammaTabF[kGammaTabSize];
-    kGammaTablesFOk = 1;
+    kLinearToGammaTabS[kGammaTabSize + 1] = kLinearToGammaTabS[kGammaTabSize];
+    kGammaTablesSOk = 1;
   }
 }
 
-static WEBP_INLINE float GammaToLinearF(int v) {
-  return kGammaToLinearTabF[v];
+// return value has a fixed-point precision of GAMMA_TO_LINEAR_BITS
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+  return kGammaToLinearTabS[v];
 }
 
-static WEBP_INLINE int LinearToGammaF(float value) {
-  const float v = value * kGammaTabSize;
-  const int tab_pos = (int)v;
-  const float x = v - (float)tab_pos;      // fractional part
-  const float v0 = kLinearToGammaTabF[tab_pos + 0];
-  const float v1 = kLinearToGammaTabF[tab_pos + 1];
-  const float y = v1 * x + v0 * (1.f - x);  // interpolate
-  return (int)(y + .5);
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+  // 'value' is in GAMMA_TO_LINEAR_BITS fractional precision
+  const uint32_t v = value * kGammaTabSize;
+  const uint32_t tab_pos = v >> GAMMA_TO_LINEAR_BITS;
+  // fractional part, in GAMMA_TO_LINEAR_BITS fixed-point precision
+  const uint32_t x = v - (tab_pos << GAMMA_TO_LINEAR_BITS);  // fractional part
+  // v0 / v1 are in GAMMA_TO_LINEAR_BITS fixed-point precision (range [0..1])
+  const uint32_t v0 = kLinearToGammaTabS[tab_pos + 0];
+  const uint32_t v1 = kLinearToGammaTabS[tab_pos + 1];
+  // Final interpolation. Note that rounding is already included.
+  const uint32_t v2 = (v1 - v0) * x;    // note: v1 >= v0.
+  const uint32_t result = v0 + (v2 >> GAMMA_TO_LINEAR_BITS);
+  return result;
 }
 
 #else
 
-static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {}
-static WEBP_INLINE float GammaToLinearF(int v) {
-  const float norm = 1.f / MAX_Y_T;
-  return norm * v;
+static void InitGammaTablesS(void) {}
+static WEBP_INLINE uint32_t GammaToLinearS(int v) {
+  return (v << GAMMA_TO_LINEAR_BITS) / MAX_Y_T;
 }
-static WEBP_INLINE int LinearToGammaF(float value) {
-  return (int)(MAX_Y_T * value + .5);
+static WEBP_INLINE uint32_t LinearToGammaS(uint32_t value) {
+  return (MAX_Y_T * value) >> GAMMA_TO_LINEAR_BITS;
 }
 
 #endif    // USE_GAMMA_COMPRESSION
@@ -254,26 +265,22 @@ static int RGBToGray(int r, int g, int b) {
   return (luma >> YUV_FIX);
 }
 
-static float RGBToGrayF(float r, float g, float b) {
-  return (float)(0.2126 * r + 0.7152 * g + 0.0722 * b);
-}
-
-static int ScaleDown(int a, int b, int c, int d) {
-  const float A = GammaToLinearF(a);
-  const float B = GammaToLinearF(b);
-  const float C = GammaToLinearF(c);
-  const float D = GammaToLinearF(d);
-  return LinearToGammaF(0.25f * (A + B + C + D));
+static uint32_t ScaleDown(int a, int b, int c, int d) {
+  const uint32_t A = GammaToLinearS(a);
+  const uint32_t B = GammaToLinearS(b);
+  const uint32_t C = GammaToLinearS(c);
+  const uint32_t D = GammaToLinearS(d);
+  return LinearToGammaS((A + B + C + D + 2) >> 2);
 }
 
 static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
   int i;
   for (i = 0; i < w; ++i) {
-    const float R = GammaToLinearF(src[0 * w + i]);
-    const float G = GammaToLinearF(src[1 * w + i]);
-    const float B = GammaToLinearF(src[2 * w + i]);
-    const float Y = RGBToGrayF(R, G, B);
-    dst[i] = (fixed_y_t)LinearToGammaF(Y);
+    const uint32_t R = GammaToLinearS(src[0 * w + i]);
+    const uint32_t G = GammaToLinearS(src[1 * w + i]);
+    const uint32_t B = GammaToLinearS(src[2 * w + i]);
+    const uint32_t Y = RGBToGray(R, G, B);
+    dst[i] = (fixed_y_t)LinearToGammaS(Y);
   }
 }
 
@@ -863,7 +870,7 @@ static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
   }
 
   if (use_iterative_conversion) {
-    InitGammaTablesF();
+    InitGammaTablesS();
     if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
       return 0;
     }
@@ -990,10 +997,10 @@ static int PictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace,
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
   } else {
     const uint8_t* const argb = (const uint8_t*)picture->argb;
-    const uint8_t* const r = ALPHA_IS_LAST ? argb + 2 : argb + 1;
-    const uint8_t* const g = ALPHA_IS_LAST ? argb + 1 : argb + 2;
-    const uint8_t* const b = ALPHA_IS_LAST ? argb + 0 : argb + 3;
-    const uint8_t* const a = ALPHA_IS_LAST ? argb + 3 : argb + 0;
+    const uint8_t* const a = argb + (0 ^ ALPHA_OFFSET);
+    const uint8_t* const r = argb + (1 ^ ALPHA_OFFSET);
+    const uint8_t* const g = argb + (2 ^ ALPHA_OFFSET);
+    const uint8_t* const b = argb + (3 ^ ALPHA_OFFSET);
 
     picture->colorspace = WEBP_YUV420;
     return ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride,
@@ -1044,7 +1051,8 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
     const int argb_stride = 4 * picture->argb_stride;
     uint8_t* dst = (uint8_t*)picture->argb;
     const uint8_t *cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y;
-    WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST);
+    WebPUpsampleLinePairFunc upsample =
+        WebPGetLinePairConverter(ALPHA_OFFSET > 0);
 
     // First row, with replicated top samples.
     upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
@@ -1087,6 +1095,7 @@ static int Import(WebPPicture* const picture,
                   const uint8_t* rgb, int rgb_stride,
                   int step, int swap_rb, int import_alpha) {
   int y;
+  // swap_rb -> b,g,r,a , !swap_rb -> r,g,b,a
   const uint8_t* r_ptr = rgb + (swap_rb ? 2 : 0);
   const uint8_t* g_ptr = rgb + 1;
   const uint8_t* b_ptr = rgb + (swap_rb ? 0 : 2);
@@ -1104,19 +1113,32 @@ static int Import(WebPPicture* const picture,
   WebPInitAlphaProcessing();
 
   if (import_alpha) {
+    // dst[] byte order is {a,r,g,b} for big-endian, {b,g,r,a} for little endian
     uint32_t* dst = picture->argb;
-    const int do_copy =
-        (!swap_rb && !ALPHA_IS_LAST) || (swap_rb && ALPHA_IS_LAST);
+    const int do_copy = (ALPHA_OFFSET == 3) && swap_rb;
     assert(step == 4);
-    for (y = 0; y < height; ++y) {
-      if (do_copy) {
+    if (do_copy) {
+      for (y = 0; y < height; ++y) {
         memcpy(dst, rgb, width * 4);
-      } else {
+        rgb += rgb_stride;
+        dst += picture->argb_stride;
+      }
+    } else {
+      for (y = 0; y < height; ++y) {
+#ifdef WORDS_BIGENDIAN
+        // BGRA or RGBA input order.
+        const uint8_t* a_ptr = rgb + 3;
+        WebPPackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
+        r_ptr += rgb_stride;
+        g_ptr += rgb_stride;
+        b_ptr += rgb_stride;
+#else
         // RGBA input order. Need to swap R and B.
         VP8LConvertBGRAToRGBA((const uint32_t*)rgb, width, (uint8_t*)dst);
+#endif
+        rgb += rgb_stride;
+        dst += picture->argb_stride;
       }
-      rgb += rgb_stride;
-      dst += picture->argb_stride;
     }
   } else {
     uint32_t* dst = picture->argb;
diff --git a/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c b/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c
index 362a7c7..1a2f0be 100644
--- a/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c
+++ b/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c
@@ -18,6 +18,7 @@
 #include <math.h>
 #include <stdlib.h>
 
+#include "src/dsp/dsp.h"
 #include "src/enc/vp8i_enc.h"
 #include "src/utils/utils.h"
 
@@ -169,6 +170,12 @@ int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
   return 1;
 }
 
+#ifdef WORDS_BIGENDIAN
+#define BLUE_OFFSET 3   // uint32_t 0x000000ff is 0x00,00,00,ff in memory
+#else
+#define BLUE_OFFSET 0   // uint32_t 0x000000ff is 0xff,00,00,00 in memory
+#endif
+
 int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
                           int type, float results[5]) {
   int w, h, c;
@@ -195,8 +202,10 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
     float distortion;
     const size_t stride0 = 4 * (size_t)p0.argb_stride;
     const size_t stride1 = 4 * (size_t)p1.argb_stride;
-    if (!WebPPlaneDistortion((const uint8_t*)p0.argb + c, stride0,
-                             (const uint8_t*)p1.argb + c, stride1,
+    // results are reported as BGRA
+    const int offset = c ^ BLUE_OFFSET;
+    if (!WebPPlaneDistortion((const uint8_t*)p0.argb + offset, stride0,
+                             (const uint8_t*)p1.argb + offset, stride1,
                              w, h, 4, type, &distortion, results + c)) {
       goto Error;
     }
@@ -214,6 +223,8 @@ int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
   return ok;
 }
 
+#undef BLUE_OFFSET
+
 #else  // defined(WEBP_DISABLE_STATS)
 int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
                         const uint8_t* ref, size_t ref_stride,
diff --git a/src/3rdparty/libwebp/src/enc/quant_enc.c b/src/3rdparty/libwebp/src/enc/quant_enc.c
index 3b1a312..35bfaf2 100644
--- a/src/3rdparty/libwebp/src/enc/quant_enc.c
+++ b/src/3rdparty/libwebp/src/enc/quant_enc.c
@@ -826,6 +826,85 @@ static int ReconstructIntra4(VP8EncIterator* const it,
   return nz;
 }
 
+//------------------------------------------------------------------------------
+// DC-error diffusion
+
+// Diffusion weights. We under-correct a bit (15/16th of the error is actually
+// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
+#define C1 7    // fraction of error sent to the 4x4 block below
+#define C2 8    // fraction of error sent to the 4x4 block on the right
+#define DSHIFT 4
+#define DSCALE 1   // storage descaling, needed to make the error fit int8_t
+
+// Quantize as usual, but also compute and return the quantization error.
+// Error is already divided by DSHIFT.
+static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
+  int V = *v;
+  const int sign = (V < 0);
+  if (sign) V = -V;
+  if (V > (int)mtx->zthresh_[0]) {
+    const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
+    const int err = (V - qV);
+    *v = sign ? -qV : qV;
+    return (sign ? -err : err) >> DSCALE;
+  }
+  *v = 0;
+  return (sign ? -V : V) >> DSCALE;
+}
+
+static void CorrectDCValues(const VP8EncIterator* const it,
+                            const VP8Matrix* const mtx,
+                            int16_t tmp[][16], VP8ModeScore* const rd) {
+  //         | top[0] | top[1]
+  // --------+--------+---------
+  // left[0] | tmp[0]   tmp[1]  <->   err0 err1
+  // left[1] | tmp[2]   tmp[3]        err2 err3
+  //
+  // Final errors {err1,err2,err3} are preserved and later restored
+  // as top[]/left[] on the next block.
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    const int8_t* const top = it->top_derr_[it->x_][ch];
+    const int8_t* const left = it->left_derr_[ch];
+    int16_t (* const c)[16] = &tmp[ch * 4];
+    int err0, err1, err2, err3;
+    c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
+    err0 = QuantizeSingle(&c[0][0], mtx);
+    c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
+    err1 = QuantizeSingle(&c[1][0], mtx);
+    c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
+    err2 = QuantizeSingle(&c[2][0], mtx);
+    c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
+    err3 = QuantizeSingle(&c[3][0], mtx);
+    // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
+    // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
+    assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
+    rd->derr[ch][0] = (int8_t)err1;
+    rd->derr[ch][1] = (int8_t)err2;
+    rd->derr[ch][2] = (int8_t)err3;
+  }
+}
+
+static void StoreDiffusionErrors(VP8EncIterator* const it,
+                                 const VP8ModeScore* const rd) {
+  int ch;
+  for (ch = 0; ch <= 1; ++ch) {
+    int8_t* const top = it->top_derr_[it->x_][ch];
+    int8_t* const left = it->left_derr_[ch];
+    left[0] = rd->derr[ch][0];            // restore err1
+    left[1] = 3 * rd->derr[ch][2] >> 2;   //     ... 3/4th of err3
+    top[0]  = rd->derr[ch][1];            //     ... err2
+    top[1]  = rd->derr[ch][2] - left[1];  //     ... 1/4th of err3.
+  }
+}
+
+#undef C1
+#undef C2
+#undef DSHIFT
+#undef DSCALE
+
+//------------------------------------------------------------------------------
+
 static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
                          uint8_t* const yuv_out, int mode) {
   const VP8Encoder* const enc = it->enc_;
@@ -839,6 +918,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
   for (n = 0; n < 8; n += 2) {
     VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
   }
+  if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
+
   if (DO_TRELLIS_UV && it->do_trellis_) {
     int ch, x, y;
     for (ch = 0, n = 0; ch <= 2; ch += 2) {
@@ -1101,6 +1182,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
       CopyScore(&rd_best, &rd_uv);
       rd->mode_uv = mode;
       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
+      if (it->top_derr_ != NULL) {
+        memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
+      }
       SwapPtr(&dst, &tmp_dst);
     }
   }
@@ -1109,6 +1193,9 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
   if (dst != dst0) {   // copy 16x8 block if needed
     VP8Copy16x8(dst, dst0);
   }
+  if (it->top_derr_ != NULL) {  // store diffusion errors for next block
+    StoreDiffusionErrors(it, rd);
+  }
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/vp8i_enc.h b/src/3rdparty/libwebp/src/enc/vp8i_enc.h
index 3463491..624e8f8 100644
--- a/src/3rdparty/libwebp/src/enc/vp8i_enc.h
+++ b/src/3rdparty/libwebp/src/enc/vp8i_enc.h
@@ -30,9 +30,9 @@ extern "C" {
 // Various defines and enums
 
 // version numbers
-#define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 6
-#define ENC_REV_VERSION 1
+#define ENC_MAJ_VERSION 1
+#define ENC_MIN_VERSION 0
+#define ENC_REV_VERSION 0
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
@@ -120,6 +120,9 @@ static WEBP_INLINE int QUANTDIV(uint32_t n, uint32_t iQ, uint32_t B) {
 // Uncomment the following to remove token-buffer code:
 // #define DISABLE_TOKEN_BUFFER
 
+// quality below which error-diffusion is enabled
+#define ERROR_DIFFUSION_QUALITY 98
+
 //------------------------------------------------------------------------------
 // Headers
 
@@ -201,6 +204,8 @@ typedef struct {
   score_t i4_penalty_;   // penalty for using Intra4
 } VP8SegmentInfo;
 
+typedef int8_t DError[2 /* u/v */][2 /* top or left */];
+
 // Handy transient struct to accumulate score and info during RD-optimization
 // and mode evaluation.
 typedef struct {
@@ -213,6 +218,7 @@ typedef struct {
   uint8_t modes_i4[16];       // mode numbers for intra4 predictions
   int mode_uv;                // mode number of chroma prediction
   uint32_t nz;                // non-zero blocks
+  int8_t derr[2][3];          // DC diffusion errors for U/V for blocks #1/2/3
 } VP8ModeScore;
 
 // Iterator structure to iterate through macroblocks, pointing to the
@@ -242,6 +248,9 @@ typedef struct {
   int           count_down0_;      // starting counter value (for progress)
   int           percent0_;         // saved initial progress percent
 
+  DError        left_derr_;        // left error diffusion (u/v)
+  DError       *top_derr_;         // top diffusion error - NULL if disabled
+
   uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
   uint8_t* u_left_;    // left u samples (addressable from index -1 to 7)
   uint8_t* v_left_;    // left v samples (addressable from index -1 to 7)
@@ -401,6 +410,7 @@ struct VP8Encoder {
   uint8_t*   uv_top_;    // top u/v samples.
                          // U and V are packed into 16 bytes (8 U + 8 V)
   LFStats*   lf_stats_;  // autofilter stats (if NULL, autofilter is off)
+  DError*    top_derr_;  // diffusion error (NULL if disabled)
 };
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/vp8l_enc.c b/src/3rdparty/libwebp/src/enc/vp8l_enc.c
index 312e521..a89184e 100644
--- a/src/3rdparty/libwebp/src/enc/vp8l_enc.c
+++ b/src/3rdparty/libwebp/src/enc/vp8l_enc.c
@@ -26,8 +26,6 @@
 #include "src/utils/utils.h"
 #include "src/webp/format_constants.h"
 
-#include "src/enc/delta_palettization_enc.h"
-
 // Maximum number of histogram images (sub-blocks).
 #define MAX_HUFF_IMAGE_SIZE       2600
 
@@ -259,7 +257,7 @@ static int AnalyzeEntropy(const uint32_t* argb,
       ++histo[kHistoAlphaPred * 256];
 
       for (j = 0; j < kHistoTotal; ++j) {
-        entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256, NULL);
+        entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256);
       }
       entropy[kDirect] = entropy_comp[kHistoAlpha] +
           entropy_comp[kHistoRed] +
@@ -384,8 +382,7 @@ static int EncoderAnalyze(VP8LEncoder* const enc,
       AnalyzeAndCreatePalette(pic, low_effort,
                               enc->palette_, &enc->palette_size_);
 
-  // TODO(jyrki): replace the decision to be based on an actual estimate
-  // of entropy, or even spatial variance of entropy.
+  // Empirical bit sizes.
   enc->histo_bits_ = GetHistoBits(method, use_palette,
                                   pic->width, pic->height);
   enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
@@ -756,7 +753,6 @@ static WebPEncodingError StoreImageToBitMask(
       // Don't write the distance with the extra bits code since
       // the distance can be up to 18 bits of extra bits, and the prefix
       // 15 bits, totaling to 33, and our PutBits only supports up to 32 bits.
-      // TODO(jyrki): optimize this further.
       VP8LPrefixEncode(distance, &code, &n_bits, &bits);
       WriteHuffmanCode(bw, codes + 4, code);
       VP8LPutBits(bw, bits, n_bits);
@@ -1464,49 +1460,6 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
                               20 /* quality */, low_effort);
 }
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-
-static WebPEncodingError EncodeDeltaPalettePredictorImage(
-    VP8LBitWriter* const bw, VP8LEncoder* const enc, int quality,
-    int low_effort) {
-  const WebPPicture* const pic = enc->pic_;
-  const int width = pic->width;
-  const int height = pic->height;
-
-  const int pred_bits = 5;
-  const int transform_width = VP8LSubSampleSize(width, pred_bits);
-  const int transform_height = VP8LSubSampleSize(height, pred_bits);
-  const int pred = 7;   // default is Predictor7 (Top/Left Average)
-  const int tiles_per_row = VP8LSubSampleSize(width, pred_bits);
-  const int tiles_per_col = VP8LSubSampleSize(height, pred_bits);
-  uint32_t* predictors;
-  int tile_x, tile_y;
-  WebPEncodingError err = VP8_ENC_OK;
-
-  predictors = (uint32_t*)WebPSafeMalloc(tiles_per_col * tiles_per_row,
-                                         sizeof(*predictors));
-  if (predictors == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
-
-  for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
-    for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
-      predictors[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
-    }
-  }
-
-  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
-  VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
-  VP8LPutBits(bw, pred_bits - 2, 3);
-  err = EncodeImageNoHuffman(
-      bw, predictors, &enc->hash_chain_,
-      (VP8LBackwardRefs*)&enc->refs_[0],  // cast const away
-      (VP8LBackwardRefs*)&enc->refs_[1],
-      transform_width, transform_height, quality, low_effort);
-  WebPSafeFree(predictors);
-  return err;
-}
-
-#endif // WEBP_EXPERIMENTAL_FEATURES
-
 // -----------------------------------------------------------------------------
 // VP8LEncoder
 
@@ -1568,7 +1521,7 @@ static int EncodeStreamHook(void* input, void* data2) {
   WebPEncodingError err = VP8_ENC_OK;
   const int quality = (int)config->quality;
   const int low_effort = (config->method == 0);
-#if (WEBP_NEAR_LOSSLESS == 1) || defined(WEBP_EXPERIMENTAL_FEATURES)
+#if (WEBP_NEAR_LOSSLESS == 1)
   const int width = picture->width;
 #endif
   const int height = picture->height;
@@ -1627,29 +1580,6 @@ static int EncodeStreamHook(void* input, void* data2) {
     enc->argb_content_ = kEncoderNone;
 #endif
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (config->use_delta_palette) {
-      enc->use_predict_ = 1;
-      enc->use_cross_color_ = 0;
-      enc->use_subtract_green_ = 0;
-      enc->use_palette_ = 1;
-      if (enc->argb_content_ != kEncoderNearLossless &&
-          enc->argb_content_ != kEncoderPalette) {
-        err = MakeInputImageCopy(enc);
-        if (err != VP8_ENC_OK) goto Error;
-      }
-      err = WebPSearchOptimalDeltaPalette(enc);
-      if (err != VP8_ENC_OK) goto Error;
-      if (enc->use_palette_) {
-        err = AllocateTransformBuffer(enc, width, height);
-        if (err != VP8_ENC_OK) goto Error;
-        err = EncodeDeltaPalettePredictorImage(bw, enc, quality, low_effort);
-        if (err != VP8_ENC_OK) goto Error;
-        use_delta_palette = 1;
-      }
-    }
-#endif  // WEBP_EXPERIMENTAL_FEATURES
-
     // Encode palette
     if (enc->use_palette_) {
       err = EncodePalette(bw, low_effort, enc);
@@ -1822,7 +1752,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
       worker_interface->Init(worker);
       worker->data1 = param;
       worker->data2 = NULL;
-      worker->hook = (WebPWorkerHook)EncodeStreamHook;
+      worker->hook = EncodeStreamHook;
     }
   }
 
@@ -1944,7 +1874,6 @@ int VP8LEncodeImage(const WebPConfig* const config,
   err = VP8LEncodeStream(config, picture, &bw, 1 /*use_cache*/);
   if (err != VP8_ENC_OK) goto Error;
 
-  // TODO(skal): have a fine-grained progress report in VP8LEncodeStream().
   if (!WebPReportProgress(picture, 90, &percent)) goto UserAbort;
 
   // Finish the RIFF chunk.
diff --git a/src/3rdparty/libwebp/src/enc/webp_enc.c b/src/3rdparty/libwebp/src/enc/webp_enc.c
index 283cda8..9f4b10c 100644
--- a/src/3rdparty/libwebp/src/enc/webp_enc.c
+++ b/src/3rdparty/libwebp/src/enc/webp_enc.c
@@ -159,12 +159,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
       + WEBP_ALIGN_CST;                      // align all
   const size_t lf_stats_size =
       config->autofilter ? sizeof(*enc->lf_stats_) + WEBP_ALIGN_CST : 0;
+  const size_t top_derr_size =
+      (config->quality <= ERROR_DIFFUSION_QUALITY || config->pass > 1) ?
+          mb_w * sizeof(*enc->top_derr_) : 0;
   uint8_t* mem;
   const uint64_t size = (uint64_t)sizeof(*enc)   // main struct
                       + WEBP_ALIGN_CST           // cache alignment
                       + info_size                // modes info
                       + preds_size               // prediction modes
                       + samples_size             // top/left samples
+                      + top_derr_size            // top diffusion error
                       + nz_size                  // coeff context bits
                       + lf_stats_size;           // autofilter stats
 
@@ -175,11 +179,12 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
          "                info: %ld\n"
          "               preds: %ld\n"
          "         top samples: %ld\n"
+         "       top diffusion: %ld\n"
          "            non-zero: %ld\n"
          "            lf-stats: %ld\n"
          "               total: %ld\n",
          sizeof(*enc) + WEBP_ALIGN_CST, info_size,
-         preds_size, samples_size, nz_size, lf_stats_size, size);
+         preds_size, samples_size, top_derr_size, nz_size, lf_stats_size, size);
   printf("Transient object sizes:\n"
          "      VP8EncIterator: %ld\n"
          "        VP8ModeScore: %ld\n"
@@ -219,6 +224,8 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   enc->y_top_ = mem;
   enc->uv_top_ = enc->y_top_ + top_stride;
   mem += 2 * top_stride;
+  enc->top_derr_ = top_derr_size ? (DError*)mem : NULL;
+  mem += top_derr_size;
   assert(mem <= (uint8_t*)enc + size);
 
   enc->config_ = config;
diff --git a/src/3rdparty/libwebp/src/mux/muxi.h b/src/3rdparty/libwebp/src/mux/muxi.h
index b73e3fb..6b57eea 100644
--- a/src/3rdparty/libwebp/src/mux/muxi.h
+++ b/src/3rdparty/libwebp/src/mux/muxi.h
@@ -26,9 +26,9 @@ extern "C" {
 //------------------------------------------------------------------------------
 // Defines and constants.
 
-#define MUX_MAJ_VERSION 0
-#define MUX_MIN_VERSION 4
-#define MUX_REV_VERSION 1
+#define MUX_MAJ_VERSION 1
+#define MUX_MIN_VERSION 0
+#define MUX_REV_VERSION 0
 
 // Chunk object.
 typedef struct WebPChunk WebPChunk;
diff --git a/src/3rdparty/libwebp/src/utils/endian_inl_utils.h b/src/3rdparty/libwebp/src/utils/endian_inl_utils.h
index 4b2f91d..3630a29 100644
--- a/src/3rdparty/libwebp/src/utils/endian_inl_utils.h
+++ b/src/3rdparty/libwebp/src/utils/endian_inl_utils.h
@@ -19,13 +19,6 @@
 #include "src/dsp/dsp.h"
 #include "src/webp/types.h"
 
-// some endian fix (e.g.: mips-gcc doesn't define __BIG_ENDIAN__)
-#if !defined(WORDS_BIGENDIAN) && \
-    (defined(__BIG_ENDIAN__) || defined(_M_PPC) || \
-     (defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)))
-#define WORDS_BIGENDIAN
-#endif
-
 #if defined(WORDS_BIGENDIAN)
 #define HToLE32 BSwap32
 #define HToLE16 BSwap16
diff --git a/src/3rdparty/libwebp/src/webp/config.h b/src/3rdparty/libwebp/src/webp/config.h
index 5ca2e3d..ae16e79 100644
--- a/src/3rdparty/libwebp/src/webp/config.h
+++ b/src/3rdparty/libwebp/src/webp/config.h
@@ -14,6 +14,9 @@
 /* Set to 1 if __builtin_bswap64 is available */
 /* #undef HAVE_BUILTIN_BSWAP64 */
 
+/* Define to 1 if you have the <cpu-features.h> header file. */
+/* #undef HAVE_CPU_FEATURES_H */
+
 /* Define to 1 if you have the <dlfcn.h> header file. */
 /* #undef HAVE_DLFCN_H */
 
@@ -65,8 +68,7 @@
 /* Define to 1 if you have the <windows.h> header file. */
 /* #undef HAVE_WINDOWS_H */
 
-/* Define to the sub-directory in which libtool stores uninstalled libraries.
-   */
+/* Define to the sub-directory where libtool stores uninstalled libraries. */
 /* #undef LT_OBJDIR ".libs/" */
 
 /* Name of package */
@@ -79,7 +81,7 @@
 #define PACKAGE_NAME "libwebp"
 
 /* Define to the full name and version of this package. */
-#define PACKAGE_STRING "libwebp 0.6.1"
+#define PACKAGE_STRING "libwebp 1.0.0"
 
 /* Define to the one symbol short name of this package. */
 #define PACKAGE_TARNAME "libwebp"
@@ -88,7 +90,7 @@
 #define PACKAGE_URL "http://developers.google.com/speed/webp"
 
 /* Define to the version of this package. */
-#define PACKAGE_VERSION "0.6.1"
+#define PACKAGE_VERSION "1.0.0"
 
 /* Define to necessary symbol if this constant uses a non-standard name on
    your system. */
@@ -98,7 +100,7 @@
 /* #undef STDC_HEADERS */
 
 /* Version number of package */
-#define VERSION "0.6.1"
+#define VERSION "1.0.0"
 
 /* Enable experimental code */
 /* #undef WEBP_EXPERIMENTAL_FEATURES */
@@ -127,6 +129,9 @@
 /* Set to 1 if PNG library is installed */
 /* #undef WEBP_HAVE_PNG */
 
+/* Set to 1 if SDL library is installed */
+/* #undef WEBP_HAVE_SDL */
+
 /* Set to 1 if SSE2 is supported */
 /* #undef WEBP_HAVE_SSE2 */
 
@@ -136,6 +141,9 @@
 /* Set to 1 if TIFF library is installed */
 /* #undef WEBP_HAVE_TIFF */
 
+/* Enable near lossless encoding */
+/* #undef WEBP_NEAR_LOSSLESS */
+
 /* Undefine this to disable thread support. */
 #define WEBP_USE_THREAD 1
author	Liang Qi <liang.qi@qt.io>	2018-05-14 13:07:15 +0200
committer	Liang Qi <liang.qi@qt.io>	2018-05-22 08:03:36 +0000
commit	62082a63e112e9991b33c2045896ced78ffcb62e (patch)
tree	04a66f057499c90be0a8abfe8b0375886c6f25df
parent	92398950d9cfe5a88cb685ec166eb413aa8613ec (diff)