30 files changed, 1999 insertions, 1061 deletions
diff --git a/src/3rdparty/libwebp/src/enc/alpha.c b/src/3rdparty/libwebp/src/enc/alpha_enc.c
index 03e3ad0..5a2c931 100644
--- a/src/3rdparty/libwebp/src/enc/alpha.c
+++ b/src/3rdparty/libwebp/src/enc/alpha_enc.c
@@ -14,10 +14,10 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 #include "../dsp/dsp.h"
-#include "../utils/filters.h"
-#include "../utils/quant_levels.h"
+#include "../utils/filters_utils.h"
+#include "../utils/quant_levels_utils.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
@@ -44,7 +44,7 @@
 //           invalid quality or method, or
 //           memory allocation for the compressed data fails.
 
-#include "../enc/vp8li.h"
+#include "../enc/vp8li_enc.h"
 
 static int EncodeLossless(const uint8_t* const data, int width, int height,
                           int effort_level,  // in [0..6] range
diff --git a/src/3rdparty/libwebp/src/enc/analysis.c b/src/3rdparty/libwebp/src/enc/analysis_enc.c
index b55128f..dce159b 100644
--- a/src/3rdparty/libwebp/src/enc/analysis.c
+++ b/src/3rdparty/libwebp/src/enc/analysis_enc.c
@@ -15,8 +15,8 @@
 #include <string.h>
 #include <assert.h>
 
-#include "./vp8enci.h"
-#include "./cost.h"
+#include "./vp8i_enc.h"
+#include "./cost_enc.h"
 #include "../utils/utils.h"
 
 #define MAX_ITERS_K_MEANS  6
@@ -262,6 +262,29 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
   return best_alpha;
 }
 
+static int FastMBAnalyze(VP8EncIterator* const it) {
+  // Empirical cut-off value, should be around 16 (~=block size). We use the
+  // [8-17] range and favor intra4 at high quality, intra16 for low quality.
+  const int q = (int)it->enc_->config_->quality;
+  const uint32_t kThreshold = 8 + (17 - 8) * q / 100;
+  int k;
+  uint32_t dc[16], m, m2;
+  for (k = 0; k < 16; k += 4) {
+    VP8Mean16x4(it->yuv_in_ + Y_OFF_ENC + k * BPS, &dc[k]);
+  }
+  for (m = 0, m2 = 0, k = 0; k < 16; ++k) {
+    m += dc[k];
+    m2 += dc[k] * dc[k];
+  }
+  if (kThreshold * m2 < m * m) {
+    VP8SetIntra16Mode(it, 0);   // DC16
+  } else {
+    const uint8_t modes[16] = { 0 };  // DC4
+    VP8SetIntra4Mode(it, modes);
+  }
+  return 0;
+}
+
 static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
                                    int best_alpha) {
   uint8_t modes[16];
@@ -307,6 +330,7 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
 
 static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
   int best_alpha = DEFAULT_ALPHA;
+  int smallest_alpha = 0;
   int best_mode = 0;
   const int max_mode = MAX_UV_MODE;
   int mode;
@@ -322,6 +346,10 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
     alpha = GetAlpha(&histo);
     if (IS_BETTER_ALPHA(alpha, best_alpha)) {
       best_alpha = alpha;
+    }
+    // The best prediction mode tends to be the one with the smallest alpha.
+    if (mode == 0 || alpha < smallest_alpha) {
+      smallest_alpha = alpha;
       best_mode = mode;
     }
   }
@@ -339,13 +367,17 @@ static void MBAnalyze(VP8EncIterator* const it,
   VP8SetSkip(it, 0);         // not skipped
   VP8SetSegment(it, 0);      // default segment, spec-wise.
 
-  best_alpha = MBAnalyzeBestIntra16Mode(it);
-  if (enc->method_ >= 5) {
-    // We go and make a fast decision for intra4/intra16.
-    // It's usually not a good and definitive pick, but helps seeding the stats
-    // about level bit-cost.
-    // TODO(skal): improve criterion.
-    best_alpha = MBAnalyzeBestIntra4Mode(it, best_alpha);
+  if (enc->method_ <= 1) {
+    best_alpha = FastMBAnalyze(it);
+  } else {
+    best_alpha = MBAnalyzeBestIntra16Mode(it);
+    if (enc->method_ >= 5) {
+      // We go and make a fast decision for intra4/intra16.
+      // It's usually not a good and definitive pick, but helps seeding the
+      // stats about level bit-cost.
+      // TODO(skal): improve criterion.
+      best_alpha = MBAnalyzeBestIntra4Mode(it, best_alpha);
+    }
   }
   best_uv_alpha = MBAnalyzeBestUVMode(it);
 
@@ -448,7 +480,7 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
   const int do_segments =
       enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation.
       (enc->segment_hdr_.num_segments_ > 1) ||
-      (enc->method_ == 0);  // for method 0, we need preds_[] to be filled.
+      (enc->method_ <= 1);  // for method 0 - 1, we need preds_[] to be filled.
   if (do_segments) {
     const int last_row = enc->mb_h_;
     // We give a little more than a half work to the main thread.
diff --git a/src/3rdparty/libwebp/src/enc/backward_references.c b/src/3rdparty/libwebp/src/enc/backward_references_enc.c
index 136a24a..7c0559f 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references.c
+++ b/src/3rdparty/libwebp/src/enc/backward_references_enc.c
@@ -13,11 +13,12 @@
 #include <assert.h>
 #include <math.h>
 
-#include "./backward_references.h"
-#include "./histogram.h"
+#include "./backward_references_enc.h"
+#include "./histogram_enc.h"
 #include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
 #include "../dsp/dsp.h"
-#include "../utils/color_cache.h"
+#include "../utils/color_cache_utils.h"
 #include "../utils/utils.h"
 
 #define VALUES_IN_BYTE 256
@@ -30,8 +31,9 @@
 #define WINDOW_SIZE_BITS 20
 #define WINDOW_SIZE ((1 << WINDOW_SIZE_BITS) - 120)
 
-// Bounds for the match length.
-#define MIN_LENGTH 2
+// Minimum number of pixels for which it is cheaper to encode a
+// distance + length instead of each pixel as a literal.
+#define MIN_LENGTH 4
 // If you change this, you need MAX_LENGTH_BITS + WINDOW_SIZE_BITS <= 32 as it
 // is used in VP8LHashChain.
 #define MAX_LENGTH_BITS 12
@@ -211,13 +213,13 @@ void VP8LHashChainClear(VP8LHashChain* const p) {
 
 // -----------------------------------------------------------------------------
 
-#define HASH_MULTIPLIER_HI (0xc6a4a793U)
-#define HASH_MULTIPLIER_LO (0x5bd1e996U)
+#define HASH_MULTIPLIER_HI (0xc6a4a793ULL)
+#define HASH_MULTIPLIER_LO (0x5bd1e996ULL)
 
 static WEBP_INLINE uint32_t GetPixPairHash64(const uint32_t* const argb) {
   uint32_t key;
-  key  = argb[1] * HASH_MULTIPLIER_HI;
-  key += argb[0] * HASH_MULTIPLIER_LO;
+  key  = (argb[1] * HASH_MULTIPLIER_HI) & 0xffffffffu;
+  key += (argb[0] * HASH_MULTIPLIER_LO) & 0xffffffffu;
   key = key >> (32 - HASH_BITS);
   return key;
 }
@@ -242,19 +244,26 @@ static WEBP_INLINE int MaxFindCopyLength(int len) {
 }
 
 int VP8LHashChainFill(VP8LHashChain* const p, int quality,
-                      const uint32_t* const argb, int xsize, int ysize) {
+                      const uint32_t* const argb, int xsize, int ysize,
+                      int low_effort) {
   const int size = xsize * ysize;
   const int iter_max = GetMaxItersForQuality(quality);
-  const int iter_min = iter_max - quality / 10;
   const uint32_t window_size = GetWindowSizeForHashChain(quality, xsize);
   int pos;
+  int argb_comp;
   uint32_t base_position;
   int32_t* hash_to_first_index;
   // Temporarily use the p->offset_length_ as a hash chain.
   int32_t* chain = (int32_t*)p->offset_length_;
+  assert(size > 0);
   assert(p->size_ != 0);
   assert(p->offset_length_ != NULL);
 
+  if (size <= 2) {
+    p->offset_length_[0] = p->offset_length_[size - 1] = 0;
+    return 1;
+  }
+
   hash_to_first_index =
       (int32_t*)WebPSafeMalloc(HASH_SIZE, sizeof(*hash_to_first_index));
   if (hash_to_first_index == NULL) return 0;
@@ -262,48 +271,111 @@ int VP8LHashChainFill(VP8LHashChain* const p, int quality,
   // Set the int32_t array to -1.
   memset(hash_to_first_index, 0xff, HASH_SIZE * sizeof(*hash_to_first_index));
   // Fill the chain linking pixels with the same hash.
-  for (pos = 0; pos < size - 1; ++pos) {
-    const uint32_t hash_code = GetPixPairHash64(argb + pos);
-    chain[pos] = hash_to_first_index[hash_code];
-    hash_to_first_index[hash_code] = pos;
+  argb_comp = (argb[0] == argb[1]);
+  for (pos = 0; pos < size - 2;) {
+    uint32_t hash_code;
+    const int argb_comp_next = (argb[pos + 1] == argb[pos + 2]);
+    if (argb_comp && argb_comp_next) {
+      // Consecutive pixels with the same color will share the same hash.
+      // We therefore use a different hash: the color and its repetition
+      // length.
+      uint32_t tmp[2];
+      uint32_t len = 1;
+      tmp[0] = argb[pos];
+      // Figure out how far the pixels are the same.
+      // The last pixel has a different 64 bit hash, as its next pixel does
+      // not have the same color, so we just need to get to the last pixel equal
+      // to its follower.
+      while (pos + (int)len + 2 < size && argb[pos + len + 2] == argb[pos]) {
+        ++len;
+      }
+      if (len > MAX_LENGTH) {
+        // Skip the pixels that match for distance=1 and length>MAX_LENGTH
+        // because they are linked to their predecessor and we automatically
+        // check that in the main for loop below. Skipping means setting no
+        // predecessor in the chain, hence -1.
+        memset(chain + pos, 0xff, (len - MAX_LENGTH) * sizeof(*chain));
+        pos += len - MAX_LENGTH;
+        len = MAX_LENGTH;
+      }
+      // Process the rest of the hash chain.
+      while (len) {
+        tmp[1] = len--;
+        hash_code = GetPixPairHash64(tmp);
+        chain[pos] = hash_to_first_index[hash_code];
+        hash_to_first_index[hash_code] = pos++;
+      }
+      argb_comp = 0;
+    } else {
+      // Just move one pixel forward.
+      hash_code = GetPixPairHash64(argb + pos);
+      chain[pos] = hash_to_first_index[hash_code];
+      hash_to_first_index[hash_code] = pos++;
+      argb_comp = argb_comp_next;
+    }
   }
+  // Process the penultimate pixel.
+  chain[pos] = hash_to_first_index[GetPixPairHash64(argb + pos)];
+
   WebPSafeFree(hash_to_first_index);
 
   // Find the best match interval at each pixel, defined by an offset to the
   // pixel and a length. The right-most pixel cannot match anything to the right
   // (hence a best length of 0) and the left-most pixel nothing to the left
   // (hence an offset of 0).
+  assert(size > 2);
   p->offset_length_[0] = p->offset_length_[size - 1] = 0;
-  for (base_position = size - 2 < 0 ? 0 : size - 2; base_position > 0;) {
+  for (base_position = size - 2; base_position > 0;) {
     const int max_len = MaxFindCopyLength(size - 1 - base_position);
     const uint32_t* const argb_start = argb + base_position;
     int iter = iter_max;
     int best_length = 0;
     uint32_t best_distance = 0;
+    uint32_t best_argb;
     const int min_pos =
         (base_position > window_size) ? base_position - window_size : 0;
     const int length_max = (max_len < 256) ? max_len : 256;
     uint32_t max_base_position;
 
-    for (pos = chain[base_position]; pos >= min_pos; pos = chain[pos]) {
+    pos = chain[base_position];
+    if (!low_effort) {
       int curr_length;
-      if (--iter < 0) {
-        break;
+      // Heuristic: use the comparison with the above line as an initialization.
+      if (base_position >= (uint32_t)xsize) {
+        curr_length = FindMatchLength(argb_start - xsize, argb_start,
+                                      best_length, max_len);
+        if (curr_length > best_length) {
+          best_length = curr_length;
+          best_distance = xsize;
+        }
+        --iter;
+      }
+      // Heuristic: compare to the previous pixel.
+      curr_length =
+          FindMatchLength(argb_start - 1, argb_start, best_length, max_len);
+      if (curr_length > best_length) {
+        best_length = curr_length;
+        best_distance = 1;
       }
+      --iter;
+      // Skip the for loop if we already have the maximum.
+      if (best_length == MAX_LENGTH) pos = min_pos - 1;
+    }
+    best_argb = argb_start[best_length];
+
+    for (; pos >= min_pos && --iter; pos = chain[pos]) {
+      int curr_length;
       assert(base_position > (uint32_t)pos);
 
-      curr_length =
-          FindMatchLength(argb + pos, argb_start, best_length, max_len);
+      if (argb[pos + best_length] != best_argb) continue;
+
+      curr_length = VP8LVectorMismatch(argb + pos, argb_start, max_len);
       if (best_length < curr_length) {
         best_length = curr_length;
         best_distance = base_position - pos;
-        // Stop if we have reached the maximum length. Otherwise, make sure
-        // we have executed a minimum number of iterations depending on the
-        // quality.
-        if ((best_length == MAX_LENGTH) ||
-            (curr_length >= length_max && iter < iter_min)) {
-          break;
-        }
+        best_argb = argb_start[best_length];
+        // Stop if we have reached a good enough length.
+        if (best_length >= length_max) break;
       }
     }
     // We have the best match but in case the two intervals continue matching
@@ -392,17 +464,16 @@ static int BackwardReferencesRle(int xsize, int ysize,
   i = 1;
   while (i < pix_count) {
     const int max_len = MaxFindCopyLength(pix_count - i);
-    const int kMinLength = 4;
     const int rle_len = FindMatchLength(argb + i, argb + i - 1, 0, max_len);
     const int prev_row_len = (i < xsize) ? 0 :
         FindMatchLength(argb + i, argb + i - xsize, 0, max_len);
-    if (rle_len >= prev_row_len && rle_len >= kMinLength) {
+    if (rle_len >= prev_row_len && rle_len >= MIN_LENGTH) {
       BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len));
       // We don't need to update the color cache here since it is always the
       // same pixel being copied, and that does not change the color cache
       // state.
       i += rle_len;
-    } else if (prev_row_len >= kMinLength) {
+    } else if (prev_row_len >= MIN_LENGTH) {
       BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len));
       if (use_color_cache) {
         for (k = 0; k < prev_row_len; ++k) {
@@ -442,7 +513,7 @@ static int BackwardReferencesLz77(int xsize, int ysize,
     int len = 0;
     int j;
     HashChainFindCopy(hash_chain, i, &offset, &len);
-    if (len > MIN_LENGTH + 1) {
+    if (len >= MIN_LENGTH) {
       const int len_ini = len;
       int max_reach = 0;
       assert(i + len < pix_count);
@@ -457,7 +528,7 @@ static int BackwardReferencesLz77(int xsize, int ysize,
       for (j = i_last_check + 1; j <= i + len_ini; ++j) {
         const int len_j = HashChainFindLength(hash_chain, j);
         const int reach =
-            j + (len_j > MIN_LENGTH + 1 ? len_j : 1);  // 1 for single literal.
+            j + (len_j >= MIN_LENGTH ? len_j : 1);  // 1 for single literal.
         if (reach > max_reach) {
           len = j - i;
           max_reach = reach;
@@ -581,9 +652,10 @@ static void AddSingleLiteralWithCostModel(const uint32_t* const argb,
                                           uint16_t* const dist_array) {
   double cost_val = prev_cost;
   const uint32_t color = argb[0];
-  if (use_color_cache && VP8LColorCacheContains(hashers, color)) {
+  const int ix = use_color_cache ? VP8LColorCacheContains(hashers, color) : -1;
+  if (ix >= 0) {
+    // use_color_cache is true and hashers contains color
     const double mul0 = 0.68;
-    const int ix = VP8LColorCacheGetIndex(hashers, color);
     cost_val += GetCacheCost(cost_model, ix) * mul0;
   } else {
     const double mul1 = 0.82;
@@ -1215,7 +1287,8 @@ static int BackwardReferencesHashChainDistanceOnly(
     int offset = 0, len = 0;
     double prev_cost = cost_manager->costs_[i - 1];
     HashChainFindCopy(hash_chain, i, &offset, &len);
-    if (len >= MIN_LENGTH) {
+    if (len >= 2) {
+      // If we are dealing with a non-literal.
       const int code = DistanceToPlaneCode(xsize, offset);
       const double offset_cost = GetDistanceCost(cost_model, code);
       const int first_i = i;
@@ -1304,20 +1377,17 @@ static int BackwardReferencesHashChainDistanceOnly(
         }
         goto next_symbol;
       }
-      if (len > MIN_LENGTH) {
-        int code_min_length;
-        double cost_total;
-        offset = HashChainFindOffset(hash_chain, i);
-        code_min_length = DistanceToPlaneCode(xsize, offset);
-        cost_total = prev_cost +
-            GetDistanceCost(cost_model, code_min_length) +
-            GetLengthCost(cost_model, 1);
+      if (len > 2) {
+        // Also try the smallest interval possible (size 2).
+        double cost_total =
+            prev_cost + offset_cost + GetLengthCost(cost_model, 1);
         if (cost_manager->costs_[i + 1] > cost_total) {
           cost_manager->costs_[i + 1] = (float)cost_total;
           dist_array[i + 1] = 2;
         }
       }
-    } else {    // len < MIN_LENGTH
+    } else {
+      // The pixel is added as a single literal so just update the costs.
       UpdateCostPerIndex(cost_manager, i + 1);
     }
 
@@ -1393,9 +1463,11 @@ static int BackwardReferencesHashChainFollowChosenPath(
       i += len;
     } else {
       PixOrCopy v;
-      if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
+      const int idx =
+          use_color_cache ? VP8LColorCacheContains(&hashers, argb[i]) : -1;
+      if (idx >= 0) {
+        // use_color_cache is true and hashers contains argb[i]
         // push pixel as a color cache index
-        const int idx = VP8LColorCacheGetIndex(&hashers, argb[i]);
         v = PixOrCopyCreateCacheIdx(idx);
       } else {
         if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
@@ -1454,63 +1526,89 @@ static void BackwardReferences2DLocality(int xsize,
   }
 }
 
-// Returns entropy for the given cache bits.
-static double ComputeCacheEntropy(const uint32_t* argb,
-                                  const VP8LBackwardRefs* const refs,
-                                  int cache_bits) {
-  const int use_color_cache = (cache_bits > 0);
-  int cc_init = 0;
-  double entropy = MAX_ENTROPY;
-  const double kSmallPenaltyForLargeCache = 4.0;
-  VP8LColorCache hashers;
+// Computes the entropies for a color cache size (in bits) between 0 (unused)
+// and cache_bits_max (inclusive).
+// Returns 1 on success, 0 in case of allocation error.
+static int ComputeCacheEntropies(const uint32_t* argb,
+                                 const VP8LBackwardRefs* const refs,
+                                 int cache_bits_max, double entropies[]) {
+  int cc_init[MAX_COLOR_CACHE_BITS + 1] = { 0 };
+  VP8LColorCache hashers[MAX_COLOR_CACHE_BITS + 1];
   VP8LRefsCursor c = VP8LRefsCursorInit(refs);
-  VP8LHistogram* histo = VP8LAllocateHistogram(cache_bits);
-  if (histo == NULL) goto Error;
+  VP8LHistogram* histos[MAX_COLOR_CACHE_BITS + 1] = { NULL };
+  int ok = 0;
+  int i;
 
-  if (use_color_cache) {
-    cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-    if (!cc_init) goto Error;
+  for (i = 0; i <= cache_bits_max; ++i) {
+    histos[i] = VP8LAllocateHistogram(i);
+    if (histos[i] == NULL) goto Error;
+    if (i == 0) continue;
+    cc_init[i] = VP8LColorCacheInit(&hashers[i], i);
+    if (!cc_init[i]) goto Error;
   }
-  if (!use_color_cache) {
-    while (VP8LRefsCursorOk(&c)) {
-      VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos);
-      VP8LRefsCursorNext(&c);
-    }
-  } else {
+
+  assert(cache_bits_max >= 0);
+  // Do not use the color cache for cache_bits=0.
+  while (VP8LRefsCursorOk(&c)) {
+    VP8LHistogramAddSinglePixOrCopy(histos[0], c.cur_pos);
+    VP8LRefsCursorNext(&c);
+  }
+  if (cache_bits_max > 0) {
+    c = VP8LRefsCursorInit(refs);
     while (VP8LRefsCursorOk(&c)) {
       const PixOrCopy* const v = c.cur_pos;
       if (PixOrCopyIsLiteral(v)) {
         const uint32_t pix = *argb++;
-        const uint32_t key = VP8LColorCacheGetIndex(&hashers, pix);
-        if (VP8LColorCacheLookup(&hashers, key) == pix) {
-          ++histo->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
-        } else {
-          VP8LColorCacheSet(&hashers, key, pix);
-          ++histo->blue_[pix & 0xff];
-          ++histo->literal_[(pix >> 8) & 0xff];
-          ++histo->red_[(pix >> 16) & 0xff];
-          ++histo->alpha_[pix >> 24];
+        // The keys of the caches can be derived from the longest one.
+        int key = HashPix(pix, 32 - cache_bits_max);
+        for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+          if (VP8LColorCacheLookup(&hashers[i], key) == pix) {
+            ++histos[i]->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
+          } else {
+            VP8LColorCacheSet(&hashers[i], key, pix);
+            ++histos[i]->blue_[pix & 0xff];
+            ++histos[i]->literal_[(pix >> 8) & 0xff];
+            ++histos[i]->red_[(pix >> 16) & 0xff];
+            ++histos[i]->alpha_[pix >> 24];
+          }
         }
       } else {
+        // Update the histograms for distance/length.
         int len = PixOrCopyLength(v);
-        int code, extra_bits;
-        VP8LPrefixEncodeBits(len, &code, &extra_bits);
-        ++histo->literal_[NUM_LITERAL_CODES + code];
-        VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
-        ++histo->distance_[code];
+        int code_dist, code_len, extra_bits;
+        uint32_t argb_prev = *argb ^ 0xffffffffu;
+        VP8LPrefixEncodeBits(len, &code_len, &extra_bits);
+        VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code_dist, &extra_bits);
+        for (i = 1; i <= cache_bits_max; ++i) {
+          ++histos[i]->literal_[NUM_LITERAL_CODES + code_len];
+          ++histos[i]->distance_[code_dist];
+        }
+        // Update the colors caches.
         do {
-          VP8LColorCacheInsert(&hashers, *argb++);
-        } while(--len != 0);
+          if (*argb != argb_prev) {
+            // Efficiency: insert only if the color changes.
+            int key = HashPix(*argb, 32 - cache_bits_max);
+            for (i = cache_bits_max; i >= 1; --i, key >>= 1) {
+              hashers[i].colors_[key] = *argb;
+            }
+            argb_prev = *argb;
+          }
+          argb++;
+        } while (--len != 0);
       }
       VP8LRefsCursorNext(&c);
     }
   }
-  entropy = VP8LHistogramEstimateBits(histo) +
-      kSmallPenaltyForLargeCache * cache_bits;
- Error:
-  if (cc_init) VP8LColorCacheClear(&hashers);
-  VP8LFreeHistogram(histo);
-  return entropy;
+  for (i = 0; i <= cache_bits_max; ++i) {
+    entropies[i] = VP8LHistogramEstimateBits(histos[i]);
+  }
+  ok = 1;
+Error:
+  for (i = 0; i <= cache_bits_max; ++i) {
+    if (cc_init[i]) VP8LColorCacheClear(&hashers[i]);
+    VP8LFreeHistogram(histos[i]);
+  }
+  return ok;
 }
 
 // Evaluate optimal cache bits for the local color cache.
@@ -1524,13 +1622,10 @@ static int CalculateBestCacheSize(const uint32_t* const argb,
                                   VP8LBackwardRefs* const refs,
                                   int* const lz77_computed,
                                   int* const best_cache_bits) {
-  int eval_low = 1;
-  int eval_high = 1;
-  double entropy_low = MAX_ENTROPY;
-  double entropy_high = MAX_ENTROPY;
-  const double cost_mul = 5e-4;
-  int cache_bits_low = 0;
+  int i;
   int cache_bits_high = (quality <= 25) ? 0 : *best_cache_bits;
+  double entropy_min = MAX_ENTROPY;
+  double entropies[MAX_COLOR_CACHE_BITS + 1];
 
   assert(cache_bits_high <= MAX_COLOR_CACHE_BITS);
 
@@ -1540,34 +1635,23 @@ static int CalculateBestCacheSize(const uint32_t* const argb,
     // Local color cache is disabled.
     return 1;
   }
-  if (!BackwardReferencesLz77(xsize, ysize, argb, cache_bits_low, hash_chain,
-                              refs)) {
+  // Compute LZ77 with no cache (0 bits), as the ideal LZ77 with a color cache
+  // is not that different in practice.
+  if (!BackwardReferencesLz77(xsize, ysize, argb, 0, hash_chain, refs)) {
     return 0;
   }
-  // Do a binary search to find the optimal entropy for cache_bits.
-  while (eval_low || eval_high) {
-    if (eval_low) {
-      entropy_low = ComputeCacheEntropy(argb, refs, cache_bits_low);
-      entropy_low += entropy_low * cache_bits_low * cost_mul;
-      eval_low = 0;
-    }
-    if (eval_high) {
-      entropy_high = ComputeCacheEntropy(argb, refs, cache_bits_high);
-      entropy_high += entropy_high * cache_bits_high * cost_mul;
-      eval_high = 0;
-    }
-    if (entropy_high < entropy_low) {
-      const int prev_cache_bits_low = cache_bits_low;
-      *best_cache_bits = cache_bits_high;
-      cache_bits_low = (cache_bits_low + cache_bits_high) / 2;
-      if (cache_bits_low != prev_cache_bits_low) eval_low = 1;
-    } else {
-      *best_cache_bits = cache_bits_low;
-      cache_bits_high = (cache_bits_low + cache_bits_high) / 2;
-      if (cache_bits_high != cache_bits_low) eval_high = 1;
+  // Find the cache_bits giving the lowest entropy. The search is done in a
+  // brute-force way as the function (entropy w.r.t cache_bits) can be
+  // anything in practice.
+  if (!ComputeCacheEntropies(argb, refs, cache_bits_high, entropies)) {
+    return 0;
+  }
+  for (i = 0; i <= cache_bits_high; ++i) {
+    if (i == 0 || entropies[i] < entropy_min) {
+      entropy_min = entropies[i];
+      *best_cache_bits = i;
     }
   }
-  *lz77_computed = 1;
   return 1;
 }
 
@@ -1584,8 +1668,9 @@ static int BackwardRefsWithLocalCache(const uint32_t* const argb,
     PixOrCopy* const v = c.cur_pos;
     if (PixOrCopyIsLiteral(v)) {
       const uint32_t argb_literal = v->argb_or_distance;
-      if (VP8LColorCacheContains(&hashers, argb_literal)) {
-        const int ix = VP8LColorCacheGetIndex(&hashers, argb_literal);
+      const int ix = VP8LColorCacheContains(&hashers, argb_literal);
+      if (ix >= 0) {
+        // hashers contains argb_literal
         *v = PixOrCopyCreateCacheIdx(ix);
       } else {
         VP8LColorCacheInsert(&hashers, argb_literal);
diff --git a/src/3rdparty/libwebp/src/enc/backward_references.h b/src/3rdparty/libwebp/src/enc/backward_references_enc.h
index 0cadb11..3a19aa7 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references.h
+++ b/src/3rdparty/libwebp/src/enc/backward_references_enc.h
@@ -130,7 +130,8 @@ struct VP8LHashChain {
 int VP8LHashChainInit(VP8LHashChain* const p, int size);
 // Pre-compute the best matches for argb.
 int VP8LHashChainFill(VP8LHashChain* const p, int quality,
-                      const uint32_t* const argb, int xsize, int ysize);
+                      const uint32_t* const argb, int xsize, int ysize,
+                      int low_effort);
 void VP8LHashChainClear(VP8LHashChain* const p);  // release memory
 
 // -----------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/config.c b/src/3rdparty/libwebp/src/enc/config_enc.c
index f9f7961..4589dc0 100644
--- a/src/3rdparty/libwebp/src/enc/config.c
+++ b/src/3rdparty/libwebp/src/enc/config_enc.c
@@ -11,6 +11,10 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
+#ifdef HAVE_CONFIG_H
+#include "../webp/config.h"
+#endif
+
 #include "../webp/encode.h"
 
 //------------------------------------------------------------------------------
@@ -49,9 +53,8 @@ int WebPConfigInitInternal(WebPConfig* config,
   config->thread_level = 0;
   config->low_memory = 0;
   config->near_lossless = 100;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  config->delta_palettization = 0;
-#endif // WEBP_EXPERIMENTAL_FEATURES
+  config->use_delta_palette = 0;
+  config->use_sharp_yuv = 0;
 
   // TODO(skal): tune.
   switch (preset) {
@@ -92,60 +95,36 @@ int WebPConfigInitInternal(WebPConfig* config,
 
 int WebPValidateConfig(const WebPConfig* config) {
   if (config == NULL) return 0;
-  if (config->quality < 0 || config->quality > 100)
-    return 0;
-  if (config->target_size < 0)
-    return 0;
-  if (config->target_PSNR < 0)
-    return 0;
-  if (config->method < 0 || config->method > 6)
-    return 0;
-  if (config->segments < 1 || config->segments > 4)
-    return 0;
-  if (config->sns_strength < 0 || config->sns_strength > 100)
-    return 0;
-  if (config->filter_strength < 0 || config->filter_strength > 100)
-    return 0;
-  if (config->filter_sharpness < 0 || config->filter_sharpness > 7)
-    return 0;
-  if (config->filter_type < 0 || config->filter_type > 1)
-    return 0;
-  if (config->autofilter < 0 || config->autofilter > 1)
-    return 0;
-  if (config->pass < 1 || config->pass > 10)
-    return 0;
-  if (config->show_compressed < 0 || config->show_compressed > 1)
-    return 0;
-  if (config->preprocessing < 0 || config->preprocessing > 7)
-    return 0;
-  if (config->partitions < 0 || config->partitions > 3)
+  if (config->quality < 0 || config->quality > 100) return 0;
+  if (config->target_size < 0) return 0;
+  if (config->target_PSNR < 0) return 0;
+  if (config->method < 0 || config->method > 6) return 0;
+  if (config->segments < 1 || config->segments > 4) return 0;
+  if (config->sns_strength < 0 || config->sns_strength > 100) return 0;
+  if (config->filter_strength < 0 || config->filter_strength > 100) return 0;
+  if (config->filter_sharpness < 0 || config->filter_sharpness > 7) return 0;
+  if (config->filter_type < 0 || config->filter_type > 1) return 0;
+  if (config->autofilter < 0 || config->autofilter > 1) return 0;
+  if (config->pass < 1 || config->pass > 10) return 0;
+  if (config->show_compressed < 0 || config->show_compressed > 1) return 0;
+  if (config->preprocessing < 0 || config->preprocessing > 7) return 0;
+  if (config->partitions < 0 || config->partitions > 3) return 0;
+  if (config->partition_limit < 0 || config->partition_limit > 100) return 0;
+  if (config->alpha_compression < 0) return 0;
+  if (config->alpha_filtering < 0) return 0;
+  if (config->alpha_quality < 0 || config->alpha_quality > 100) return 0;
+  if (config->lossless < 0 || config->lossless > 1) return 0;
+  if (config->near_lossless < 0 || config->near_lossless > 100) return 0;
+  if (config->image_hint >= WEBP_HINT_LAST) return 0;
+  if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1) return 0;
+  if (config->thread_level < 0 || config->thread_level > 1) return 0;
+  if (config->low_memory < 0 || config->low_memory > 1) return 0;
+  if (config->exact < 0 || config->exact > 1) return 0;
+  if (config->use_delta_palette < 0 || config->use_delta_palette > 1) {
     return 0;
-  if (config->partition_limit < 0 || config->partition_limit > 100)
-    return 0;
-  if (config->alpha_compression < 0)
-    return 0;
-  if (config->alpha_filtering < 0)
-    return 0;
-  if (config->alpha_quality < 0 || config->alpha_quality > 100)
-    return 0;
-  if (config->lossless < 0 || config->lossless > 1)
-    return 0;
-  if (config->near_lossless < 0 || config->near_lossless > 100)
-    return 0;
-  if (config->image_hint >= WEBP_HINT_LAST)
-    return 0;
-  if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1)
-    return 0;
-  if (config->thread_level < 0 || config->thread_level > 1)
-    return 0;
-  if (config->low_memory < 0 || config->low_memory > 1)
-    return 0;
-  if (config->exact < 0 || config->exact > 1)
-    return 0;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (config->delta_palettization < 0 || config->delta_palettization > 1)
-    return 0;
-#endif  // WEBP_EXPERIMENTAL_FEATURES
+  }
+  if (config->use_sharp_yuv < 0 || config->use_sharp_yuv > 1) return 0;
+
   return 1;
 }
 
diff --git a/src/3rdparty/libwebp/src/enc/cost.c b/src/3rdparty/libwebp/src/enc/cost_enc.c
index ae7fe01..c823f5a 100644
--- a/src/3rdparty/libwebp/src/enc/cost.c
+++ b/src/3rdparty/libwebp/src/enc/cost_enc.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./cost.h"
+#include "./cost_enc.h"
 
 //------------------------------------------------------------------------------
 // Level cost tables
@@ -281,18 +281,6 @@ int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
 //------------------------------------------------------------------------------
 // Recording of token probabilities.
 
-// Record proba context used
-static int Record(int bit, proba_t* const stats) {
-  proba_t p = *stats;
-  if (p >= 0xffff0000u) {               // an overflow is inbound.
-    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
-  }
-  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
-  p += 0x00010000u + bit;
-  *stats = p;
-  return bit;
-}
-
 // We keep the table-free variant around for reference, in case.
 #define USE_LEVEL_CODE_TABLE
 
@@ -303,31 +291,31 @@ int VP8RecordCoeffs(int ctx, const VP8Residual* const res) {
   // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
   proba_t* s = res->stats[n][ctx];
   if (res->last  < 0) {
-    Record(0, s + 0);
+    VP8RecordStats(0, s + 0);
     return 0;
   }
   while (n <= res->last) {
     int v;
-    Record(1, s + 0);  // order of record doesn't matter
+    VP8RecordStats(1, s + 0);  // order of record doesn't matter
     while ((v = res->coeffs[n++]) == 0) {
-      Record(0, s + 1);
+      VP8RecordStats(0, s + 1);
       s = res->stats[VP8EncBands[n]][0];
     }
-    Record(1, s + 1);
-    if (!Record(2u < (unsigned int)(v + 1), s + 2)) {  // v = -1 or 1
+    VP8RecordStats(1, s + 1);
+    if (!VP8RecordStats(2u < (unsigned int)(v + 1), s + 2)) {  // v = -1 or 1
       s = res->stats[VP8EncBands[n]][1];
     } else {
       v = abs(v);
 #if !defined(USE_LEVEL_CODE_TABLE)
-      if (!Record(v > 4, s + 3)) {
-        if (Record(v != 2, s + 4))
-          Record(v == 4, s + 5);
-      } else if (!Record(v > 10, s + 6)) {
-        Record(v > 6, s + 7);
-      } else if (!Record((v >= 3 + (8 << 2)), s + 8)) {
-        Record((v >= 3 + (8 << 1)), s + 9);
+      if (!VP8RecordStats(v > 4, s + 3)) {
+        if (VP8RecordStats(v != 2, s + 4))
+          VP8RecordStats(v == 4, s + 5);
+      } else if (!VP8RecordStats(v > 10, s + 6)) {
+        VP8RecordStats(v > 6, s + 7);
+      } else if (!VP8RecordStats((v >= 3 + (8 << 2)), s + 8)) {
+        VP8RecordStats((v >= 3 + (8 << 1)), s + 9);
       } else {
-        Record((v >= 3 + (8 << 3)), s + 10);
+        VP8RecordStats((v >= 3 + (8 << 3)), s + 10);
       }
 #else
       if (v > MAX_VARIABLE_LEVEL) {
@@ -340,14 +328,14 @@ int VP8RecordCoeffs(int ctx, const VP8Residual* const res) {
         int i;
         for (i = 0; (pattern >>= 1) != 0; ++i) {
           const int mask = 2 << i;
-          if (pattern & 1) Record(!!(bits & mask), s + 3 + i);
+          if (pattern & 1) VP8RecordStats(!!(bits & mask), s + 3 + i);
         }
       }
 #endif
       s = res->stats[VP8EncBands[n]][2];
     }
   }
-  if (n < 16) Record(0, s + 0);
+  if (n < 16) VP8RecordStats(0, s + 0);
   return 1;
 }
 
diff --git a/src/3rdparty/libwebp/src/enc/cost.h b/src/3rdparty/libwebp/src/enc/cost_enc.h
index 20960d6..99e4b37 100644
--- a/src/3rdparty/libwebp/src/enc/cost.h
+++ b/src/3rdparty/libwebp/src/enc/cost_enc.h
@@ -16,7 +16,7 @@
 
 #include <assert.h>
 #include <stdlib.h>
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -41,6 +41,20 @@ void VP8InitResidual(int first, int coeff_type,
 
 int VP8RecordCoeffs(int ctx, const VP8Residual* const res);
 
+// Record proba context used.
+static WEBP_INLINE int VP8RecordStats(int bit, proba_t* const stats) {
+  proba_t p = *stats;
+  // An overflow is inbound. Note we handle this at 0xfffe0000u instead of
+  // 0xffff0000u to make sure p + 1u does not overflow.
+  if (p >= 0xfffe0000u) {
+    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
+  }
+  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
+  p += 0x00010000u + bit;
+  *stats = p;
+  return bit;
+}
+
 // Cost of coding one event with probability 'proba'.
 static WEBP_INLINE int VP8BitCost(int bit, uint8_t proba) {
   return !bit ? VP8EntropyCost[proba] : VP8EntropyCost[255 - proba];
diff --git a/src/3rdparty/libwebp/src/enc/delta_palettization.c b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.c
index 062e588..eaf0f05 100644
--- a/src/3rdparty/libwebp/src/enc/delta_palettization.c
+++ b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.c
@@ -10,7 +10,7 @@
 // Author: Mislav Bradac (mislavm@google.com)
 //
 
-#include "./delta_palettization.h"
+#include "./delta_palettization_enc.h"
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
 #include "../webp/types.h"
diff --git a/src/3rdparty/libwebp/src/enc/delta_palettization.h b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.h
index e41c0c5..63048ec 100644
--- a/src/3rdparty/libwebp/src/enc/delta_palettization.h
+++ b/src/3rdparty/libwebp/src/enc/delta_palettization_enc.h
@@ -14,7 +14,7 @@
 #define WEBP_ENC_DELTA_PALETTIZATION_H_
 
 #include "../webp/encode.h"
-#include "../enc/vp8li.h"
+#include "../enc/vp8li_enc.h"
 
 // Replaces enc->argb_[] input by a palettizable approximation of it,
 // and generates optimal enc->palette_[].
diff --git a/src/3rdparty/libwebp/src/enc/filter.c b/src/3rdparty/libwebp/src/enc/filter_enc.c
index e8ea8b4..4bc3672 100644
--- a/src/3rdparty/libwebp/src/enc/filter.c
+++ b/src/3rdparty/libwebp/src/enc/filter_enc.c
@@ -12,7 +12,7 @@
 // Author: somnath@google.com (Somnath Banerjee)
 
 #include <assert.h>
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 #include "../dsp/dsp.h"
 
 // This table gives, for a given sharpness, the filtering strength to be
@@ -105,115 +105,28 @@ static void DoFilter(const VP8EncIterator* const it, int level) {
 }
 
 //------------------------------------------------------------------------------
-// SSIM metric
-
-static const double kMinValue = 1.e-10;  // minimal threshold
-
-void VP8SSIMAddStats(const VP8DistoStats* const src, VP8DistoStats* const dst) {
-  dst->w   += src->w;
-  dst->xm  += src->xm;
-  dst->ym  += src->ym;
-  dst->xxm += src->xxm;
-  dst->xym += src->xym;
-  dst->yym += src->yym;
-}
-
-double VP8SSIMGet(const VP8DistoStats* const stats) {
-  const double xmxm = stats->xm * stats->xm;
-  const double ymym = stats->ym * stats->ym;
-  const double xmym = stats->xm * stats->ym;
-  const double w2 = stats->w * stats->w;
-  double sxx = stats->xxm * stats->w - xmxm;
-  double syy = stats->yym * stats->w - ymym;
-  double sxy = stats->xym * stats->w - xmym;
-  double C1, C2;
-  double fnum;
-  double fden;
-  // small errors are possible, due to rounding. Clamp to zero.
-  if (sxx < 0.) sxx = 0.;
-  if (syy < 0.) syy = 0.;
-  C1 = 6.5025 * w2;
-  C2 = 58.5225 * w2;
-  fnum = (2 * xmym + C1) * (2 * sxy + C2);
-  fden = (xmxm + ymym + C1) * (sxx + syy + C2);
-  return (fden != 0.) ? fnum / fden : kMinValue;
-}
-
-double VP8SSIMGetSquaredError(const VP8DistoStats* const s) {
-  if (s->w > 0.) {
-    const double iw2 = 1. / (s->w * s->w);
-    const double sxx = s->xxm * s->w - s->xm * s->xm;
-    const double syy = s->yym * s->w - s->ym * s->ym;
-    const double sxy = s->xym * s->w - s->xm * s->ym;
-    const double SSE = iw2 * (sxx + syy - 2. * sxy);
-    if (SSE > kMinValue) return SSE;
-  }
-  return kMinValue;
-}
-
-#define LIMIT(A, M)  ((A) > (M) ? (M) : (A))
-static void VP8SSIMAccumulateRow(const uint8_t* src1, int stride1,
-                                 const uint8_t* src2, int stride2,
-                                 int y, int W, int H,
-                                 VP8DistoStats* const stats) {
-  int x = 0;
-  const int w0 = LIMIT(VP8_SSIM_KERNEL, W);
-  for (x = 0; x < w0; ++x) {
-    VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-  }
-  for (; x <= W - 8 + VP8_SSIM_KERNEL; ++x) {
-    VP8SSIMAccumulate(
-        src1 + (y - VP8_SSIM_KERNEL) * stride1 + (x - VP8_SSIM_KERNEL), stride1,
-        src2 + (y - VP8_SSIM_KERNEL) * stride2 + (x - VP8_SSIM_KERNEL), stride2,
-        stats);
-  }
-  for (; x < W; ++x) {
-    VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-  }
-}
-
-void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1,
-                            const uint8_t* src2, int stride2,
-                            int W, int H, VP8DistoStats* const stats) {
-  int x, y;
-  const int h0 = LIMIT(VP8_SSIM_KERNEL, H);
-  const int h1 = LIMIT(VP8_SSIM_KERNEL, H - VP8_SSIM_KERNEL);
-  for (y = 0; y < h0; ++y) {
-    for (x = 0; x < W; ++x) {
-      VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-    }
-  }
-  for (; y < h1; ++y) {
-    VP8SSIMAccumulateRow(src1, stride1, src2, stride2, y, W, H, stats);
-  }
-  for (; y < H; ++y) {
-    for (x = 0; x < W; ++x) {
-      VP8SSIMAccumulateClipped(src1, stride1, src2, stride2, x, y, W, H, stats);
-    }
-  }
-}
-#undef LIMIT
+// SSIM metric for one macroblock
 
 static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
   int x, y;
-  VP8DistoStats s = { .0, .0, .0, .0, .0, .0 };
+  double sum = 0.;
 
   // compute SSIM in a 10 x 10 window
   for (y = VP8_SSIM_KERNEL; y < 16 - VP8_SSIM_KERNEL; y++) {
     for (x = VP8_SSIM_KERNEL; x < 16 - VP8_SSIM_KERNEL; x++) {
-      VP8SSIMAccumulateClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
-                               x, y, 16, 16, &s);
+      sum += VP8SSIMGetClipped(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
+                               x, y, 16, 16);
     }
   }
   for (x = 1; x < 7; x++) {
     for (y = 1; y < 7; y++) {
-      VP8SSIMAccumulateClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
-                               x, y, 8, 8, &s);
-      VP8SSIMAccumulateClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
-                               x, y, 8, 8, &s);
+      sum += VP8SSIMGetClipped(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
+                               x, y, 8, 8);
+      sum += VP8SSIMGetClipped(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
+                               x, y, 8, 8);
     }
   }
-  return VP8SSIMGet(&s);
+  return sum;
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/frame.c b/src/3rdparty/libwebp/src/enc/frame_enc.c
index 5b7a40b..abef523 100644
--- a/src/3rdparty/libwebp/src/enc/frame.c
+++ b/src/3rdparty/libwebp/src/enc/frame_enc.c
@@ -14,8 +14,8 @@
 #include <string.h>
 #include <math.h>
 
-#include "./cost.h"
-#include "./vp8enci.h"
+#include "./cost_enc.h"
+#include "./vp8i_enc.h"
 #include "../dsp/dsp.h"
 #include "../webp/format_constants.h"  // RIFF constants
 
@@ -185,6 +185,13 @@ static int GetProba(int a, int b) {
                       : (255 * a + total / 2) / total;  // rounded proba
 }
 
+static void ResetSegments(VP8Encoder* const enc) {
+  int n;
+  for (n = 0; n < enc->mb_w_ * enc->mb_h_; ++n) {
+    enc->mb_info_[n].segment_ = 0;
+  }
+}
+
 static void SetSegmentProbas(VP8Encoder* const enc) {
   int p[NUM_MB_SEGMENTS] = { 0 };
   int n;
@@ -206,6 +213,7 @@ static void SetSegmentProbas(VP8Encoder* const enc) {
 
     enc->segment_hdr_.update_map_ =
         (probas[0] != 255) || (probas[1] != 255) || (probas[2] != 255);
+    if (!enc->segment_hdr_.update_map_) ResetSegments(enc);
     enc->segment_hdr_.size_ =
         p[0] * (VP8BitCost(0, probas[0]) + VP8BitCost(0, probas[1])) +
         p[1] * (VP8BitCost(0, probas[0]) + VP8BitCost(1, probas[1])) +
@@ -240,8 +248,9 @@ static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
       p = res->prob[VP8EncBands[n]][1];
     } else {
       if (!VP8PutBit(bw, v > 4, p[3])) {
-        if (VP8PutBit(bw, v != 2, p[4]))
+        if (VP8PutBit(bw, v != 2, p[4])) {
           VP8PutBit(bw, v == 4, p[5]);
+        }
       } else if (!VP8PutBit(bw, v > 10, p[6])) {
         if (!VP8PutBit(bw, v > 6, p[7])) {
           VP8PutBit(bw, v == 6, 159);
@@ -406,9 +415,7 @@ static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
     VP8InitResidual(0, 1, enc, &res);
     VP8SetResidualCoeffs(rd->y_dc_levels, &res);
     it->top_nz_[8] = it->left_nz_[8] =
-        VP8RecordCoeffTokens(ctx, 1,
-                             res.first, res.last, res.coeffs, tokens);
-    VP8RecordCoeffs(ctx, &res);
+        VP8RecordCoeffTokens(ctx, &res, tokens);
     VP8InitResidual(1, 0, enc, &res);
   } else {
     VP8InitResidual(0, 3, enc, &res);
@@ -420,9 +427,7 @@ static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
       const int ctx = it->top_nz_[x] + it->left_nz_[y];
       VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
       it->top_nz_[x] = it->left_nz_[y] =
-          VP8RecordCoeffTokens(ctx, res.coeff_type,
-                               res.first, res.last, res.coeffs, tokens);
-      VP8RecordCoeffs(ctx, &res);
+          VP8RecordCoeffTokens(ctx, &res, tokens);
     }
   }
 
@@ -434,9 +439,7 @@ static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
         const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
         VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
         it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
-            VP8RecordCoeffTokens(ctx, 2,
-                                 res.first, res.last, res.coeffs, tokens);
-        VP8RecordCoeffs(ctx, &res);
+            VP8RecordCoeffTokens(ctx, &res, tokens);
       }
     }
   }
@@ -555,8 +558,9 @@ static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
     size += info.R + info.H;
     size_p0 += info.H;
     distortion += info.D;
-    if (percent_delta && !VP8IteratorProgress(&it, percent_delta))
+    if (percent_delta && !VP8IteratorProgress(&it, percent_delta)) {
       return 0;
+    }
     VP8IteratorSaveBoundary(&it);
   } while (VP8IteratorNext(&it) && --nb_mbs > 0);
 
@@ -814,7 +818,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
            num_pass_left, stats.last_value, stats.value,
            stats.last_q, stats.q, stats.dq);
 #endif
-    if (size_p0 > PARTITION0_SIZE_LIMIT) {
+    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
       ++num_pass_left;
       enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
       continue;                        // ...and start over
diff --git a/src/3rdparty/libwebp/src/enc/histogram.c b/src/3rdparty/libwebp/src/enc/histogram_enc.c
index 395372b..808b6f7 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.c
+++ b/src/3rdparty/libwebp/src/enc/histogram_enc.c
@@ -15,9 +15,10 @@
 
 #include <math.h>
 
-#include "./backward_references.h"
-#include "./histogram.h"
+#include "./backward_references_enc.h"
+#include "./histogram_enc.h"
 #include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
 #include "../utils/utils.h"
 
 #define MAX_COST 1.e38
@@ -213,10 +214,19 @@ static double InitialHuffmanCost(void) {
 
 // Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
 static double FinalHuffmanCost(const VP8LStreaks* const stats) {
+  // The constants in this function are experimental and got rounded from
+  // their original values in 1/8 when switched to 1/1024.
   double retval = InitialHuffmanCost();
+  // Second coefficient: Many zeros in the histogram are covered efficiently
+  // by a run-length encode. Originally 2/8.
   retval += stats->counts[0] * 1.5625 + 0.234375 * stats->streaks[0][1];
+  // Second coefficient: Constant values are encoded less efficiently, but still
+  // RLE'ed. Originally 6/8.
   retval += stats->counts[1] * 2.578125 + 0.703125 * stats->streaks[1][1];
+  // 0s are usually encoded more efficiently than non-0s.
+  // Originally 15/8.
   retval += 1.796875 * stats->streaks[0][0];
+  // Originally 26/8.
   retval += 3.28125 * stats->streaks[1][0];
   return retval;
 }
@@ -236,14 +246,30 @@ static double PopulationCost(const uint32_t* const population, int length,
   return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
 }
 
+// trivial_at_end is 1 if the two histograms only have one element that is
+// non-zero: both the zero-th one, or both the last one.
 static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
                                              const uint32_t* const Y,
-                                             int length) {
-  VP8LBitEntropy bit_entropy;
+                                             int length, int trivial_at_end) {
   VP8LStreaks stats;
-  VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
+  if (trivial_at_end) {
+    // This configuration is due to palettization that transforms an indexed
+    // pixel into 0xff000000 | (pixel << 8) in VP8LBundleColorMap.
+    // BitsEntropyRefine is 0 for histograms with only one non-zero value.
+    // Only FinalHuffmanCost needs to be evaluated.
+    memset(&stats, 0, sizeof(stats));
+    // Deal with the non-zero value at index 0 or length-1.
+    stats.streaks[1][0] += 1;
+    // Deal with the following/previous zero streak.
+    stats.counts[0] += 1;
+    stats.streaks[0][1] += length - 1;
+    return FinalHuffmanCost(&stats);
+  } else {
+    VP8LBitEntropy bit_entropy;
+    VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
 
-  return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
+    return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
+  }
 }
 
 // Estimates the Entropy + Huffman + other block overhead size cost.
@@ -267,24 +293,42 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
                                        double cost_threshold,
                                        double* cost) {
   const int palette_code_bits = a->palette_code_bits_;
+  int trivial_at_end = 0;
   assert(a->palette_code_bits_ == b->palette_code_bits_);
   *cost += GetCombinedEntropy(a->literal_, b->literal_,
-                              VP8LHistogramNumCodes(palette_code_bits));
+                              VP8LHistogramNumCodes(palette_code_bits), 0);
   *cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
                                  b->literal_ + NUM_LITERAL_CODES,
                                  NUM_LENGTH_CODES);
   if (*cost > cost_threshold) return 0;
 
-  *cost += GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES);
+  if (a->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM &&
+      a->trivial_symbol_ == b->trivial_symbol_) {
+    // A, R and B are all 0 or 0xff.
+    const uint32_t color_a = (a->trivial_symbol_ >> 24) & 0xff;
+    const uint32_t color_r = (a->trivial_symbol_ >> 16) & 0xff;
+    const uint32_t color_b = (a->trivial_symbol_ >> 0) & 0xff;
+    if ((color_a == 0 || color_a == 0xff) &&
+        (color_r == 0 || color_r == 0xff) &&
+        (color_b == 0 || color_b == 0xff)) {
+      trivial_at_end = 1;
+    }
+  }
+
+  *cost +=
+      GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES, trivial_at_end);
   if (*cost > cost_threshold) return 0;
 
-  *cost += GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES);
+  *cost +=
+      GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES, trivial_at_end);
   if (*cost > cost_threshold) return 0;
 
-  *cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES);
+  *cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES,
+                              trivial_at_end);
   if (*cost > cost_threshold) return 0;
 
-  *cost += GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES);
+  *cost +=
+      GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES, 0);
   *cost +=
       VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES);
   if (*cost > cost_threshold) return 0;
@@ -292,6 +336,15 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
   return 1;
 }
 
+static WEBP_INLINE void HistogramAdd(const VP8LHistogram* const a,
+                                     const VP8LHistogram* const b,
+                                     VP8LHistogram* const out) {
+  VP8LHistogramAdd(a, b, out);
+  out->trivial_symbol_ = (a->trivial_symbol_ == b->trivial_symbol_)
+                       ? a->trivial_symbol_
+                       : VP8L_NON_TRIVIAL_SYM;
+}
+
 // Performs out = a + b, computing the cost C(a+b) - C(a) - C(b) while comparing
 // to the threshold value 'cost_threshold'. The score returned is
 //  Score = C(a+b) - C(a) - C(b), where C(a) + C(b) is known and fixed.
@@ -307,11 +360,9 @@ static double HistogramAddEval(const VP8LHistogram* const a,
   cost_threshold += sum_cost;
 
   if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) {
-    VP8LHistogramAdd(a, b, out);
+    HistogramAdd(a, b, out);
     out->bit_cost_ = cost;
     out->palette_code_bits_ = a->palette_code_bits_;
-    out->trivial_symbol_ = (a->trivial_symbol_ == b->trivial_symbol_) ?
-        a->trivial_symbol_ : VP8L_NON_TRIVIAL_SYM;
   }
 
   return cost - sum_cost;
@@ -450,113 +501,103 @@ static void HistogramCopyAndAnalyze(
 // Partition histograms to different entropy bins for three dominant (literal,
 // red and blue) symbol costs and compute the histogram aggregate bit_cost.
 static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
-                                       int16_t* const bin_map, int low_effort) {
+                                       uint16_t* const bin_map,
+                                       int low_effort) {
   int i;
   VP8LHistogram** const histograms = image_histo->histograms;
   const int histo_size = image_histo->size;
-  const int bin_depth = histo_size + 1;
   DominantCostRange cost_range;
   DominantCostRangeInit(&cost_range);
 
   // Analyze the dominant (literal, red and blue) entropy costs.
   for (i = 0; i < histo_size; ++i) {
-    VP8LHistogram* const histo = histograms[i];
-    UpdateDominantCostRange(histo, &cost_range);
+    UpdateDominantCostRange(histograms[i], &cost_range);
   }
 
   // bin-hash histograms on three of the dominant (literal, red and blue)
-  // symbol costs.
+  // symbol costs and store the resulting bin_id for each histogram.
   for (i = 0; i < histo_size; ++i) {
-    const VP8LHistogram* const histo = histograms[i];
-    const int bin_id = GetHistoBinIndex(histo, &cost_range, low_effort);
-    const int bin_offset = bin_id * bin_depth;
-    // bin_map[n][0] for every bin 'n' maintains the counter for the number of
-    // histograms in that bin.
-    // Get and increment the num_histos in that bin.
-    const int num_histos = ++bin_map[bin_offset];
-    assert(bin_offset + num_histos < bin_depth * BIN_SIZE);
-    // Add histogram i'th index at num_histos (last) position in the bin_map.
-    bin_map[bin_offset + num_histos] = i;
-  }
-}
-
-// Compact the histogram set by removing unused entries.
-static void HistogramCompactBins(VP8LHistogramSet* const image_histo) {
-  VP8LHistogram** const histograms = image_histo->histograms;
-  int i, j;
-
-  for (i = 0, j = 0; i < image_histo->size; ++i) {
-    if (histograms[i] != NULL && histograms[i]->bit_cost_ != 0.) {
-      if (j < i) {
-        histograms[j] = histograms[i];
-        histograms[i] = NULL;
-      }
-      ++j;
-    }
+    bin_map[i] = GetHistoBinIndex(histograms[i], &cost_range, low_effort);
   }
-  image_histo->size = j;
 }
 
+// Compact image_histo[] by merging some histograms with same bin_id together if
+// it's advantageous.
 static VP8LHistogram* HistogramCombineEntropyBin(
     VP8LHistogramSet* const image_histo,
     VP8LHistogram* cur_combo,
-    int16_t* const bin_map, int bin_depth, int num_bins,
+    const uint16_t* const bin_map, int bin_map_size, int num_bins,
     double combine_cost_factor, int low_effort) {
-  int bin_id;
   VP8LHistogram** const histograms = image_histo->histograms;
-
-  for (bin_id = 0; bin_id < num_bins; ++bin_id) {
-    const int bin_offset = bin_id * bin_depth;
-    const int num_histos = bin_map[bin_offset];
-    const int idx1 = bin_map[bin_offset + 1];
-    int num_combine_failures = 0;
-    int n;
-    for (n = 2; n <= num_histos; ++n) {
-      const int idx2 = bin_map[bin_offset + n];
-      if (low_effort) {
-        // Merge all histograms with the same bin index, irrespective of cost of
-        // the merged histograms.
-        VP8LHistogramAdd(histograms[idx1], histograms[idx2], histograms[idx1]);
-        histograms[idx2]->bit_cost_ = 0.;
-      } else {
-        const double bit_cost_idx2 = histograms[idx2]->bit_cost_;
-        if (bit_cost_idx2 > 0.) {
-          const double bit_cost_thresh = -bit_cost_idx2 * combine_cost_factor;
-          const double curr_cost_diff =
-              HistogramAddEval(histograms[idx1], histograms[idx2],
-                               cur_combo, bit_cost_thresh);
-          if (curr_cost_diff < bit_cost_thresh) {
-            // Try to merge two histograms only if the combo is a trivial one or
-            // the two candidate histograms are already non-trivial.
-            // For some images, 'try_combine' turns out to be false for a lot of
-            // histogram pairs. In that case, we fallback to combining
-            // histograms as usual to avoid increasing the header size.
-            const int try_combine =
-                (cur_combo->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM) ||
-                ((histograms[idx1]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM) &&
-                 (histograms[idx2]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM));
-            const int max_combine_failures = 32;
-            if (try_combine || (num_combine_failures >= max_combine_failures)) {
-              HistogramSwap(&cur_combo, &histograms[idx1]);
-              histograms[idx2]->bit_cost_ = 0.;
-            } else {
-              ++num_combine_failures;
-            }
-          }
+  int idx;
+  // Work in-place: processed histograms are put at the beginning of
+  // image_histo[]. At the end, we just have to truncate the array.
+  int size = 0;
+  struct {
+    int16_t first;    // position of the histogram that accumulates all
+                      // histograms with the same bin_id
+    uint16_t num_combine_failures;   // number of combine failures per bin_id
+  } bin_info[BIN_SIZE];
+
+  assert(num_bins <= BIN_SIZE);
+  for (idx = 0; idx < num_bins; ++idx) {
+    bin_info[idx].first = -1;
+    bin_info[idx].num_combine_failures = 0;
+  }
+
+  for (idx = 0; idx < bin_map_size; ++idx) {
+    const int bin_id = bin_map[idx];
+    const int first = bin_info[bin_id].first;
+    assert(size <= idx);
+    if (first == -1) {
+      // just move histogram #idx to its final position
+      histograms[size] = histograms[idx];
+      bin_info[bin_id].first = size++;
+    } else if (low_effort) {
+      HistogramAdd(histograms[idx], histograms[first], histograms[first]);
+    } else {
+      // try to merge #idx into #first (both share the same bin_id)
+      const double bit_cost = histograms[idx]->bit_cost_;
+      const double bit_cost_thresh = -bit_cost * combine_cost_factor;
+      const double curr_cost_diff =
+          HistogramAddEval(histograms[first], histograms[idx],
+                           cur_combo, bit_cost_thresh);
+      if (curr_cost_diff < bit_cost_thresh) {
+        // Try to merge two histograms only if the combo is a trivial one or
+        // the two candidate histograms are already non-trivial.
+        // For some images, 'try_combine' turns out to be false for a lot of
+        // histogram pairs. In that case, we fallback to combining
+        // histograms as usual to avoid increasing the header size.
+        const int try_combine =
+            (cur_combo->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM) ||
+            ((histograms[idx]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM) &&
+             (histograms[first]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM));
+        const int max_combine_failures = 32;
+        if (try_combine ||
+            bin_info[bin_id].num_combine_failures >= max_combine_failures) {
+          // move the (better) merged histogram to its final slot
+          HistogramSwap(&cur_combo, &histograms[first]);
+        } else {
+          histograms[size++] = histograms[idx];
+          ++bin_info[bin_id].num_combine_failures;
         }
+      } else {
+        histograms[size++] = histograms[idx];
       }
     }
-    if (low_effort) {
-      // Update the bit_cost for the merged histograms (per bin index).
-      UpdateHistogramCost(histograms[idx1]);
+  }
+  image_histo->size = size;
+  if (low_effort) {
+    // for low_effort case, update the final cost when everything is merged
+    for (idx = 0; idx < size; ++idx) {
+      UpdateHistogramCost(histograms[idx]);
     }
   }
-  HistogramCompactBins(image_histo);
   return cur_combo;
 }
 
-static uint32_t MyRand(uint32_t *seed) {
-  *seed *= 16807U;
+static uint32_t MyRand(uint32_t* const seed) {
+  *seed = (*seed * 16807ull) & 0xffffffffu;
   if (*seed == 0) {
     *seed = 1;
   }
@@ -592,8 +633,8 @@ static int HistoQueueInit(HistoQueue* const histo_queue, const int max_index) {
   histo_queue->max_size = max_index * max_index;
   // We allocate max_size + 1 because the last element at index "size" is
   // used as temporary data (and it could be up to max_size).
-  histo_queue->queue = WebPSafeMalloc(histo_queue->max_size + 1,
-                                      sizeof(*histo_queue->queue));
+  histo_queue->queue = (HistogramPair*)WebPSafeMalloc(
+      histo_queue->max_size + 1, sizeof(*histo_queue->queue));
   return histo_queue->queue != NULL;
 }
 
@@ -659,7 +700,8 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
   int i, j;
   VP8LHistogram** const histograms = image_histo->histograms;
   // Indexes of remaining histograms.
-  int* const clusters = WebPSafeMalloc(image_histo_size, sizeof(*clusters));
+  int* const clusters =
+      (int*)WebPSafeMalloc(image_histo_size, sizeof(*clusters));
   // Priority queue of histogram pairs.
   HistoQueue histo_queue;
 
@@ -681,7 +723,7 @@ static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo) {
     HistogramPair* copy_to;
     const int idx1 = histo_queue.queue[0].idx1;
     const int idx2 = histo_queue.queue[0].idx2;
-    VP8LHistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
+    HistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
     histograms[idx1]->bit_cost_ = histo_queue.queue[0].cost_combo;
     // Remove merged histogram.
     for (i = 0; i + 1 < image_histo_size; ++i) {
@@ -747,6 +789,8 @@ static void HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
   const int outer_iters = image_histo_size * iter_mult;
   const int num_pairs = image_histo_size / 2;
   const int num_tries_no_success = outer_iters / 2;
+  int idx2_max = image_histo_size - 1;
+  int do_brute_dorce = 0;
   VP8LHistogram** const histograms = image_histo->histograms;
 
   // Collapse similar histograms in 'image_histo'.
@@ -757,43 +801,62 @@ static void HistogramCombineStochastic(VP8LHistogramSet* const image_histo,
     double best_cost_diff = 0.;
     int best_idx1 = -1, best_idx2 = 1;
     int j;
-    const int num_tries =
+    int num_tries =
         (num_pairs < image_histo_size) ? num_pairs : image_histo_size;
+    // Use a brute force approach if:
+    // - stochastic has not worked for a while and
+    // - if the number of iterations for brute force is less than the number of
+    // iterations if we never find a match ever again stochastically (hence
+    // num_tries times the number of remaining outer iterations).
+    do_brute_dorce =
+        (tries_with_no_success > 10) &&
+        (idx2_max * (idx2_max + 1) < 2 * num_tries * (outer_iters - iter));
+    if (do_brute_dorce) num_tries = idx2_max;
+
     seed += iter;
     for (j = 0; j < num_tries; ++j) {
       double curr_cost_diff;
       // Choose two histograms at random and try to combine them.
-      const uint32_t idx1 = MyRand(&seed) % image_histo_size;
-      const uint32_t tmp = (j & 7) + 1;
-      const uint32_t diff =
-          (tmp < 3) ? tmp : MyRand(&seed) % (image_histo_size - 1);
-      const uint32_t idx2 = (idx1 + diff + 1) % image_histo_size;
-      if (idx1 == idx2) {
-        continue;
+      uint32_t idx1, idx2;
+      if (do_brute_dorce) {
+        // Use a brute force approach.
+        idx1 = (uint32_t)j;
+        idx2 = (uint32_t)idx2_max;
+      } else {
+        const uint32_t tmp = (j & 7) + 1;
+        const uint32_t diff =
+            (tmp < 3) ? tmp : MyRand(&seed) % (image_histo_size - 1);
+        idx1 = MyRand(&seed) % image_histo_size;
+        idx2 = (idx1 + diff + 1) % image_histo_size;
+        if (idx1 == idx2) {
+          continue;
+        }
       }
 
       // Calculate cost reduction on combining.
       curr_cost_diff = HistogramAddEval(histograms[idx1], histograms[idx2],
                                         tmp_histo, best_cost_diff);
-      if (curr_cost_diff < best_cost_diff) {    // found a better pair?
+      if (curr_cost_diff < best_cost_diff) {  // found a better pair?
         HistogramSwap(&best_combo, &tmp_histo);
         best_cost_diff = curr_cost_diff;
         best_idx1 = idx1;
         best_idx2 = idx2;
       }
     }
+    if (do_brute_dorce) --idx2_max;
 
     if (best_idx1 >= 0) {
       HistogramSwap(&best_combo, &histograms[best_idx1]);
       // swap best_idx2 slot with last one (which is now unused)
       --image_histo_size;
+      if (idx2_max >= image_histo_size) idx2_max = image_histo_size - 1;
       if (best_idx2 != image_histo_size) {
         HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]);
         histograms[image_histo_size] = NULL;
       }
       tries_with_no_success = 0;
     }
-    if (++tries_with_no_success >= num_tries_no_success) {
+    if (++tries_with_no_success >= num_tries_no_success || idx2_max == 0) {
       break;
     }
   }
@@ -842,7 +905,7 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
 
   for (i = 0; i < in_size; ++i) {
     const int idx = symbols[i];
-    VP8LHistogramAdd(in_histo[i], out_histo[idx], out_histo[idx]);
+    HistogramAdd(in_histo[i], out_histo[idx], out_histo[idx]);
   }
 }
 
@@ -868,32 +931,18 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
   const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
   const int image_histo_raw_size = histo_xsize * histo_ysize;
-  const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
-
-  // The bin_map for every bin follows following semantics:
-  // bin_map[n][0] = num_histo; // The number of histograms in that bin.
-  // bin_map[n][1] = index of first histogram in that bin;
-  // bin_map[n][num_histo] = index of last histogram in that bin;
-  // bin_map[n][num_histo + 1] ... bin_map[n][bin_depth - 1] = unused indices.
-  const int bin_depth = image_histo_raw_size + 1;
-  int16_t* bin_map = NULL;
   VP8LHistogramSet* const orig_histo =
       VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
   VP8LHistogram* cur_combo;
+  // Don't attempt linear bin-partition heuristic for
+  // histograms of small sizes (as bin_map will be very sparse) and
+  // maximum quality q==100 (to preserve the compression gains at that level).
+  const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
   const int entropy_combine =
       (orig_histo->size > entropy_combine_num_bins * 2) && (quality < 100);
 
   if (orig_histo == NULL) goto Error;
 
-  // Don't attempt linear bin-partition heuristic for:
-  // histograms of small sizes, as bin_map will be very sparse and;
-  // Maximum quality (q==100), to preserve the compression gains at that level.
-  if (entropy_combine) {
-    const int bin_map_size = bin_depth * entropy_combine_num_bins;
-    bin_map = (int16_t*)WebPSafeCalloc(bin_map_size, sizeof(*bin_map));
-    if (bin_map == NULL) goto Error;
-  }
-
   // Construct the histograms from backward references.
   HistogramBuild(xsize, histo_bits, refs, orig_histo);
   // Copies the histograms and computes its bit_cost.
@@ -901,12 +950,17 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
 
   cur_combo = tmp_histos->histograms[1];  // pick up working slot
   if (entropy_combine) {
+    const int bin_map_size = orig_histo->size;
+    // Reuse histogram_symbols storage. By definition, it's guaranteed to be ok.
+    uint16_t* const bin_map = histogram_symbols;
     const double combine_cost_factor =
         GetCombineCostFactor(image_histo_raw_size, quality);
+
     HistogramAnalyzeEntropyBin(orig_histo, bin_map, low_effort);
     // Collapse histograms with similar entropy.
-    cur_combo = HistogramCombineEntropyBin(image_histo, cur_combo, bin_map,
-                                           bin_depth, entropy_combine_num_bins,
+    cur_combo = HistogramCombineEntropyBin(image_histo, cur_combo,
+                                           bin_map, bin_map_size,
+                                           entropy_combine_num_bins,
                                            combine_cost_factor, low_effort);
   }
 
@@ -931,7 +985,6 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   ok = 1;
 
  Error:
-  WebPSafeFree(bin_map);
   VP8LFreeHistogramSet(orig_histo);
   return ok;
 }
diff --git a/src/3rdparty/libwebp/src/enc/histogram.h b/src/3rdparty/libwebp/src/enc/histogram_enc.h
index d303d1d..a9d258a 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.h
+++ b/src/3rdparty/libwebp/src/enc/histogram_enc.h
@@ -16,7 +16,7 @@
 
 #include <string.h>
 
-#include "./backward_references.h"
+#include "./backward_references_enc.h"
 #include "../webp/format_constants.h"
 #include "../webp/types.h"
 
diff --git a/src/3rdparty/libwebp/src/enc/iterator.c b/src/3rdparty/libwebp/src/enc/iterator_enc.c
index 99d960a..e48d30b 100644
--- a/src/3rdparty/libwebp/src/enc/iterator.c
+++ b/src/3rdparty/libwebp/src/enc/iterator_enc.c
@@ -13,7 +13,7 @@
 
 #include <string.h>
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // VP8Iterator
@@ -53,7 +53,6 @@ void VP8IteratorReset(VP8EncIterator* const it) {
   VP8IteratorSetRow(it, 0);
   VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_);  // default
   InitTop(it);
-  InitLeft(it);
   memset(it->bit_count_, 0, sizeof(it->bit_count_));
   it->do_trellis_ = 0;
 }
@@ -68,8 +67,6 @@ int VP8IteratorIsDone(const VP8EncIterator* const it) {
 
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
   it->enc_ = enc;
-  it->y_stride_  = enc->pic_->y_stride;
-  it->uv_stride_ = enc->pic_->uv_stride;
   it->yuv_in_   = (uint8_t*)WEBP_ALIGN(it->yuv_mem_);
   it->yuv_out_  = it->yuv_in_ + YUV_SIZE_ENC;
   it->yuv_out2_ = it->yuv_out_ + YUV_SIZE_ENC;
@@ -309,14 +306,14 @@ void VP8IteratorSaveBoundary(VP8EncIterator* const it) {
 }
 
 int VP8IteratorNext(VP8EncIterator* const it) {
-  it->preds_ += 4;
-  it->mb_ += 1;
-  it->nz_ += 1;
-  it->y_top_ += 16;
-  it->uv_top_ += 16;
-  it->x_ += 1;
-  if (it->x_ == it->enc_->mb_w_) {
+  if (++it->x_ == it->enc_->mb_w_) {
     VP8IteratorSetRow(it, ++it->y_);
+  } else {
+    it->preds_ += 4;
+    it->mb_ += 1;
+    it->nz_ += 1;
+    it->y_top_ += 16;
+    it->uv_top_ += 16;
   }
   return (0 < --it->count_down_);
 }
diff --git a/src/3rdparty/libwebp/src/enc/near_lossless.c b/src/3rdparty/libwebp/src/enc/near_lossless_enc.c
index f4ab91f..2bd03ab 100644
--- a/src/3rdparty/libwebp/src/enc/near_lossless.c
+++ b/src/3rdparty/libwebp/src/enc/near_lossless_enc.c
@@ -17,9 +17,9 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
 #include "../utils/utils.h"
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 #define MIN_DIM_FOR_NEAR_LOSSLESS 64
 #define MAX_LIMIT_BITS             5
diff --git a/src/3rdparty/libwebp/src/enc/picture_csp.c b/src/3rdparty/libwebp/src/enc/picture_csp_enc.c
index 607a624..e5d1c75 100644
--- a/src/3rdparty/libwebp/src/enc/picture_csp.c
+++ b/src/3rdparty/libwebp/src/enc/picture_csp_enc.c
@@ -15,8 +15,8 @@
 #include <stdlib.h>
 #include <math.h>
 
-#include "./vp8enci.h"
-#include "../utils/random.h"
+#include "./vp8i_enc.h"
+#include "../utils/random_utils.h"
 #include "../utils/utils.h"
 #include "../dsp/yuv.h"
 
@@ -153,9 +153,9 @@ static int RGBToV(int r, int g, int b, VP8Random* const rg) {
 }
 
 //------------------------------------------------------------------------------
-// Smart RGB->YUV conversion
+// Sharp RGB->YUV conversion
 
-static const int kNumIterations = 6;
+static const int kNumIterations = 4;
 static const int kMinDimensionIterativeConversion = 4;
 
 // We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
@@ -171,9 +171,9 @@ typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
 #if defined(USE_GAMMA_COMPRESSION)
 
 // float variant of gamma-correction
-// We use tables of different size and precision, along with a 'real-world'
-// Gamma value close to ~2.
-#define kGammaF 2.2
+// We use tables of different size and precision for the Rec709
+// transfer function.
+#define kGammaF (1./0.45)
 static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
 static float kLinearToGammaTabF[kGammaTabSize + 2];
 static volatile int kGammaTablesFOk = 0;
@@ -183,11 +183,26 @@ static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
     int v;
     const double norm = 1. / MAX_Y_T;
     const double scale = 1. / kGammaTabSize;
+    const double a = 0.099;
+    const double thresh = 0.018;
     for (v = 0; v <= MAX_Y_T; ++v) {
-      kGammaToLinearTabF[v] = (float)pow(norm * v, kGammaF);
+      const double g = norm * v;
+      if (g <= thresh * 4.5) {
+        kGammaToLinearTabF[v] = (float)(g / 4.5);
+      } else {
+        const double a_rec = 1. / (1. + a);
+        kGammaToLinearTabF[v] = (float)pow(a_rec * (g + a), kGammaF);
+      }
     }
     for (v = 0; v <= kGammaTabSize; ++v) {
-      kLinearToGammaTabF[v] = (float)(MAX_Y_T * pow(scale * v, 1. / kGammaF));
+      const double g = scale * v;
+      double value;
+      if (g <= thresh) {
+        value = 4.5 * g;
+      } else {
+        value = (1. + a) * pow(g, 1. / kGammaF) - a;
+      }
+      kLinearToGammaTabF[v] = (float)(MAX_Y_T * value);
     }
     // to prevent small rounding errors to cause read-overflow:
     kLinearToGammaTabF[kGammaTabSize + 1] = kLinearToGammaTabF[kGammaTabSize];
@@ -235,12 +250,12 @@ static fixed_y_t clip_y(int y) {
 //------------------------------------------------------------------------------
 
 static int RGBToGray(int r, int g, int b) {
-  const int luma = 19595 * r + 38470 * g + 7471 * b + YUV_HALF;
+  const int luma = 13933 * r + 46871 * g + 4732 * b + YUV_HALF;
   return (luma >> YUV_FIX);
 }
 
 static float RGBToGrayF(float r, float g, float b) {
-  return 0.299f * r + 0.587f * g + 0.114f * b;
+  return (float)(0.2126 * r + 0.7152 * g + 0.0722 * b);
 }
 
 static int ScaleDown(int a, int b, int c, int d) {
@@ -251,58 +266,50 @@ static int ScaleDown(int a, int b, int c, int d) {
   return LinearToGammaF(0.25f * (A + B + C + D));
 }
 
-static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int len) {
-  while (len-- > 0) {
-    const float R = GammaToLinearF(src[0]);
-    const float G = GammaToLinearF(src[1]);
-    const float B = GammaToLinearF(src[2]);
+static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int w) {
+  int i;
+  for (i = 0; i < w; ++i) {
+    const float R = GammaToLinearF(src[0 * w + i]);
+    const float G = GammaToLinearF(src[1 * w + i]);
+    const float B = GammaToLinearF(src[2 * w + i]);
     const float Y = RGBToGrayF(R, G, B);
-    *dst++ = (fixed_y_t)LinearToGammaF(Y);
-    src += 3;
+    dst[i] = (fixed_y_t)LinearToGammaF(Y);
   }
 }
 
-static int UpdateChroma(const fixed_y_t* src1,
-                        const fixed_y_t* src2,
-                        fixed_t* dst, fixed_y_t* tmp, int len) {
-  int diff = 0;
-  while (len--> 0) {
-    const int r = ScaleDown(src1[0], src1[3], src2[0], src2[3]);
-    const int g = ScaleDown(src1[1], src1[4], src2[1], src2[4]);
-    const int b = ScaleDown(src1[2], src1[5], src2[2], src2[5]);
+static void UpdateChroma(const fixed_y_t* src1, const fixed_y_t* src2,
+                         fixed_t* dst, int uv_w) {
+  int i;
+  for (i = 0; i < uv_w; ++i) {
+    const int r = ScaleDown(src1[0 * uv_w + 0], src1[0 * uv_w + 1],
+                            src2[0 * uv_w + 0], src2[0 * uv_w + 1]);
+    const int g = ScaleDown(src1[2 * uv_w + 0], src1[2 * uv_w + 1],
+                            src2[2 * uv_w + 0], src2[2 * uv_w + 1]);
+    const int b = ScaleDown(src1[4 * uv_w + 0], src1[4 * uv_w + 1],
+                            src2[4 * uv_w + 0], src2[4 * uv_w + 1]);
     const int W = RGBToGray(r, g, b);
-    const int r_avg = (src1[0] + src1[3] + src2[0] + src2[3] + 2) >> 2;
-    const int g_avg = (src1[1] + src1[4] + src2[1] + src2[4] + 2) >> 2;
-    const int b_avg = (src1[2] + src1[5] + src2[2] + src2[5] + 2) >> 2;
-    dst[0] = (fixed_t)(r - W);
-    dst[1] = (fixed_t)(g - W);
-    dst[2] = (fixed_t)(b - W);
-    dst += 3;
-    src1 += 6;
-    src2 += 6;
-    if (tmp != NULL) {
-      tmp[0] = tmp[1] = clip_y(W);
-      tmp += 2;
-    }
-    diff += abs(RGBToGray(r_avg, g_avg, b_avg) - W);
+    dst[0 * uv_w] = (fixed_t)(r - W);
+    dst[1 * uv_w] = (fixed_t)(g - W);
+    dst[2 * uv_w] = (fixed_t)(b - W);
+    dst  += 1;
+    src1 += 2;
+    src2 += 2;
   }
-  return diff;
 }
 
-//------------------------------------------------------------------------------
-
-static WEBP_INLINE int Filter(const fixed_t* const A, const fixed_t* const B,
-                              int rightwise) {
-  int v;
-  if (!rightwise) {
-    v = (A[0] * 9 + A[-3] * 3 + B[0] * 3 + B[-3]);
-  } else {
-    v = (A[0] * 9 + A[+3] * 3 + B[0] * 3 + B[+3]);
+static void StoreGray(const fixed_y_t* rgb, fixed_y_t* y, int w) {
+  int i;
+  for (i = 0; i < w; ++i) {
+    y[i] = RGBToGray(rgb[0 * w + i], rgb[1 * w + i], rgb[2 * w + i]);
   }
-  return (v + 8) >> 4;
 }
 
-static WEBP_INLINE int Filter2(int A, int B) { return (A * 3 + B + 2) >> 2; }
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE fixed_y_t Filter2(int A, int B, int W0) {
+  const int v0 = (A * 3 + B + 2) >> 2;
+  return clip_y(v0 + W0);
+}
 
 //------------------------------------------------------------------------------
 
@@ -317,52 +324,50 @@ static void ImportOneRow(const uint8_t* const r_ptr,
                          int pic_width,
                          fixed_y_t* const dst) {
   int i;
+  const int w = (pic_width + 1) & ~1;
   for (i = 0; i < pic_width; ++i) {
     const int off = i * step;
-    dst[3 * i + 0] = UpLift(r_ptr[off]);
-    dst[3 * i + 1] = UpLift(g_ptr[off]);
-    dst[3 * i + 2] = UpLift(b_ptr[off]);
+    dst[i + 0 * w] = UpLift(r_ptr[off]);
+    dst[i + 1 * w] = UpLift(g_ptr[off]);
+    dst[i + 2 * w] = UpLift(b_ptr[off]);
   }
   if (pic_width & 1) {  // replicate rightmost pixel
-    memcpy(dst + 3 * pic_width, dst + 3 * (pic_width - 1), 3 * sizeof(*dst));
+    dst[pic_width + 0 * w] = dst[pic_width + 0 * w - 1];
+    dst[pic_width + 1 * w] = dst[pic_width + 1 * w - 1];
+    dst[pic_width + 2 * w] = dst[pic_width + 2 * w - 1];
   }
 }
 
 static void InterpolateTwoRows(const fixed_y_t* const best_y,
-                               const fixed_t* const prev_uv,
-                               const fixed_t* const cur_uv,
-                               const fixed_t* const next_uv,
+                               const fixed_t* prev_uv,
+                               const fixed_t* cur_uv,
+                               const fixed_t* next_uv,
                                int w,
-                               fixed_y_t* const out1,
-                               fixed_y_t* const out2) {
-  int i, k;
-  {  // special boundary case for i==0
-    const int W0 = best_y[0];
-    const int W1 = best_y[w];
-    for (k = 0; k <= 2; ++k) {
-      out1[k] = clip_y(Filter2(cur_uv[k], prev_uv[k]) + W0);
-      out2[k] = clip_y(Filter2(cur_uv[k], next_uv[k]) + W1);
-    }
-  }
-  for (i = 1; i < w - 1; ++i) {
-    const int W0 = best_y[i + 0];
-    const int W1 = best_y[i + w];
-    const int off = 3 * (i >> 1);
-    for (k = 0; k <= 2; ++k) {
-      const int tmp0 = Filter(cur_uv + off + k, prev_uv + off + k, i & 1);
-      const int tmp1 = Filter(cur_uv + off + k, next_uv + off + k, i & 1);
-      out1[3 * i + k] = clip_y(tmp0 + W0);
-      out2[3 * i + k] = clip_y(tmp1 + W1);
-    }
-  }
-  {  // special boundary case for i == w - 1
-    const int W0 = best_y[i + 0];
-    const int W1 = best_y[i + w];
-    const int off = 3 * (i >> 1);
-    for (k = 0; k <= 2; ++k) {
-      out1[3 * i + k] = clip_y(Filter2(cur_uv[off + k], prev_uv[off + k]) + W0);
-      out2[3 * i + k] = clip_y(Filter2(cur_uv[off + k], next_uv[off + k]) + W1);
+                               fixed_y_t* out1,
+                               fixed_y_t* out2) {
+  const int uv_w = w >> 1;
+  const int len = (w - 1) >> 1;   // length to filter
+  int k = 3;
+  while (k-- > 0) {   // process each R/G/B segments in turn
+    // special boundary case for i==0
+    out1[0] = Filter2(cur_uv[0], prev_uv[0], best_y[0]);
+    out2[0] = Filter2(cur_uv[0], next_uv[0], best_y[w]);
+
+    WebPSharpYUVFilterRow(cur_uv, prev_uv, len, best_y + 0 + 1, out1 + 1);
+    WebPSharpYUVFilterRow(cur_uv, next_uv, len, best_y + w + 1, out2 + 1);
+
+    // special boundary case for i == w - 1 when w is even
+    if (!(w & 1)) {
+      out1[w - 1] = Filter2(cur_uv[uv_w - 1], prev_uv[uv_w - 1],
+                            best_y[w - 1 + 0]);
+      out2[w - 1] = Filter2(cur_uv[uv_w - 1], next_uv[uv_w - 1],
+                            best_y[w - 1 + w]);
     }
+    out1 += w;
+    out2 += w;
+    prev_uv += uv_w;
+    cur_uv  += uv_w;
+    next_uv += uv_w;
   }
 }
 
@@ -381,36 +386,42 @@ static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
   return clip_8b(128 + (v >> (YUV_FIX + SFIX)));
 }
 
-static int ConvertWRGBToYUV(const fixed_y_t* const best_y,
-                            const fixed_t* const best_uv,
+static int ConvertWRGBToYUV(const fixed_y_t* best_y, const fixed_t* best_uv,
                             WebPPicture* const picture) {
   int i, j;
+  uint8_t* dst_y = picture->y;
+  uint8_t* dst_u = picture->u;
+  uint8_t* dst_v = picture->v;
+  const fixed_t* const best_uv_base = best_uv;
   const int w = (picture->width + 1) & ~1;
   const int h = (picture->height + 1) & ~1;
   const int uv_w = w >> 1;
   const int uv_h = h >> 1;
-  for (j = 0; j < picture->height; ++j) {
+  for (best_uv = best_uv_base, j = 0; j < picture->height; ++j) {
     for (i = 0; i < picture->width; ++i) {
-      const int off = 3 * ((i >> 1) + (j >> 1) * uv_w);
-      const int off2 = i + j * picture->y_stride;
-      const int W = best_y[i + j * w];
-      const int r = best_uv[off + 0] + W;
-      const int g = best_uv[off + 1] + W;
-      const int b = best_uv[off + 2] + W;
-      picture->y[off2] = ConvertRGBToY(r, g, b);
+      const int off = (i >> 1);
+      const int W = best_y[i];
+      const int r = best_uv[off + 0 * uv_w] + W;
+      const int g = best_uv[off + 1 * uv_w] + W;
+      const int b = best_uv[off + 2 * uv_w] + W;
+      dst_y[i] = ConvertRGBToY(r, g, b);
     }
+    best_y += w;
+    best_uv += (j & 1) * 3 * uv_w;
+    dst_y += picture->y_stride;
   }
-  for (j = 0; j < uv_h; ++j) {
-    uint8_t* const dst_u = picture->u + j * picture->uv_stride;
-    uint8_t* const dst_v = picture->v + j * picture->uv_stride;
+  for (best_uv = best_uv_base, j = 0; j < uv_h; ++j) {
     for (i = 0; i < uv_w; ++i) {
-      const int off = 3 * (i + j * uv_w);
-      const int r = best_uv[off + 0];
-      const int g = best_uv[off + 1];
-      const int b = best_uv[off + 2];
+      const int off = i;
+      const int r = best_uv[off + 0 * uv_w];
+      const int g = best_uv[off + 1 * uv_w];
+      const int b = best_uv[off + 2 * uv_w];
       dst_u[i] = ConvertRGBToU(r, g, b);
       dst_v[i] = ConvertRGBToV(r, g, b);
     }
+    best_uv += 3 * uv_w;
+    dst_u += picture->uv_stride;
+    dst_v += picture->uv_stride;
   }
   return 1;
 }
@@ -420,9 +431,9 @@ static int ConvertWRGBToYUV(const fixed_y_t* const best_y,
 
 #define SAFE_ALLOC(W, H, T) ((T*)WebPSafeMalloc((W) * (H), sizeof(T)))
 
-static int PreprocessARGB(const uint8_t* const r_ptr,
-                          const uint8_t* const g_ptr,
-                          const uint8_t* const b_ptr,
+static int PreprocessARGB(const uint8_t* r_ptr,
+                          const uint8_t* g_ptr,
+                          const uint8_t* b_ptr,
                           int step, int rgb_stride,
                           WebPPicture* const picture) {
   // we expand the right/bottom border if needed
@@ -430,25 +441,27 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
   const int h = (picture->height + 1) & ~1;
   const int uv_w = w >> 1;
   const int uv_h = h >> 1;
-  int i, j, iter;
+  uint64_t prev_diff_y_sum = ~0;
+  int j, iter;
 
   // TODO(skal): allocate one big memory chunk. But for now, it's easier
   // for valgrind debugging to have several chunks.
   fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
-  fixed_y_t* const best_y = SAFE_ALLOC(w, h, fixed_y_t);
-  fixed_y_t* const target_y = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const best_y_base = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const target_y_base = SAFE_ALLOC(w, h, fixed_y_t);
   fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
-  fixed_t* const best_uv = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
-  fixed_t* const target_uv = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const best_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const target_uv_base = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
   fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
+  fixed_y_t* best_y = best_y_base;
+  fixed_y_t* target_y = target_y_base;
+  fixed_t* best_uv = best_uv_base;
+  fixed_t* target_uv = target_uv_base;
+  const uint64_t diff_y_threshold = (uint64_t)(3.0 * w * h);
   int ok;
-  int diff_sum = 0;
-  const int first_diff_threshold = (int)(2.5 * w * h);
-  const int min_improvement = 5;   // stop if improvement is below this %
-  const int min_first_improvement = 80;
 
-  if (best_y == NULL || best_uv == NULL ||
-      target_y == NULL || target_uv == NULL ||
+  if (best_y_base == NULL || best_uv_base == NULL ||
+      target_y_base == NULL || target_uv_base == NULL ||
       best_rgb_y == NULL || best_rgb_uv == NULL ||
       tmp_buffer == NULL) {
     ok = WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
@@ -457,101 +470,86 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
   assert(picture->width >= kMinDimensionIterativeConversion);
   assert(picture->height >= kMinDimensionIterativeConversion);
 
+  WebPInitConvertARGBToYUV();
+
   // Import RGB samples to W/RGB representation.
   for (j = 0; j < picture->height; j += 2) {
     const int is_last_row = (j == picture->height - 1);
-    fixed_y_t* const src1 = tmp_buffer;
+    fixed_y_t* const src1 = tmp_buffer + 0 * w;
     fixed_y_t* const src2 = tmp_buffer + 3 * w;
-    const int off1 = j * rgb_stride;
-    const int off2 = off1 + rgb_stride;
-    const int uv_off = (j >> 1) * 3 * uv_w;
-    fixed_y_t* const dst_y = best_y + j * w;
 
     // prepare two rows of input
-    ImportOneRow(r_ptr + off1, g_ptr + off1, b_ptr + off1,
-                 step, picture->width, src1);
+    ImportOneRow(r_ptr, g_ptr, b_ptr, step, picture->width, src1);
     if (!is_last_row) {
-      ImportOneRow(r_ptr + off2, g_ptr + off2, b_ptr + off2,
+      ImportOneRow(r_ptr + rgb_stride, g_ptr + rgb_stride, b_ptr + rgb_stride,
                    step, picture->width, src2);
     } else {
       memcpy(src2, src1, 3 * w * sizeof(*src2));
     }
-    UpdateW(src1, target_y + (j + 0) * w, w);
-    UpdateW(src2, target_y + (j + 1) * w, w);
-    diff_sum += UpdateChroma(src1, src2, target_uv + uv_off, dst_y, uv_w);
-    memcpy(best_uv + uv_off, target_uv + uv_off, 3 * uv_w * sizeof(*best_uv));
-    memcpy(dst_y + w, dst_y, w * sizeof(*dst_y));
+    StoreGray(src1, best_y + 0, w);
+    StoreGray(src2, best_y + w, w);
+
+    UpdateW(src1, target_y, w);
+    UpdateW(src2, target_y + w, w);
+    UpdateChroma(src1, src2, target_uv, uv_w);
+    memcpy(best_uv, target_uv, 3 * uv_w * sizeof(*best_uv));
+    best_y += 2 * w;
+    best_uv += 3 * uv_w;
+    target_y += 2 * w;
+    target_uv += 3 * uv_w;
+    r_ptr += 2 * rgb_stride;
+    g_ptr += 2 * rgb_stride;
+    b_ptr += 2 * rgb_stride;
   }
 
   // Iterate and resolve clipping conflicts.
   for (iter = 0; iter < kNumIterations; ++iter) {
-    int k;
-    const fixed_t* cur_uv = best_uv;
-    const fixed_t* prev_uv = best_uv;
-    const int old_diff_sum = diff_sum;
-    diff_sum = 0;
+    const fixed_t* cur_uv = best_uv_base;
+    const fixed_t* prev_uv = best_uv_base;
+    uint64_t diff_y_sum = 0;
+
+    best_y = best_y_base;
+    best_uv = best_uv_base;
+    target_y = target_y_base;
+    target_uv = target_uv_base;
     for (j = 0; j < h; j += 2) {
-      fixed_y_t* const src1 = tmp_buffer;
+      fixed_y_t* const src1 = tmp_buffer + 0 * w;
       fixed_y_t* const src2 = tmp_buffer + 3 * w;
       {
         const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
-        InterpolateTwoRows(best_y + j * w, prev_uv, cur_uv, next_uv,
-                           w, src1, src2);
+        InterpolateTwoRows(best_y, prev_uv, cur_uv, next_uv, w, src1, src2);
         prev_uv = cur_uv;
         cur_uv = next_uv;
       }
 
       UpdateW(src1, best_rgb_y + 0 * w, w);
       UpdateW(src2, best_rgb_y + 1 * w, w);
-      diff_sum += UpdateChroma(src1, src2, best_rgb_uv, NULL, uv_w);
+      UpdateChroma(src1, src2, best_rgb_uv, uv_w);
 
       // update two rows of Y and one row of RGB
-      for (i = 0; i < 2 * w; ++i) {
-        const int off = i + j * w;
-        const int diff_y = target_y[off] - best_rgb_y[i];
-        const int new_y = (int)best_y[off] + diff_y;
-        best_y[off] = clip_y(new_y);
-      }
-      for (i = 0; i < uv_w; ++i) {
-        const int off = 3 * (i + (j >> 1) * uv_w);
-        int W;
-        for (k = 0; k <= 2; ++k) {
-          const int diff_uv = (int)target_uv[off + k] - best_rgb_uv[3 * i + k];
-          best_uv[off + k] += diff_uv;
-        }
-        W = RGBToGray(best_uv[off + 0], best_uv[off + 1], best_uv[off + 2]);
-        for (k = 0; k <= 2; ++k) {
-          best_uv[off + k] -= W;
-        }
-      }
+      diff_y_sum += WebPSharpYUVUpdateY(target_y, best_rgb_y, best_y, 2 * w);
+      WebPSharpYUVUpdateRGB(target_uv, best_rgb_uv, best_uv, 3 * uv_w);
+
+      best_y += 2 * w;
+      best_uv += 3 * uv_w;
+      target_y += 2 * w;
+      target_uv += 3 * uv_w;
     }
     // test exit condition
-    if (diff_sum > 0) {
-      const int improvement = 100 * abs(diff_sum - old_diff_sum) / diff_sum;
-      // Check if first iteration gave good result already, without a large
-      // jump of improvement (otherwise it means we need to try few extra
-      // iterations, just to be sure).
-      if (iter == 0 && diff_sum < first_diff_threshold &&
-          improvement < min_first_improvement) {
-        break;
-      }
-      // then, check if improvement is stalling.
-      if (improvement < min_improvement) {
-        break;
-      }
-    } else {
-      break;
+    if (iter > 0) {
+      if (diff_y_sum < diff_y_threshold) break;
+      if (diff_y_sum > prev_diff_y_sum) break;
     }
+    prev_diff_y_sum = diff_y_sum;
   }
-
   // final reconstruction
-  ok = ConvertWRGBToYUV(best_y, best_uv, picture);
+  ok = ConvertWRGBToYUV(best_y_base, best_uv_base, picture);
 
  End:
-  WebPSafeFree(best_y);
-  WebPSafeFree(best_uv);
-  WebPSafeFree(target_y);
-  WebPSafeFree(target_uv);
+  WebPSafeFree(best_y_base);
+  WebPSafeFree(best_uv_base);
+  WebPSafeFree(target_y_base);
+  WebPSafeFree(target_uv_base);
   WebPSafeFree(best_rgb_y);
   WebPSafeFree(best_rgb_uv);
   WebPSafeFree(tmp_buffer);
@@ -830,10 +828,10 @@ static WEBP_INLINE void ConvertRowsToUV(const uint16_t* rgb,
   }
 }
 
-static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
-                              const uint8_t* const g_ptr,
-                              const uint8_t* const b_ptr,
-                              const uint8_t* const a_ptr,
+static int ImportYUVAFromRGBA(const uint8_t* r_ptr,
+                              const uint8_t* g_ptr,
+                              const uint8_t* b_ptr,
+                              const uint8_t* a_ptr,
                               int step,         // bytes per pixel
                               int rgb_stride,   // bytes per scanline
                               float dithering,
@@ -900,36 +898,34 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
     // Downsample Y/U/V planes, two rows at a time
     for (y = 0; y < (height >> 1); ++y) {
       int rows_have_alpha = has_alpha;
-      const int off1 = (2 * y + 0) * rgb_stride;
-      const int off2 = (2 * y + 1) * rgb_stride;
       if (use_dsp) {
         if (is_rgb) {
-          WebPConvertRGB24ToY(r_ptr + off1, dst_y, width);
-          WebPConvertRGB24ToY(r_ptr + off2, dst_y + picture->y_stride, width);
+          WebPConvertRGB24ToY(r_ptr, dst_y, width);
+          WebPConvertRGB24ToY(r_ptr + rgb_stride,
+                              dst_y + picture->y_stride, width);
         } else {
-          WebPConvertBGR24ToY(b_ptr + off1, dst_y, width);
-          WebPConvertBGR24ToY(b_ptr + off2, dst_y + picture->y_stride, width);
+          WebPConvertBGR24ToY(b_ptr, dst_y, width);
+          WebPConvertBGR24ToY(b_ptr + rgb_stride,
+                              dst_y + picture->y_stride, width);
         }
       } else {
-        ConvertRowToY(r_ptr + off1, g_ptr + off1, b_ptr + off1, step,
-                      dst_y, width, rg);
-        ConvertRowToY(r_ptr + off2, g_ptr + off2, b_ptr + off2, step,
+        ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
+        ConvertRowToY(r_ptr + rgb_stride,
+                      g_ptr + rgb_stride,
+                      b_ptr + rgb_stride, step,
                       dst_y + picture->y_stride, width, rg);
       }
       dst_y += 2 * picture->y_stride;
       if (has_alpha) {
-        rows_have_alpha &= !WebPExtractAlpha(a_ptr + off1, rgb_stride,
-                                             width, 2,
+        rows_have_alpha &= !WebPExtractAlpha(a_ptr, rgb_stride, width, 2,
                                              dst_a, picture->a_stride);
         dst_a += 2 * picture->a_stride;
       }
       // Collect averaged R/G/B(/A)
       if (!rows_have_alpha) {
-        AccumulateRGB(r_ptr + off1, g_ptr + off1, b_ptr + off1,
-                      step, rgb_stride, tmp_rgb, width);
+        AccumulateRGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, tmp_rgb, width);
       } else {
-        AccumulateRGBA(r_ptr + off1, g_ptr + off1, b_ptr + off1, a_ptr + off1,
-                       rgb_stride, tmp_rgb, width);
+        AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, rgb_stride, tmp_rgb, width);
       }
       // Convert to U/V
       if (rg == NULL) {
@@ -939,31 +935,33 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
       }
       dst_u += picture->uv_stride;
       dst_v += picture->uv_stride;
+      r_ptr += 2 * rgb_stride;
+      b_ptr += 2 * rgb_stride;
+      g_ptr += 2 * rgb_stride;
+      if (has_alpha) a_ptr += 2 * rgb_stride;
     }
     if (height & 1) {    // extra last row
-      const int off = 2 * y * rgb_stride;
       int row_has_alpha = has_alpha;
       if (use_dsp) {
         if (r_ptr < b_ptr) {
-          WebPConvertRGB24ToY(r_ptr + off, dst_y, width);
+          WebPConvertRGB24ToY(r_ptr, dst_y, width);
         } else {
-          WebPConvertBGR24ToY(b_ptr + off, dst_y, width);
+          WebPConvertBGR24ToY(b_ptr, dst_y, width);
         }
       } else {
-        ConvertRowToY(r_ptr + off, g_ptr + off, b_ptr + off, step,
-                      dst_y, width, rg);
+        ConvertRowToY(r_ptr, g_ptr, b_ptr, step, dst_y, width, rg);
       }
       if (row_has_alpha) {
-        row_has_alpha &= !WebPExtractAlpha(a_ptr + off, 0, width, 1, dst_a, 0);
+        row_has_alpha &= !WebPExtractAlpha(a_ptr, 0, width, 1, dst_a, 0);
       }
       // Collect averaged R/G/B(/A)
       if (!row_has_alpha) {
         // Collect averaged R/G/B
-        AccumulateRGB(r_ptr + off, g_ptr + off, b_ptr + off,
-                      step, /* rgb_stride = */ 0, tmp_rgb, width);
+        AccumulateRGB(r_ptr, g_ptr, b_ptr, step, /* rgb_stride = */ 0,
+                      tmp_rgb, width);
       } else {
-        AccumulateRGBA(r_ptr + off, g_ptr + off, b_ptr + off, a_ptr + off,
-                       /* rgb_stride = */ 0, tmp_rgb, width);
+        AccumulateRGBA(r_ptr, g_ptr, b_ptr, a_ptr, /* rgb_stride = */ 0,
+                       tmp_rgb, width);
       }
       if (rg == NULL) {
         WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
@@ -1013,9 +1011,13 @@ int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
   return PictureARGBToYUVA(picture, colorspace, 0.f, 0);
 }
 
-int WebPPictureSmartARGBToYUVA(WebPPicture* picture) {
+int WebPPictureSharpARGBToYUVA(WebPPicture* picture) {
   return PictureARGBToYUVA(picture, WEBP_YUV420, 0.f, 1);
 }
+// for backward compatibility
+int WebPPictureSmartARGBToYUVA(WebPPicture* picture) {
+  return WebPPictureSharpARGBToYUVA(picture);
+}
 
 //------------------------------------------------------------------------------
 // call for YUVA -> ARGB conversion
@@ -1086,10 +1088,10 @@ static int Import(WebPPicture* const picture,
                   const uint8_t* const rgb, int rgb_stride,
                   int step, int swap_rb, int import_alpha) {
   int y;
-  const uint8_t* const r_ptr = rgb + (swap_rb ? 2 : 0);
-  const uint8_t* const g_ptr = rgb + 1;
-  const uint8_t* const b_ptr = rgb + (swap_rb ? 0 : 2);
-  const uint8_t* const a_ptr = import_alpha ? rgb + 3 : NULL;
+  const uint8_t* r_ptr = rgb + (swap_rb ? 2 : 0);
+  const uint8_t* g_ptr = rgb + 1;
+  const uint8_t* b_ptr = rgb + (swap_rb ? 0 : 2);
+  const uint8_t* a_ptr = import_alpha ? rgb + 3 : NULL;
   const int width = picture->width;
   const int height = picture->height;
 
@@ -1102,20 +1104,25 @@ static int Import(WebPPicture* const picture,
   VP8EncDspARGBInit();
 
   if (import_alpha) {
+    uint32_t* dst = picture->argb;
     assert(step == 4);
     for (y = 0; y < height; ++y) {
-      uint32_t* const dst = &picture->argb[y * picture->argb_stride];
-      const int offset = y * rgb_stride;
-      VP8PackARGB(a_ptr + offset, r_ptr + offset, g_ptr + offset,
-                  b_ptr + offset, width, dst);
+      VP8PackARGB(a_ptr, r_ptr, g_ptr, b_ptr, width, dst);
+      a_ptr += rgb_stride;
+      r_ptr += rgb_stride;
+      g_ptr += rgb_stride;
+      b_ptr += rgb_stride;
+      dst += picture->argb_stride;
     }
   } else {
+    uint32_t* dst = picture->argb;
     assert(step >= 3);
     for (y = 0; y < height; ++y) {
-      uint32_t* const dst = &picture->argb[y * picture->argb_stride];
-      const int offset = y * rgb_stride;
-      VP8PackRGB(r_ptr + offset, g_ptr + offset, b_ptr + offset,
-                 width, step, dst);
+      VP8PackRGB(r_ptr, g_ptr, b_ptr, width, step, dst);
+      r_ptr += rgb_stride;
+      g_ptr += rgb_stride;
+      b_ptr += rgb_stride;
+      dst += picture->argb_stride;
     }
   }
   return 1;
diff --git a/src/3rdparty/libwebp/src/enc/picture.c b/src/3rdparty/libwebp/src/enc/picture_enc.c
index d9befbc..dfa6651 100644
--- a/src/3rdparty/libwebp/src/enc/picture.c
+++ b/src/3rdparty/libwebp/src/enc/picture_enc.c
@@ -14,7 +14,7 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 #include "../dsp/dsp.h"
 #include "../utils/utils.h"
 
@@ -88,8 +88,9 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
 }
 
 int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
-  const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
-  const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT;
+  const WebPEncCSP uv_csp =
+      (WebPEncCSP)((int)picture->colorspace & WEBP_CSP_UV_MASK);
+  const int has_alpha = (int)picture->colorspace & WEBP_CSP_ALPHA_BIT;
   const int y_stride = width;
   const int uv_width = (width + 1) >> 1;
   const int uv_height = (height + 1) >> 1;
diff --git a/src/3rdparty/libwebp/src/enc/picture_psnr.c b/src/3rdparty/libwebp/src/enc/picture_psnr.c
deleted file mode 100644
index 81ab1b5..0000000
--- a/src/3rdparty/libwebp/src/enc/picture_psnr.c
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright 2014 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// WebPPicture tools for measuring distortion
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <math.h>
-#include <stdlib.h>
-
-#include "./vp8enci.h"
-#include "../utils/utils.h"
-
-//------------------------------------------------------------------------------
-// local-min distortion
-//
-// For every pixel in the *reference* picture, we search for the local best
-// match in the compressed image. This is not a symmetrical measure.
-
-#define RADIUS 2  // search radius. Shouldn't be too large.
-
-static void AccumulateLSIM(const uint8_t* src, int src_stride,
-                           const uint8_t* ref, int ref_stride,
-                           int w, int h, VP8DistoStats* stats) {
-  int x, y;
-  double total_sse = 0.;
-  for (y = 0; y < h; ++y) {
-    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
-    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
-    for (x = 0; x < w; ++x) {
-      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
-      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
-      double best_sse = 255. * 255.;
-      const double value = (double)ref[y * ref_stride + x];
-      int i, j;
-      for (j = y_0; j < y_1; ++j) {
-        const uint8_t* const s = src + j * src_stride;
-        for (i = x_0; i < x_1; ++i) {
-          const double diff = s[i] - value;
-          const double sse = diff * diff;
-          if (sse < best_sse) best_sse = sse;
-        }
-      }
-      total_sse += best_sse;
-    }
-  }
-  stats->w = w * h;
-  stats->xm = 0;
-  stats->ym = 0;
-  stats->xxm = total_sse;
-  stats->yym = 0;
-  stats->xxm = 0;
-}
-#undef RADIUS
-
-//------------------------------------------------------------------------------
-// Distortion
-
-// Max value returned in case of exact similarity.
-static const double kMinDistortion_dB = 99.;
-static float GetPSNR(const double v) {
-  return (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
-                          : kMinDistortion_dB);
-}
-
-int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
-                          int type, float result[5]) {
-  VP8DistoStats stats[5];
-  int w, h;
-
-  memset(stats, 0, sizeof(stats));
-
-  VP8SSIMDspInit();
-
-  if (src == NULL || ref == NULL ||
-      src->width != ref->width || src->height != ref->height ||
-      src->use_argb != ref->use_argb || result == NULL) {
-    return 0;
-  }
-  w = src->width;
-  h = src->height;
-
-  if (src->use_argb == 1) {
-    if (src->argb == NULL || ref->argb == NULL) {
-      return 0;
-    } else {
-      int i, j, c;
-      uint8_t* tmp1, *tmp2;
-      uint8_t* const tmp_plane =
-          (uint8_t*)WebPSafeMalloc(2ULL * w * h, sizeof(*tmp_plane));
-      if (tmp_plane == NULL) return 0;
-      tmp1 = tmp_plane;
-      tmp2 = tmp_plane + w * h;
-      for (c = 0; c < 4; ++c) {
-        for (j = 0; j < h; ++j) {
-          for (i = 0; i < w; ++i) {
-            tmp1[j * w + i] = src->argb[i + j * src->argb_stride] >> (c * 8);
-            tmp2[j * w + i] = ref->argb[i + j * ref->argb_stride] >> (c * 8);
-          }
-        }
-        if (type >= 2) {
-          AccumulateLSIM(tmp1, w, tmp2, w, w, h, &stats[c]);
-        } else {
-          VP8SSIMAccumulatePlane(tmp1, w, tmp2, w, w, h, &stats[c]);
-        }
-      }
-      free(tmp_plane);
-    }
-  } else {
-    int has_alpha, uv_w, uv_h;
-    if (src->y == NULL || ref->y == NULL ||
-        src->u == NULL || ref->u == NULL ||
-        src->v == NULL || ref->v == NULL) {
-      return 0;
-    }
-    has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
-    if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
-        (has_alpha && (src->a == NULL || ref->a == NULL))) {
-      return 0;
-    }
-
-    uv_w = (src->width + 1) >> 1;
-    uv_h = (src->height + 1) >> 1;
-    if (type >= 2) {
-      AccumulateLSIM(src->y, src->y_stride, ref->y, ref->y_stride,
-                     w, h, &stats[0]);
-      AccumulateLSIM(src->u, src->uv_stride, ref->u, ref->uv_stride,
-                     uv_w, uv_h, &stats[1]);
-      AccumulateLSIM(src->v, src->uv_stride, ref->v, ref->uv_stride,
-                     uv_w, uv_h, &stats[2]);
-      if (has_alpha) {
-        AccumulateLSIM(src->a, src->a_stride, ref->a, ref->a_stride,
-                       w, h, &stats[3]);
-      }
-    } else {
-      VP8SSIMAccumulatePlane(src->y, src->y_stride,
-                             ref->y, ref->y_stride,
-                             w, h, &stats[0]);
-      VP8SSIMAccumulatePlane(src->u, src->uv_stride,
-                             ref->u, ref->uv_stride,
-                             uv_w, uv_h, &stats[1]);
-      VP8SSIMAccumulatePlane(src->v, src->uv_stride,
-                             ref->v, ref->uv_stride,
-                             uv_w, uv_h, &stats[2]);
-      if (has_alpha) {
-        VP8SSIMAccumulatePlane(src->a, src->a_stride,
-                               ref->a, ref->a_stride,
-                               w, h, &stats[3]);
-      }
-    }
-  }
-  // Final stat calculations.
-  {
-    int c;
-    for (c = 0; c <= 4; ++c) {
-      if (type == 1) {
-        const double v = VP8SSIMGet(&stats[c]);
-        result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
-                                     : kMinDistortion_dB);
-      } else {
-        const double v = VP8SSIMGetSquaredError(&stats[c]);
-        result[c] = GetPSNR(v);
-      }
-      // Accumulate forward
-      if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
-    }
-  }
-  return 1;
-}
-
-//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c b/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c
new file mode 100644
index 0000000..9c0b229
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/picture_psnr_enc.c
@@ -0,0 +1,213 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools for measuring distortion
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <math.h>
+#include <stdlib.h>
+
+#include "./vp8i_enc.h"
+#include "../utils/utils.h"
+
+typedef double (*AccumulateFunc)(const uint8_t* src, int src_stride,
+                                 const uint8_t* ref, int ref_stride,
+                                 int w, int h);
+
+//------------------------------------------------------------------------------
+// local-min distortion
+//
+// For every pixel in the *reference* picture, we search for the local best
+// match in the compressed image. This is not a symmetrical measure.
+
+#define RADIUS 2  // search radius. Shouldn't be too large.
+
+static double AccumulateLSIM(const uint8_t* src, int src_stride,
+                             const uint8_t* ref, int ref_stride,
+                             int w, int h) {
+  int x, y;
+  double total_sse = 0.;
+  for (y = 0; y < h; ++y) {
+    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
+    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
+    for (x = 0; x < w; ++x) {
+      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
+      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
+      double best_sse = 255. * 255.;
+      const double value = (double)ref[y * ref_stride + x];
+      int i, j;
+      for (j = y_0; j < y_1; ++j) {
+        const uint8_t* const s = src + j * src_stride;
+        for (i = x_0; i < x_1; ++i) {
+          const double diff = s[i] - value;
+          const double sse = diff * diff;
+          if (sse < best_sse) best_sse = sse;
+        }
+      }
+      total_sse += best_sse;
+    }
+  }
+  return total_sse;
+}
+#undef RADIUS
+
+static double AccumulateSSE(const uint8_t* src, int src_stride,
+                            const uint8_t* ref, int ref_stride,
+                            int w, int h) {
+  int y;
+  double total_sse = 0.;
+  for (y = 0; y < h; ++y) {
+    total_sse += VP8AccumulateSSE(src, ref, w);
+    src += src_stride;
+    ref += ref_stride;
+  }
+  return total_sse;
+}
+
+//------------------------------------------------------------------------------
+
+static double AccumulateSSIM(const uint8_t* src, int src_stride,
+                             const uint8_t* ref, int ref_stride,
+                             int w, int h) {
+  const int w0 = (w < VP8_SSIM_KERNEL) ? w : VP8_SSIM_KERNEL;
+  const int w1 = w - VP8_SSIM_KERNEL - 1;
+  const int h0 = (h < VP8_SSIM_KERNEL) ? h : VP8_SSIM_KERNEL;
+  const int h1 = h - VP8_SSIM_KERNEL - 1;
+  int x, y;
+  double sum = 0.;
+  for (y = 0; y < h0; ++y) {
+    for (x = 0; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  for (; y < h1; ++y) {
+    for (x = 0; x < w0; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+    for (; x < w1; ++x) {
+      const int off1 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * src_stride;
+      const int off2 = x - VP8_SSIM_KERNEL + (y - VP8_SSIM_KERNEL) * ref_stride;
+      sum += VP8SSIMGet(src + off1, src_stride, ref + off2, ref_stride);
+    }
+    for (; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  for (; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      sum += VP8SSIMGetClipped(src, src_stride, ref, ref_stride, x, y, w, h);
+    }
+  }
+  return sum;
+}
+
+//------------------------------------------------------------------------------
+// Distortion
+
+// Max value returned in case of exact similarity.
+static const double kMinDistortion_dB = 99.;
+
+static double GetPSNR(double v, double size) {
+  return (v > 0. && size > 0.) ? -4.3429448 * log(v / (size * 255 * 255.))
+                               : kMinDistortion_dB;
+}
+
+static double GetLogSSIM(double v, double size) {
+  v = (size > 0.) ? v / size : 1.;
+  return (v < 1.) ? -10.0 * log10(1. - v) : kMinDistortion_dB;
+}
+
+int WebPPlaneDistortion(const uint8_t* src, size_t src_stride,
+                        const uint8_t* ref, size_t ref_stride,
+                        int width, int height, size_t x_step,
+                        int type, float* distortion, float* result) {
+  uint8_t* allocated = NULL;
+  const AccumulateFunc metric = (type == 0) ? AccumulateSSE :
+                                (type == 1) ? AccumulateSSIM :
+                                              AccumulateLSIM;
+  if (src == NULL || ref == NULL ||
+      src_stride < x_step * width || ref_stride < x_step * width ||
+      result == NULL || distortion == NULL) {
+    return 0;
+  }
+
+  VP8SSIMDspInit();
+  if (x_step != 1) {   // extract a packed plane if needed
+    int x, y;
+    uint8_t* tmp1;
+    uint8_t* tmp2;
+    allocated =
+        (uint8_t*)WebPSafeMalloc(2ULL * width * height, sizeof(*allocated));
+    if (allocated == NULL) return 0;
+    tmp1 = allocated;
+    tmp2 = tmp1 + (size_t)width * height;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        tmp1[x + y * width] = src[x * x_step + y * src_stride];
+        tmp2[x + y * width] = ref[x * x_step + y * ref_stride];
+      }
+    }
+    src = tmp1;
+    ref = tmp2;
+  }
+  *distortion = (float)metric(src, width, ref, width, width, height);
+  WebPSafeFree(allocated);
+
+  *result = (type == 1) ? (float)GetLogSSIM(*distortion, (double)width * height)
+                        : (float)GetPSNR(*distortion, (double)width * height);
+  return 1;
+}
+
+int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+                          int type, float results[5]) {
+  int w, h, c;
+  int ok = 0;
+  WebPPicture p0, p1;
+  double total_size = 0., total_distortion = 0.;
+  if (src == NULL || ref == NULL ||
+      src->width != ref->width || src->height != ref->height ||
+      results == NULL) {
+    return 0;
+  }
+
+  VP8SSIMDspInit();
+  if (!WebPPictureInit(&p0) || !WebPPictureInit(&p1)) return 0;
+  w = src->width;
+  h = src->height;
+  if (!WebPPictureView(src, 0, 0, w, h, &p0)) goto Error;
+  if (!WebPPictureView(ref, 0, 0, w, h, &p1)) goto Error;
+
+  // We always measure distortion in ARGB space.
+  if (p0.use_argb == 0 && !WebPPictureYUVAToARGB(&p0)) goto Error;
+  if (p1.use_argb == 0 && !WebPPictureYUVAToARGB(&p1)) goto Error;
+  for (c = 0; c < 4; ++c) {
+    float distortion;
+    const size_t stride0 = 4 * (size_t)p0.argb_stride;
+    const size_t stride1 = 4 * (size_t)p1.argb_stride;
+    if (!WebPPlaneDistortion((const uint8_t*)p0.argb + c, stride0,
+                             (const uint8_t*)p1.argb + c, stride1,
+                             w, h, 4, type, &distortion, results + c)) {
+      goto Error;
+    }
+    total_distortion += distortion;
+    total_size += w * h;
+  }
+
+  results[4] = (type == 1) ? (float)GetLogSSIM(total_distortion, total_size)
+                           : (float)GetPSNR(total_distortion, total_size);
+  ok = 1;
+
+ Error:
+  WebPPictureFree(&p0);
+  WebPPictureFree(&p1);
+  return ok;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/picture_rescale.c b/src/3rdparty/libwebp/src/enc/picture_rescale_enc.c
index 9f19e8e..0b7181c 100644
--- a/src/3rdparty/libwebp/src/enc/picture_rescale.c
+++ b/src/3rdparty/libwebp/src/enc/picture_rescale_enc.c
@@ -14,8 +14,8 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./vp8enci.h"
-#include "../utils/rescaler.h"
+#include "./vp8i_enc.h"
+#include "../utils/rescaler_utils.h"
 #include "../utils/utils.h"
 
 #define HALVE(x) (((x) + 1) >> 1)
diff --git a/src/3rdparty/libwebp/src/enc/picture_tools.c b/src/3rdparty/libwebp/src/enc/picture_tools_enc.c
index bf97af8..895df51 100644
--- a/src/3rdparty/libwebp/src/enc/picture_tools.c
+++ b/src/3rdparty/libwebp/src/enc/picture_tools_enc.c
@@ -13,7 +13,7 @@
 
 #include <assert.h>
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 #include "../dsp/yuv.h"
 
 static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
diff --git a/src/3rdparty/libwebp/src/enc/predictor_enc.c b/src/3rdparty/libwebp/src/enc/predictor_enc.c
new file mode 100644
index 0000000..0639b74
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/predictor_enc.c
@@ -0,0 +1,750 @@
+// Copyright 2016 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Image transform methods for lossless encoder.
+//
+// Authors: Vikas Arora (vikaas.arora@gmail.com)
+//          Jyrki Alakuijala (jyrki@google.com)
+//          Urvang Joshi (urvang@google.com)
+//          Vincent Rabaud (vrabaud@google.com)
+
+#include "../dsp/lossless.h"
+#include "../dsp/lossless_common.h"
+#include "./vp8li_enc.h"
+
+#define MAX_DIFF_COST (1e30f)
+
+static const float kSpatialPredictorBias = 15.f;
+static const int kPredLowEffort = 11;
+static const uint32_t kMaskAlpha = 0xff000000;
+
+// Mostly used to reduce code size + readability
+static WEBP_INLINE int GetMin(int a, int b) { return (a > b) ? b : a; }
+static WEBP_INLINE int GetMax(int a, int b) { return (a < b) ? b : a; }
+
+//------------------------------------------------------------------------------
+// Methods to calculate Entropy (Shannon).
+
+static float PredictionCostSpatial(const int counts[256], int weight_0,
+                                   double exp_val) {
+  const int significant_symbols = 256 >> 4;
+  const double exp_decay_factor = 0.6;
+  double bits = weight_0 * counts[0];
+  int i;
+  for (i = 1; i < significant_symbols; ++i) {
+    bits += exp_val * (counts[i] + counts[256 - i]);
+    exp_val *= exp_decay_factor;
+  }
+  return (float)(-0.1 * bits);
+}
+
+static float PredictionCostSpatialHistogram(const int accumulated[4][256],
+                                            const int tile[4][256]) {
+  int i;
+  double retval = 0;
+  for (i = 0; i < 4; ++i) {
+    const double kExpValue = 0.94;
+    retval += PredictionCostSpatial(tile[i], 1, kExpValue);
+    retval += VP8LCombinedShannonEntropy(tile[i], accumulated[i]);
+  }
+  return (float)retval;
+}
+
+static WEBP_INLINE void UpdateHisto(int histo_argb[4][256], uint32_t argb) {
+  ++histo_argb[0][argb >> 24];
+  ++histo_argb[1][(argb >> 16) & 0xff];
+  ++histo_argb[2][(argb >> 8) & 0xff];
+  ++histo_argb[3][argb & 0xff];
+}
+
+//------------------------------------------------------------------------------
+// Spatial transform functions.
+
+static WEBP_INLINE void PredictBatch(int mode, int x_start, int y,
+                                     int num_pixels, const uint32_t* current,
+                                     const uint32_t* upper, uint32_t* out) {
+  if (x_start == 0) {
+    if (y == 0) {
+      // ARGB_BLACK.
+      VP8LPredictorsSub[0](current, NULL, 1, out);
+    } else {
+      // Top one.
+      VP8LPredictorsSub[2](current, upper, 1, out);
+    }
+    ++x_start;
+    ++out;
+    --num_pixels;
+  }
+  if (y == 0) {
+    // Left one.
+    VP8LPredictorsSub[1](current + x_start, NULL, num_pixels, out);
+  } else {
+    VP8LPredictorsSub[mode](current + x_start, upper + x_start, num_pixels,
+                            out);
+  }
+}
+
+static int MaxDiffBetweenPixels(uint32_t p1, uint32_t p2) {
+  const int diff_a = abs((int)(p1 >> 24) - (int)(p2 >> 24));
+  const int diff_r = abs((int)((p1 >> 16) & 0xff) - (int)((p2 >> 16) & 0xff));
+  const int diff_g = abs((int)((p1 >> 8) & 0xff) - (int)((p2 >> 8) & 0xff));
+  const int diff_b = abs((int)(p1 & 0xff) - (int)(p2 & 0xff));
+  return GetMax(GetMax(diff_a, diff_r), GetMax(diff_g, diff_b));
+}
+
+static int MaxDiffAroundPixel(uint32_t current, uint32_t up, uint32_t down,
+                              uint32_t left, uint32_t right) {
+  const int diff_up = MaxDiffBetweenPixels(current, up);
+  const int diff_down = MaxDiffBetweenPixels(current, down);
+  const int diff_left = MaxDiffBetweenPixels(current, left);
+  const int diff_right = MaxDiffBetweenPixels(current, right);
+  return GetMax(GetMax(diff_up, diff_down), GetMax(diff_left, diff_right));
+}
+
+static uint32_t AddGreenToBlueAndRed(uint32_t argb) {
+  const uint32_t green = (argb >> 8) & 0xff;
+  uint32_t red_blue = argb & 0x00ff00ffu;
+  red_blue += (green << 16) | green;
+  red_blue &= 0x00ff00ffu;
+  return (argb & 0xff00ff00u) | red_blue;
+}
+
+static void MaxDiffsForRow(int width, int stride, const uint32_t* const argb,
+                           uint8_t* const max_diffs, int used_subtract_green) {
+  uint32_t current, up, down, left, right;
+  int x;
+  if (width <= 2) return;
+  current = argb[0];
+  right = argb[1];
+  if (used_subtract_green) {
+    current = AddGreenToBlueAndRed(current);
+    right = AddGreenToBlueAndRed(right);
+  }
+  // max_diffs[0] and max_diffs[width - 1] are never used.
+  for (x = 1; x < width - 1; ++x) {
+    up = argb[-stride + x];
+    down = argb[stride + x];
+    left = current;
+    current = right;
+    right = argb[x + 1];
+    if (used_subtract_green) {
+      up = AddGreenToBlueAndRed(up);
+      down = AddGreenToBlueAndRed(down);
+      right = AddGreenToBlueAndRed(right);
+    }
+    max_diffs[x] = MaxDiffAroundPixel(current, up, down, left, right);
+  }
+}
+
+// Quantize the difference between the actual component value and its prediction
+// to a multiple of quantization, working modulo 256, taking care not to cross
+// a boundary (inclusive upper limit).
+static uint8_t NearLosslessComponent(uint8_t value, uint8_t predict,
+                                     uint8_t boundary, int quantization) {
+  const int residual = (value - predict) & 0xff;
+  const int boundary_residual = (boundary - predict) & 0xff;
+  const int lower = residual & ~(quantization - 1);
+  const int upper = lower + quantization;
+  // Resolve ties towards a value closer to the prediction (i.e. towards lower
+  // if value comes after prediction and towards upper otherwise).
+  const int bias = ((boundary - value) & 0xff) < boundary_residual;
+  if (residual - lower < upper - residual + bias) {
+    // lower is closer to residual than upper.
+    if (residual > boundary_residual && lower <= boundary_residual) {
+      // Halve quantization step to avoid crossing boundary. This midpoint is
+      // on the same side of boundary as residual because midpoint >= residual
+      // (since lower is closer than upper) and residual is above the boundary.
+      return lower + (quantization >> 1);
+    }
+    return lower;
+  } else {
+    // upper is closer to residual than lower.
+    if (residual <= boundary_residual && upper > boundary_residual) {
+      // Halve quantization step to avoid crossing boundary. This midpoint is
+      // on the same side of boundary as residual because midpoint <= residual
+      // (since upper is closer than lower) and residual is below the boundary.
+      return lower + (quantization >> 1);
+    }
+    return upper & 0xff;
+  }
+}
+
+// Quantize every component of the difference between the actual pixel value and
+// its prediction to a multiple of a quantization (a power of 2, not larger than
+// max_quantization which is a power of 2, smaller than max_diff). Take care if
+// value and predict have undergone subtract green, which means that red and
+// blue are represented as offsets from green.
+static uint32_t NearLossless(uint32_t value, uint32_t predict,
+                             int max_quantization, int max_diff,
+                             int used_subtract_green) {
+  int quantization;
+  uint8_t new_green = 0;
+  uint8_t green_diff = 0;
+  uint8_t a, r, g, b;
+  if (max_diff <= 2) {
+    return VP8LSubPixels(value, predict);
+  }
+  quantization = max_quantization;
+  while (quantization >= max_diff) {
+    quantization >>= 1;
+  }
+  if ((value >> 24) == 0 || (value >> 24) == 0xff) {
+    // Preserve transparency of fully transparent or fully opaque pixels.
+    a = ((value >> 24) - (predict >> 24)) & 0xff;
+  } else {
+    a = NearLosslessComponent(value >> 24, predict >> 24, 0xff, quantization);
+  }
+  g = NearLosslessComponent((value >> 8) & 0xff, (predict >> 8) & 0xff, 0xff,
+                            quantization);
+  if (used_subtract_green) {
+    // The green offset will be added to red and blue components during decoding
+    // to obtain the actual red and blue values.
+    new_green = ((predict >> 8) + g) & 0xff;
+    // The amount by which green has been adjusted during quantization. It is
+    // subtracted from red and blue for compensation, to avoid accumulating two
+    // quantization errors in them.
+    green_diff = (new_green - (value >> 8)) & 0xff;
+  }
+  r = NearLosslessComponent(((value >> 16) - green_diff) & 0xff,
+                            (predict >> 16) & 0xff, 0xff - new_green,
+                            quantization);
+  b = NearLosslessComponent((value - green_diff) & 0xff, predict & 0xff,
+                            0xff - new_green, quantization);
+  return ((uint32_t)a << 24) | ((uint32_t)r << 16) | ((uint32_t)g << 8) | b;
+}
+
+// Stores the difference between the pixel and its prediction in "out".
+// In case of a lossy encoding, updates the source image to avoid propagating
+// the deviation further to pixels which depend on the current pixel for their
+// predictions.
+static WEBP_INLINE void GetResidual(
+    int width, int height, uint32_t* const upper_row,
+    uint32_t* const current_row, const uint8_t* const max_diffs, int mode,
+    int x_start, int x_end, int y, int max_quantization, int exact,
+    int used_subtract_green, uint32_t* const out) {
+  if (exact) {
+    PredictBatch(mode, x_start, y, x_end - x_start, current_row, upper_row,
+                 out);
+  } else {
+    const VP8LPredictorFunc pred_func = VP8LPredictors[mode];
+    int x;
+    for (x = x_start; x < x_end; ++x) {
+      uint32_t predict;
+      uint32_t residual;
+      if (y == 0) {
+        predict = (x == 0) ? ARGB_BLACK : current_row[x - 1];  // Left.
+      } else if (x == 0) {
+        predict = upper_row[x];  // Top.
+      } else {
+        predict = pred_func(current_row[x - 1], upper_row + x);
+      }
+      if (max_quantization == 1 || mode == 0 || y == 0 || y == height - 1 ||
+          x == 0 || x == width - 1) {
+        residual = VP8LSubPixels(current_row[x], predict);
+      } else {
+        residual = NearLossless(current_row[x], predict, max_quantization,
+                                max_diffs[x], used_subtract_green);
+        // Update the source image.
+        current_row[x] = VP8LAddPixels(predict, residual);
+        // x is never 0 here so we do not need to update upper_row like below.
+      }
+      if ((current_row[x] & kMaskAlpha) == 0) {
+        // If alpha is 0, cleanup RGB. We can choose the RGB values of the
+        // residual for best compression. The prediction of alpha itself can be
+        // non-zero and must be kept though. We choose RGB of the residual to be
+        // 0.
+        residual &= kMaskAlpha;
+        // Update the source image.
+        current_row[x] = predict & ~kMaskAlpha;
+        // The prediction for the rightmost pixel in a row uses the leftmost
+        // pixel
+        // in that row as its top-right context pixel. Hence if we change the
+        // leftmost pixel of current_row, the corresponding change must be
+        // applied
+        // to upper_row as well where top-right context is being read from.
+        if (x == 0 && y != 0) upper_row[width] = current_row[0];
+      }
+      out[x - x_start] = residual;
+    }
+  }
+}
+
+// Returns best predictor and updates the accumulated histogram.
+// If max_quantization > 1, assumes that near lossless processing will be
+// applied, quantizing residuals to multiples of quantization levels up to
+// max_quantization (the actual quantization level depends on smoothness near
+// the given pixel).
+static int GetBestPredictorForTile(int width, int height,
+                                   int tile_x, int tile_y, int bits,
+                                   int accumulated[4][256],
+                                   uint32_t* const argb_scratch,
+                                   const uint32_t* const argb,
+                                   int max_quantization,
+                                   int exact, int used_subtract_green,
+                                   const uint32_t* const modes) {
+  const int kNumPredModes = 14;
+  const int start_x = tile_x << bits;
+  const int start_y = tile_y << bits;
+  const int tile_size = 1 << bits;
+  const int max_y = GetMin(tile_size, height - start_y);
+  const int max_x = GetMin(tile_size, width - start_x);
+  // Whether there exist columns just outside the tile.
+  const int have_left = (start_x > 0);
+  const int have_right = (max_x < width - start_x);
+  // Position and size of the strip covering the tile and adjacent columns if
+  // they exist.
+  const int context_start_x = start_x - have_left;
+  const int context_width = max_x + have_left + have_right;
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  // Prediction modes of the left and above neighbor tiles.
+  const int left_mode = (tile_x > 0) ?
+      (modes[tile_y * tiles_per_row + tile_x - 1] >> 8) & 0xff : 0xff;
+  const int above_mode = (tile_y > 0) ?
+      (modes[(tile_y - 1) * tiles_per_row + tile_x] >> 8) & 0xff : 0xff;
+  // The width of upper_row and current_row is one pixel larger than image width
+  // to allow the top right pixel to point to the leftmost pixel of the next row
+  // when at the right edge.
+  uint32_t* upper_row = argb_scratch;
+  uint32_t* current_row = upper_row + width + 1;
+  uint8_t* const max_diffs = (uint8_t*)(current_row + width + 1);
+  float best_diff = MAX_DIFF_COST;
+  int best_mode = 0;
+  int mode;
+  int histo_stack_1[4][256];
+  int histo_stack_2[4][256];
+  // Need pointers to be able to swap arrays.
+  int (*histo_argb)[256] = histo_stack_1;
+  int (*best_histo)[256] = histo_stack_2;
+  int i, j;
+  uint32_t residuals[1 << MAX_TRANSFORM_BITS];
+  assert(bits <= MAX_TRANSFORM_BITS);
+  assert(max_x <= (1 << MAX_TRANSFORM_BITS));
+
+  for (mode = 0; mode < kNumPredModes; ++mode) {
+    float cur_diff;
+    int relative_y;
+    memset(histo_argb, 0, sizeof(histo_stack_1));
+    if (start_y > 0) {
+      // Read the row above the tile which will become the first upper_row.
+      // Include a pixel to the left if it exists; include a pixel to the right
+      // in all cases (wrapping to the leftmost pixel of the next row if it does
+      // not exist).
+      memcpy(current_row + context_start_x,
+             argb + (start_y - 1) * width + context_start_x,
+             sizeof(*argb) * (max_x + have_left + 1));
+    }
+    for (relative_y = 0; relative_y < max_y; ++relative_y) {
+      const int y = start_y + relative_y;
+      int relative_x;
+      uint32_t* tmp = upper_row;
+      upper_row = current_row;
+      current_row = tmp;
+      // Read current_row. Include a pixel to the left if it exists; include a
+      // pixel to the right in all cases except at the bottom right corner of
+      // the image (wrapping to the leftmost pixel of the next row if it does
+      // not exist in the current row).
+      memcpy(current_row + context_start_x,
+             argb + y * width + context_start_x,
+             sizeof(*argb) * (max_x + have_left + (y + 1 < height)));
+      if (max_quantization > 1 && y >= 1 && y + 1 < height) {
+        MaxDiffsForRow(context_width, width, argb + y * width + context_start_x,
+                       max_diffs + context_start_x, used_subtract_green);
+      }
+
+      GetResidual(width, height, upper_row, current_row, max_diffs, mode,
+                  start_x, start_x + max_x, y, max_quantization, exact,
+                  used_subtract_green, residuals);
+      for (relative_x = 0; relative_x < max_x; ++relative_x) {
+        UpdateHisto(histo_argb, residuals[relative_x]);
+      }
+    }
+    cur_diff = PredictionCostSpatialHistogram(
+        (const int (*)[256])accumulated, (const int (*)[256])histo_argb);
+    // Favor keeping the areas locally similar.
+    if (mode == left_mode) cur_diff -= kSpatialPredictorBias;
+    if (mode == above_mode) cur_diff -= kSpatialPredictorBias;
+
+    if (cur_diff < best_diff) {
+      int (*tmp)[256] = histo_argb;
+      histo_argb = best_histo;
+      best_histo = tmp;
+      best_diff = cur_diff;
+      best_mode = mode;
+    }
+  }
+
+  for (i = 0; i < 4; i++) {
+    for (j = 0; j < 256; j++) {
+      accumulated[i][j] += best_histo[i][j];
+    }
+  }
+
+  return best_mode;
+}
+
+// Converts pixels of the image to residuals with respect to predictions.
+// If max_quantization > 1, applies near lossless processing, quantizing
+// residuals to multiples of quantization levels up to max_quantization
+// (the actual quantization level depends on smoothness near the given pixel).
+static void CopyImageWithPrediction(int width, int height,
+                                    int bits, uint32_t* const modes,
+                                    uint32_t* const argb_scratch,
+                                    uint32_t* const argb,
+                                    int low_effort, int max_quantization,
+                                    int exact, int used_subtract_green) {
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  // The width of upper_row and current_row is one pixel larger than image width
+  // to allow the top right pixel to point to the leftmost pixel of the next row
+  // when at the right edge.
+  uint32_t* upper_row = argb_scratch;
+  uint32_t* current_row = upper_row + width + 1;
+  uint8_t* current_max_diffs = (uint8_t*)(current_row + width + 1);
+  uint8_t* lower_max_diffs = current_max_diffs + width;
+  int y;
+
+  for (y = 0; y < height; ++y) {
+    int x;
+    uint32_t* const tmp32 = upper_row;
+    upper_row = current_row;
+    current_row = tmp32;
+    memcpy(current_row, argb + y * width,
+           sizeof(*argb) * (width + (y + 1 < height)));
+
+    if (low_effort) {
+      PredictBatch(kPredLowEffort, 0, y, width, current_row, upper_row,
+                   argb + y * width);
+    } else {
+      if (max_quantization > 1) {
+        // Compute max_diffs for the lower row now, because that needs the
+        // contents of argb for the current row, which we will overwrite with
+        // residuals before proceeding with the next row.
+        uint8_t* const tmp8 = current_max_diffs;
+        current_max_diffs = lower_max_diffs;
+        lower_max_diffs = tmp8;
+        if (y + 2 < height) {
+          MaxDiffsForRow(width, width, argb + (y + 1) * width, lower_max_diffs,
+                         used_subtract_green);
+        }
+      }
+      for (x = 0; x < width;) {
+        const int mode =
+            (modes[(y >> bits) * tiles_per_row + (x >> bits)] >> 8) & 0xff;
+        int x_end = x + (1 << bits);
+        if (x_end > width) x_end = width;
+        GetResidual(width, height, upper_row, current_row, current_max_diffs,
+                    mode, x, x_end, y, max_quantization, exact,
+                    used_subtract_green, argb + y * width + x);
+        x = x_end;
+      }
+    }
+  }
+}
+
+// Finds the best predictor for each tile, and converts the image to residuals
+// with respect to predictions. If near_lossless_quality < 100, applies
+// near lossless processing, shaving off more bits of residuals for lower
+// qualities.
+void VP8LResidualImage(int width, int height, int bits, int low_effort,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image, int near_lossless_quality,
+                       int exact, int used_subtract_green) {
+  const int tiles_per_row = VP8LSubSampleSize(width, bits);
+  const int tiles_per_col = VP8LSubSampleSize(height, bits);
+  int tile_y;
+  int histo[4][256];
+  const int max_quantization = 1 << VP8LNearLosslessBits(near_lossless_quality);
+  if (low_effort) {
+    int i;
+    for (i = 0; i < tiles_per_row * tiles_per_col; ++i) {
+      image[i] = ARGB_BLACK | (kPredLowEffort << 8);
+    }
+  } else {
+    memset(histo, 0, sizeof(histo));
+    for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
+      int tile_x;
+      for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
+        const int pred = GetBestPredictorForTile(width, height, tile_x, tile_y,
+            bits, histo, argb_scratch, argb, max_quantization, exact,
+            used_subtract_green, image);
+        image[tile_y * tiles_per_row + tile_x] = ARGB_BLACK | (pred << 8);
+      }
+    }
+  }
+
+  CopyImageWithPrediction(width, height, bits, image, argb_scratch, argb,
+                          low_effort, max_quantization, exact,
+                          used_subtract_green);
+}
+
+//------------------------------------------------------------------------------
+// Color transform functions.
+
+static WEBP_INLINE void MultipliersClear(VP8LMultipliers* const m) {
+  m->green_to_red_ = 0;
+  m->green_to_blue_ = 0;
+  m->red_to_blue_ = 0;
+}
+
+static WEBP_INLINE void ColorCodeToMultipliers(uint32_t color_code,
+                                               VP8LMultipliers* const m) {
+  m->green_to_red_  = (color_code >>  0) & 0xff;
+  m->green_to_blue_ = (color_code >>  8) & 0xff;
+  m->red_to_blue_   = (color_code >> 16) & 0xff;
+}
+
+static WEBP_INLINE uint32_t MultipliersToColorCode(
+    const VP8LMultipliers* const m) {
+  return 0xff000000u |
+         ((uint32_t)(m->red_to_blue_) << 16) |
+         ((uint32_t)(m->green_to_blue_) << 8) |
+         m->green_to_red_;
+}
+
+static float PredictionCostCrossColor(const int accumulated[256],
+                                      const int counts[256]) {
+  // Favor low entropy, locally and globally.
+  // Favor small absolute values for PredictionCostSpatial
+  static const double kExpValue = 2.4;
+  return VP8LCombinedShannonEntropy(counts, accumulated) +
+         PredictionCostSpatial(counts, 3, kExpValue);
+}
+
+static float GetPredictionCostCrossColorRed(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int green_to_red,
+    const int accumulated_red_histo[256]) {
+  int histo[256] = { 0 };
+  float cur_diff;
+
+  VP8LCollectColorRedTransforms(argb, stride, tile_width, tile_height,
+                                green_to_red, histo);
+
+  cur_diff = PredictionCostCrossColor(accumulated_red_histo, histo);
+  if ((uint8_t)green_to_red == prev_x.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)green_to_red == prev_y.green_to_red_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_red == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+static void GetBestGreenToRed(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+    const int accumulated_red_histo[256], VP8LMultipliers* const best_tx) {
+  const int kMaxIters = 4 + ((7 * quality) >> 8);  // in range [4..6]
+  int green_to_red_best = 0;
+  int iter, offset;
+  float best_diff = GetPredictionCostCrossColorRed(
+      argb, stride, tile_width, tile_height, prev_x, prev_y,
+      green_to_red_best, accumulated_red_histo);
+  for (iter = 0; iter < kMaxIters; ++iter) {
+    // ColorTransformDelta is a 3.5 bit fixed point, so 32 is equal to
+    // one in color computation. Having initial delta here as 1 is sufficient
+    // to explore the range of (-2, 2).
+    const int delta = 32 >> iter;
+    // Try a negative and a positive delta from the best known value.
+    for (offset = -delta; offset <= delta; offset += 2 * delta) {
+      const int green_to_red_cur = offset + green_to_red_best;
+      const float cur_diff = GetPredictionCostCrossColorRed(
+          argb, stride, tile_width, tile_height, prev_x, prev_y,
+          green_to_red_cur, accumulated_red_histo);
+      if (cur_diff < best_diff) {
+        best_diff = cur_diff;
+        green_to_red_best = green_to_red_cur;
+      }
+    }
+  }
+  best_tx->green_to_red_ = green_to_red_best;
+}
+
+static float GetPredictionCostCrossColorBlue(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y,
+    int green_to_blue, int red_to_blue, const int accumulated_blue_histo[256]) {
+  int histo[256] = { 0 };
+  float cur_diff;
+
+  VP8LCollectColorBlueTransforms(argb, stride, tile_width, tile_height,
+                                 green_to_blue, red_to_blue, histo);
+
+  cur_diff = PredictionCostCrossColor(accumulated_blue_histo, histo);
+  if ((uint8_t)green_to_blue == prev_x.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)green_to_blue == prev_y.green_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_x.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if ((uint8_t)red_to_blue == prev_y.red_to_blue_) {
+    cur_diff -= 3;  // favor keeping the areas locally similar
+  }
+  if (green_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  if (red_to_blue == 0) {
+    cur_diff -= 3;
+  }
+  return cur_diff;
+}
+
+#define kGreenRedToBlueNumAxis 8
+#define kGreenRedToBlueMaxIters 7
+static void GetBestGreenRedToBlue(
+    const uint32_t* argb, int stride, int tile_width, int tile_height,
+    VP8LMultipliers prev_x, VP8LMultipliers prev_y, int quality,
+    const int accumulated_blue_histo[256],
+    VP8LMultipliers* const best_tx) {
+  const int8_t offset[kGreenRedToBlueNumAxis][2] =
+      {{0, -1}, {0, 1}, {-1, 0}, {1, 0}, {-1, -1}, {-1, 1}, {1, -1}, {1, 1}};
+  const int8_t delta_lut[kGreenRedToBlueMaxIters] = { 16, 16, 8, 4, 2, 2, 2 };
+  const int iters =
+      (quality < 25) ? 1 : (quality > 50) ? kGreenRedToBlueMaxIters : 4;
+  int green_to_blue_best = 0;
+  int red_to_blue_best = 0;
+  int iter;
+  // Initial value at origin:
+  float best_diff = GetPredictionCostCrossColorBlue(
+      argb, stride, tile_width, tile_height, prev_x, prev_y,
+      green_to_blue_best, red_to_blue_best, accumulated_blue_histo);
+  for (iter = 0; iter < iters; ++iter) {
+    const int delta = delta_lut[iter];
+    int axis;
+    for (axis = 0; axis < kGreenRedToBlueNumAxis; ++axis) {
+      const int green_to_blue_cur =
+          offset[axis][0] * delta + green_to_blue_best;
+      const int red_to_blue_cur = offset[axis][1] * delta + red_to_blue_best;
+      const float cur_diff = GetPredictionCostCrossColorBlue(
+          argb, stride, tile_width, tile_height, prev_x, prev_y,
+          green_to_blue_cur, red_to_blue_cur, accumulated_blue_histo);
+      if (cur_diff < best_diff) {
+        best_diff = cur_diff;
+        green_to_blue_best = green_to_blue_cur;
+        red_to_blue_best = red_to_blue_cur;
+      }
+      if (quality < 25 && iter == 4) {
+        // Only axis aligned diffs for lower quality.
+        break;  // next iter.
+      }
+    }
+    if (delta == 2 && green_to_blue_best == 0 && red_to_blue_best == 0) {
+      // Further iterations would not help.
+      break;  // out of iter-loop.
+    }
+  }
+  best_tx->green_to_blue_ = green_to_blue_best;
+  best_tx->red_to_blue_ = red_to_blue_best;
+}
+#undef kGreenRedToBlueMaxIters
+#undef kGreenRedToBlueNumAxis
+
+static VP8LMultipliers GetBestColorTransformForTile(
+    int tile_x, int tile_y, int bits,
+    VP8LMultipliers prev_x,
+    VP8LMultipliers prev_y,
+    int quality, int xsize, int ysize,
+    const int accumulated_red_histo[256],
+    const int accumulated_blue_histo[256],
+    const uint32_t* const argb) {
+  const int max_tile_size = 1 << bits;
+  const int tile_y_offset = tile_y * max_tile_size;
+  const int tile_x_offset = tile_x * max_tile_size;
+  const int all_x_max = GetMin(tile_x_offset + max_tile_size, xsize);
+  const int all_y_max = GetMin(tile_y_offset + max_tile_size, ysize);
+  const int tile_width = all_x_max - tile_x_offset;
+  const int tile_height = all_y_max - tile_y_offset;
+  const uint32_t* const tile_argb = argb + tile_y_offset * xsize
+                                  + tile_x_offset;
+  VP8LMultipliers best_tx;
+  MultipliersClear(&best_tx);
+
+  GetBestGreenToRed(tile_argb, xsize, tile_width, tile_height,
+                    prev_x, prev_y, quality, accumulated_red_histo, &best_tx);
+  GetBestGreenRedToBlue(tile_argb, xsize, tile_width, tile_height,
+                        prev_x, prev_y, quality, accumulated_blue_histo,
+                        &best_tx);
+  return best_tx;
+}
+
+static void CopyTileWithColorTransform(int xsize, int ysize,
+                                       int tile_x, int tile_y,
+                                       int max_tile_size,
+                                       VP8LMultipliers color_transform,
+                                       uint32_t* argb) {
+  const int xscan = GetMin(max_tile_size, xsize - tile_x);
+  int yscan = GetMin(max_tile_size, ysize - tile_y);
+  argb += tile_y * xsize + tile_x;
+  while (yscan-- > 0) {
+    VP8LTransformColor(&color_transform, argb, xscan);
+    argb += xsize;
+  }
+}
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                             uint32_t* const argb, uint32_t* image) {
+  const int max_tile_size = 1 << bits;
+  const int tile_xsize = VP8LSubSampleSize(width, bits);
+  const int tile_ysize = VP8LSubSampleSize(height, bits);
+  int accumulated_red_histo[256] = { 0 };
+  int accumulated_blue_histo[256] = { 0 };
+  int tile_x, tile_y;
+  VP8LMultipliers prev_x, prev_y;
+  MultipliersClear(&prev_y);
+  MultipliersClear(&prev_x);
+  for (tile_y = 0; tile_y < tile_ysize; ++tile_y) {
+    for (tile_x = 0; tile_x < tile_xsize; ++tile_x) {
+      int y;
+      const int tile_x_offset = tile_x * max_tile_size;
+      const int tile_y_offset = tile_y * max_tile_size;
+      const int all_x_max = GetMin(tile_x_offset + max_tile_size, width);
+      const int all_y_max = GetMin(tile_y_offset + max_tile_size, height);
+      const int offset = tile_y * tile_xsize + tile_x;
+      if (tile_y != 0) {
+        ColorCodeToMultipliers(image[offset - tile_xsize], &prev_y);
+      }
+      prev_x = GetBestColorTransformForTile(tile_x, tile_y, bits,
+                                            prev_x, prev_y,
+                                            quality, width, height,
+                                            accumulated_red_histo,
+                                            accumulated_blue_histo,
+                                            argb);
+      image[offset] = MultipliersToColorCode(&prev_x);
+      CopyTileWithColorTransform(width, height, tile_x_offset, tile_y_offset,
+                                 max_tile_size, prev_x, argb);
+
+      // Gather accumulated histogram data.
+      for (y = tile_y_offset; y < all_y_max; ++y) {
+        int ix = y * width + tile_x_offset;
+        const int ix_end = ix + all_x_max - tile_x_offset;
+        for (; ix < ix_end; ++ix) {
+          const uint32_t pix = argb[ix];
+          if (ix >= 2 &&
+              pix == argb[ix - 2] &&
+              pix == argb[ix - 1]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          if (ix >= width + 2 &&
+              argb[ix - 2] == argb[ix - width - 2] &&
+              argb[ix - 1] == argb[ix - width - 1] &&
+              pix == argb[ix - width]) {
+            continue;  // repeated pixels are handled by backward references
+          }
+          ++accumulated_red_histo[(pix >> 16) & 0xff];
+          ++accumulated_blue_histo[(pix >> 0) & 0xff];
+        }
+      }
+    }
+  }
+}
diff --git a/src/3rdparty/libwebp/src/enc/quant.c b/src/3rdparty/libwebp/src/enc/quant_enc.c
index 549ad26..b118fb2 100644
--- a/src/3rdparty/libwebp/src/enc/quant.c
+++ b/src/3rdparty/libwebp/src/enc/quant_enc.c
@@ -15,8 +15,8 @@
 #include <math.h>
 #include <stdlib.h>  // for abs()
 
-#include "./vp8enci.h"
-#include "./cost.h"
+#include "./vp8i_enc.h"
+#include "./cost_enc.h"
 
 #define DO_TRELLIS_I4  1
 #define DO_TRELLIS_I16 1   // not a huge gain, but ok at low bitrate.
@@ -278,7 +278,7 @@ static void SetupMatrices(VP8Encoder* enc) {
     CheckLambdaValue(&m->lambda_trellis_uv_);
     CheckLambdaValue(&m->tlambda_);
 
-    m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
+    m->min_disto_ = 20 * m->y1_.q_[0];   // quantization-aware min disto
     m->max_edge_  = 0;
 
     m->i4_penalty_ = 1000 * q_i4 * q_i4;
@@ -643,6 +643,8 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
     const int sign = (in[j] < 0);
     const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
     int level0 = QUANTDIV(coeff0, iQ, B);
+    int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
+    if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
 
     {   // Swap current and previous score states
@@ -657,23 +659,17 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
       int level = level0 + m;
       const int ctx = (level > 2) ? 2 : level;
       const int band = VP8EncBands[n + 1];
-      score_t base_score, last_pos_score;
+      score_t base_score;
       score_t best_cur_score = MAX_COST;
       int best_prev = 0;   // default, in case
 
       ss_cur[m].score = MAX_COST;
       ss_cur[m].costs = costs[n + 1][ctx];
-      if (level > MAX_LEVEL || level < 0) {   // node is dead?
+      if (level < 0 || level > thresh_level) {
+        // Node is dead.
         continue;
       }
 
-      // Compute extra rate cost if last coeff's position is < 15
-      {
-        const score_t last_pos_cost =
-            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
-        last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
-      }
-
       {
         // Compute delta_error = how much coding this level will
         // subtract to max_error as distortion.
@@ -705,6 +701,9 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
 
       // Now, record best terminal node (and thus best entry in the graph).
       if (level != 0) {
+        const score_t last_pos_cost =
+            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
+        const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
         const score_t score = best_cur_score + last_pos_score;
         if (score < best_score) {
           best_score = score;
@@ -874,9 +873,9 @@ static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
   // We look at the first three AC coefficients to determine what is the average
   // delta between each sub-4x4 block.
   const int v0 = abs(DCs[1]);
-  const int v1 = abs(DCs[4]);
-  const int v2 = abs(DCs[5]);
-  int max_v = (v0 > v1) ? v1 : v0;
+  const int v1 = abs(DCs[2]);
+  const int v2 = abs(DCs[4]);
+  int max_v = (v1 > v0) ? v1 : v0;
   max_v = (v2 > max_v) ? v2 : max_v;
   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
 }
@@ -957,7 +956,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
   // we have a blocky macroblock (only DCs are non-zero) with fairly high
   // distortion, record max delta so we can later adjust the minimal filtering
   // strength needed to smooth these blocks out.
-  if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
+  if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
     StoreMaxDelta(dqm, rd->y_dc_levels);
   }
 }
@@ -1155,7 +1154,8 @@ static void RefineUsingDistortion(VP8EncIterator* const it,
   const int lambda_d_uv = 120;
   score_t score_i4 = dqm->i4_penalty_;
   score_t i4_bit_sum = 0;
-  const score_t bit_limit = it->enc_->mb_header_limit_;
+  const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
+                                           : MAX_COST;  // no early-out allowed
 
   if (is_i16) {   // First, evaluate Intra16 distortion
     int best_mode = -1;
diff --git a/src/3rdparty/libwebp/src/enc/syntax.c b/src/3rdparty/libwebp/src/enc/syntax_enc.c
index a0e79ef..90665bd 100644
--- a/src/3rdparty/libwebp/src/enc/syntax.c
+++ b/src/3rdparty/libwebp/src/enc/syntax_enc.c
@@ -16,7 +16,7 @@
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"  // RIFF constants
 #include "../webp/mux_types.h"         // ALPHA_FLAG
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Helper functions
@@ -362,8 +362,7 @@ int VP8EncWrite(VP8Encoder* const enc) {
   for (p = 0; p < enc->num_parts_; ++p) {
     const uint8_t* const buf = VP8BitWriterBuf(enc->parts_ + p);
     const size_t size = VP8BitWriterSize(enc->parts_ + p);
-    if (size)
-      ok = ok && pic->writer(buf, size, pic);
+    if (size) ok = ok && pic->writer(buf, size, pic);
     VP8BitWriterWipeOut(enc->parts_ + p);    // will free the internal buffer.
     ok = ok && WebPReportProgress(pic, enc->percent_ + percent_per_part,
                                   &enc->percent_);
diff --git a/src/3rdparty/libwebp/src/enc/token.c b/src/3rdparty/libwebp/src/enc/token_enc.c
index e73256b..02a0d72 100644
--- a/src/3rdparty/libwebp/src/enc/token.c
+++ b/src/3rdparty/libwebp/src/enc/token_enc.c
@@ -20,8 +20,8 @@
 #include <stdlib.h>
 #include <string.h>
 
-#include "./cost.h"
-#include "./vp8enci.h"
+#include "./cost_enc.h"
+#include "./vp8i_enc.h"
 #include "../utils/utils.h"
 
 #if !defined(DISABLE_TOKEN_BUFFER)
@@ -87,14 +87,16 @@ static int TBufferNewPage(VP8TBuffer* const b) {
 #define TOKEN_ID(t, b, ctx) \
     (NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
 
-static WEBP_INLINE uint32_t AddToken(VP8TBuffer* const b,
-                                     uint32_t bit, uint32_t proba_idx) {
+static WEBP_INLINE uint32_t AddToken(VP8TBuffer* const b, uint32_t bit,
+                                     uint32_t proba_idx,
+                                     proba_t* const stats) {
   assert(proba_idx < FIXED_PROBA_BIT);
   assert(bit <= 1);
   if (b->left_ > 0 || TBufferNewPage(b)) {
     const int slot = --b->left_;
     b->tokens_[slot] = (bit << 15) | proba_idx;
   }
+  VP8RecordStats(bit, stats);
   return bit;
 }
 
@@ -108,13 +110,16 @@ static WEBP_INLINE void AddConstantToken(VP8TBuffer* const b,
   }
 }
 
-int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
-                         int first, int last,
-                         const int16_t* const coeffs,
+int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res,
                          VP8TBuffer* const tokens) {
-  int n = first;
+  const int16_t* const coeffs = res->coeffs;
+  const int coeff_type = res->coeff_type;
+  const int last = res->last;
+  int n = res->first;
   uint32_t base_id = TOKEN_ID(coeff_type, n, ctx);
-  if (!AddToken(tokens, last >= 0, base_id + 0)) {
+  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  proba_t* s = res->stats[n][ctx];
+  if (!AddToken(tokens, last >= 0, base_id + 0, s + 0)) {
     return 0;
   }
 
@@ -122,18 +127,21 @@ int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
     const int c = coeffs[n++];
     const int sign = c < 0;
     const uint32_t v = sign ? -c : c;
-    if (!AddToken(tokens, v != 0, base_id + 1)) {
+    if (!AddToken(tokens, v != 0, base_id + 1, s + 1)) {
       base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 0);  // ctx=0
+      s = res->stats[VP8EncBands[n]][0];
       continue;
     }
-    if (!AddToken(tokens, v > 1, base_id + 2)) {
+    if (!AddToken(tokens, v > 1, base_id + 2, s + 2)) {
       base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 1);  // ctx=1
+      s = res->stats[VP8EncBands[n]][1];
     } else {
-      if (!AddToken(tokens, v > 4, base_id + 3)) {
-        if (AddToken(tokens, v != 2, base_id + 4))
-          AddToken(tokens, v == 4, base_id + 5);
-      } else if (!AddToken(tokens, v > 10, base_id + 6)) {
-        if (!AddToken(tokens, v > 6, base_id + 7)) {
+      if (!AddToken(tokens, v > 4, base_id + 3, s + 3)) {
+        if (AddToken(tokens, v != 2, base_id + 4, s + 4)) {
+          AddToken(tokens, v == 4, base_id + 5, s + 5);
+        }
+      } else if (!AddToken(tokens, v > 10, base_id + 6, s + 6)) {
+        if (!AddToken(tokens, v > 6, base_id + 7, s + 7)) {
           AddConstantToken(tokens, v == 6, 159);
         } else {
           AddConstantToken(tokens, v >= 9, 165);
@@ -144,26 +152,26 @@ int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
         const uint8_t* tab;
         uint32_t residue = v - 3;
         if (residue < (8 << 1)) {          // VP8Cat3  (3b)
-          AddToken(tokens, 0, base_id + 8);
-          AddToken(tokens, 0, base_id + 9);
+          AddToken(tokens, 0, base_id + 8, s + 8);
+          AddToken(tokens, 0, base_id + 9, s + 9);
           residue -= (8 << 0);
           mask = 1 << 2;
           tab = VP8Cat3;
         } else if (residue < (8 << 2)) {   // VP8Cat4  (4b)
-          AddToken(tokens, 0, base_id + 8);
-          AddToken(tokens, 1, base_id + 9);
+          AddToken(tokens, 0, base_id + 8, s + 8);
+          AddToken(tokens, 1, base_id + 9, s + 9);
           residue -= (8 << 1);
           mask = 1 << 3;
           tab = VP8Cat4;
         } else if (residue < (8 << 3)) {   // VP8Cat5  (5b)
-          AddToken(tokens, 1, base_id + 8);
-          AddToken(tokens, 0, base_id + 10);
+          AddToken(tokens, 1, base_id + 8, s + 8);
+          AddToken(tokens, 0, base_id + 10, s + 9);
           residue -= (8 << 2);
           mask = 1 << 4;
           tab = VP8Cat5;
         } else {                         // VP8Cat6 (11b)
-          AddToken(tokens, 1, base_id + 8);
-          AddToken(tokens, 1, base_id + 10);
+          AddToken(tokens, 1, base_id + 8, s + 8);
+          AddToken(tokens, 1, base_id + 10, s + 9);
           residue -= (8 << 3);
           mask = 1 << 10;
           tab = VP8Cat6;
@@ -174,9 +182,10 @@ int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
         }
       }
       base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 2);  // ctx=2
+      s = res->stats[VP8EncBands[n]][2];
     }
     AddConstantToken(tokens, sign, 128);
-    if (n == 16 || !AddToken(tokens, n <= last, base_id + 0)) {
+    if (n == 16 || !AddToken(tokens, n <= last, base_id + 0, s + 0)) {
       return 1;   // EOB
     }
   }
diff --git a/src/3rdparty/libwebp/src/enc/tree.c b/src/3rdparty/libwebp/src/enc/tree_enc.c
index f141006..2c40fe7 100644
--- a/src/3rdparty/libwebp/src/enc/tree.c
+++ b/src/3rdparty/libwebp/src/enc/tree_enc.c
@@ -11,7 +11,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./vp8enci.h"
+#include "./vp8i_enc.h"
 
 //------------------------------------------------------------------------------
 // Default probabilities
diff --git a/src/3rdparty/libwebp/src/enc/vp8enci.h b/src/3rdparty/libwebp/src/enc/vp8i_enc.h
index c1fbd76..93c95ec 100644
--- a/src/3rdparty/libwebp/src/enc/vp8enci.h
+++ b/src/3rdparty/libwebp/src/enc/vp8i_enc.h
@@ -15,10 +15,10 @@
 #define WEBP_ENC_VP8ENCI_H_
 
 #include <string.h>     // for memcpy()
-#include "../dec/common.h"
+#include "../dec/common_dec.h"
 #include "../dsp/dsp.h"
-#include "../utils/bit_writer.h"
-#include "../utils/thread.h"
+#include "../utils/bit_writer_utils.h"
+#include "../utils/thread_utils.h"
 #include "../utils/utils.h"
 #include "../webp/encode.h"
 
@@ -31,8 +31,8 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 5
-#define ENC_REV_VERSION 1
+#define ENC_MIN_VERSION 6
+#define ENC_REV_VERSION 0
 
 enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
@@ -219,7 +219,6 @@ typedef struct {
 // right neighbouring data (samples, predictions, contexts, ...)
 typedef struct {
   int x_, y_;                      // current macroblock
-  int y_stride_, uv_stride_;       // respective strides
   uint8_t*      yuv_in_;           // input samples
   uint8_t*      yuv_out_;          // output samples
   uint8_t*      yuv_out2_;         // secondary buffer swapped with yuv_out_.
@@ -325,9 +324,7 @@ int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
                   const uint8_t* const probas, int final_pass);
 
 // record the coding of coefficients without knowing the probabilities yet
-int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
-                         int first, int last,
-                         const int16_t* const coeffs,
+int VP8RecordCoeffTokens(int ctx, const struct VP8Residual* const res,
                          VP8TBuffer* const tokens);
 
 // Estimate the final coded size given a set of 'probas'.
@@ -476,14 +473,6 @@ int VP8EncStartAlpha(VP8Encoder* const enc);    // start alpha coding process
 int VP8EncFinishAlpha(VP8Encoder* const enc);   // finalize compressed data
 int VP8EncDeleteAlpha(VP8Encoder* const enc);   // delete compressed data
 
-  // in filter.c
-void VP8SSIMAddStats(const VP8DistoStats* const src, VP8DistoStats* const dst);
-void VP8SSIMAccumulatePlane(const uint8_t* src1, int stride1,
-                            const uint8_t* src2, int stride2,
-                            int W, int H, VP8DistoStats* const stats);
-double VP8SSIMGet(const VP8DistoStats* const stats);
-double VP8SSIMGetSquaredError(const VP8DistoStats* const stats);
-
 // autofilter
 void VP8InitFilter(VP8EncIterator* const it);
 void VP8StoreFilterStats(VP8EncIterator* const it);
diff --git a/src/3rdparty/libwebp/src/enc/vp8l.c b/src/3rdparty/libwebp/src/enc/vp8l_enc.c
index c16e256..b1a793d 100644
--- a/src/3rdparty/libwebp/src/enc/vp8l.c
+++ b/src/3rdparty/libwebp/src/enc/vp8l_enc.c
@@ -15,17 +15,18 @@
 #include <assert.h>
 #include <stdlib.h>
 
-#include "./backward_references.h"
-#include "./histogram.h"
-#include "./vp8enci.h"
-#include "./vp8li.h"
+#include "./backward_references_enc.h"
+#include "./histogram_enc.h"
+#include "./vp8i_enc.h"
+#include "./vp8li_enc.h"
 #include "../dsp/lossless.h"
-#include "../utils/bit_writer.h"
-#include "../utils/huffman_encode.h"
+#include "../dsp/lossless_common.h"
+#include "../utils/bit_writer_utils.h"
+#include "../utils/huffman_encode_utils.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
-#include "./delta_palettization.h"
+#include "./delta_palettization_enc.h"
 
 #define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
 // Maximum number of histogram images (sub-blocks).
@@ -34,8 +35,8 @@
 // Palette reordering for smaller sum of deltas (and for smaller storage).
 
 static int PaletteCompareColorsForQsort(const void* p1, const void* p2) {
-  const uint32_t a = WebPMemToUint32(p1);
-  const uint32_t b = WebPMemToUint32(p2);
+  const uint32_t a = WebPMemToUint32((uint8_t*)p1);
+  const uint32_t b = WebPMemToUint32((uint8_t*)p2);
   assert(a != b);
   return (a < b) ? -1 : 1;
 }
@@ -163,18 +164,25 @@ typedef enum {
   kHistoTotal  // Must be last.
 } HistoIx;
 
-static void AddSingleSubGreen(uint32_t p, uint32_t* r, uint32_t* b) {
-  const uint32_t green = p >> 8;  // The upper bits are masked away later.
+static void AddSingleSubGreen(int p, uint32_t* const r, uint32_t* const b) {
+  const int green = p >> 8;  // The upper bits are masked away later.
   ++r[((p >> 16) - green) & 0xff];
-  ++b[(p - green) & 0xff];
+  ++b[((p >>  0) - green) & 0xff];
 }
 
 static void AddSingle(uint32_t p,
-                      uint32_t* a, uint32_t* r, uint32_t* g, uint32_t* b) {
-  ++a[p >> 24];
+                      uint32_t* const a, uint32_t* const r,
+                      uint32_t* const g, uint32_t* const b) {
+  ++a[(p >> 24) & 0xff];
   ++r[(p >> 16) & 0xff];
-  ++g[(p >> 8) & 0xff];
-  ++b[(p & 0xff)];
+  ++g[(p >>  8) & 0xff];
+  ++b[(p >>  0) & 0xff];
+}
+
+static WEBP_INLINE uint32_t HashPix(uint32_t pix) {
+  // Note that masking with 0xffffffffu is for preventing an
+  // 'unsigned int overflow' warning. Doesn't impact the compiled code.
+  return ((((uint64_t)pix + (pix >> 19)) * 0x39c5fba7ull) & 0xffffffffu) >> 24;
 }
 
 static int AnalyzeEntropy(const uint32_t* argb,
@@ -214,8 +222,8 @@ static int AnalyzeEntropy(const uint32_t* argb,
                           &histo[kHistoBluePredSubGreen * 256]);
         {
           // Approximate the palette by the entropy of the multiplicative hash.
-          const int hash = ((pix + (pix >> 19)) * 0x39c5fba7) >> 24;
-          ++histo[kHistoPalette * 256 + (hash & 0xff)];
+          const uint32_t hash = HashPix(pix);
+          ++histo[kHistoPalette * 256 + hash];
         }
       }
       prev_row = curr_row;
@@ -224,9 +232,8 @@ static int AnalyzeEntropy(const uint32_t* argb,
     {
       double entropy_comp[kHistoTotal];
       double entropy[kNumEntropyIx];
-      EntropyIx k;
-      EntropyIx last_mode_to_analyze =
-          use_palette ? kPalette : kSpatialSubGreen;
+      int k;
+      int last_mode_to_analyze = use_palette ? kPalette : kSpatialSubGreen;
       int j;
       // Let's add one zero to the predicted histograms. The zeros are removed
       // too efficiently by the pix_diff == 0 comparison, at least one of the
@@ -263,7 +270,7 @@ static int AnalyzeEntropy(const uint32_t* argb,
       *min_entropy_ix = kDirect;
       for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) {
         if (entropy[*min_entropy_ix] > entropy[k]) {
-          *min_entropy_ix = k;
+          *min_entropy_ix = (EntropyIx)k;
         }
       }
       *red_and_blue_always_zero = 1;
@@ -312,7 +319,10 @@ static int GetHistoBits(int method, int use_palette, int width, int height) {
 
 static int GetTransformBits(int method, int histo_bits) {
   const int max_transform_bits = (method < 4) ? 6 : (method > 4) ? 4 : 5;
-  return (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
+  const int res =
+      (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
+  assert(res <= MAX_TRANSFORM_BITS);
+  return res;
 }
 
 static int AnalyzeAndInit(VP8LEncoder* const enc) {
@@ -697,7 +707,7 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
                                               VP8LHashChain* const hash_chain,
                                               VP8LBackwardRefs refs_array[2],
                                               int width, int height,
-                                              int quality) {
+                                              int quality, int low_effort) {
   int i;
   int max_tokens = 0;
   WebPEncodingError err = VP8_ENC_OK;
@@ -715,7 +725,8 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
   }
 
   // Calculate backward references from ARGB image.
-  if (VP8LHashChainFill(hash_chain, quality, argb, width, height) == 0) {
+  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
@@ -815,11 +826,18 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
     goto Error;
   }
 
-  *cache_bits = use_cache ? MAX_COLOR_CACHE_BITS : 0;
+  if (use_cache) {
+    // If the value is different from zero, it has been set during the
+    // palette analysis.
+    if (*cache_bits == 0) *cache_bits = MAX_COLOR_CACHE_BITS;
+  } else {
+    *cache_bits = 0;
+  }
   // 'best_refs' is the reference to the best backward refs and points to one
   // of refs_array[0] or refs_array[1].
   // Calculate backward references from ARGB image.
-  if (VP8LHashChainFill(hash_chain, quality, argb, width, height) == 0) {
+  if (!VP8LHashChainFill(hash_chain, quality, argb, width, height,
+                         low_effort)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
@@ -900,7 +918,7 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
       err = EncodeImageNoHuffman(bw, histogram_argb, hash_chain, refs_array,
                                  VP8LSubSampleSize(width, histogram_bits),
                                  VP8LSubSampleSize(height, histogram_bits),
-                                 quality);
+                                 quality, low_effort);
       WebPSafeFree(histogram_argb);
       if (err != VP8_ENC_OK) goto Error;
     }
@@ -991,12 +1009,12 @@ static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
                               (VP8LHashChain*)&enc->hash_chain_,
                               (VP8LBackwardRefs*)enc->refs_,  // cast const away
                               transform_width, transform_height,
-                              quality);
+                              quality, low_effort);
 }
 
 static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
                                                int width, int height,
-                                               int quality,
+                                               int quality, int low_effort,
                                                VP8LBitWriter* const bw) {
   const int ccolor_transform_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
@@ -1012,7 +1030,7 @@ static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
                               (VP8LHashChain*)&enc->hash_chain_,
                               (VP8LBackwardRefs*)enc->refs_,  // cast const away
                               transform_width, transform_height,
-                              quality);
+                              quality, low_effort);
 }
 
 // -----------------------------------------------------------------------------
@@ -1157,7 +1175,8 @@ static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
 
 // -----------------------------------------------------------------------------
 
-static int SearchColor(const uint32_t sorted[], uint32_t color, int hi) {
+static WEBP_INLINE int SearchColorNoIdx(const uint32_t sorted[], uint32_t color,
+                                        int hi) {
   int low = 0;
   if (sorted[low] == color) return low;  // loop invariant: sorted[low] != color
   while (1) {
@@ -1172,35 +1191,68 @@ static int SearchColor(const uint32_t sorted[], uint32_t color, int hi) {
   }
 }
 
+#define APPLY_PALETTE_GREEDY_MAX 4
+
+static WEBP_INLINE uint32_t SearchColorGreedy(const uint32_t palette[],
+                                              int palette_size,
+                                              uint32_t color) {
+  (void)palette_size;
+  assert(palette_size < APPLY_PALETTE_GREEDY_MAX);
+  assert(3 == APPLY_PALETTE_GREEDY_MAX - 1);
+  if (color == palette[0]) return 0;
+  if (color == palette[1]) return 1;
+  if (color == palette[2]) return 2;
+  return 3;
+}
+
+static WEBP_INLINE uint32_t ApplyPaletteHash0(uint32_t color) {
+  // Focus on the green color.
+  return (color >> 8) & 0xff;
+}
+
+#define PALETTE_INV_SIZE_BITS 11
+#define PALETTE_INV_SIZE (1 << PALETTE_INV_SIZE_BITS)
+
+static WEBP_INLINE uint32_t ApplyPaletteHash1(uint32_t color) {
+  // Forget about alpha.
+  return ((color & 0x00ffffffu) * 4222244071u) >> (32 - PALETTE_INV_SIZE_BITS);
+}
+
+static WEBP_INLINE uint32_t ApplyPaletteHash2(uint32_t color) {
+  // Forget about alpha.
+  return (color & 0x00ffffffu) * ((1u << 31) - 1) >>
+         (32 - PALETTE_INV_SIZE_BITS);
+}
+
 // Sort palette in increasing order and prepare an inverse mapping array.
 static void PrepareMapToPalette(const uint32_t palette[], int num_colors,
-                                uint32_t sorted[], int idx_map[]) {
+                                uint32_t sorted[], uint32_t idx_map[]) {
   int i;
   memcpy(sorted, palette, num_colors * sizeof(*sorted));
   qsort(sorted, num_colors, sizeof(*sorted), PaletteCompareColorsForQsort);
   for (i = 0; i < num_colors; ++i) {
-    idx_map[SearchColor(sorted, palette[i], num_colors)] = i;
+    idx_map[SearchColorNoIdx(sorted, palette[i], num_colors)] = i;
   }
 }
 
-static void MapToPalette(const uint32_t sorted_palette[], int num_colors,
-                         uint32_t* const last_pix, int* const last_idx,
-                         const int idx_map[],
-                         const uint32_t* src, uint8_t* dst, int width) {
-  int x;
-  int prev_idx = *last_idx;
-  uint32_t prev_pix = *last_pix;
-  for (x = 0; x < width; ++x) {
-    const uint32_t pix = src[x];
-    if (pix != prev_pix) {
-      prev_idx = idx_map[SearchColor(sorted_palette, pix, num_colors)];
-      prev_pix = pix;
-    }
-    dst[x] = prev_idx;
-  }
-  *last_idx = prev_idx;
-  *last_pix = prev_pix;
-}
+// Use 1 pixel cache for ARGB pixels.
+#define APPLY_PALETTE_FOR(COLOR_INDEX) do {         \
+  uint32_t prev_pix = palette[0];                   \
+  uint32_t prev_idx = 0;                            \
+  for (y = 0; y < height; ++y) {                    \
+    for (x = 0; x < width; ++x) {                   \
+      const uint32_t pix = src[x];                  \
+      if (pix != prev_pix) {                        \
+        prev_idx = COLOR_INDEX;                     \
+        prev_pix = pix;                             \
+      }                                             \
+      tmp_row[x] = prev_idx;                        \
+    }                                               \
+    VP8LBundleColorMap(tmp_row, width, xbits, dst); \
+    src += src_stride;                              \
+    dst += dst_stride;                              \
+  }                                                 \
+} while (0)
 
 // Remap argb values in src[] to packed palettes entries in dst[]
 // using 'row' as a temporary buffer of size 'width'.
@@ -1213,52 +1265,59 @@ static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
   // TODO(skal): this tmp buffer is not needed if VP8LBundleColorMap() can be
   // made to work in-place.
   uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
-  int i, x, y;
-  int use_LUT = 1;
+  int x, y;
 
   if (tmp_row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
-  for (i = 0; i < palette_size; ++i) {
-    if ((palette[i] & 0xffff00ffu) != 0) {
-      use_LUT = 0;
-      break;
-    }
-  }
 
-  if (use_LUT) {
-    uint8_t inv_palette[MAX_PALETTE_SIZE] = { 0 };
-    for (i = 0; i < palette_size; ++i) {
-      const int color = (palette[i] >> 8) & 0xff;
-      inv_palette[color] = i;
-    }
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const int color = (src[x] >> 8) & 0xff;
-        tmp_row[x] = inv_palette[color];
+  if (palette_size < APPLY_PALETTE_GREEDY_MAX) {
+    APPLY_PALETTE_FOR(SearchColorGreedy(palette, palette_size, pix));
+  } else {
+    int i, j;
+    uint16_t buffer[PALETTE_INV_SIZE];
+    uint32_t (*const hash_functions[])(uint32_t) = {
+        ApplyPaletteHash0, ApplyPaletteHash1, ApplyPaletteHash2
+    };
+
+    // Try to find a perfect hash function able to go from a color to an index
+    // within 1 << PALETTE_INV_SIZE_BITS in order to build a hash map to go
+    // from color to index in palette.
+    for (i = 0; i < 3; ++i) {
+      int use_LUT = 1;
+      // Set each element in buffer to max uint16_t.
+      memset(buffer, 0xff, sizeof(buffer));
+      for (j = 0; j < palette_size; ++j) {
+        const uint32_t ind = hash_functions[i](palette[j]);
+        if (buffer[ind] != 0xffffu) {
+          use_LUT = 0;
+          break;
+        } else {
+          buffer[ind] = j;
+        }
       }
-      VP8LBundleColorMap(tmp_row, width, xbits, dst);
-      src += src_stride;
-      dst += dst_stride;
+      if (use_LUT) break;
     }
-  } else {
-    // Use 1 pixel cache for ARGB pixels.
-    uint32_t last_pix;
-    int last_idx;
-    uint32_t sorted[MAX_PALETTE_SIZE];
-    int idx_map[MAX_PALETTE_SIZE];
-    PrepareMapToPalette(palette, palette_size, sorted, idx_map);
-    last_pix = palette[0];
-    last_idx = 0;
-    for (y = 0; y < height; ++y) {
-      MapToPalette(sorted, palette_size, &last_pix, &last_idx,
-                   idx_map, src, tmp_row, width);
-      VP8LBundleColorMap(tmp_row, width, xbits, dst);
-      src += src_stride;
-      dst += dst_stride;
+
+    if (i == 0) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash0(pix)]);
+    } else if (i == 1) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash1(pix)]);
+    } else if (i == 2) {
+      APPLY_PALETTE_FOR(buffer[ApplyPaletteHash2(pix)]);
+    } else {
+      uint32_t idx_map[MAX_PALETTE_SIZE];
+      uint32_t palette_sorted[MAX_PALETTE_SIZE];
+      PrepareMapToPalette(palette, palette_size, palette_sorted, idx_map);
+      APPLY_PALETTE_FOR(
+          idx_map[SearchColorNoIdx(palette_sorted, pix, palette_size)]);
     }
   }
   WebPSafeFree(tmp_row);
   return VP8_ENC_OK;
 }
+#undef APPLY_PALETTE_FOR
+#undef PALETTE_INV_SIZE_BITS
+#undef PALETTE_INV_SIZE
+#undef APPLY_PALETTE_GREEDY_MAX
 
 // Note: Expects "enc->palette_" to be set properly.
 static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
@@ -1291,7 +1350,7 @@ static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
 }
 
 // Save palette_[] to bitstream.
-static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
+static WebPEncodingError EncodePalette(VP8LBitWriter* const bw, int low_effort,
                                        VP8LEncoder* const enc) {
   int i;
   uint32_t tmp_palette[MAX_PALETTE_SIZE];
@@ -1306,13 +1365,14 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
   }
   tmp_palette[0] = palette[0];
   return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_, enc->refs_,
-                              palette_size, 1, 20 /* quality */);
+                              palette_size, 1, 20 /* quality */, low_effort);
 }
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
 
 static WebPEncodingError EncodeDeltaPalettePredictorImage(
-    VP8LBitWriter* const bw, VP8LEncoder* const enc, int quality) {
+    VP8LBitWriter* const bw, VP8LEncoder* const enc, int quality,
+    int low_effort) {
   const WebPPicture* const pic = enc->pic_;
   const int width = pic->width;
   const int height = pic->height;
@@ -1343,7 +1403,7 @@ static WebPEncodingError EncodeDeltaPalettePredictorImage(
   err = EncodeImageNoHuffman(bw, predictors, &enc->hash_chain_,
                              (VP8LBackwardRefs*)enc->refs_,  // cast const away
                              transform_width, transform_height,
-                             quality);
+                             quality, low_effort);
   WebPSafeFree(predictors);
   return err;
 }
@@ -1394,7 +1454,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   int use_near_lossless = 0;
   int hdr_size = 0;
   int data_size = 0;
-  int use_delta_palettization = 0;
+  int use_delta_palette = 0;
 
   if (enc == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
@@ -1421,7 +1481,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   }
 
 #ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (config->delta_palettization) {
+  if (config->use_delta_palette) {
     enc->use_predict_ = 1;
     enc->use_cross_color_ = 0;
     enc->use_subtract_green_ = 0;
@@ -1433,21 +1493,25 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     if (enc->use_palette_) {
       err = AllocateTransformBuffer(enc, width, height);
       if (err != VP8_ENC_OK) goto Error;
-      err = EncodeDeltaPalettePredictorImage(bw, enc, quality);
+      err = EncodeDeltaPalettePredictorImage(bw, enc, quality, low_effort);
       if (err != VP8_ENC_OK) goto Error;
-      use_delta_palettization = 1;
+      use_delta_palette = 1;
     }
   }
 #endif  // WEBP_EXPERIMENTAL_FEATURES
 
   // Encode palette
   if (enc->use_palette_) {
-    err = EncodePalette(bw, enc);
+    err = EncodePalette(bw, low_effort, enc);
     if (err != VP8_ENC_OK) goto Error;
-    err = MapImageFromPalette(enc, use_delta_palettization);
+    err = MapImageFromPalette(enc, use_delta_palette);
     if (err != VP8_ENC_OK) goto Error;
+    // If using a color cache, do not have it bigger than the number of colors.
+    if (use_cache && enc->palette_size_ < (1 << MAX_COLOR_CACHE_BITS)) {
+      enc->cache_bits_ = BitsLog2Floor(enc->palette_size_) + 1;
+    }
   }
-  if (!use_delta_palettization) {
+  if (!use_delta_palette) {
     // In case image is not packed.
     if (enc->argb_ == NULL) {
       err = MakeInputImageCopy(enc);
@@ -1469,7 +1533,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
 
     if (enc->use_cross_color_) {
       err = ApplyCrossColorFilter(enc, enc->current_width_,
-                                  height, quality, bw);
+                                  height, quality, low_effort, bw);
       if (err != VP8_ENC_OK) goto Error;
     }
   }
diff --git a/src/3rdparty/libwebp/src/enc/vp8li.h b/src/3rdparty/libwebp/src/enc/vp8li_enc.h
index 371e276..8c5fbcb 100644
--- a/src/3rdparty/libwebp/src/enc/vp8li.h
+++ b/src/3rdparty/libwebp/src/enc/vp8li_enc.h
@@ -14,9 +14,9 @@
 #ifndef WEBP_ENC_VP8LI_H_
 #define WEBP_ENC_VP8LI_H_
 
-#include "./backward_references.h"
-#include "./histogram.h"
-#include "../utils/bit_writer.h"
+#include "./backward_references_enc.h"
+#include "./histogram_enc.h"
+#include "../utils/bit_writer_utils.h"
 #include "../webp/encode.h"
 #include "../webp/format_constants.h"
 
@@ -24,6 +24,9 @@
 extern "C" {
 #endif
 
+// maximum value of transform_bits_ in VP8LEncoder.
+#define MAX_TRANSFORM_BITS 6
+
 typedef struct {
   const WebPConfig* config_;      // user configuration and parameters
   const WebPPicture* pic_;        // input picture.
@@ -39,7 +42,7 @@ typedef struct {
 
   // Encoding parameters derived from quality parameter.
   int histo_bits_;
-  int transform_bits_;
+  int transform_bits_;    // <= MAX_TRANSFORM_BITS.
   int cache_bits_;        // If equal to 0, don't use color cache.
 
   // Encoding parameters derived from image characteristics.
@@ -73,6 +76,17 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
                                    VP8LBitWriter* const bw, int use_cache);
 
 //------------------------------------------------------------------------------
+// Image transforms in predictor.c.
+
+void VP8LResidualImage(int width, int height, int bits, int low_effort,
+                       uint32_t* const argb, uint32_t* const argb_scratch,
+                       uint32_t* const image, int near_lossless, int exact,
+                       int used_subtract_green);
+
+void VP8LColorSpaceTransform(int width, int height, int bits, int quality,
+                             uint32_t* const argb, uint32_t* image);
+
+//------------------------------------------------------------------------------
 
 #ifdef __cplusplus
 }    // extern "C"
diff --git a/src/3rdparty/libwebp/src/enc/webpenc.c b/src/3rdparty/libwebp/src/enc/webp_enc.c
index a7d04ea..f18461e 100644
--- a/src/3rdparty/libwebp/src/enc/webpenc.c
+++ b/src/3rdparty/libwebp/src/enc/webp_enc.c
@@ -16,9 +16,9 @@
 #include <string.h>
 #include <math.h>
 
-#include "./cost.h"
-#include "./vp8enci.h"
-#include "./vp8li.h"
+#include "./cost_enc.h"
+#include "./vp8i_enc.h"
+#include "./vp8li_enc.h"
 #include "../utils/utils.h"
 
 // #define PRINT_MEMORY_INFO
@@ -75,7 +75,7 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) {
 //-------------------+---+---+---+---+---+---+---+
 // dynamic proba     | ~ | x | x | x | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
-// fast mode analysis|   |   |   |   | x | x | x |
+// fast mode analysis|[x]|[x]|   |   | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
 // basic rd-opt      |   |   |   | x | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
@@ -315,18 +315,21 @@ int WebPReportProgress(const WebPPicture* const pic,
 
 int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   int ok = 0;
+  if (pic == NULL) return 0;
 
-  if (pic == NULL)
-    return 0;
   WebPEncodingSetError(pic, VP8_ENC_OK);  // all ok so far
-  if (config == NULL)  // bad params
+  if (config == NULL) {  // bad params
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER);
-  if (!WebPValidateConfig(config))
+  }
+  if (!WebPValidateConfig(config)) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION);
-  if (pic->width <= 0 || pic->height <= 0)
+  }
+  if (pic->width <= 0 || pic->height <= 0) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
-  if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION)
+  }
+  if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION) {
     return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION);
+  }
 
   if (pic->stats != NULL) memset(pic->stats, 0, sizeof(*pic->stats));
 
@@ -339,8 +342,8 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
 
     if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
       // Make sure we have YUVA samples.
-      if (config->preprocessing & 4) {
-        if (!WebPPictureSmartARGBToYUVA(pic)) {
+      if (config->use_sharp_yuv || (config->preprocessing & 4)) {
+        if (!WebPPictureSharpARGBToYUVA(pic)) {
           return 0;
         }
       } else {