27 files changed, 3040 insertions, 2011 deletions
diff --git a/src/3rdparty/libwebp/src/enc/alpha.c b/src/3rdparty/libwebp/src/enc/alpha.c
index 79cb94d..3c970b0 100644
--- a/src/3rdparty/libwebp/src/enc/alpha.c
+++ b/src/3rdparty/libwebp/src/enc/alpha.c
@@ -15,6 +15,7 @@
 #include <stdlib.h>
 
 #include "./vp8enci.h"
+#include "../dsp/dsp.h"
 #include "../utils/filters.h"
 #include "../utils/quant_levels.h"
 #include "../utils/utils.h"
@@ -61,21 +62,16 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   if (!WebPPictureAlloc(&picture)) return 0;
 
   // Transfer the alpha values to the green channel.
-  {
-    int i, j;
-    uint32_t* dst = picture.argb;
-    const uint8_t* src = data;
-    for (j = 0; j < picture.height; ++j) {
-      for (i = 0; i < picture.width; ++i) {
-        dst[i] = src[i] << 8;  // we leave A/R/B channels zero'd.
-      }
-      src += width;
-      dst += picture.argb_stride;
-    }
-  }
+  WebPDispatchAlphaToGreen(data, width, picture.width, picture.height,
+                           picture.argb, picture.argb_stride);
 
   WebPConfigInit(&config);
   config.lossless = 1;
+  // Enable exact, or it would alter RGB values of transparent alpha, which is
+  // normally OK but not here since we are not encoding the input image but  an
+  // internal encoding-related image containing necessary exact information in
+  // RGB channels.
+  config.exact = 1;
   config.method = effort_level;  // impact is very small
   // Set a low default quality for encoding alpha. Ensure that Alpha quality at
   // lower methods (3 and below) is less than the threshold for triggering
@@ -87,11 +83,10 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   WebPPictureFree(&picture);
   ok = ok && !bw->error_;
   if (!ok) {
-    VP8LBitWriterDestroy(bw);
+    VP8LBitWriterWipeOut(bw);
     return 0;
   }
   return 1;
-
 }
 
 // -----------------------------------------------------------------------------
@@ -143,10 +138,10 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
       if (output_size > data_size) {
         // compressed size is larger than source! Revert to uncompressed mode.
         method = ALPHA_NO_COMPRESSION;
-        VP8LBitWriterDestroy(&tmp_bw);
+        VP8LBitWriterWipeOut(&tmp_bw);
       }
     } else {
-      VP8LBitWriterDestroy(&tmp_bw);
+      VP8LBitWriterWipeOut(&tmp_bw);
       return 0;
     }
   }
@@ -166,7 +161,7 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   ok = ok && VP8BitWriterAppend(&result->bw, output, output_size);
 
   if (method != ALPHA_NO_COMPRESSION) {
-    VP8LBitWriterDestroy(&tmp_bw);
+    VP8LBitWriterWipeOut(&tmp_bw);
   }
   ok = ok && !result->bw.error_;
   result->score = VP8BitWriterSize(&result->bw);
@@ -175,16 +170,6 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
 
 // -----------------------------------------------------------------------------
 
-// TODO(skal): move to dsp/ ?
-static void CopyPlane(const uint8_t* src, int src_stride,
-                      uint8_t* dst, int dst_stride, int width, int height) {
-  while (height-- > 0) {
-    memcpy(dst, src, width);
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 static int GetNumColors(const uint8_t* data, int width, int height,
                         int stride) {
   int j;
@@ -218,8 +203,9 @@ static uint32_t GetFilterMap(const uint8_t* alpha, int width, int height,
     const int kMaxColorsForFilterNone = 192;
     const int num_colors = GetNumColors(alpha, width, height, width);
     // For low number of colors, NONE yields better compression.
-    filter = (num_colors <= kMinColorsForFilterNone) ? WEBP_FILTER_NONE :
-             EstimateBestFilter(alpha, width, height, width);
+    filter = (num_colors <= kMinColorsForFilterNone)
+        ? WEBP_FILTER_NONE
+        : WebPEstimateBestFilter(alpha, width, height, width);
     bit_map |= 1 << filter;
     // For large number of colors, try FILTER_NONE in addition to the best
     // filter as well.
@@ -250,6 +236,7 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
   uint32_t try_map =
       GetFilterMap(alpha, width, height, filter, effort_level);
   InitFilterTrial(&best);
+
   if (try_map != FILTER_TRY_NONE) {
     uint8_t* filtered_alpha =  (uint8_t*)WebPSafeMalloc(1ULL, data_size);
     if (filtered_alpha == NULL) return 0;
@@ -274,7 +261,16 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
                              reduce_levels, effort_level, NULL, &best);
   }
   if (ok) {
-    if (stats != NULL) *stats = best.stats;
+    if (stats != NULL) {
+      stats->lossless_features = best.stats.lossless_features;
+      stats->histogram_bits = best.stats.histogram_bits;
+      stats->transform_bits = best.stats.transform_bits;
+      stats->cache_bits = best.stats.cache_bits;
+      stats->palette_size = best.stats.palette_size;
+      stats->lossless_size = best.stats.lossless_size;
+      stats->lossless_hdr_size = best.stats.lossless_hdr_size;
+      stats->lossless_data_size = best.stats.lossless_data_size;
+    }
     *output_size = VP8BitWriterSize(&best.bw);
     *output = VP8BitWriterBuf(&best.bw);
   } else {
@@ -324,7 +320,7 @@ static int EncodeAlpha(VP8Encoder* const enc,
   }
 
   // Extract alpha data (width x height) from raw_data (stride x height).
-  CopyPlane(pic->a, pic->a_stride, quant_alpha, width, width, height);
+  WebPCopyPlane(pic->a, pic->a_stride, quant_alpha, width, width, height);
 
   if (reduce_levels) {  // No Quantization required for 'quality = 100'.
     // 16 alpha levels gives quite a low MSE w.r.t original alpha plane hence
@@ -336,6 +332,7 @@ static int EncodeAlpha(VP8Encoder* const enc,
   }
 
   if (ok) {
+    VP8FiltersInit();
     ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
                                filter, reduce_levels, effort_level, output,
                                output_size, pic->stats);
@@ -376,6 +373,7 @@ static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
 }
 
 void VP8EncInitAlpha(VP8Encoder* const enc) {
+  WebPInitAlphaProcessing();
   enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
   enc->alpha_data_ = NULL;
   enc->alpha_data_size_ = 0;
@@ -430,4 +428,3 @@ int VP8EncDeleteAlpha(VP8Encoder* const enc) {
   enc->has_alpha_ = 0;
   return ok;
 }
-
diff --git a/src/3rdparty/libwebp/src/enc/analysis.c b/src/3rdparty/libwebp/src/enc/analysis.c
index e019465..b55128f 100644
--- a/src/3rdparty/libwebp/src/enc/analysis.c
+++ b/src/3rdparty/libwebp/src/enc/analysis.c
@@ -111,28 +111,28 @@ static int FinalAlphaValue(int alpha) {
 }
 
 static int GetAlpha(const VP8Histogram* const histo) {
-  int max_value = 0, last_non_zero = 1;
-  int k;
-  int alpha;
-  for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
-    const int value = histo->distribution[k];
-    if (value > 0) {
-      if (value > max_value) max_value = value;
-      last_non_zero = k;
-    }
-  }
   // 'alpha' will later be clipped to [0..MAX_ALPHA] range, clamping outer
   // values which happen to be mostly noise. This leaves the maximum precision
   // for handling the useful small values which contribute most.
-  alpha = (max_value > 1) ? ALPHA_SCALE * last_non_zero / max_value : 0;
+  const int max_value = histo->max_value;
+  const int last_non_zero = histo->last_non_zero;
+  const int alpha =
+      (max_value > 1) ? ALPHA_SCALE * last_non_zero / max_value : 0;
   return alpha;
 }
 
+static void InitHistogram(VP8Histogram* const histo) {
+  histo->max_value = 0;
+  histo->last_non_zero = 1;
+}
+
 static void MergeHistograms(const VP8Histogram* const in,
                             VP8Histogram* const out) {
-  int i;
-  for (i = 0; i <= MAX_COEFF_THRESH; ++i) {
-    out->distribution[i] += in->distribution[i];
+  if (in->max_value > out->max_value) {
+    out->max_value = in->max_value;
+  }
+  if (in->last_non_zero > out->last_non_zero) {
+    out->last_non_zero = in->last_non_zero;
   }
 }
 
@@ -245,10 +245,11 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
 
   VP8MakeLuma16Preds(it);
   for (mode = 0; mode < max_mode; ++mode) {
-    VP8Histogram histo = { { 0 } };
+    VP8Histogram histo;
     int alpha;
 
-    VP8CollectHistogram(it->yuv_in_ + Y_OFF,
+    InitHistogram(&histo);
+    VP8CollectHistogram(it->yuv_in_ + Y_OFF_ENC,
                         it->yuv_p_ + VP8I16ModeOffsets[mode],
                         0, 16, &histo);
     alpha = GetAlpha(&histo);
@@ -266,21 +267,22 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
   uint8_t modes[16];
   const int max_mode = MAX_INTRA4_MODE;
   int i4_alpha;
-  VP8Histogram total_histo = { { 0 } };
+  VP8Histogram total_histo;
   int cur_histo = 0;
+  InitHistogram(&total_histo);
 
   VP8IteratorStartI4(it);
   do {
     int mode;
     int best_mode_alpha = DEFAULT_ALPHA;
     VP8Histogram histos[2];
-    const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
+    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
 
     VP8MakeIntra4Preds(it);
     for (mode = 0; mode < max_mode; ++mode) {
       int alpha;
 
-      memset(&histos[cur_histo], 0, sizeof(histos[cur_histo]));
+      InitHistogram(&histos[cur_histo]);
       VP8CollectHistogram(src, it->yuv_p_ + VP8I4ModeOffsets[mode],
                           0, 1, &histos[cur_histo]);
       alpha = GetAlpha(&histos[cur_histo]);
@@ -293,7 +295,7 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
     // accumulate best histogram
     MergeHistograms(&histos[cur_histo ^ 1], &total_histo);
     // Note: we reuse the original samples for predictors
-  } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
+  } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF_ENC));
 
   i4_alpha = GetAlpha(&total_histo);
   if (IS_BETTER_ALPHA(i4_alpha, best_alpha)) {
@@ -311,9 +313,10 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
 
   VP8MakeChroma8Preds(it);
   for (mode = 0; mode < max_mode; ++mode) {
-    VP8Histogram histo = { { 0 } };
+    VP8Histogram histo;
     int alpha;
-    VP8CollectHistogram(it->yuv_in_ + U_OFF,
+    InitHistogram(&histo);
+    VP8CollectHistogram(it->yuv_in_ + U_OFF_ENC,
                         it->yuv_p_ + VP8UVModeOffsets[mode],
                         16, 16 + 4 + 4, &histo);
     alpha = GetAlpha(&histo);
@@ -402,8 +405,8 @@ typedef struct {
 static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) {
   int ok = 1;
   if (!VP8IteratorIsDone(it)) {
-    uint8_t tmp[32 + ALIGN_CST];
-    uint8_t* const scratch = (uint8_t*)DO_ALIGN(tmp);
+    uint8_t tmp[32 + WEBP_ALIGN_CST];
+    uint8_t* const scratch = (uint8_t*)WEBP_ALIGN(tmp);
     do {
       // Let's pretend we have perfect lossless reconstruction.
       VP8IteratorImport(it, scratch);
diff --git a/src/3rdparty/libwebp/src/enc/backward_references.c b/src/3rdparty/libwebp/src/enc/backward_references.c
index a3c30aa..c39437d 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references.c
+++ b/src/3rdparty/libwebp/src/enc/backward_references.c
@@ -16,13 +16,12 @@
 #include "./backward_references.h"
 #include "./histogram.h"
 #include "../dsp/lossless.h"
+#include "../dsp/dsp.h"
 #include "../utils/color_cache.h"
 #include "../utils/utils.h"
 
 #define VALUES_IN_BYTE 256
 
-#define HASH_MULTIPLIER (0xc6a4a7935bd1e995ULL)
-
 #define MIN_BLOCK_SIZE 256  // minimum block size for backward references
 
 #define MAX_ENTROPY    (1e30f)
@@ -58,10 +57,28 @@ static int DistanceToPlaneCode(int xsize, int dist) {
   return dist + 120;
 }
 
+// Returns the exact index where array1 and array2 are different if this
+// index is strictly superior to best_len_match. Otherwise, it returns 0.
+// If no two elements are the same, it returns max_limit.
 static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
                                        const uint32_t* const array2,
-                                       const int max_limit) {
-  int match_len = 0;
+                                       int best_len_match,
+                                       int max_limit) {
+  int match_len;
+
+  // Before 'expensive' linear match, check if the two arrays match at the
+  // current best length index.
+  if (array1[best_len_match] != array2[best_len_match]) return 0;
+
+#if defined(WEBP_USE_SSE2)
+  // Check if anything is different up to best_len_match excluded.
+  // memcmp seems to be slower on ARM so it is disabled for now.
+  if (memcmp(array1, array2, best_len_match * sizeof(*array1))) return 0;
+  match_len = best_len_match + 1;
+#else
+  match_len = 0;
+#endif
+
   while (match_len < max_limit && array1[match_len] == array2[match_len]) {
     ++match_len;
   }
@@ -178,15 +195,12 @@ int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
 // Hash chains
 
 // initialize as empty
-static void HashChainInit(VP8LHashChain* const p) {
-  int i;
+static void HashChainReset(VP8LHashChain* const p) {
   assert(p != NULL);
-  for (i = 0; i < p->size_; ++i) {
-    p->chain_[i] = -1;
-  }
-  for (i = 0; i < HASH_SIZE; ++i) {
-    p->hash_to_first_index_[i] = -1;
-  }
+  // Set the int32_t arrays to -1.
+  memset(p->chain_, 0xff, p->size_ * sizeof(*p->chain_));
+  memset(p->hash_to_first_index_, 0xff,
+         HASH_SIZE * sizeof(*p->hash_to_first_index_));
 }
 
 int VP8LHashChainInit(VP8LHashChain* const p, int size) {
@@ -196,7 +210,7 @@ int VP8LHashChainInit(VP8LHashChain* const p, int size) {
   p->chain_ = (int*)WebPSafeMalloc(size, sizeof(*p->chain_));
   if (p->chain_ == NULL) return 0;
   p->size_ = size;
-  HashChainInit(p);
+  HashChainReset(p);
   return 1;
 }
 
@@ -209,209 +223,212 @@ void VP8LHashChainClear(VP8LHashChain* const p) {
 
 // -----------------------------------------------------------------------------
 
-static WEBP_INLINE uint64_t GetPixPairHash64(const uint32_t* const argb) {
-  uint64_t key = ((uint64_t)argb[1] << 32) | argb[0];
-  key = (key * HASH_MULTIPLIER) >> (64 - HASH_BITS);
+#define HASH_MULTIPLIER_HI (0xc6a4a793U)
+#define HASH_MULTIPLIER_LO (0x5bd1e996U)
+
+static WEBP_INLINE uint32_t GetPixPairHash64(const uint32_t* const argb) {
+  uint32_t key;
+  key  = argb[1] * HASH_MULTIPLIER_HI;
+  key += argb[0] * HASH_MULTIPLIER_LO;
+  key = key >> (32 - HASH_BITS);
   return key;
 }
 
 // Insertion of two pixels at a time.
 static void HashChainInsert(VP8LHashChain* const p,
                             const uint32_t* const argb, int pos) {
-  const uint64_t hash_code = GetPixPairHash64(argb);
+  const uint32_t hash_code = GetPixPairHash64(argb);
   p->chain_[pos] = p->hash_to_first_index_[hash_code];
   p->hash_to_first_index_[hash_code] = pos;
 }
 
-static void GetParamsForHashChainFindCopy(int quality, int xsize,
-                                          int cache_bits, int* window_size,
-                                          int* iter_pos, int* iter_limit) {
-  const int iter_mult = (quality < 27) ? 1 : 1 + ((quality - 27) >> 4);
-  const int iter_neg = -iter_mult * (quality >> 1);
-  // Limit the backward-ref window size for lower qualities.
-  const int max_window_size = (quality > 50) ? WINDOW_SIZE
-                            : (quality > 25) ? (xsize << 8)
+// Returns the maximum number of hash chain lookups to do for a
+// given compression quality. Return value in range [6, 86].
+static int GetMaxItersForQuality(int quality, int low_effort) {
+  return (low_effort ? 6 : 8) + (quality * quality) / 128;
+}
+
+static int GetWindowSizeForHashChain(int quality, int xsize) {
+  const int max_window_size = (quality > 75) ? WINDOW_SIZE
+                            : (quality > 50) ? (xsize << 8)
+                            : (quality > 25) ? (xsize << 6)
                             : (xsize << 4);
   assert(xsize > 0);
-  *window_size = (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE
-               : max_window_size;
-  *iter_pos = 8 + (quality >> 3);
-  // For lower entropy images, the rigorous search loop in HashChainFindCopy
-  // can be relaxed.
-  *iter_limit = (cache_bits > 0) ? iter_neg : iter_neg / 2;
+  return (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE : max_window_size;
+}
+
+static WEBP_INLINE int MaxFindCopyLength(int len) {
+  return (len < MAX_LENGTH) ? len : MAX_LENGTH;
+}
+
+static void HashChainFindOffset(const VP8LHashChain* const p, int base_position,
+                                const uint32_t* const argb, int len,
+                                int window_size, int* const distance_ptr) {
+  const uint32_t* const argb_start = argb + base_position;
+  const int min_pos =
+      (base_position > window_size) ? base_position - window_size : 0;
+  int pos;
+  assert(len <= MAX_LENGTH);
+  for (pos = p->hash_to_first_index_[GetPixPairHash64(argb_start)];
+       pos >= min_pos;
+       pos = p->chain_[pos]) {
+    const int curr_length =
+        FindMatchLength(argb + pos, argb_start, len - 1, len);
+    if (curr_length == len) break;
+  }
+  *distance_ptr = base_position - pos;
 }
 
 static int HashChainFindCopy(const VP8LHashChain* const p,
-                             int base_position, int xsize_signed,
+                             int base_position,
                              const uint32_t* const argb, int max_len,
-                             int window_size, int iter_pos, int iter_limit,
+                             int window_size, int iter_max,
                              int* const distance_ptr,
                              int* const length_ptr) {
   const uint32_t* const argb_start = argb + base_position;
-  uint64_t best_val = 0;
-  uint32_t best_length = 1;
-  uint32_t best_distance = 0;
-  const uint32_t xsize = (uint32_t)xsize_signed;
+  int iter = iter_max;
+  int best_length = 0;
+  int best_distance = 0;
   const int min_pos =
       (base_position > window_size) ? base_position - window_size : 0;
   int pos;
-  assert(xsize > 0);
-  if (max_len > MAX_LENGTH) {
-    max_len = MAX_LENGTH;
+  int length_max = 256;
+  if (max_len < length_max) {
+    length_max = max_len;
   }
   for (pos = p->hash_to_first_index_[GetPixPairHash64(argb_start)];
        pos >= min_pos;
        pos = p->chain_[pos]) {
-    uint64_t val;
-    uint32_t curr_length;
-    uint32_t distance;
-    const uint32_t* const ptr1 = (argb + pos + best_length - 1);
-    const uint32_t* const ptr2 = (argb_start + best_length - 1);
-
-    if (iter_pos < 0) {
-      if (iter_pos < iter_limit || best_val >= 0xff0000) {
-        break;
-      }
+    int curr_length;
+    int distance;
+    if (--iter < 0) {
+      break;
     }
-    --iter_pos;
-
-    // Before 'expensive' linear match, check if the two arrays match at the
-    // current best length index and also for the succeeding elements.
-    if (ptr1[0] != ptr2[0] || ptr1[1] != ptr2[1]) continue;
-
-    curr_length = FindMatchLength(argb + pos, argb_start, max_len);
-    if (curr_length < best_length) continue;
-
-    distance = (uint32_t)(base_position - pos);
-    val = curr_length << 16;
-    // Favoring 2d locality here gives savings for certain images.
-    if (distance < 9 * xsize) {
-      const uint32_t y = distance / xsize;
-      uint32_t x = distance % xsize;
-      if (x > (xsize >> 1)) {
-        x = xsize - x;
-      }
-      if (x <= 7) {
-        val += 9 * 9 + 9 * 9;
-        val -= y * y + x * x;
-      }
-    }
-    if (best_val < val) {
-      best_val = val;
+
+    curr_length = FindMatchLength(argb + pos, argb_start, best_length, max_len);
+    if (best_length < curr_length) {
+      distance = base_position - pos;
       best_length = curr_length;
       best_distance = distance;
-      if (curr_length >= (uint32_t)max_len) {
-        break;
-      }
-      if ((best_distance == 1 || distance == xsize) &&
-          best_length >= 128) {
+      if (curr_length >= length_max) {
         break;
       }
     }
   }
-  *distance_ptr = (int)best_distance;
+  *distance_ptr = best_distance;
   *length_ptr = best_length;
   return (best_length >= MIN_LENGTH);
 }
 
-static WEBP_INLINE void PushBackCopy(VP8LBackwardRefs* const refs, int length) {
-  while (length >= MAX_LENGTH) {
-    BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, MAX_LENGTH));
-    length -= MAX_LENGTH;
-  }
-  if (length > 0) {
-    BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, length));
+static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
+                                         VP8LColorCache* const hashers,
+                                         VP8LBackwardRefs* const refs) {
+  PixOrCopy v;
+  if (use_color_cache) {
+    const uint32_t key = VP8LColorCacheGetIndex(hashers, pixel);
+    if (VP8LColorCacheLookup(hashers, key) == pixel) {
+      v = PixOrCopyCreateCacheIdx(key);
+    } else {
+      v = PixOrCopyCreateLiteral(pixel);
+      VP8LColorCacheSet(hashers, key, pixel);
+    }
+  } else {
+    v = PixOrCopyCreateLiteral(pixel);
   }
+  BackwardRefsCursorAdd(refs, v);
 }
 
 static int BackwardReferencesRle(int xsize, int ysize,
                                  const uint32_t* const argb,
-                                 VP8LBackwardRefs* const refs) {
+                                 int cache_bits, VP8LBackwardRefs* const refs) {
   const int pix_count = xsize * ysize;
-  int match_len = 0;
-  int i;
+  int i, k;
+  const int use_color_cache = (cache_bits > 0);
+  VP8LColorCache hashers;
+
+  if (use_color_cache && !VP8LColorCacheInit(&hashers, cache_bits)) {
+    return 0;
+  }
   ClearBackwardRefs(refs);
-  PushBackCopy(refs, match_len);    // i=0 case
-  BackwardRefsCursorAdd(refs, PixOrCopyCreateLiteral(argb[0]));
-  for (i = 1; i < pix_count; ++i) {
-    if (argb[i] == argb[i - 1]) {
-      ++match_len;
+  // Add first pixel as literal.
+  AddSingleLiteral(argb[0], use_color_cache, &hashers, refs);
+  i = 1;
+  while (i < pix_count) {
+    const int max_len = MaxFindCopyLength(pix_count - i);
+    const int kMinLength = 4;
+    const int rle_len = FindMatchLength(argb + i, argb + i - 1, 0, max_len);
+    const int prev_row_len = (i < xsize) ? 0 :
+        FindMatchLength(argb + i, argb + i - xsize, 0, max_len);
+    if (rle_len >= prev_row_len && rle_len >= kMinLength) {
+      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len));
+      // We don't need to update the color cache here since it is always the
+      // same pixel being copied, and that does not change the color cache
+      // state.
+      i += rle_len;
+    } else if (prev_row_len >= kMinLength) {
+      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len));
+      if (use_color_cache) {
+        for (k = 0; k < prev_row_len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      i += prev_row_len;
     } else {
-      PushBackCopy(refs, match_len);
-      match_len = 0;
-      BackwardRefsCursorAdd(refs, PixOrCopyCreateLiteral(argb[i]));
+      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+      i++;
     }
   }
-  PushBackCopy(refs, match_len);
+  if (use_color_cache) VP8LColorCacheClear(&hashers);
   return !refs->error_;
 }
 
-static int BackwardReferencesHashChain(int xsize, int ysize,
-                                       const uint32_t* const argb,
-                                       int cache_bits, int quality,
-                                       VP8LHashChain* const hash_chain,
-                                       VP8LBackwardRefs* const refs) {
+static int BackwardReferencesLz77(int xsize, int ysize,
+                                  const uint32_t* const argb, int cache_bits,
+                                  int quality, int low_effort,
+                                  VP8LHashChain* const hash_chain,
+                                  VP8LBackwardRefs* const refs) {
   int i;
   int ok = 0;
   int cc_init = 0;
   const int use_color_cache = (cache_bits > 0);
   const int pix_count = xsize * ysize;
   VP8LColorCache hashers;
-  int window_size = WINDOW_SIZE;
-  int iter_pos = 1;
-  int iter_limit = -1;
+  int iter_max = GetMaxItersForQuality(quality, low_effort);
+  const int window_size = GetWindowSizeForHashChain(quality, xsize);
+  int min_matches = 32;
 
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     if (!cc_init) goto Error;
   }
-
   ClearBackwardRefs(refs);
-  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
-                                &window_size, &iter_pos, &iter_limit);
-  HashChainInit(hash_chain);
-  for (i = 0; i < pix_count; ) {
+  HashChainReset(hash_chain);
+  for (i = 0; i < pix_count - 2; ) {
     // Alternative#1: Code the pixels starting at 'i' using backward reference.
     int offset = 0;
     int len = 0;
-    if (i < pix_count - 1) {  // FindCopy(i,..) reads pixels at [i] and [i + 1].
-      int max_len = pix_count - i;
-      HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
-                        window_size, iter_pos, iter_limit,
-                        &offset, &len);
-    }
-    if (len >= MIN_LENGTH) {
-      // Alternative#2: Insert the pixel at 'i' as literal, and code the
-      // pixels starting at 'i + 1' using backward reference.
+    const int max_len = MaxFindCopyLength(pix_count - i);
+    HashChainFindCopy(hash_chain, i, argb, max_len, window_size,
+                      iter_max, &offset, &len);
+    if (len > MIN_LENGTH || (len == MIN_LENGTH && offset <= 512)) {
       int offset2 = 0;
       int len2 = 0;
       int k;
+      min_matches = 8;
       HashChainInsert(hash_chain, &argb[i], i);
-      if (i < pix_count - 2) {  // FindCopy(i+1,..) reads [i + 1] and [i + 2].
-        int max_len = pix_count - (i + 1);
-        HashChainFindCopy(hash_chain, i + 1, xsize, argb, max_len,
-                          window_size, iter_pos, iter_limit,
-                          &offset2, &len2);
+      if ((len < (max_len >> 2)) && !low_effort) {
+        // Evaluate Alternative#2: Insert the pixel at 'i' as literal, and code
+        // the pixels starting at 'i + 1' using backward reference.
+        HashChainFindCopy(hash_chain, i + 1, argb, max_len - 1,
+                          window_size, iter_max, &offset2,
+                          &len2);
         if (len2 > len + 1) {
-          const uint32_t pixel = argb[i];
-          // Alternative#2 is a better match. So push pixel at 'i' as literal.
-          PixOrCopy v;
-          if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
-            const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
-            v = PixOrCopyCreateCacheIdx(ix);
-          } else {
-            if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
-            v = PixOrCopyCreateLiteral(pixel);
-          }
-          BackwardRefsCursorAdd(refs, v);
+          AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
           i++;  // Backward reference to be done for next pixel.
           len = len2;
           offset = offset2;
         }
       }
-      if (len >= MAX_LENGTH) {
-        len = MAX_LENGTH - 1;
-      }
       BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
       if (use_color_cache) {
         for (k = 0; k < len; ++k) {
@@ -419,33 +436,36 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
         }
       }
       // Add to the hash_chain (but cannot add the last pixel).
-      {
+      if (offset >= 3 && offset != xsize) {
         const int last = (len < pix_count - 1 - i) ? len : pix_count - 1 - i;
-        for (k = 1; k < last; ++k) {
+        for (k = 2; k < last - 8; k += 2) {
+          HashChainInsert(hash_chain, &argb[i + k], i + k);
+        }
+        for (; k < last; ++k) {
           HashChainInsert(hash_chain, &argb[i + k], i + k);
         }
       }
       i += len;
     } else {
-      const uint32_t pixel = argb[i];
-      PixOrCopy v;
-      if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
-        // push pixel as a PixOrCopyCreateCacheIdx pixel
-        const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
-        v = PixOrCopyCreateCacheIdx(ix);
-      } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
-        v = PixOrCopyCreateLiteral(pixel);
-      }
-      BackwardRefsCursorAdd(refs, v);
-      if (i + 1 < pix_count) {
+      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+      HashChainInsert(hash_chain, &argb[i], i);
+      ++i;
+      --min_matches;
+      if (min_matches <= 0) {
+        AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
         HashChainInsert(hash_chain, &argb[i], i);
+        ++i;
       }
-      ++i;
     }
   }
+  while (i < pix_count) {
+    // Handle the last pixel(s).
+    AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+    ++i;
+  }
+
   ok = !refs->error_;
-Error:
+ Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
   return ok;
 }
@@ -455,15 +475,14 @@ Error:
 typedef struct {
   double alpha_[VALUES_IN_BYTE];
   double red_[VALUES_IN_BYTE];
-  double literal_[PIX_OR_COPY_CODES_MAX];
   double blue_[VALUES_IN_BYTE];
   double distance_[NUM_DISTANCE_CODES];
+  double* literal_;
 } CostModel;
 
 static int BackwardReferencesTraceBackwards(
-    int xsize, int ysize, int recursive_cost_model,
-    const uint32_t* const argb, int quality, int cache_bits,
-    VP8LHashChain* const hash_chain,
+    int xsize, int ysize, const uint32_t* const argb, int quality,
+    int cache_bits, VP8LHashChain* const hash_chain,
     VP8LBackwardRefs* const refs);
 
 static void ConvertPopulationCountTableToBitEstimates(
@@ -487,28 +506,10 @@ static void ConvertPopulationCountTableToBitEstimates(
   }
 }
 
-static int CostModelBuild(CostModel* const m, int xsize, int ysize,
-                          int recursion_level, const uint32_t* const argb,
-                          int quality, int cache_bits,
-                          VP8LHashChain* const hash_chain,
+static int CostModelBuild(CostModel* const m, int cache_bits,
                           VP8LBackwardRefs* const refs) {
   int ok = 0;
-  VP8LHistogram* histo = NULL;
-
-  ClearBackwardRefs(refs);
-  if (recursion_level > 0) {
-    if (!BackwardReferencesTraceBackwards(xsize, ysize, recursion_level - 1,
-                                          argb, quality, cache_bits, hash_chain,
-                                          refs)) {
-      goto Error;
-    }
-  } else {
-    if (!BackwardReferencesHashChain(xsize, ysize, argb, cache_bits, quality,
-                                     hash_chain, refs)) {
-      goto Error;
-    }
-  }
-  histo = VP8LAllocateHistogram(cache_bits);
+  VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
   if (histo == NULL) goto Error;
 
   VP8LHistogramCreate(histo, refs, cache_bits);
@@ -557,10 +558,35 @@ static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
   return m->distance_[code] + extra_bits;
 }
 
+static void AddSingleLiteralWithCostModel(
+    const uint32_t* const argb, VP8LHashChain* const hash_chain,
+    VP8LColorCache* const hashers, const CostModel* const cost_model, int idx,
+    int is_last, int use_color_cache, double prev_cost, float* const cost,
+    uint16_t* const dist_array) {
+  double cost_val = prev_cost;
+  const uint32_t color = argb[0];
+  if (!is_last) {
+    HashChainInsert(hash_chain, argb, idx);
+  }
+  if (use_color_cache && VP8LColorCacheContains(hashers, color)) {
+    const double mul0 = 0.68;
+    const int ix = VP8LColorCacheGetIndex(hashers, color);
+    cost_val += GetCacheCost(cost_model, ix) * mul0;
+  } else {
+    const double mul1 = 0.82;
+    if (use_color_cache) VP8LColorCacheInsert(hashers, color);
+    cost_val += GetLiteralCost(cost_model, color) * mul1;
+  }
+  if (cost[idx] > cost_val) {
+    cost[idx] = (float)cost_val;
+    dist_array[idx] = 1;  // only one is inserted.
+  }
+}
+
 static int BackwardReferencesHashChainDistanceOnly(
-    int xsize, int ysize, int recursive_cost_model, const uint32_t* const argb,
+    int xsize, int ysize, const uint32_t* const argb,
     int quality, int cache_bits, VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs* const refs, uint32_t* const dist_array) {
+    VP8LBackwardRefs* const refs, uint16_t* const dist_array) {
   int i;
   int ok = 0;
   int cc_init = 0;
@@ -568,24 +594,27 @@ static int BackwardReferencesHashChainDistanceOnly(
   const int use_color_cache = (cache_bits > 0);
   float* const cost =
       (float*)WebPSafeMalloc(pix_count, sizeof(*cost));
-  CostModel* cost_model = (CostModel*)WebPSafeMalloc(1ULL, sizeof(*cost_model));
+  const size_t literal_array_size = sizeof(double) *
+      (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
+       ((cache_bits > 0) ? (1 << cache_bits) : 0));
+  const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
+  CostModel* const cost_model =
+      (CostModel*)WebPSafeMalloc(1ULL, cost_model_size);
   VP8LColorCache hashers;
-  const double mul0 = (recursive_cost_model != 0) ? 1.0 : 0.68;
-  const double mul1 = (recursive_cost_model != 0) ? 1.0 : 0.82;
-  const int min_distance_code = 2;  // TODO(vikasa): tune as function of quality
-  int window_size = WINDOW_SIZE;
-  int iter_pos = 1;
-  int iter_limit = -1;
+  const int skip_length = 32 + quality;
+  const int skip_min_distance_code = 2;
+  int iter_max = GetMaxItersForQuality(quality, 0);
+  const int window_size = GetWindowSizeForHashChain(quality, xsize);
 
   if (cost == NULL || cost_model == NULL) goto Error;
 
+  cost_model->literal_ = (double*)(cost_model + 1);
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     if (!cc_init) goto Error;
   }
 
-  if (!CostModelBuild(cost_model, xsize, ysize, recursive_cost_model, argb,
-                      quality, cache_bits, hash_chain, refs)) {
+  if (!CostModelBuild(cost_model, cache_bits, refs)) {
     goto Error;
   }
 
@@ -594,85 +623,80 @@ static int BackwardReferencesHashChainDistanceOnly(
   // We loop one pixel at a time, but store all currently best points to
   // non-processed locations from this point.
   dist_array[0] = 0;
-  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
-                                &window_size, &iter_pos, &iter_limit);
-  HashChainInit(hash_chain);
-  for (i = 0; i < pix_count; ++i) {
-    double prev_cost = 0.0;
-    int shortmax;
-    if (i > 0) {
-      prev_cost = cost[i - 1];
-    }
-    for (shortmax = 0; shortmax < 2; ++shortmax) {
-      int offset = 0;
-      int len = 0;
-      if (i < pix_count - 1) {  // FindCopy reads pixels at [i] and [i + 1].
-        int max_len = shortmax ? 2 : pix_count - i;
-        HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
-                          window_size, iter_pos, iter_limit,
-                          &offset, &len);
+  HashChainReset(hash_chain);
+  // Add first pixel as literal.
+  AddSingleLiteralWithCostModel(argb + 0, hash_chain, &hashers, cost_model, 0,
+                                0, use_color_cache, 0.0, cost, dist_array);
+  for (i = 1; i < pix_count - 1; ++i) {
+    int offset = 0;
+    int len = 0;
+    double prev_cost = cost[i - 1];
+    const int max_len = MaxFindCopyLength(pix_count - i);
+    HashChainFindCopy(hash_chain, i, argb, max_len, window_size,
+                      iter_max, &offset, &len);
+    if (len >= MIN_LENGTH) {
+      const int code = DistanceToPlaneCode(xsize, offset);
+      const double distance_cost =
+          prev_cost + GetDistanceCost(cost_model, code);
+      int k;
+      for (k = 1; k < len; ++k) {
+        const double cost_val = distance_cost + GetLengthCost(cost_model, k);
+        if (cost[i + k] > cost_val) {
+          cost[i + k] = (float)cost_val;
+          dist_array[i + k] = k + 1;
+        }
       }
-      if (len >= MIN_LENGTH) {
-        const int code = DistanceToPlaneCode(xsize, offset);
-        const double distance_cost =
-            prev_cost + GetDistanceCost(cost_model, code);
-        int k;
-        for (k = 1; k < len; ++k) {
-          const double cost_val = distance_cost + GetLengthCost(cost_model, k);
-          if (cost[i + k] > cost_val) {
-            cost[i + k] = (float)cost_val;
-            dist_array[i + k] = k + 1;
+      // This if is for speedup only. It roughly doubles the speed, and
+      // makes compression worse by .1 %.
+      if (len >= skip_length && code <= skip_min_distance_code) {
+        // Long copy for short distances, let's skip the middle
+        // lookups for better copies.
+        // 1) insert the hashes.
+        if (use_color_cache) {
+          for (k = 0; k < len; ++k) {
+            VP8LColorCacheInsert(&hashers, argb[i + k]);
           }
         }
-        // This if is for speedup only. It roughly doubles the speed, and
-        // makes compression worse by .1 %.
-        if (len >= 128 && code <= min_distance_code) {
-          // Long copy for short distances, let's skip the middle
-          // lookups for better copies.
-          // 1) insert the hashes.
-          if (use_color_cache) {
-            for (k = 0; k < len; ++k) {
-              VP8LColorCacheInsert(&hashers, argb[i + k]);
-            }
-          }
-          // 2) Add to the hash_chain (but cannot add the last pixel)
-          {
-            const int last = (len + i < pix_count - 1) ? len + i
-                                                       : pix_count - 1;
-            for (k = i; k < last; ++k) {
-              HashChainInsert(hash_chain, &argb[k], k);
-            }
+        // 2) Add to the hash_chain (but cannot add the last pixel)
+        {
+          const int last = (len + i < pix_count - 1) ? len + i
+                                                     : pix_count - 1;
+          for (k = i; k < last; ++k) {
+            HashChainInsert(hash_chain, &argb[k], k);
           }
-          // 3) jump.
-          i += len - 1;  // for loop does ++i, thus -1 here.
-          goto next_symbol;
         }
+        // 3) jump.
+        i += len - 1;  // for loop does ++i, thus -1 here.
+        goto next_symbol;
       }
-    }
-    if (i < pix_count - 1) {
-      HashChainInsert(hash_chain, &argb[i], i);
-    }
-    {
-      // inserting a literal pixel
-      double cost_val = prev_cost;
-      if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
-        const int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
-        cost_val += GetCacheCost(cost_model, ix) * mul0;
-      } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
-        cost_val += GetLiteralCost(cost_model, argb[i]) * mul1;
-      }
-      if (cost[i] > cost_val) {
-        cost[i] = (float)cost_val;
-        dist_array[i] = 1;  // only one is inserted.
+      if (len != MIN_LENGTH) {
+        int code_min_length;
+        double cost_total;
+        HashChainFindOffset(hash_chain, i, argb, MIN_LENGTH, window_size,
+                            &offset);
+        code_min_length = DistanceToPlaneCode(xsize, offset);
+        cost_total = prev_cost +
+            GetDistanceCost(cost_model, code_min_length) +
+            GetLengthCost(cost_model, 1);
+        if (cost[i + 1] > cost_total) {
+          cost[i + 1] = (float)cost_total;
+          dist_array[i + 1] = 2;
+        }
       }
     }
+    AddSingleLiteralWithCostModel(argb + i, hash_chain, &hashers, cost_model, i,
+                                  0, use_color_cache, prev_cost, cost,
+                                  dist_array);
  next_symbol: ;
   }
-  // Last pixel still to do, it can only be a single step if not reached
-  // through cheaper means already.
+  // Handle the last pixel.
+  if (i == (pix_count - 1)) {
+    AddSingleLiteralWithCostModel(argb + i, hash_chain, &hashers, cost_model, i,
+                                  1, use_color_cache, cost[pix_count - 2], cost,
+                                  dist_array);
+  }
   ok = !refs->error_;
-Error:
+ Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
   WebPSafeFree(cost_model);
   WebPSafeFree(cost);
@@ -682,12 +706,12 @@ Error:
 // We pack the path at the end of *dist_array and return
 // a pointer to this part of the array. Example:
 // dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
-static void TraceBackwards(uint32_t* const dist_array,
+static void TraceBackwards(uint16_t* const dist_array,
                            int dist_array_size,
-                           uint32_t** const chosen_path,
+                           uint16_t** const chosen_path,
                            int* const chosen_path_size) {
-  uint32_t* path = dist_array + dist_array_size;
-  uint32_t* cur = dist_array + dist_array_size - 1;
+  uint16_t* path = dist_array + dist_array_size;
+  uint16_t* cur = dist_array + dist_array_size - 1;
   while (cur >= dist_array) {
     const int k = *cur;
     --path;
@@ -701,20 +725,16 @@ static void TraceBackwards(uint32_t* const dist_array,
 static int BackwardReferencesHashChainFollowChosenPath(
     int xsize, int ysize, const uint32_t* const argb,
     int quality, int cache_bits,
-    const uint32_t* const chosen_path, int chosen_path_size,
+    const uint16_t* const chosen_path, int chosen_path_size,
     VP8LHashChain* const hash_chain,
     VP8LBackwardRefs* const refs) {
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
-  int size = 0;
-  int i = 0;
-  int k;
   int ix;
+  int i = 0;
   int ok = 0;
   int cc_init = 0;
-  int window_size = WINDOW_SIZE;
-  int iter_pos = 1;
-  int iter_limit = -1;
+  const int window_size = GetWindowSizeForHashChain(quality, xsize);
   VP8LColorCache hashers;
 
   if (use_color_cache) {
@@ -723,18 +743,13 @@ static int BackwardReferencesHashChainFollowChosenPath(
   }
 
   ClearBackwardRefs(refs);
-  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
-                                &window_size, &iter_pos, &iter_limit);
-  HashChainInit(hash_chain);
-  for (ix = 0; ix < chosen_path_size; ++ix, ++size) {
+  HashChainReset(hash_chain);
+  for (ix = 0; ix < chosen_path_size; ++ix) {
     int offset = 0;
-    int len = 0;
-    int max_len = chosen_path[ix];
-    if (max_len != 1) {
-      HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
-                        window_size, iter_pos, iter_limit,
-                        &offset, &len);
-      assert(len == max_len);
+    const int len = chosen_path[ix];
+    if (len != 1) {
+      int k;
+      HashChainFindOffset(hash_chain, i, argb, len, window_size, &offset);
       BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
       if (use_color_cache) {
         for (k = 0; k < len; ++k) {
@@ -766,29 +781,28 @@ static int BackwardReferencesHashChainFollowChosenPath(
     }
   }
   ok = !refs->error_;
-Error:
+ Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
   return ok;
 }
 
 // Returns 1 on success.
 static int BackwardReferencesTraceBackwards(int xsize, int ysize,
-                                            int recursive_cost_model,
                                             const uint32_t* const argb,
                                             int quality, int cache_bits,
                                             VP8LHashChain* const hash_chain,
                                             VP8LBackwardRefs* const refs) {
   int ok = 0;
   const int dist_array_size = xsize * ysize;
-  uint32_t* chosen_path = NULL;
+  uint16_t* chosen_path = NULL;
   int chosen_path_size = 0;
-  uint32_t* dist_array =
-      (uint32_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
+  uint16_t* dist_array =
+      (uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
 
   if (dist_array == NULL) goto Error;
 
   if (!BackwardReferencesHashChainDistanceOnly(
-      xsize, ysize, recursive_cost_model, argb, quality, cache_bits, hash_chain,
+      xsize, ysize, argb, quality, cache_bits, hash_chain,
       refs, dist_array)) {
     goto Error;
   }
@@ -817,72 +831,10 @@ static void BackwardReferences2DLocality(int xsize,
   }
 }
 
-VP8LBackwardRefs* VP8LGetBackwardReferences(
-    int width, int height, const uint32_t* const argb, int quality,
-    int cache_bits, int use_2d_locality, VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs refs_array[2]) {
-  int lz77_is_useful;
-  const int num_pix = width * height;
-  VP8LBackwardRefs* best = NULL;
-  VP8LBackwardRefs* const refs_lz77 = &refs_array[0];
-  VP8LBackwardRefs* const refs_rle = &refs_array[1];
-
-  if (!BackwardReferencesHashChain(width, height, argb, cache_bits, quality,
-                                   hash_chain, refs_lz77)) {
-    return NULL;
-  }
-  if (!BackwardReferencesRle(width, height, argb, refs_rle)) {
-    return NULL;
-  }
-
-  {
-    double bit_cost_lz77, bit_cost_rle;
-    VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
-    if (histo == NULL) return NULL;
-    // Evaluate LZ77 coding.
-    VP8LHistogramCreate(histo, refs_lz77, cache_bits);
-    bit_cost_lz77 = VP8LHistogramEstimateBits(histo);
-    // Evaluate RLE coding.
-    VP8LHistogramCreate(histo, refs_rle, cache_bits);
-    bit_cost_rle = VP8LHistogramEstimateBits(histo);
-    // Decide if LZ77 is useful.
-    lz77_is_useful = (bit_cost_lz77 < bit_cost_rle);
-    VP8LFreeHistogram(histo);
-  }
-
-  // Choose appropriate backward reference.
-  if (lz77_is_useful) {
-    // TraceBackwards is costly. Don't execute it at lower quality.
-    const int try_lz77_trace_backwards = (quality >= 25);
-    best = refs_lz77;   // default guess: lz77 is better
-    if (try_lz77_trace_backwards) {
-      // Set recursion level for large images using a color cache.
-      const int recursion_level =
-          (num_pix < 320 * 200) && (cache_bits > 0) ? 1 : 0;
-      VP8LBackwardRefs* const refs_trace = &refs_array[1];
-      ClearBackwardRefs(refs_trace);
-      if (BackwardReferencesTraceBackwards(width, height, recursion_level, argb,
-                                           quality, cache_bits, hash_chain,
-                                           refs_trace)) {
-        best = refs_trace;
-      }
-    }
-  } else {
-    best = refs_rle;
-  }
-
-  if (use_2d_locality) BackwardReferences2DLocality(width, best);
-
-  return best;
-}
-
 // Returns entropy for the given cache bits.
-static double ComputeCacheEntropy(const uint32_t* const argb,
-                                  int xsize, int ysize,
+static double ComputeCacheEntropy(const uint32_t* argb,
                                   const VP8LBackwardRefs* const refs,
                                   int cache_bits) {
-  int pixel_index = 0;
-  uint32_t k;
   const int use_color_cache = (cache_bits > 0);
   int cc_init = 0;
   double entropy = MAX_ENTROPY;
@@ -896,33 +848,40 @@ static double ComputeCacheEntropy(const uint32_t* const argb,
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     if (!cc_init) goto Error;
   }
-
-  while (VP8LRefsCursorOk(&c)) {
-    const PixOrCopy* const v = c.cur_pos;
-    if (PixOrCopyIsLiteral(v)) {
-      if (use_color_cache &&
-          VP8LColorCacheContains(&hashers, argb[pixel_index])) {
-        // push pixel as a cache index
-        const int ix = VP8LColorCacheGetIndex(&hashers, argb[pixel_index]);
-        const PixOrCopy token = PixOrCopyCreateCacheIdx(ix);
-        VP8LHistogramAddSinglePixOrCopy(histo, &token);
-      } else {
-        VP8LHistogramAddSinglePixOrCopy(histo, v);
-      }
-    } else {
-      VP8LHistogramAddSinglePixOrCopy(histo, v);
+  if (!use_color_cache) {
+    while (VP8LRefsCursorOk(&c)) {
+      VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos);
+      VP8LRefsCursorNext(&c);
     }
-    if (use_color_cache) {
-      for (k = 0; k < PixOrCopyLength(v); ++k) {
-        VP8LColorCacheInsert(&hashers, argb[pixel_index + k]);
+  } else {
+    while (VP8LRefsCursorOk(&c)) {
+      const PixOrCopy* const v = c.cur_pos;
+      if (PixOrCopyIsLiteral(v)) {
+        const uint32_t pix = *argb++;
+        const uint32_t key = VP8LColorCacheGetIndex(&hashers, pix);
+        if (VP8LColorCacheLookup(&hashers, key) == pix) {
+          ++histo->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
+        } else {
+          VP8LColorCacheSet(&hashers, key, pix);
+          ++histo->blue_[pix & 0xff];
+          ++histo->literal_[(pix >> 8) & 0xff];
+          ++histo->red_[(pix >> 16) & 0xff];
+          ++histo->alpha_[pix >> 24];
+        }
+      } else {
+        int len = PixOrCopyLength(v);
+        int code, extra_bits;
+        VP8LPrefixEncodeBits(len, &code, &extra_bits);
+        ++histo->literal_[NUM_LITERAL_CODES + code];
+        VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
+        ++histo->distance_[code];
+        do {
+          VP8LColorCacheInsert(&hashers, *argb++);
+        } while(--len != 0);
       }
+      VP8LRefsCursorNext(&c);
     }
-    pixel_index += PixOrCopyLength(v);
-    VP8LRefsCursorNext(&c);
   }
-  assert(pixel_index == xsize * ysize);
-  (void)xsize;  // xsize is not used in non-debug compilations otherwise.
-  (void)ysize;  // ysize is not used in non-debug compilations otherwise.
   entropy = VP8LHistogramEstimateBits(histo) +
       kSmallPenaltyForLargeCache * cache_bits;
  Error:
@@ -931,45 +890,204 @@ static double ComputeCacheEntropy(const uint32_t* const argb,
   return entropy;
 }
 
-// *best_cache_bits will contain how many bits are to be used for a color cache.
+// Evaluate optimal cache bits for the local color cache.
+// The input *best_cache_bits sets the maximum cache bits to use (passing 0
+// implies disabling the local color cache). The local color cache is also
+// disabled for the lower (<= 25) quality.
 // Returns 0 in case of memory error.
-int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
-                                      int xsize, int ysize, int quality,
-                                      VP8LHashChain* const hash_chain,
-                                      VP8LBackwardRefs* const refs,
-                                      int* const best_cache_bits) {
+static int CalculateBestCacheSize(const uint32_t* const argb,
+                                  int xsize, int ysize, int quality,
+                                  VP8LHashChain* const hash_chain,
+                                  VP8LBackwardRefs* const refs,
+                                  int* const lz77_computed,
+                                  int* const best_cache_bits) {
   int eval_low = 1;
   int eval_high = 1;
   double entropy_low = MAX_ENTROPY;
   double entropy_high = MAX_ENTROPY;
+  const double cost_mul = 5e-4;
   int cache_bits_low = 0;
-  int cache_bits_high = MAX_COLOR_CACHE_BITS;
+  int cache_bits_high = (quality <= 25) ? 0 : *best_cache_bits;
 
-  if (!BackwardReferencesHashChain(xsize, ysize, argb, 0, quality, hash_chain,
-                                   refs)) {
+  assert(cache_bits_high <= MAX_COLOR_CACHE_BITS);
+
+  *lz77_computed = 0;
+  if (cache_bits_high == 0) {
+    *best_cache_bits = 0;
+    // Local color cache is disabled.
+    return 1;
+  }
+  if (!BackwardReferencesLz77(xsize, ysize, argb, cache_bits_low, quality, 0,
+                              hash_chain, refs)) {
     return 0;
   }
   // Do a binary search to find the optimal entropy for cache_bits.
-  while (cache_bits_high - cache_bits_low > 1) {
+  while (eval_low || eval_high) {
     if (eval_low) {
-      entropy_low =
-          ComputeCacheEntropy(argb, xsize, ysize, refs, cache_bits_low);
+      entropy_low = ComputeCacheEntropy(argb, refs, cache_bits_low);
+      entropy_low += entropy_low * cache_bits_low * cost_mul;
       eval_low = 0;
     }
     if (eval_high) {
-      entropy_high =
-          ComputeCacheEntropy(argb, xsize, ysize, refs, cache_bits_high);
+      entropy_high = ComputeCacheEntropy(argb, refs, cache_bits_high);
+      entropy_high += entropy_high * cache_bits_high * cost_mul;
       eval_high = 0;
     }
     if (entropy_high < entropy_low) {
+      const int prev_cache_bits_low = cache_bits_low;
       *best_cache_bits = cache_bits_high;
       cache_bits_low = (cache_bits_low + cache_bits_high) / 2;
-      eval_low = 1;
+      if (cache_bits_low != prev_cache_bits_low) eval_low = 1;
     } else {
       *best_cache_bits = cache_bits_low;
       cache_bits_high = (cache_bits_low + cache_bits_high) / 2;
-      eval_high = 1;
+      if (cache_bits_high != cache_bits_low) eval_high = 1;
     }
   }
+  *lz77_computed = 1;
   return 1;
 }
+
+// Update (in-place) backward references for specified cache_bits.
+static int BackwardRefsWithLocalCache(const uint32_t* const argb,
+                                      int cache_bits,
+                                      VP8LBackwardRefs* const refs) {
+  int pixel_index = 0;
+  VP8LColorCache hashers;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  if (!VP8LColorCacheInit(&hashers, cache_bits)) return 0;
+
+  while (VP8LRefsCursorOk(&c)) {
+    PixOrCopy* const v = c.cur_pos;
+    if (PixOrCopyIsLiteral(v)) {
+      const uint32_t argb_literal = v->argb_or_distance;
+      if (VP8LColorCacheContains(&hashers, argb_literal)) {
+        const int ix = VP8LColorCacheGetIndex(&hashers, argb_literal);
+        *v = PixOrCopyCreateCacheIdx(ix);
+      } else {
+        VP8LColorCacheInsert(&hashers, argb_literal);
+      }
+      ++pixel_index;
+    } else {
+      // refs was created without local cache, so it can not have cache indexes.
+      int k;
+      assert(PixOrCopyIsCopy(v));
+      for (k = 0; k < v->len; ++k) {
+        VP8LColorCacheInsert(&hashers, argb[pixel_index++]);
+      }
+    }
+    VP8LRefsCursorNext(&c);
+  }
+  VP8LColorCacheClear(&hashers);
+  return 1;
+}
+
+static VP8LBackwardRefs* GetBackwardReferencesLowEffort(
+    int width, int height, const uint32_t* const argb, int quality,
+    int* const cache_bits, VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs refs_array[2]) {
+  VP8LBackwardRefs* refs_lz77 = &refs_array[0];
+  *cache_bits = 0;
+  if (!BackwardReferencesLz77(width, height, argb, 0, quality,
+                              1 /* Low effort. */, hash_chain, refs_lz77)) {
+    return NULL;
+  }
+  BackwardReferences2DLocality(width, refs_lz77);
+  return refs_lz77;
+}
+
+static VP8LBackwardRefs* GetBackwardReferences(
+    int width, int height, const uint32_t* const argb, int quality,
+    int* const cache_bits, VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs refs_array[2]) {
+  int lz77_is_useful;
+  int lz77_computed;
+  double bit_cost_lz77, bit_cost_rle;
+  VP8LBackwardRefs* best = NULL;
+  VP8LBackwardRefs* refs_lz77 = &refs_array[0];
+  VP8LBackwardRefs* refs_rle = &refs_array[1];
+  VP8LHistogram* histo = NULL;
+
+  if (!CalculateBestCacheSize(argb, width, height, quality, hash_chain,
+                              refs_lz77, &lz77_computed, cache_bits)) {
+    goto Error;
+  }
+
+  if (lz77_computed) {
+    // Transform refs_lz77 for the optimized cache_bits.
+    if (*cache_bits > 0) {
+      if (!BackwardRefsWithLocalCache(argb, *cache_bits, refs_lz77)) {
+        goto Error;
+      }
+    }
+  } else {
+    if (!BackwardReferencesLz77(width, height, argb, *cache_bits, quality,
+                                0 /* Low effort. */, hash_chain, refs_lz77)) {
+      goto Error;
+    }
+  }
+
+  if (!BackwardReferencesRle(width, height, argb, *cache_bits, refs_rle)) {
+    goto Error;
+  }
+
+  histo = VP8LAllocateHistogram(*cache_bits);
+  if (histo == NULL) goto Error;
+
+  {
+    // Evaluate LZ77 coding.
+    VP8LHistogramCreate(histo, refs_lz77, *cache_bits);
+    bit_cost_lz77 = VP8LHistogramEstimateBits(histo);
+    // Evaluate RLE coding.
+    VP8LHistogramCreate(histo, refs_rle, *cache_bits);
+    bit_cost_rle = VP8LHistogramEstimateBits(histo);
+    // Decide if LZ77 is useful.
+    lz77_is_useful = (bit_cost_lz77 < bit_cost_rle);
+  }
+
+  // Choose appropriate backward reference.
+  if (lz77_is_useful) {
+    // TraceBackwards is costly. Don't execute it at lower quality.
+    const int try_lz77_trace_backwards = (quality >= 25);
+    best = refs_lz77;   // default guess: lz77 is better
+    if (try_lz77_trace_backwards) {
+      VP8LBackwardRefs* const refs_trace = refs_rle;
+      if (!VP8LBackwardRefsCopy(refs_lz77, refs_trace)) {
+        best = NULL;
+        goto Error;
+      }
+      if (BackwardReferencesTraceBackwards(width, height, argb, quality,
+                                           *cache_bits, hash_chain,
+                                           refs_trace)) {
+        double bit_cost_trace;
+        // Evaluate LZ77 coding.
+        VP8LHistogramCreate(histo, refs_trace, *cache_bits);
+        bit_cost_trace = VP8LHistogramEstimateBits(histo);
+        if (bit_cost_trace < bit_cost_lz77) {
+          best = refs_trace;
+        }
+      }
+    }
+  } else {
+    best = refs_rle;
+  }
+
+  BackwardReferences2DLocality(width, best);
+
+ Error:
+  VP8LFreeHistogram(histo);
+  return best;
+}
+
+VP8LBackwardRefs* VP8LGetBackwardReferences(
+    int width, int height, const uint32_t* const argb, int quality,
+    int low_effort, int* const cache_bits, VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs refs_array[2]) {
+  if (low_effort) {
+    return GetBackwardReferencesLowEffort(width, height, argb, quality,
+                                          cache_bits, hash_chain, refs_array);
+  } else {
+    return GetBackwardReferences(width, height, argb, quality, cache_bits,
+                                 hash_chain, refs_array);
+  }
+}
diff --git a/src/3rdparty/libwebp/src/enc/backward_references.h b/src/3rdparty/libwebp/src/enc/backward_references.h
index c2c81c5..daa084d 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references.h
+++ b/src/3rdparty/libwebp/src/enc/backward_references.h
@@ -22,13 +22,8 @@
 extern "C" {
 #endif
 
-// The spec allows 11, we use 9 bits to reduce memory consumption in encoding.
-// Having 9 instead of 11 only removes about 0.25 % of compression density.
-#define MAX_COLOR_CACHE_BITS 9
-
-// Max ever number of codes we'll use:
-#define PIX_OR_COPY_CODES_MAX \
-    (NUM_LITERAL_CODES + NUM_LENGTH_CODES + (1 << MAX_COLOR_CACHE_BITS))
+// The maximum allowed limit is 11.
+#define MAX_COLOR_CACHE_BITS 10
 
 // -----------------------------------------------------------------------------
 // PixOrCopy
@@ -190,21 +185,16 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
 // Main entry points
 
 // Evaluates best possible backward references for specified quality.
-// Further optimize for 2D locality if use_2d_locality flag is set.
+// The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
+// bits to use (passing 0 implies disabling the local color cache).
+// The optimal cache bits is evaluated and set for the *cache_bits parameter.
 // The return value is the pointer to the best of the two backward refs viz,
 // refs[0] or refs[1].
 VP8LBackwardRefs* VP8LGetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
-    int cache_bits, int use_2d_locality, VP8LHashChain* const hash_chain,
+    int low_effort, int* const cache_bits, VP8LHashChain* const hash_chain,
     VP8LBackwardRefs refs[2]);
 
-// Produce an estimate for a good color cache size for the image.
-int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
-                                      int xsize, int ysize, int quality,
-                                      VP8LHashChain* const hash_chain,
-                                      VP8LBackwardRefs* const ref,
-                                      int* const best_cache_bits);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/3rdparty/libwebp/src/enc/config.c b/src/3rdparty/libwebp/src/enc/config.c
index 53a3bb2..f9f7961 100644
--- a/src/3rdparty/libwebp/src/enc/config.c
+++ b/src/3rdparty/libwebp/src/enc/config.c
@@ -43,10 +43,15 @@ int WebPConfigInitInternal(WebPConfig* config,
   config->alpha_filtering = 1;
   config->alpha_quality = 100;
   config->lossless = 0;
+  config->exact = 0;
   config->image_hint = WEBP_HINT_DEFAULT;
   config->emulate_jpeg_size = 0;
   config->thread_level = 0;
   config->low_memory = 0;
+  config->near_lossless = 100;
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  config->delta_palettization = 0;
+#endif // WEBP_EXPERIMENTAL_FEATURES
 
   // TODO(skal): tune.
   switch (preset) {
@@ -111,11 +116,7 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->show_compressed < 0 || config->show_compressed > 1)
     return 0;
-#if WEBP_ENCODER_ABI_VERSION > 0x0204
   if (config->preprocessing < 0 || config->preprocessing > 7)
-#else
-  if (config->preprocessing < 0 || config->preprocessing > 3)
-#endif
     return 0;
   if (config->partitions < 0 || config->partitions > 3)
     return 0;
@@ -129,6 +130,8 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->lossless < 0 || config->lossless > 1)
     return 0;
+  if (config->near_lossless < 0 || config->near_lossless > 100)
+    return 0;
   if (config->image_hint >= WEBP_HINT_LAST)
     return 0;
   if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1)
@@ -137,12 +140,17 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->low_memory < 0 || config->low_memory > 1)
     return 0;
+  if (config->exact < 0 || config->exact > 1)
+    return 0;
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  if (config->delta_palettization < 0 || config->delta_palettization > 1)
+    return 0;
+#endif  // WEBP_EXPERIMENTAL_FEATURES
   return 1;
 }
 
 //------------------------------------------------------------------------------
 
-#if WEBP_ENCODER_ABI_VERSION > 0x0202
 #define MAX_LEVEL 9
 
 // Mapping between -z level and -m / -q parameter settings.
@@ -161,6 +169,5 @@ int WebPConfigLosslessPreset(WebPConfig* config, int level) {
   config->quality = kLosslessPresets[level].quality_;
   return 1;
 }
-#endif
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/cost.c b/src/3rdparty/libwebp/src/enc/cost.c
index 9d2cc01..ae7fe01 100644
--- a/src/3rdparty/libwebp/src/enc/cost.c
+++ b/src/3rdparty/libwebp/src/enc/cost.c
@@ -14,38 +14,6 @@
 #include "./cost.h"
 
 //------------------------------------------------------------------------------
-// Boolean-cost cost table
-
-const uint16_t VP8EntropyCost[256] = {
-  1792, 1792, 1792, 1536, 1536, 1408, 1366, 1280, 1280, 1216,
-  1178, 1152, 1110, 1076, 1061, 1024, 1024,  992,  968,  951,
-   939,  911,  896,  878,  871,  854,  838,  820,  811,  794,
-   786,  768,  768,  752,  740,  732,  720,  709,  704,  690,
-   683,  672,  666,  655,  647,  640,  631,  622,  615,  607,
-   598,  592,  586,  576,  572,  564,  559,  555,  547,  541,
-   534,  528,  522,  512,  512,  504,  500,  494,  488,  483,
-   477,  473,  467,  461,  458,  452,  448,  443,  438,  434,
-   427,  424,  419,  415,  410,  406,  403,  399,  394,  390,
-   384,  384,  377,  374,  370,  366,  362,  359,  355,  351,
-   347,  342,  342,  336,  333,  330,  326,  323,  320,  316,
-   312,  308,  305,  302,  299,  296,  293,  288,  287,  283,
-   280,  277,  274,  272,  268,  266,  262,  256,  256,  256,
-   251,  248,  245,  242,  240,  237,  234,  232,  228,  226,
-   223,  221,  218,  216,  214,  211,  208,  205,  203,  201,
-   198,  196,  192,  191,  188,  187,  183,  181,  179,  176,
-   175,  171,  171,  168,  165,  163,  160,  159,  156,  154,
-   152,  150,  148,  146,  144,  142,  139,  138,  135,  133,
-   131,  128,  128,  125,  123,  121,  119,  117,  115,  113,
-   111,  110,  107,  105,  103,  102,  100,   98,   96,   94,
-    92,   91,   89,   86,   86,   83,   82,   80,   77,   76,
-    74,   73,   71,   69,   67,   66,   64,   63,   61,   59,
-    57,   55,   54,   52,   51,   49,   47,   46,   44,   43,
-    41,   40,   38,   36,   35,   33,   32,   30,   29,   27,
-    25,   24,   22,   21,   19,   18,   16,   15,   13,   12,
-    10,    9,    7,    6,    4,    3
-};
-
-//------------------------------------------------------------------------------
 // Level cost tables
 
 // For each given level, the following table gives the pattern of contexts to
@@ -71,267 +39,6 @@ const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
   {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x153}
 };
 
-// fixed costs for coding levels, deduce from the coding tree.
-// This is only the part that doesn't depend on the probability state.
-const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
-     0,  256,  256,  256,  256,  432,  618,  630,
-   731,  640,  640,  828,  901,  948, 1021, 1101,
-  1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
-  1245, 1275, 1318, 1337, 1380, 1410, 1453, 1497,
-  1540, 1570, 1613, 1280, 1295, 1317, 1332, 1358,
-  1373, 1395, 1410, 1454, 1469, 1491, 1506, 1532,
-  1547, 1569, 1584, 1601, 1616, 1638, 1653, 1679,
-  1694, 1716, 1731, 1775, 1790, 1812, 1827, 1853,
-  1868, 1890, 1905, 1727, 1733, 1742, 1748, 1759,
-  1765, 1774, 1780, 1800, 1806, 1815, 1821, 1832,
-  1838, 1847, 1853, 1878, 1884, 1893, 1899, 1910,
-  1916, 1925, 1931, 1951, 1957, 1966, 1972, 1983,
-  1989, 1998, 2004, 2027, 2033, 2042, 2048, 2059,
-  2065, 2074, 2080, 2100, 2106, 2115, 2121, 2132,
-  2138, 2147, 2153, 2178, 2184, 2193, 2199, 2210,
-  2216, 2225, 2231, 2251, 2257, 2266, 2272, 2283,
-  2289, 2298, 2304, 2168, 2174, 2183, 2189, 2200,
-  2206, 2215, 2221, 2241, 2247, 2256, 2262, 2273,
-  2279, 2288, 2294, 2319, 2325, 2334, 2340, 2351,
-  2357, 2366, 2372, 2392, 2398, 2407, 2413, 2424,
-  2430, 2439, 2445, 2468, 2474, 2483, 2489, 2500,
-  2506, 2515, 2521, 2541, 2547, 2556, 2562, 2573,
-  2579, 2588, 2594, 2619, 2625, 2634, 2640, 2651,
-  2657, 2666, 2672, 2692, 2698, 2707, 2713, 2724,
-  2730, 2739, 2745, 2540, 2546, 2555, 2561, 2572,
-  2578, 2587, 2593, 2613, 2619, 2628, 2634, 2645,
-  2651, 2660, 2666, 2691, 2697, 2706, 2712, 2723,
-  2729, 2738, 2744, 2764, 2770, 2779, 2785, 2796,
-  2802, 2811, 2817, 2840, 2846, 2855, 2861, 2872,
-  2878, 2887, 2893, 2913, 2919, 2928, 2934, 2945,
-  2951, 2960, 2966, 2991, 2997, 3006, 3012, 3023,
-  3029, 3038, 3044, 3064, 3070, 3079, 3085, 3096,
-  3102, 3111, 3117, 2981, 2987, 2996, 3002, 3013,
-  3019, 3028, 3034, 3054, 3060, 3069, 3075, 3086,
-  3092, 3101, 3107, 3132, 3138, 3147, 3153, 3164,
-  3170, 3179, 3185, 3205, 3211, 3220, 3226, 3237,
-  3243, 3252, 3258, 3281, 3287, 3296, 3302, 3313,
-  3319, 3328, 3334, 3354, 3360, 3369, 3375, 3386,
-  3392, 3401, 3407, 3432, 3438, 3447, 3453, 3464,
-  3470, 3479, 3485, 3505, 3511, 3520, 3526, 3537,
-  3543, 3552, 3558, 2816, 2822, 2831, 2837, 2848,
-  2854, 2863, 2869, 2889, 2895, 2904, 2910, 2921,
-  2927, 2936, 2942, 2967, 2973, 2982, 2988, 2999,
-  3005, 3014, 3020, 3040, 3046, 3055, 3061, 3072,
-  3078, 3087, 3093, 3116, 3122, 3131, 3137, 3148,
-  3154, 3163, 3169, 3189, 3195, 3204, 3210, 3221,
-  3227, 3236, 3242, 3267, 3273, 3282, 3288, 3299,
-  3305, 3314, 3320, 3340, 3346, 3355, 3361, 3372,
-  3378, 3387, 3393, 3257, 3263, 3272, 3278, 3289,
-  3295, 3304, 3310, 3330, 3336, 3345, 3351, 3362,
-  3368, 3377, 3383, 3408, 3414, 3423, 3429, 3440,
-  3446, 3455, 3461, 3481, 3487, 3496, 3502, 3513,
-  3519, 3528, 3534, 3557, 3563, 3572, 3578, 3589,
-  3595, 3604, 3610, 3630, 3636, 3645, 3651, 3662,
-  3668, 3677, 3683, 3708, 3714, 3723, 3729, 3740,
-  3746, 3755, 3761, 3781, 3787, 3796, 3802, 3813,
-  3819, 3828, 3834, 3629, 3635, 3644, 3650, 3661,
-  3667, 3676, 3682, 3702, 3708, 3717, 3723, 3734,
-  3740, 3749, 3755, 3780, 3786, 3795, 3801, 3812,
-  3818, 3827, 3833, 3853, 3859, 3868, 3874, 3885,
-  3891, 3900, 3906, 3929, 3935, 3944, 3950, 3961,
-  3967, 3976, 3982, 4002, 4008, 4017, 4023, 4034,
-  4040, 4049, 4055, 4080, 4086, 4095, 4101, 4112,
-  4118, 4127, 4133, 4153, 4159, 4168, 4174, 4185,
-  4191, 4200, 4206, 4070, 4076, 4085, 4091, 4102,
-  4108, 4117, 4123, 4143, 4149, 4158, 4164, 4175,
-  4181, 4190, 4196, 4221, 4227, 4236, 4242, 4253,
-  4259, 4268, 4274, 4294, 4300, 4309, 4315, 4326,
-  4332, 4341, 4347, 4370, 4376, 4385, 4391, 4402,
-  4408, 4417, 4423, 4443, 4449, 4458, 4464, 4475,
-  4481, 4490, 4496, 4521, 4527, 4536, 4542, 4553,
-  4559, 4568, 4574, 4594, 4600, 4609, 4615, 4626,
-  4632, 4641, 4647, 3515, 3521, 3530, 3536, 3547,
-  3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
-  3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
-  3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
-  3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
-  3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
-  3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
-  4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
-  4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
-  3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
-  4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
-  4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
-  4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
-  4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
-  4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
-  4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
-  4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
-  4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
-  4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
-  4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
-  4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
-  4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
-  4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
-  4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
-  4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
-  4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
-  4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
-  4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
-  5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
-  5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
-  5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
-  5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
-  5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
-  4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
-  4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
-  4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
-  4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
-  4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
-  5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
-  5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
-  5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
-  5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
-  5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
-  5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
-  5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
-  5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
-  5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
-  5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
-  5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
-  5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
-  5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
-  5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
-  5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
-  5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
-  5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
-  5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
-  5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
-  5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
-  5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
-  6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
-  6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
-  6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
-  6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
-  6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
-  6420, 6429, 6435, 3515, 3521, 3530, 3536, 3547,
-  3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
-  3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
-  3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
-  3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
-  3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
-  3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
-  4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
-  4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
-  3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
-  4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
-  4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
-  4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
-  4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
-  4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
-  4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
-  4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
-  4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
-  4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
-  4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
-  4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
-  4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
-  4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
-  4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
-  4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
-  4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
-  4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
-  4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
-  5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
-  5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
-  5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
-  5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
-  5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
-  4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
-  4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
-  4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
-  4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
-  4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
-  5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
-  5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
-  5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
-  5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
-  5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
-  5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
-  5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
-  5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
-  5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
-  5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
-  5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
-  5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
-  5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
-  5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
-  5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
-  5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
-  5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
-  5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
-  5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
-  5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
-  5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
-  6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
-  6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
-  6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
-  6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
-  6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
-  6420, 6429, 6435, 5303, 5309, 5318, 5324, 5335,
-  5341, 5350, 5356, 5376, 5382, 5391, 5397, 5408,
-  5414, 5423, 5429, 5454, 5460, 5469, 5475, 5486,
-  5492, 5501, 5507, 5527, 5533, 5542, 5548, 5559,
-  5565, 5574, 5580, 5603, 5609, 5618, 5624, 5635,
-  5641, 5650, 5656, 5676, 5682, 5691, 5697, 5708,
-  5714, 5723, 5729, 5754, 5760, 5769, 5775, 5786,
-  5792, 5801, 5807, 5827, 5833, 5842, 5848, 5859,
-  5865, 5874, 5880, 5744, 5750, 5759, 5765, 5776,
-  5782, 5791, 5797, 5817, 5823, 5832, 5838, 5849,
-  5855, 5864, 5870, 5895, 5901, 5910, 5916, 5927,
-  5933, 5942, 5948, 5968, 5974, 5983, 5989, 6000,
-  6006, 6015, 6021, 6044, 6050, 6059, 6065, 6076,
-  6082, 6091, 6097, 6117, 6123, 6132, 6138, 6149,
-  6155, 6164, 6170, 6195, 6201, 6210, 6216, 6227,
-  6233, 6242, 6248, 6268, 6274, 6283, 6289, 6300,
-  6306, 6315, 6321, 6116, 6122, 6131, 6137, 6148,
-  6154, 6163, 6169, 6189, 6195, 6204, 6210, 6221,
-  6227, 6236, 6242, 6267, 6273, 6282, 6288, 6299,
-  6305, 6314, 6320, 6340, 6346, 6355, 6361, 6372,
-  6378, 6387, 6393, 6416, 6422, 6431, 6437, 6448,
-  6454, 6463, 6469, 6489, 6495, 6504, 6510, 6521,
-  6527, 6536, 6542, 6567, 6573, 6582, 6588, 6599,
-  6605, 6614, 6620, 6640, 6646, 6655, 6661, 6672,
-  6678, 6687, 6693, 6557, 6563, 6572, 6578, 6589,
-  6595, 6604, 6610, 6630, 6636, 6645, 6651, 6662,
-  6668, 6677, 6683, 6708, 6714, 6723, 6729, 6740,
-  6746, 6755, 6761, 6781, 6787, 6796, 6802, 6813,
-  6819, 6828, 6834, 6857, 6863, 6872, 6878, 6889,
-  6895, 6904, 6910, 6930, 6936, 6945, 6951, 6962,
-  6968, 6977, 6983, 7008, 7014, 7023, 7029, 7040,
-  7046, 7055, 7061, 7081, 7087, 7096, 7102, 7113,
-  7119, 7128, 7134, 6392, 6398, 6407, 6413, 6424,
-  6430, 6439, 6445, 6465, 6471, 6480, 6486, 6497,
-  6503, 6512, 6518, 6543, 6549, 6558, 6564, 6575,
-  6581, 6590, 6596, 6616, 6622, 6631, 6637, 6648,
-  6654, 6663, 6669, 6692, 6698, 6707, 6713, 6724,
-  6730, 6739, 6745, 6765, 6771, 6780, 6786, 6797,
-  6803, 6812, 6818, 6843, 6849, 6858, 6864, 6875,
-  6881, 6890, 6896, 6916, 6922, 6931, 6937, 6948,
-  6954, 6963, 6969, 6833, 6839, 6848, 6854, 6865,
-  6871, 6880, 6886, 6906, 6912, 6921, 6927, 6938,
-  6944, 6953, 6959, 6984, 6990, 6999, 7005, 7016,
-  7022, 7031, 7037, 7057, 7063, 7072, 7078, 7089,
-  7095, 7104, 7110, 7133, 7139, 7148, 7154, 7165,
-  7171, 7180, 7186, 7206, 7212, 7221, 7227, 7238,
-  7244, 7253, 7259, 7284, 7290, 7299, 7305, 7316,
-  7322, 7331, 7337, 7357, 7363, 7372, 7378, 7389,
-  7395, 7404, 7410, 7205, 7211, 7220, 7226, 7237,
-  7243, 7252, 7258, 7278, 7284, 7293, 7299, 7310,
-  7316, 7325, 7331, 7356, 7362, 7371, 7377, 7388,
-  7394, 7403, 7409, 7429, 7435, 7444, 7450, 7461,
-  7467, 7476, 7482, 7505, 7511, 7520, 7526, 7537,
-  7543, 7552, 7558, 7578, 7584, 7593, 7599, 7610,
-  7616, 7625, 7631, 7656, 7662, 7671, 7677, 7688,
-  7694, 7703, 7709, 7729, 7735, 7744, 7750, 7761
-};
-
 static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) {
   int pattern = VP8LevelCodes[level - 1][0];
   int bits = VP8LevelCodes[level - 1][1];
@@ -350,12 +57,13 @@ static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) {
 //------------------------------------------------------------------------------
 // Pre-calc level costs once for all
 
-void VP8CalculateLevelCosts(VP8Proba* const proba) {
+void VP8CalculateLevelCosts(VP8EncProba* const proba) {
   int ctype, band, ctx;
 
   if (!proba->dirty_) return;  // nothing to do.
 
   for (ctype = 0; ctype < NUM_TYPES; ++ctype) {
+    int n;
     for (band = 0; band < NUM_BANDS; ++band) {
       for (ctx = 0; ctx < NUM_CTX; ++ctx) {
         const uint8_t* const p = proba->coeffs_[ctype][band][ctx];
@@ -371,6 +79,12 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) {
         // actually constant.
       }
     }
+    for (n = 0; n < 16; ++n) {    // replicate bands. We don't need to sentinel.
+      for (ctx = 0; ctx < NUM_CTX; ++ctx) {
+        proba->remapped_costs_[ctype][n][ctx] =
+            proba->level_cost_[ctype][VP8EncBands[n]][ctx];
+      }
+    }
   }
   proba->dirty_ = 0;
 }
@@ -487,66 +201,6 @@ const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
 };
 
 //------------------------------------------------------------------------------
-// Mode costs
-
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
-  int n = res->first;
-  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  const int p0 = res->prob[n][ctx0][0];
-  const uint16_t* t = res->cost[n][ctx0];
-  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
-  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
-  // be missing during the loop.
-  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
-
-  if (res->last < 0) {
-    return VP8BitCost(0, p0);
-  }
-  for (; n < res->last; ++n) {
-    const int v = abs(res->coeffs[n]);
-    const int b = VP8EncBands[n + 1];
-    const int ctx = (v >= 2) ? 2 : v;
-    cost += VP8LevelCost(t, v);
-    t = res->cost[b][ctx];
-  }
-  // Last coefficient is always non-zero
-  {
-    const int v = abs(res->coeffs[n]);
-    assert(v != 0);
-    cost += VP8LevelCost(t, v);
-    if (n < 15) {
-      const int b = VP8EncBands[n + 1];
-      const int ctx = (v == 1) ? 1 : 2;
-      const int last_p0 = res->prob[b][ctx][0];
-      cost += VP8BitCost(0, last_p0);
-    }
-  }
-  return cost;
-}
-
-//------------------------------------------------------------------------------
-// init function
-
-#if defined(WEBP_USE_MIPS32)
-extern int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res);
-#endif  // WEBP_USE_MIPS32
-
-// TODO(skal): this, and GetResidualCost(), should probably go somewhere
-// under src/dsp/ at some point.
-VP8GetResidualCostFunc VP8GetResidualCost;
-
-void VP8GetResidualCostInit(void) {
-  VP8GetResidualCost = GetResidualCost;
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_MIPS32)
-    if (VP8GetCPUInfo(kMIPS32)) {
-      VP8GetResidualCost = VP8GetResidualCostMIPS32;
-    }
-#endif
-  }
-}
-
-//------------------------------------------------------------------------------
 // helper functions for residuals struct VP8Residual.
 
 void VP8InitResidual(int first, int coeff_type,
@@ -554,45 +208,10 @@ void VP8InitResidual(int first, int coeff_type,
   res->coeff_type = coeff_type;
   res->prob  = enc->proba_.coeffs_[coeff_type];
   res->stats = enc->proba_.stats_[coeff_type];
-  res->cost  = enc->proba_.level_cost_[coeff_type];
+  res->costs = enc->proba_.remapped_costs_[coeff_type];
   res->first = first;
 }
 
-static void SetResidualCoeffs(const int16_t* const coeffs,
-                              VP8Residual* const res) {
-  int n;
-  res->last = -1;
-  assert(res->first == 0 || coeffs[0] == 0);
-  for (n = 15; n >= 0; --n) {
-    if (coeffs[n]) {
-      res->last = n;
-      break;
-    }
-  }
-  res->coeffs = coeffs;
-}
-
-//------------------------------------------------------------------------------
-// init function
-
-#if defined(WEBP_USE_SSE2)
-extern void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
-                                     VP8Residual* const res);
-#endif  // WEBP_USE_SSE2
-
-VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
-
-void VP8SetResidualCoeffsInit(void) {
-  VP8SetResidualCoeffs = SetResidualCoeffs;
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
-    if (VP8GetCPUInfo(kSSE2)) {
-      VP8SetResidualCoeffs = VP8SetResidualCoeffsSSE2;
-    }
-#endif
-  }
-}
-
 //------------------------------------------------------------------------------
 // Mode costs
 
diff --git a/src/3rdparty/libwebp/src/enc/cost.h b/src/3rdparty/libwebp/src/enc/cost.h
index 4e55895..20960d6 100644
--- a/src/3rdparty/libwebp/src/enc/cost.h
+++ b/src/3rdparty/libwebp/src/enc/cost.h
@@ -24,46 +24,31 @@ extern "C" {
 
 // On-the-fly info about the current set of residuals. Handy to avoid
 // passing zillions of params.
-typedef struct {
+typedef struct VP8Residual VP8Residual;
+struct VP8Residual {
   int first;
   int last;
   const int16_t* coeffs;
 
   int coeff_type;
-  ProbaArray* prob;
-  StatsArray* stats;
-  CostArray*  cost;
-} VP8Residual;
+  ProbaArray*   prob;
+  StatsArray*   stats;
+  CostArrayPtr  costs;
+};
 
 void VP8InitResidual(int first, int coeff_type,
                      VP8Encoder* const enc, VP8Residual* const res);
 
-typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
-                                         VP8Residual* const res);
-extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
-
-void VP8SetResidualCoeffsInit(void);  // must be called first
-
 int VP8RecordCoeffs(int ctx, const VP8Residual* const res);
 
-// approximate cost per level:
-extern const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1];
-extern const uint16_t VP8EntropyCost[256];        // 8bit fixed-point log(p)
-
 // Cost of coding one event with probability 'proba'.
 static WEBP_INLINE int VP8BitCost(int bit, uint8_t proba) {
   return !bit ? VP8EntropyCost[proba] : VP8EntropyCost[255 - proba];
 }
 
-// Cost calculation function.
-typedef int (*VP8GetResidualCostFunc)(int ctx0, const VP8Residual* const res);
-extern VP8GetResidualCostFunc VP8GetResidualCost;
-
-void VP8GetResidualCostInit(void);  // must be called first
-
 // Level cost calculations
 extern const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2];
-void VP8CalculateLevelCosts(VP8Proba* const proba);
+void VP8CalculateLevelCosts(VP8EncProba* const proba);
 static WEBP_INLINE int VP8LevelCost(const uint16_t* const table, int level) {
   return VP8LevelFixedCosts[level]
        + table[(level > MAX_VARIABLE_LEVEL) ? MAX_VARIABLE_LEVEL : level];
diff --git a/src/3rdparty/libwebp/src/enc/delta_palettization.c b/src/3rdparty/libwebp/src/enc/delta_palettization.c
new file mode 100644
index 0000000..062e588
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/delta_palettization.c
@@ -0,0 +1,455 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Mislav Bradac (mislavm@google.com)
+//
+
+#include "./delta_palettization.h"
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+#include "../webp/types.h"
+#include "../dsp/lossless.h"
+
+#define MK_COL(r, g, b) (((r) << 16) + ((g) << 8) + (b))
+
+// Format allows palette up to 256 entries, but more palette entries produce
+// bigger entropy. In the future it will probably be useful to add more entries
+// that are far from the origin of the palette or choose remaining entries
+// dynamically.
+#define DELTA_PALETTE_SIZE 226
+
+// Palette used for delta_palettization. Entries are roughly sorted by distance
+// of their signed equivalents from the origin.
+static const uint32_t kDeltaPalette[DELTA_PALETTE_SIZE] = {
+  MK_COL(0u, 0u, 0u),
+  MK_COL(255u, 255u, 255u),
+  MK_COL(1u, 1u, 1u),
+  MK_COL(254u, 254u, 254u),
+  MK_COL(2u, 2u, 2u),
+  MK_COL(4u, 4u, 4u),
+  MK_COL(252u, 252u, 252u),
+  MK_COL(250u, 0u, 0u),
+  MK_COL(0u, 250u, 0u),
+  MK_COL(0u, 0u, 250u),
+  MK_COL(6u, 0u, 0u),
+  MK_COL(0u, 6u, 0u),
+  MK_COL(0u, 0u, 6u),
+  MK_COL(0u, 0u, 248u),
+  MK_COL(0u, 0u, 8u),
+  MK_COL(0u, 248u, 0u),
+  MK_COL(0u, 248u, 248u),
+  MK_COL(0u, 248u, 8u),
+  MK_COL(0u, 8u, 0u),
+  MK_COL(0u, 8u, 248u),
+  MK_COL(0u, 8u, 8u),
+  MK_COL(8u, 8u, 8u),
+  MK_COL(248u, 0u, 0u),
+  MK_COL(248u, 0u, 248u),
+  MK_COL(248u, 0u, 8u),
+  MK_COL(248u, 248u, 0u),
+  MK_COL(248u, 8u, 0u),
+  MK_COL(8u, 0u, 0u),
+  MK_COL(8u, 0u, 248u),
+  MK_COL(8u, 0u, 8u),
+  MK_COL(8u, 248u, 0u),
+  MK_COL(8u, 8u, 0u),
+  MK_COL(23u, 23u, 23u),
+  MK_COL(13u, 13u, 13u),
+  MK_COL(232u, 232u, 232u),
+  MK_COL(244u, 244u, 244u),
+  MK_COL(245u, 245u, 250u),
+  MK_COL(50u, 50u, 50u),
+  MK_COL(204u, 204u, 204u),
+  MK_COL(236u, 236u, 236u),
+  MK_COL(16u, 16u, 16u),
+  MK_COL(240u, 16u, 16u),
+  MK_COL(16u, 240u, 16u),
+  MK_COL(240u, 240u, 16u),
+  MK_COL(16u, 16u, 240u),
+  MK_COL(240u, 16u, 240u),
+  MK_COL(16u, 240u, 240u),
+  MK_COL(240u, 240u, 240u),
+  MK_COL(0u, 0u, 232u),
+  MK_COL(0u, 232u, 0u),
+  MK_COL(232u, 0u, 0u),
+  MK_COL(0u, 0u, 24u),
+  MK_COL(0u, 24u, 0u),
+  MK_COL(24u, 0u, 0u),
+  MK_COL(32u, 32u, 32u),
+  MK_COL(224u, 32u, 32u),
+  MK_COL(32u, 224u, 32u),
+  MK_COL(224u, 224u, 32u),
+  MK_COL(32u, 32u, 224u),
+  MK_COL(224u, 32u, 224u),
+  MK_COL(32u, 224u, 224u),
+  MK_COL(224u, 224u, 224u),
+  MK_COL(0u, 0u, 176u),
+  MK_COL(0u, 0u, 80u),
+  MK_COL(0u, 176u, 0u),
+  MK_COL(0u, 176u, 176u),
+  MK_COL(0u, 176u, 80u),
+  MK_COL(0u, 80u, 0u),
+  MK_COL(0u, 80u, 176u),
+  MK_COL(0u, 80u, 80u),
+  MK_COL(176u, 0u, 0u),
+  MK_COL(176u, 0u, 176u),
+  MK_COL(176u, 0u, 80u),
+  MK_COL(176u, 176u, 0u),
+  MK_COL(176u, 80u, 0u),
+  MK_COL(80u, 0u, 0u),
+  MK_COL(80u, 0u, 176u),
+  MK_COL(80u, 0u, 80u),
+  MK_COL(80u, 176u, 0u),
+  MK_COL(80u, 80u, 0u),
+  MK_COL(0u, 0u, 152u),
+  MK_COL(0u, 0u, 104u),
+  MK_COL(0u, 152u, 0u),
+  MK_COL(0u, 152u, 152u),
+  MK_COL(0u, 152u, 104u),
+  MK_COL(0u, 104u, 0u),
+  MK_COL(0u, 104u, 152u),
+  MK_COL(0u, 104u, 104u),
+  MK_COL(152u, 0u, 0u),
+  MK_COL(152u, 0u, 152u),
+  MK_COL(152u, 0u, 104u),
+  MK_COL(152u, 152u, 0u),
+  MK_COL(152u, 104u, 0u),
+  MK_COL(104u, 0u, 0u),
+  MK_COL(104u, 0u, 152u),
+  MK_COL(104u, 0u, 104u),
+  MK_COL(104u, 152u, 0u),
+  MK_COL(104u, 104u, 0u),
+  MK_COL(216u, 216u, 216u),
+  MK_COL(216u, 216u, 40u),
+  MK_COL(216u, 216u, 176u),
+  MK_COL(216u, 216u, 80u),
+  MK_COL(216u, 40u, 216u),
+  MK_COL(216u, 40u, 40u),
+  MK_COL(216u, 40u, 176u),
+  MK_COL(216u, 40u, 80u),
+  MK_COL(216u, 176u, 216u),
+  MK_COL(216u, 176u, 40u),
+  MK_COL(216u, 176u, 176u),
+  MK_COL(216u, 176u, 80u),
+  MK_COL(216u, 80u, 216u),
+  MK_COL(216u, 80u, 40u),
+  MK_COL(216u, 80u, 176u),
+  MK_COL(216u, 80u, 80u),
+  MK_COL(40u, 216u, 216u),
+  MK_COL(40u, 216u, 40u),
+  MK_COL(40u, 216u, 176u),
+  MK_COL(40u, 216u, 80u),
+  MK_COL(40u, 40u, 216u),
+  MK_COL(40u, 40u, 40u),
+  MK_COL(40u, 40u, 176u),
+  MK_COL(40u, 40u, 80u),
+  MK_COL(40u, 176u, 216u),
+  MK_COL(40u, 176u, 40u),
+  MK_COL(40u, 176u, 176u),
+  MK_COL(40u, 176u, 80u),
+  MK_COL(40u, 80u, 216u),
+  MK_COL(40u, 80u, 40u),
+  MK_COL(40u, 80u, 176u),
+  MK_COL(40u, 80u, 80u),
+  MK_COL(80u, 216u, 216u),
+  MK_COL(80u, 216u, 40u),
+  MK_COL(80u, 216u, 176u),
+  MK_COL(80u, 216u, 80u),
+  MK_COL(80u, 40u, 216u),
+  MK_COL(80u, 40u, 40u),
+  MK_COL(80u, 40u, 176u),
+  MK_COL(80u, 40u, 80u),
+  MK_COL(80u, 176u, 216u),
+  MK_COL(80u, 176u, 40u),
+  MK_COL(80u, 176u, 176u),
+  MK_COL(80u, 176u, 80u),
+  MK_COL(80u, 80u, 216u),
+  MK_COL(80u, 80u, 40u),
+  MK_COL(80u, 80u, 176u),
+  MK_COL(80u, 80u, 80u),
+  MK_COL(0u, 0u, 192u),
+  MK_COL(0u, 0u, 64u),
+  MK_COL(0u, 0u, 128u),
+  MK_COL(0u, 192u, 0u),
+  MK_COL(0u, 192u, 192u),
+  MK_COL(0u, 192u, 64u),
+  MK_COL(0u, 192u, 128u),
+  MK_COL(0u, 64u, 0u),
+  MK_COL(0u, 64u, 192u),
+  MK_COL(0u, 64u, 64u),
+  MK_COL(0u, 64u, 128u),
+  MK_COL(0u, 128u, 0u),
+  MK_COL(0u, 128u, 192u),
+  MK_COL(0u, 128u, 64u),
+  MK_COL(0u, 128u, 128u),
+  MK_COL(176u, 216u, 216u),
+  MK_COL(176u, 216u, 40u),
+  MK_COL(176u, 216u, 176u),
+  MK_COL(176u, 216u, 80u),
+  MK_COL(176u, 40u, 216u),
+  MK_COL(176u, 40u, 40u),
+  MK_COL(176u, 40u, 176u),
+  MK_COL(176u, 40u, 80u),
+  MK_COL(176u, 176u, 216u),
+  MK_COL(176u, 176u, 40u),
+  MK_COL(176u, 176u, 176u),
+  MK_COL(176u, 176u, 80u),
+  MK_COL(176u, 80u, 216u),
+  MK_COL(176u, 80u, 40u),
+  MK_COL(176u, 80u, 176u),
+  MK_COL(176u, 80u, 80u),
+  MK_COL(192u, 0u, 0u),
+  MK_COL(192u, 0u, 192u),
+  MK_COL(192u, 0u, 64u),
+  MK_COL(192u, 0u, 128u),
+  MK_COL(192u, 192u, 0u),
+  MK_COL(192u, 192u, 192u),
+  MK_COL(192u, 192u, 64u),
+  MK_COL(192u, 192u, 128u),
+  MK_COL(192u, 64u, 0u),
+  MK_COL(192u, 64u, 192u),
+  MK_COL(192u, 64u, 64u),
+  MK_COL(192u, 64u, 128u),
+  MK_COL(192u, 128u, 0u),
+  MK_COL(192u, 128u, 192u),
+  MK_COL(192u, 128u, 64u),
+  MK_COL(192u, 128u, 128u),
+  MK_COL(64u, 0u, 0u),
+  MK_COL(64u, 0u, 192u),
+  MK_COL(64u, 0u, 64u),
+  MK_COL(64u, 0u, 128u),
+  MK_COL(64u, 192u, 0u),
+  MK_COL(64u, 192u, 192u),
+  MK_COL(64u, 192u, 64u),
+  MK_COL(64u, 192u, 128u),
+  MK_COL(64u, 64u, 0u),
+  MK_COL(64u, 64u, 192u),
+  MK_COL(64u, 64u, 64u),
+  MK_COL(64u, 64u, 128u),
+  MK_COL(64u, 128u, 0u),
+  MK_COL(64u, 128u, 192u),
+  MK_COL(64u, 128u, 64u),
+  MK_COL(64u, 128u, 128u),
+  MK_COL(128u, 0u, 0u),
+  MK_COL(128u, 0u, 192u),
+  MK_COL(128u, 0u, 64u),
+  MK_COL(128u, 0u, 128u),
+  MK_COL(128u, 192u, 0u),
+  MK_COL(128u, 192u, 192u),
+  MK_COL(128u, 192u, 64u),
+  MK_COL(128u, 192u, 128u),
+  MK_COL(128u, 64u, 0u),
+  MK_COL(128u, 64u, 192u),
+  MK_COL(128u, 64u, 64u),
+  MK_COL(128u, 64u, 128u),
+  MK_COL(128u, 128u, 0u),
+  MK_COL(128u, 128u, 192u),
+  MK_COL(128u, 128u, 64u),
+  MK_COL(128u, 128u, 128u),
+};
+
+#undef MK_COL
+
+//------------------------------------------------------------------------------
+// TODO(skal): move the functions to dsp/lossless.c when the correct
+// granularity is found. For now, we'll just copy-paste some useful bits
+// here instead.
+
+// In-place sum of each component with mod 256.
+static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
+  const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
+  const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
+  *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+}
+
+static WEBP_INLINE uint32_t Clip255(uint32_t a) {
+  if (a < 256) {
+    return a;
+  }
+  // return 0, when a is a negative integer.
+  // return 255, when a is positive.
+  return ~a >> 24;
+}
+
+// Delta palettization functions.
+static WEBP_INLINE int Square(int x) {
+  return x * x;
+}
+
+static WEBP_INLINE uint32_t Intensity(uint32_t a) {
+  return
+      30 * ((a >> 16) & 0xff) +
+      59 * ((a >>  8) & 0xff) +
+      11 * ((a >>  0) & 0xff);
+}
+
+static uint32_t CalcDist(uint32_t predicted_value, uint32_t actual_value,
+                         uint32_t palette_entry) {
+  int i;
+  uint32_t distance = 0;
+  AddPixelsEq(&predicted_value, palette_entry);
+  for (i = 0; i < 32; i += 8) {
+    const int32_t av = (actual_value >> i) & 0xff;
+    const int32_t pv = (predicted_value >> i) & 0xff;
+    distance += Square(pv - av);
+  }
+  // We sum square of intensity difference with factor 10, but because Intensity
+  // returns 100 times real intensity we need to multiply differences of colors
+  // by 1000.
+  distance *= 1000u;
+  distance += Square(Intensity(predicted_value)
+                     - Intensity(actual_value));
+  return distance;
+}
+
+static uint32_t Predict(int x, int y, uint32_t* image) {
+  const uint32_t t = (y == 0) ? ARGB_BLACK : image[x];
+  const uint32_t l = (x == 0) ? ARGB_BLACK : image[x - 1];
+  const uint32_t p =
+      (((((t >> 24) & 0xff) + ((l >> 24) & 0xff)) / 2) << 24) +
+      (((((t >> 16) & 0xff) + ((l >> 16) & 0xff)) / 2) << 16) +
+      (((((t >>  8) & 0xff) + ((l >>  8) & 0xff)) / 2) <<  8) +
+      (((((t >>  0) & 0xff) + ((l >>  0) & 0xff)) / 2) <<  0);
+  if (x == 0 && y == 0) return ARGB_BLACK;
+  if (x == 0) return t;
+  if (y == 0) return l;
+  return p;
+}
+
+static WEBP_INLINE int AddSubtractComponentFullWithCoefficient(
+    int a, int b, int c) {
+  return Clip255(a + ((b - c) >> 2));
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFullWithCoefficient(
+    uint32_t c0, uint32_t c1, uint32_t c2) {
+  const int a = AddSubtractComponentFullWithCoefficient(
+      c0 >> 24, c1 >> 24, c2 >> 24);
+  const int r = AddSubtractComponentFullWithCoefficient((c0 >> 16) & 0xff,
+                                                       (c1 >> 16) & 0xff,
+                                                       (c2 >> 16) & 0xff);
+  const int g = AddSubtractComponentFullWithCoefficient((c0 >> 8) & 0xff,
+                                                       (c1 >> 8) & 0xff,
+                                                       (c2 >> 8) & 0xff);
+  const int b = AddSubtractComponentFullWithCoefficient(
+      c0 & 0xff, c1 & 0xff, c2 & 0xff);
+  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
+}
+
+//------------------------------------------------------------------------------
+
+// Find palette entry with minimum error from difference of actual pixel value
+// and predicted pixel value. Propagate error of pixel to its top and left pixel
+// in src array. Write predicted_value + palette_entry to new_image. Return
+// index of best palette entry.
+static int FindBestPaletteEntry(uint32_t src, uint32_t predicted_value,
+                                const uint32_t palette[], int palette_size) {
+  int i;
+  int idx = 0;
+  uint32_t best_distance = CalcDist(predicted_value, src, palette[0]);
+  for (i = 1; i < palette_size; ++i) {
+    const uint32_t distance = CalcDist(predicted_value, src, palette[i]);
+    if (distance < best_distance) {
+      best_distance = distance;
+      idx = i;
+    }
+  }
+  return idx;
+}
+
+static void ApplyBestPaletteEntry(int x, int y,
+                                  uint32_t new_value, uint32_t palette_value,
+                                  uint32_t* src, int src_stride,
+                                  uint32_t* new_image) {
+  AddPixelsEq(&new_value, palette_value);
+  if (x > 0) {
+    src[x - 1] = ClampedAddSubtractFullWithCoefficient(src[x - 1],
+                                                       new_value, src[x]);
+  }
+  if (y > 0) {
+    src[x - src_stride] =
+        ClampedAddSubtractFullWithCoefficient(src[x - src_stride],
+                                              new_value, src[x]);
+  }
+  new_image[x] = new_value;
+}
+
+//------------------------------------------------------------------------------
+// Main entry point
+
+static WebPEncodingError ApplyDeltaPalette(uint32_t* src, uint32_t* dst,
+                                           uint32_t src_stride,
+                                           uint32_t dst_stride,
+                                           const uint32_t* palette,
+                                           int palette_size,
+                                           int width, int height,
+                                           int num_passes) {
+  int x, y;
+  WebPEncodingError err = VP8_ENC_OK;
+  uint32_t* new_image = (uint32_t*)WebPSafeMalloc(width, sizeof(*new_image));
+  uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
+  if (new_image == NULL || tmp_row == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  while (num_passes--) {
+    uint32_t* cur_src = src;
+    uint32_t* cur_dst = dst;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        const uint32_t predicted_value = Predict(x, y, new_image);
+        tmp_row[x] = FindBestPaletteEntry(cur_src[x], predicted_value,
+                                          palette, palette_size);
+        ApplyBestPaletteEntry(x, y, predicted_value, palette[tmp_row[x]],
+                              cur_src, src_stride, new_image);
+      }
+      for (x = 0; x < width; ++x) {
+        cur_dst[x] = palette[tmp_row[x]];
+      }
+      cur_src += src_stride;
+      cur_dst += dst_stride;
+    }
+  }
+ Error:
+  WebPSafeFree(new_image);
+  WebPSafeFree(tmp_row);
+  return err;
+}
+
+// replaces enc->argb_ by a palettizable approximation of it,
+// and generates optimal enc->palette_[]
+WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  uint32_t* src = pic->argb;
+  uint32_t* dst = enc->argb_;
+  const int width = pic->width;
+  const int height = pic->height;
+
+  WebPEncodingError err = VP8_ENC_OK;
+  memcpy(enc->palette_, kDeltaPalette, sizeof(kDeltaPalette));
+  enc->palette_[DELTA_PALETTE_SIZE - 1] = src[0] - 0xff000000u;
+  enc->palette_size_ = DELTA_PALETTE_SIZE;
+  err = ApplyDeltaPalette(src, dst, pic->argb_stride, enc->current_width_,
+                          enc->palette_, enc->palette_size_,
+                          width, height, 2);
+  if (err != VP8_ENC_OK) goto Error;
+
+ Error:
+  return err;
+}
+
+#else  // !WEBP_EXPERIMENTAL_FEATURES
+
+WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) {
+  (void)enc;
+  return VP8_ENC_ERROR_INVALID_CONFIGURATION;
+}
+
+#endif  // WEBP_EXPERIMENTAL_FEATURES
diff --git a/src/3rdparty/libwebp/src/enc/delta_palettization.h b/src/3rdparty/libwebp/src/enc/delta_palettization.h
new file mode 100644
index 0000000..e41c0c5
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/delta_palettization.h
@@ -0,0 +1,25 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Mislav Bradac (mislavm@google.com)
+//
+
+#ifndef WEBP_ENC_DELTA_PALETTIZATION_H_
+#define WEBP_ENC_DELTA_PALETTIZATION_H_
+
+#include "../webp/encode.h"
+#include "../enc/vp8li.h"
+
+// Replaces enc->argb_[] input by a palettizable approximation of it,
+// and generates optimal enc->palette_[].
+// This function can revert enc->use_palette_ / enc->use_predict_ flag
+// if delta-palettization is not producing expected saving.
+WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc);
+
+#endif  // WEBP_ENC_DELTA_PALETTIZATION_H_
diff --git a/src/3rdparty/libwebp/src/enc/filter.c b/src/3rdparty/libwebp/src/enc/filter.c
index 11db4bd..41813cf 100644
--- a/src/3rdparty/libwebp/src/enc/filter.c
+++ b/src/3rdparty/libwebp/src/enc/filter.c
@@ -85,12 +85,12 @@ static void DoFilter(const VP8EncIterator* const it, int level) {
   const int ilevel = GetILevel(enc->config_->filter_sharpness, level);
   const int limit = 2 * level + ilevel;
 
-  uint8_t* const y_dst = it->yuv_out2_ + Y_OFF;
-  uint8_t* const u_dst = it->yuv_out2_ + U_OFF;
-  uint8_t* const v_dst = it->yuv_out2_ + V_OFF;
+  uint8_t* const y_dst = it->yuv_out2_ + Y_OFF_ENC;
+  uint8_t* const u_dst = it->yuv_out2_ + U_OFF_ENC;
+  uint8_t* const v_dst = it->yuv_out2_ + V_OFF_ENC;
 
   // copy current block to yuv_out2_
-  memcpy(y_dst, it->yuv_out_, YUV_SIZE * sizeof(uint8_t));
+  memcpy(y_dst, it->yuv_out_, YUV_SIZE_ENC * sizeof(uint8_t));
 
   if (enc->filter_hdr_.simple_ == 1) {   // simple
     VP8SimpleHFilter16i(y_dst, BPS, limit);
@@ -195,13 +195,16 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
   // compute SSIM in a 10 x 10 window
   for (x = 3; x < 13; x++) {
     for (y = 3; y < 13; y++) {
-      VP8SSIMAccumulate(yuv1 + Y_OFF, BPS, yuv2 + Y_OFF, BPS, x, y, 16, 16, &s);
+      VP8SSIMAccumulate(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
+                        x, y, 16, 16, &s);
     }
   }
   for (x = 1; x < 7; x++) {
     for (y = 1; y < 7; y++) {
-      VP8SSIMAccumulate(yuv1 + U_OFF, BPS, yuv2 + U_OFF, BPS, x, y, 8, 8, &s);
-      VP8SSIMAccumulate(yuv1 + V_OFF, BPS, yuv2 + V_OFF, BPS, x, y, 8, 8, &s);
+      VP8SSIMAccumulate(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
+                        x, y, 8, 8, &s);
+      VP8SSIMAccumulate(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
+                        x, y, 8, 8, &s);
     }
   }
   return VP8SSIMGet(&s);
@@ -226,7 +229,7 @@ void VP8StoreFilterStats(VP8EncIterator* const it) {
   int d;
   VP8Encoder* const enc = it->enc_;
   const int s = it->mb_->segment_;
-  const int level0 = enc->dqm_[s].fstrength_;  // TODO: ref_lf_delta[]
+  const int level0 = enc->dqm_[s].fstrength_;
 
   // explore +/-quant range of values around level0
   const int delta_min = -enc->dqm_[s].quant_;
diff --git a/src/3rdparty/libwebp/src/enc/frame.c b/src/3rdparty/libwebp/src/enc/frame.c
index cdf1dab..5b7a40b 100644
--- a/src/3rdparty/libwebp/src/enc/frame.c
+++ b/src/3rdparty/libwebp/src/enc/frame.c
@@ -14,8 +14,9 @@
 #include <string.h>
 #include <math.h>
 
-#include "./vp8enci.h"
 #include "./cost.h"
+#include "./vp8enci.h"
+#include "../dsp/dsp.h"
 #include "../webp/format_constants.h"  // RIFF constants
 
 #define SEGMENT_VISU 0
@@ -81,11 +82,6 @@ static float ComputeNextQ(PassStats* const s) {
 //------------------------------------------------------------------------------
 // Tables for level coding
 
-const uint8_t VP8EncBands[16 + 1] = {
-  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
-  0  // sentinel
-};
-
 const uint8_t VP8Cat3[] = { 173, 148, 140 };
 const uint8_t VP8Cat4[] = { 176, 155, 140, 135 };
 const uint8_t VP8Cat5[] = { 180, 157, 141, 134, 130 };
@@ -96,7 +92,7 @@ const uint8_t VP8Cat6[] =
 // Reset the statistics about: number of skips, token proba, level cost,...
 
 static void ResetStats(VP8Encoder* const enc) {
-  VP8Proba* const proba = &enc->proba_;
+  VP8EncProba* const proba = &enc->proba_;
   VP8CalculateLevelCosts(proba);
   proba->nb_skip_ = 0;
 }
@@ -112,7 +108,7 @@ static int CalcSkipProba(uint64_t nb, uint64_t total) {
 
 // Returns the bit-cost for coding the skip probability.
 static int FinalizeSkipProba(VP8Encoder* const enc) {
-  VP8Proba* const proba = &enc->proba_;
+  VP8EncProba* const proba = &enc->proba_;
   const int nb_mbs = enc->mb_w_ * enc->mb_h_;
   const int nb_events = proba->nb_skip_;
   int size;
@@ -140,11 +136,11 @@ static int BranchCost(int nb, int total, int proba) {
 }
 
 static void ResetTokenStats(VP8Encoder* const enc) {
-  VP8Proba* const proba = &enc->proba_;
+  VP8EncProba* const proba = &enc->proba_;
   memset(proba->stats_, 0, sizeof(proba->stats_));
 }
 
-static int FinalizeTokenProbas(VP8Proba* const proba) {
+static int FinalizeTokenProbas(VP8EncProba* const proba) {
   int has_changed = 0;
   int size = 0;
   int t, b, c, p;
@@ -476,9 +472,9 @@ static void StoreSSE(const VP8EncIterator* const it) {
   const uint8_t* const in = it->yuv_in_;
   const uint8_t* const out = it->yuv_out_;
   // Note: not totally accurate at boundary. And doesn't include in-loop filter.
-  enc->sse_[0] += VP8SSE16x16(in + Y_OFF, out + Y_OFF);
-  enc->sse_[1] += VP8SSE8x8(in + U_OFF, out + U_OFF);
-  enc->sse_[2] += VP8SSE8x8(in + V_OFF, out + V_OFF);
+  enc->sse_[0] += VP8SSE16x16(in + Y_OFF_ENC, out + Y_OFF_ENC);
+  enc->sse_[1] += VP8SSE8x8(in + U_OFF_ENC, out + U_OFF_ENC);
+  enc->sse_[2] += VP8SSE8x8(in + V_OFF_ENC, out + V_OFF_ENC);
   enc->sse_count_ += 16 * 16;
 }
 
@@ -511,9 +507,9 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
     }
   }
 #if SEGMENT_VISU  // visualize segments and prediction modes
-  SetBlock(it->yuv_out_ + Y_OFF, mb->segment_ * 64, 16);
-  SetBlock(it->yuv_out_ + U_OFF, it->preds_[0] * 64, 8);
-  SetBlock(it->yuv_out_ + V_OFF, mb->uv_mode_ * 64, 8);
+  SetBlock(it->yuv_out_ + Y_OFF_ENC, mb->segment_ * 64, 16);
+  SetBlock(it->yuv_out_ + U_OFF_ENC, it->preds_[0] * 64, 8);
+  SetBlock(it->yuv_out_ + V_OFF_ENC, mb->uv_mode_ * 64, 8);
 #endif
 }
 
@@ -743,7 +739,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
   int num_pass_left = enc->config_->pass;
   const int do_search = enc->do_search_;
   VP8EncIterator it;
-  VP8Proba* const proba = &enc->proba_;
+  VP8EncProba* const proba = &enc->proba_;
   const VP8RDLevel rd_opt = enc->rd_opt_level_;
   const uint64_t pixel_count = enc->mb_w_ * enc->mb_h_ * 384;
   PassStats stats;
diff --git a/src/3rdparty/libwebp/src/enc/histogram.c b/src/3rdparty/libwebp/src/enc/histogram.c
index a2266b4..869882d 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.c
+++ b/src/3rdparty/libwebp/src/enc/histogram.c
@@ -20,9 +20,6 @@
 #include "../dsp/lossless.h"
 #include "../utils/utils.h"
 
-#define ALIGN_CST 15
-#define DO_ALIGN(PTR) ((uintptr_t)((PTR) + ALIGN_CST) & ~ALIGN_CST)
-
 #define MAX_COST 1.e38
 
 // Number of partitions for the three dominant (literal, red and blue) symbol
@@ -30,6 +27,8 @@
 #define NUM_PARTITIONS 4
 // The size of the bin-hash corresponding to the three dominant costs.
 #define BIN_SIZE (NUM_PARTITIONS * NUM_PARTITIONS * NUM_PARTITIONS)
+// Maximum number of histograms allowed in greedy combining algorithm.
+#define MAX_HISTO_GREEDY 100
 
 static void HistogramClear(VP8LHistogram* const p) {
   uint32_t* const literal = p->literal_;
@@ -40,6 +39,13 @@ static void HistogramClear(VP8LHistogram* const p) {
   p->literal_ = literal;
 }
 
+// Swap two histogram pointers.
+static void HistogramSwap(VP8LHistogram** const A, VP8LHistogram** const B) {
+  VP8LHistogram* const tmp = *A;
+  *A = *B;
+  *B = tmp;
+}
+
 static void HistogramCopy(const VP8LHistogram* const src,
                           VP8LHistogram* const dst) {
   uint32_t* const dst_literal = dst->literal_;
@@ -106,7 +112,8 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
   VP8LHistogramSet* set;
   const int histo_size = VP8LGetHistogramSize(cache_bits);
   const size_t total_size =
-      sizeof(*set) + size * (sizeof(*set->histograms) + histo_size + ALIGN_CST);
+      sizeof(*set) + size * (sizeof(*set->histograms) +
+      histo_size + WEBP_ALIGN_CST);
   uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
   if (memory == NULL) return NULL;
 
@@ -117,7 +124,7 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
   set->max_size = size;
   set->size = size;
   for (i = 0; i < size; ++i) {
-    memory = (uint8_t*)DO_ALIGN(memory);
+    memory = (uint8_t*)WEBP_ALIGN(memory);
     set->histograms[i] = (VP8LHistogram*)memory;
     // literal_ won't necessary be aligned.
     set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
@@ -149,24 +156,26 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
   }
 }
 
-static WEBP_INLINE double BitsEntropyRefine(int nonzeros, int sum, int max_val,
-                                            double retval) {
+// -----------------------------------------------------------------------------
+// Entropy-related functions.
+
+static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
   double mix;
-  if (nonzeros < 5) {
-    if (nonzeros <= 1) {
+  if (entropy->nonzeros < 5) {
+    if (entropy->nonzeros <= 1) {
       return 0;
     }
     // Two symbols, they will be 0 and 1 in a Huffman code.
     // Let's mix in a bit of entropy to favor good clustering when
     // distributions of these are combined.
-    if (nonzeros == 2) {
-      return 0.99 * sum + 0.01 * retval;
+    if (entropy->nonzeros == 2) {
+      return 0.99 * entropy->sum + 0.01 * entropy->entropy;
     }
     // No matter what the entropy says, we cannot be better than min_limit
     // with Huffman coding. I am mixing a bit of entropy into the
     // min_limit since it produces much better (~0.5 %) compression results
     // perhaps because of better entropy clustering.
-    if (nonzeros == 3) {
+    if (entropy->nonzeros == 3) {
       mix = 0.95;
     } else {
       mix = 0.7;  // nonzeros == 4.
@@ -176,52 +185,22 @@ static WEBP_INLINE double BitsEntropyRefine(int nonzeros, int sum, int max_val,
   }
 
   {
-    double min_limit = 2 * sum - max_val;
-    min_limit = mix * min_limit + (1.0 - mix) * retval;
-    return (retval < min_limit) ? min_limit : retval;
+    double min_limit = 2 * entropy->sum - entropy->max_val;
+    min_limit = mix * min_limit + (1.0 - mix) * entropy->entropy;
+    return (entropy->entropy < min_limit) ? min_limit : entropy->entropy;
   }
 }
 
-static double BitsEntropy(const uint32_t* const array, int n) {
-  double retval = 0.;
-  uint32_t sum = 0;
-  int nonzeros = 0;
-  uint32_t max_val = 0;
-  int i;
-  for (i = 0; i < n; ++i) {
-    if (array[i] != 0) {
-      sum += array[i];
-      ++nonzeros;
-      retval -= VP8LFastSLog2(array[i]);
-      if (max_val < array[i]) {
-        max_val = array[i];
-      }
-    }
+double VP8LBitsEntropy(const uint32_t* const array, int n,
+                       uint32_t* const trivial_symbol) {
+  VP8LBitEntropy entropy;
+  VP8LBitsEntropyUnrefined(array, n, &entropy);
+  if (trivial_symbol != NULL) {
+    *trivial_symbol =
+        (entropy.nonzeros == 1) ? entropy.nonzero_code : VP8L_NON_TRIVIAL_SYM;
   }
-  retval += VP8LFastSLog2(sum);
-  return BitsEntropyRefine(nonzeros, sum, max_val, retval);
-}
 
-static double BitsEntropyCombined(const uint32_t* const X,
-                                  const uint32_t* const Y, int n) {
-  double retval = 0.;
-  int sum = 0;
-  int nonzeros = 0;
-  int max_val = 0;
-  int i;
-  for (i = 0; i < n; ++i) {
-    const int xy = X[i] + Y[i];
-    if (xy != 0) {
-      sum += xy;
-      ++nonzeros;
-      retval -= VP8LFastSLog2(xy);
-      if (max_val < xy) {
-        max_val = xy;
-      }
-    }
-  }
-  retval += VP8LFastSLog2(sum);
-  return BitsEntropyRefine(nonzeros, sum, max_val, retval);
+  return BitsEntropyRefine(&entropy);
 }
 
 static double InitialHuffmanCost(void) {
@@ -242,47 +221,40 @@ static double FinalHuffmanCost(const VP8LStreaks* const stats) {
   return retval;
 }
 
-// Trampolines
-static double HuffmanCost(const uint32_t* const population, int length) {
-  const VP8LStreaks stats = VP8LHuffmanCostCount(population, length);
-  return FinalHuffmanCost(&stats);
-}
+// Get the symbol entropy for the distribution 'population'.
+// Set 'trivial_sym', if there's only one symbol present in the distribution.
+static double PopulationCost(const uint32_t* const population, int length,
+                             uint32_t* const trivial_sym) {
+  VP8LBitEntropy bit_entropy;
+  VP8LStreaks stats;
+  VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats);
+  if (trivial_sym != NULL) {
+    *trivial_sym = (bit_entropy.nonzeros == 1) ? bit_entropy.nonzero_code
+                                               : VP8L_NON_TRIVIAL_SYM;
+  }
 
-static double HuffmanCostCombined(const uint32_t* const X,
-                                  const uint32_t* const Y, int length) {
-  const VP8LStreaks stats = VP8LHuffmanCostCombinedCount(X, Y, length);
-  return FinalHuffmanCost(&stats);
+  return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
 }
 
-// Aggregated costs
-static double PopulationCost(const uint32_t* const population, int length) {
-  return BitsEntropy(population, length) + HuffmanCost(population, length);
-}
+static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
+                                             const uint32_t* const Y,
+                                             int length) {
+  VP8LBitEntropy bit_entropy;
+  VP8LStreaks stats;
+  VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
 
-static double GetCombinedEntropy(const uint32_t* const X,
-                                 const uint32_t* const Y, int length) {
-  return BitsEntropyCombined(X, Y, length) + HuffmanCostCombined(X, Y, length);
+  return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
 }
 
 // Estimates the Entropy + Huffman + other block overhead size cost.
 double VP8LHistogramEstimateBits(const VP8LHistogram* const p) {
   return
-      PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_))
-      + PopulationCost(p->red_, NUM_LITERAL_CODES)
-      + PopulationCost(p->blue_, NUM_LITERAL_CODES)
-      + PopulationCost(p->alpha_, NUM_LITERAL_CODES)
-      + PopulationCost(p->distance_, NUM_DISTANCE_CODES)
-      + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
-      + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
-}
-
-double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) {
-  return
-      BitsEntropy(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_))
-      + BitsEntropy(p->red_, NUM_LITERAL_CODES)
-      + BitsEntropy(p->blue_, NUM_LITERAL_CODES)
-      + BitsEntropy(p->alpha_, NUM_LITERAL_CODES)
-      + BitsEntropy(p->distance_, NUM_DISTANCE_CODES)
+      PopulationCost(
+          p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_), NULL)
+      + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL)
+      + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL)
+      + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL)
+      + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL)
       + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
       + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
 }
@@ -313,8 +285,8 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
   if (*cost > cost_threshold) return 0;
 
   *cost += GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES);
-  *cost += VP8LExtraCostCombined(a->distance_, b->distance_,
-                                 NUM_DISTANCE_CODES);
+  *cost +=
+      VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES);
   if (*cost > cost_threshold) return 0;
 
   return 1;
@@ -338,6 +310,8 @@ static double HistogramAddEval(const VP8LHistogram* const a,
     VP8LHistogramAdd(a, b, out);
     out->bit_cost_ = cost;
     out->palette_code_bits_ = a->palette_code_bits_;
+    out->trivial_symbol_ = (a->trivial_symbol_ == b->trivial_symbol_) ?
+        a->trivial_symbol_ : VP8L_NON_TRIVIAL_SYM;
   }
 
   return cost - sum_cost;
@@ -389,18 +363,26 @@ static void UpdateDominantCostRange(
 }
 
 static void UpdateHistogramCost(VP8LHistogram* const h) {
-  const double alpha_cost = PopulationCost(h->alpha_, NUM_LITERAL_CODES);
+  uint32_t alpha_sym, red_sym, blue_sym;
+  const double alpha_cost =
+      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym);
   const double distance_cost =
-      PopulationCost(h->distance_, NUM_DISTANCE_CODES) +
+      PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL) +
       VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
   const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
-  h->literal_cost_ = PopulationCost(h->literal_, num_codes) +
+  h->literal_cost_ = PopulationCost(h->literal_, num_codes, NULL) +
                      VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES,
                                    NUM_LENGTH_CODES);
-  h->red_cost_ = PopulationCost(h->red_, NUM_LITERAL_CODES);
-  h->blue_cost_ = PopulationCost(h->blue_, NUM_LITERAL_CODES);
+  h->red_cost_ = PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym);
+  h->blue_cost_ = PopulationCost(h->blue_, NUM_LITERAL_CODES, &blue_sym);
   h->bit_cost_ = h->literal_cost_ + h->red_cost_ + h->blue_cost_ +
                  alpha_cost + distance_cost;
+  if ((alpha_sym | red_sym | blue_sym) == VP8L_NON_TRIVIAL_SYM) {
+    h->trivial_symbol_ = VP8L_NON_TRIVIAL_SYM;
+  } else {
+    h->trivial_symbol_ =
+        ((uint32_t)alpha_sym << 24) | (red_sym << 16) | (blue_sym << 0);
+  }
 }
 
 static int GetBinIdForEntropy(double min, double max, double val) {
@@ -409,7 +391,14 @@ static int GetBinIdForEntropy(double min, double max, double val) {
   return (int)(NUM_PARTITIONS * delta / range);
 }
 
-// TODO(vikasa): Evaluate, if there's any correlation between red & blue.
+static int GetHistoBinIndexLowEffort(
+    const VP8LHistogram* const h, const DominantCostRange* const c) {
+  const int bin_id = GetBinIdForEntropy(c->literal_min_, c->literal_max_,
+                                        h->literal_cost_);
+  assert(bin_id < NUM_PARTITIONS);
+  return bin_id;
+}
+
 static int GetHistoBinIndex(
     const VP8LHistogram* const h, const DominantCostRange* const c) {
   const int bin_id =
@@ -432,7 +421,6 @@ static void HistogramBuild(
   VP8LHistogram** const histograms = image_histo->histograms;
   VP8LRefsCursor c = VP8LRefsCursorInit(backward_refs);
   assert(histo_bits > 0);
-  // Construct the Histo from a given backward references.
   while (VP8LRefsCursorOk(&c)) {
     const PixOrCopy* const v = c.cur_pos;
     const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
@@ -463,8 +451,8 @@ static void HistogramCopyAndAnalyze(
 
 // Partition histograms to different entropy bins for three dominant (literal,
 // red and blue) symbol costs and compute the histogram aggregate bit_cost.
-static void HistogramAnalyzeEntropyBin(
-    VP8LHistogramSet* const image_histo, int16_t* const bin_map) {
+static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
+                                       int16_t* const bin_map, int low_effort) {
   int i;
   VP8LHistogram** const histograms = image_histo->histograms;
   const int histo_size = image_histo->size;
@@ -483,7 +471,9 @@ static void HistogramAnalyzeEntropyBin(
   for (i = 0; i < histo_size; ++i) {
     int num_histos;
     VP8LHistogram* const histo = histograms[i];
-    const int16_t bin_id = (int16_t)GetHistoBinIndex(histo, &cost_range);
+    const int16_t bin_id = low_effort ?
+        (int16_t)GetHistoBinIndexLowEffort(histo, &cost_range) :
+        (int16_t)GetHistoBinIndex(histo, &cost_range);
     const int bin_offset = bin_id * bin_depth;
     // bin_map[n][0] for every bin 'n' maintains the counter for the number of
     // histograms in that bin.
@@ -495,64 +485,79 @@ static void HistogramAnalyzeEntropyBin(
   }
 }
 
-// Compact the histogram set by moving the valid one left in the set to the
-// head and moving the ones that have been merged to other histograms towards
-// the end.
-// TODO(vikasa): Evaluate if this method can be avoided by altering the code
-// logic of HistogramCombineEntropyBin main loop.
+// Compact the histogram set by removing unused entries.
 static void HistogramCompactBins(VP8LHistogramSet* const image_histo) {
-  int start = 0;
-  int end = image_histo->size - 1;
   VP8LHistogram** const histograms = image_histo->histograms;
-  while (start < end) {
-    while (start <= end && histograms[start] != NULL &&
-           histograms[start]->bit_cost_ != 0.) {
-      ++start;
-    }
-    while (start <= end && histograms[end]->bit_cost_ == 0.) {
-      histograms[end] = NULL;
-      --end;
-    }
-    if (start < end) {
-      assert(histograms[start] != NULL);
-      assert(histograms[end] != NULL);
-      HistogramCopy(histograms[end], histograms[start]);
-      histograms[end] = NULL;
-      --end;
+  int i, j;
+
+  for (i = 0, j = 0; i < image_histo->size; ++i) {
+    if (histograms[i] != NULL && histograms[i]->bit_cost_ != 0.) {
+      if (j < i) {
+        histograms[j] = histograms[i];
+        histograms[i] = NULL;
+      }
+      ++j;
     }
   }
-  image_histo->size = end + 1;
+  image_histo->size = j;
 }
 
-static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
-                                       VP8LHistogram* const histos,
-                                       int16_t* const bin_map, int bin_depth,
-                                       double combine_cost_factor) {
+static VP8LHistogram* HistogramCombineEntropyBin(
+    VP8LHistogramSet* const image_histo,
+    VP8LHistogram* cur_combo,
+    int16_t* const bin_map, int bin_depth, int num_bins,
+    double combine_cost_factor, int low_effort) {
   int bin_id;
-  VP8LHistogram* cur_combo = histos;
   VP8LHistogram** const histograms = image_histo->histograms;
 
-  for (bin_id = 0; bin_id < BIN_SIZE; ++bin_id) {
+  for (bin_id = 0; bin_id < num_bins; ++bin_id) {
     const int bin_offset = bin_id * bin_depth;
     const int num_histos = bin_map[bin_offset];
     const int idx1 = bin_map[bin_offset + 1];
+    int num_combine_failures = 0;
     int n;
     for (n = 2; n <= num_histos; ++n) {
       const int idx2 = bin_map[bin_offset + n];
-      const double bit_cost_idx2 = histograms[idx2]->bit_cost_;
-      if (bit_cost_idx2 > 0.) {
-        const double bit_cost_thresh = -bit_cost_idx2 * combine_cost_factor;
-        const double curr_cost_diff =
-            HistogramAddEval(histograms[idx1], histograms[idx2],
-                             cur_combo, bit_cost_thresh);
-        if (curr_cost_diff < bit_cost_thresh) {
-          HistogramCopy(cur_combo, histograms[idx1]);
-          histograms[idx2]->bit_cost_ = 0.;
+      if (low_effort) {
+        // Merge all histograms with the same bin index, irrespective of cost of
+        // the merged histograms.
+        VP8LHistogramAdd(histograms[idx1], histograms[idx2], histograms[idx1]);
+        histograms[idx2]->bit_cost_ = 0.;
+      } else {
+        const double bit_cost_idx2 = histograms[idx2]->bit_cost_;
+        if (bit_cost_idx2 > 0.) {
+          const double bit_cost_thresh = -bit_cost_idx2 * combine_cost_factor;
+          const double curr_cost_diff =
+              HistogramAddEval(histograms[idx1], histograms[idx2],
+                               cur_combo, bit_cost_thresh);
+          if (curr_cost_diff < bit_cost_thresh) {
+            // Try to merge two histograms only if the combo is a trivial one or
+            // the two candidate histograms are already non-trivial.
+            // For some images, 'try_combine' turns out to be false for a lot of
+            // histogram pairs. In that case, we fallback to combining
+            // histograms as usual to avoid increasing the header size.
+            const int try_combine =
+                (cur_combo->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM) ||
+                ((histograms[idx1]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM) &&
+                 (histograms[idx2]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM));
+            const int max_combine_failures = 32;
+            if (try_combine || (num_combine_failures >= max_combine_failures)) {
+              HistogramSwap(&cur_combo, &histograms[idx1]);
+              histograms[idx2]->bit_cost_ = 0.;
+            } else {
+              ++num_combine_failures;
+            }
+          }
         }
       }
     }
+    if (low_effort) {
+      // Update the bit_cost for the merged histograms (per bin index).
+      UpdateHistogramCost(histograms[idx1]);
+    }
   }
   HistogramCompactBins(image_histo);
+  return cur_combo;
 }
 
 static uint32_t MyRand(uint32_t *seed) {
@@ -563,8 +568,179 @@ static uint32_t MyRand(uint32_t *seed) {
   return *seed;
 }
 
-static void HistogramCombine(VP8LHistogramSet* const image_histo,
-                             VP8LHistogramSet* const histos, int quality) {
+// -----------------------------------------------------------------------------
+// Histogram pairs priority queue
+
+// Pair of histograms. Negative idx1 value means that pair is out-of-date.
+typedef struct {
+  int idx1;
+  int idx2;
+  double cost_diff;
+  double cost_combo;
+} HistogramPair;
+
+typedef struct {
+  HistogramPair* queue;
+  int size;
+  int max_size;
+} HistoQueue;
+
+static int HistoQueueInit(HistoQueue* const histo_queue, const int max_index) {
+  histo_queue->size = 0;
+  // max_index^2 for the queue size is safe. If you look at
+  // HistogramCombineGreedy, and imagine that UpdateQueueFront always pushes
+  // data to the queue, you insert at most:
+  // - max_index*(max_index-1)/2 (the first two for loops)
+  // - max_index - 1 in the last for loop at the first iteration of the while
+  //   loop, max_index - 2 at the second iteration ... therefore
+  //   max_index*(max_index-1)/2 overall too
+  histo_queue->max_size = max_index * max_index;
+  // We allocate max_size + 1 because the last element at index "size" is
+  // used as temporary data (and it could be up to max_size).
+  histo_queue->queue = WebPSafeMalloc(histo_queue->max_size + 1,
+                                      sizeof(*histo_queue->queue));
+  return histo_queue->queue != NULL;
+}
+
+static void HistoQueueClear(HistoQueue* const histo_queue) {
+  assert(histo_queue != NULL);
+  WebPSafeFree(histo_queue->queue);
+}
+
+static void SwapHistogramPairs(HistogramPair *p1,
+                               HistogramPair *p2) {
+  const HistogramPair tmp = *p1;
+  *p1 = *p2;
+  *p2 = tmp;
+}
+
+// Given a valid priority queue in range [0, queue_size) this function checks
+// whether histo_queue[queue_size] should be accepted and swaps it with the
+// front if it is smaller. Otherwise, it leaves it as is.
+static void UpdateQueueFront(HistoQueue* const histo_queue) {
+  if (histo_queue->queue[histo_queue->size].cost_diff >= 0) return;
+
+  if (histo_queue->queue[histo_queue->size].cost_diff <
+      histo_queue->queue[0].cost_diff) {
+    SwapHistogramPairs(histo_queue->queue,
+                       histo_queue->queue + histo_queue->size);
+  }
+  ++histo_queue->size;
+
+  // We cannot add more elements than the capacity.
+  // The allocation adds an extra element to the official capacity so that
+  // histo_queue->queue[histo_queue->max_size] is read/written within bound.
+  assert(histo_queue->size <= histo_queue->max_size);
+}
+
+// -----------------------------------------------------------------------------
+
+static void PreparePair(VP8LHistogram** histograms, int idx1, int idx2,
+                        HistogramPair* const pair,
+                        VP8LHistogram* const histos) {
+  if (idx1 > idx2) {
+    const int tmp = idx2;
+    idx2 = idx1;
+    idx1 = tmp;
+  }
+  pair->idx1 = idx1;
+  pair->idx2 = idx2;
+  pair->cost_diff =
+      HistogramAddEval(histograms[idx1], histograms[idx2], histos, 0);
+  pair->cost_combo = histos->bit_cost_;
+}
+
+// Combines histograms by continuously choosing the one with the highest cost
+// reduction.
+static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
+                                  VP8LHistogram* const histos) {
+  int ok = 0;
+  int image_histo_size = image_histo->size;
+  int i, j;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  // Indexes of remaining histograms.
+  int* const clusters = WebPSafeMalloc(image_histo_size, sizeof(*clusters));
+  // Priority queue of histogram pairs.
+  HistoQueue histo_queue;
+
+  if (!HistoQueueInit(&histo_queue, image_histo_size) || clusters == NULL) {
+    goto End;
+  }
+
+  for (i = 0; i < image_histo_size; ++i) {
+    // Initialize clusters indexes.
+    clusters[i] = i;
+    for (j = i + 1; j < image_histo_size; ++j) {
+      // Initialize positions array.
+      PreparePair(histograms, i, j, &histo_queue.queue[histo_queue.size],
+                  histos);
+      UpdateQueueFront(&histo_queue);
+    }
+  }
+
+  while (image_histo_size > 1 && histo_queue.size > 0) {
+    HistogramPair* copy_to;
+    const int idx1 = histo_queue.queue[0].idx1;
+    const int idx2 = histo_queue.queue[0].idx2;
+    VP8LHistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
+    histograms[idx1]->bit_cost_ = histo_queue.queue[0].cost_combo;
+    // Remove merged histogram.
+    for (i = 0; i + 1 < image_histo_size; ++i) {
+      if (clusters[i] >= idx2) {
+        clusters[i] = clusters[i + 1];
+      }
+    }
+    --image_histo_size;
+
+    // Remove pairs intersecting the just combined best pair. This will
+    // therefore pop the head of the queue.
+    copy_to = histo_queue.queue;
+    for (i = 0; i < histo_queue.size; ++i) {
+      HistogramPair* const p = histo_queue.queue + i;
+      if (p->idx1 == idx1 || p->idx2 == idx1 ||
+          p->idx1 == idx2 || p->idx2 == idx2) {
+        // Do not copy the invalid pair.
+        continue;
+      }
+      if (p->cost_diff < histo_queue.queue[0].cost_diff) {
+        // Replace the top of the queue if we found better.
+        SwapHistogramPairs(histo_queue.queue, p);
+      }
+      SwapHistogramPairs(copy_to, p);
+      ++copy_to;
+    }
+    histo_queue.size = (int)(copy_to - histo_queue.queue);
+
+    // Push new pairs formed with combined histogram to the queue.
+    for (i = 0; i < image_histo_size; ++i) {
+      if (clusters[i] != idx1) {
+        PreparePair(histograms, idx1, clusters[i],
+                    &histo_queue.queue[histo_queue.size], histos);
+        UpdateQueueFront(&histo_queue);
+      }
+    }
+  }
+  // Move remaining histograms to the beginning of the array.
+  for (i = 0; i < image_histo_size; ++i) {
+    if (i != clusters[i]) {  // swap the two histograms
+      HistogramSwap(&histograms[i], &histograms[clusters[i]]);
+    }
+  }
+
+  image_histo->size = image_histo_size;
+  ok = 1;
+
+ End:
+  WebPSafeFree(clusters);
+  HistoQueueClear(&histo_queue);
+  return ok;
+}
+
+static VP8LHistogram* HistogramCombineStochastic(
+    VP8LHistogramSet* const image_histo,
+    VP8LHistogram* tmp_histo,
+    VP8LHistogram* best_combo,
+    int quality, int min_cluster_size) {
   int iter;
   uint32_t seed = 0;
   int tries_with_no_success = 0;
@@ -573,12 +749,10 @@ static void HistogramCombine(VP8LHistogramSet* const image_histo,
   const int outer_iters = image_histo_size * iter_mult;
   const int num_pairs = image_histo_size / 2;
   const int num_tries_no_success = outer_iters / 2;
-  const int min_cluster_size = 2;
   VP8LHistogram** const histograms = image_histo->histograms;
-  VP8LHistogram* cur_combo = histos->histograms[0];   // trial histogram
-  VP8LHistogram* best_combo = histos->histograms[1];  // best histogram so far
 
   // Collapse similar histograms in 'image_histo'.
+  ++min_cluster_size;
   for (iter = 0;
        iter < outer_iters && image_histo_size >= min_cluster_size;
        ++iter) {
@@ -602,13 +776,9 @@ static void HistogramCombine(VP8LHistogramSet* const image_histo,
 
       // Calculate cost reduction on combining.
       curr_cost_diff = HistogramAddEval(histograms[idx1], histograms[idx2],
-                                        cur_combo, best_cost_diff);
+                                        tmp_histo, best_cost_diff);
       if (curr_cost_diff < best_cost_diff) {    // found a better pair?
-        {     // swap cur/best combo histograms
-          VP8LHistogram* const tmp_histo = cur_combo;
-          cur_combo = best_combo;
-          best_combo = tmp_histo;
-        }
+        HistogramSwap(&best_combo, &tmp_histo);
         best_cost_diff = curr_cost_diff;
         best_idx1 = idx1;
         best_idx2 = idx2;
@@ -616,11 +786,11 @@ static void HistogramCombine(VP8LHistogramSet* const image_histo,
     }
 
     if (best_idx1 >= 0) {
-      HistogramCopy(best_combo, histograms[best_idx1]);
+      HistogramSwap(&best_combo, &histograms[best_idx1]);
       // swap best_idx2 slot with last one (which is now unused)
       --image_histo_size;
       if (best_idx2 != image_histo_size) {
-        HistogramCopy(histograms[image_histo_size], histograms[best_idx2]);
+        HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]);
         histograms[image_histo_size] = NULL;
       }
       tries_with_no_success = 0;
@@ -630,6 +800,7 @@ static void HistogramCombine(VP8LHistogramSet* const image_histo,
     }
   }
   image_histo->size = image_histo_size;
+  return best_combo;
 }
 
 // -----------------------------------------------------------------------------
@@ -643,28 +814,37 @@ static void HistogramRemap(const VP8LHistogramSet* const orig_histo,
   int i;
   VP8LHistogram** const orig_histograms = orig_histo->histograms;
   VP8LHistogram** const histograms = image_histo->histograms;
-  for (i = 0; i < orig_histo->size; ++i) {
-    int best_out = 0;
-    double best_bits =
-        HistogramAddThresh(histograms[0], orig_histograms[i], MAX_COST);
-    int k;
-    for (k = 1; k < image_histo->size; ++k) {
-      const double cur_bits =
-          HistogramAddThresh(histograms[k], orig_histograms[i], best_bits);
-      if (cur_bits < best_bits) {
-        best_bits = cur_bits;
-        best_out = k;
+  const int orig_histo_size = orig_histo->size;
+  const int image_histo_size = image_histo->size;
+  if (image_histo_size > 1) {
+    for (i = 0; i < orig_histo_size; ++i) {
+      int best_out = 0;
+      double best_bits =
+          HistogramAddThresh(histograms[0], orig_histograms[i], MAX_COST);
+      int k;
+      for (k = 1; k < image_histo_size; ++k) {
+        const double cur_bits =
+            HistogramAddThresh(histograms[k], orig_histograms[i], best_bits);
+        if (cur_bits < best_bits) {
+          best_bits = cur_bits;
+          best_out = k;
+        }
       }
+      symbols[i] = best_out;
+    }
+  } else {
+    assert(image_histo_size == 1);
+    for (i = 0; i < orig_histo_size; ++i) {
+      symbols[i] = 0;
     }
-    symbols[i] = best_out;
   }
 
   // Recompute each out based on raw and symbols.
-  for (i = 0; i < image_histo->size; ++i) {
+  for (i = 0; i < image_histo_size; ++i) {
     HistogramClear(histograms[i]);
   }
 
-  for (i = 0; i < orig_histo->size; ++i) {
+  for (i = 0; i < orig_histo_size; ++i) {
     const int idx = symbols[i];
     VP8LHistogramAdd(orig_histograms[i], histograms[idx], histograms[idx]);
   }
@@ -672,44 +852,48 @@ static void HistogramRemap(const VP8LHistogramSet* const orig_histo,
 
 static double GetCombineCostFactor(int histo_size, int quality) {
   double combine_cost_factor = 0.16;
-  if (histo_size > 256) combine_cost_factor /= 2.;
-  if (histo_size > 512) combine_cost_factor /= 2.;
-  if (histo_size > 1024) combine_cost_factor /= 2.;
-  if (quality <= 50) combine_cost_factor /= 2.;
+  if (quality < 90) {
+    if (histo_size > 256) combine_cost_factor /= 2.;
+    if (histo_size > 512) combine_cost_factor /= 2.;
+    if (histo_size > 1024) combine_cost_factor /= 2.;
+    if (quality <= 50) combine_cost_factor /= 2.;
+  }
   return combine_cost_factor;
 }
 
 int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              const VP8LBackwardRefs* const refs,
-                             int quality, int histo_bits, int cache_bits,
+                             int quality, int low_effort,
+                             int histo_bits, int cache_bits,
                              VP8LHistogramSet* const image_histo,
+                             VP8LHistogramSet* const tmp_histos,
                              uint16_t* const histogram_symbols) {
   int ok = 0;
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
   const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
   const int image_histo_raw_size = histo_xsize * histo_ysize;
+  const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
 
   // The bin_map for every bin follows following semantics:
   // bin_map[n][0] = num_histo; // The number of histograms in that bin.
   // bin_map[n][1] = index of first histogram in that bin;
   // bin_map[n][num_histo] = index of last histogram in that bin;
-  // bin_map[n][num_histo + 1] ... bin_map[n][bin_depth - 1] = un-used indices.
+  // bin_map[n][num_histo + 1] ... bin_map[n][bin_depth - 1] = unused indices.
   const int bin_depth = image_histo_raw_size + 1;
   int16_t* bin_map = NULL;
-  VP8LHistogramSet* const histos = VP8LAllocateHistogramSet(2, cache_bits);
   VP8LHistogramSet* const orig_histo =
       VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
+  VP8LHistogram* cur_combo;
+  const int entropy_combine =
+      (orig_histo->size > entropy_combine_num_bins * 2) && (quality < 100);
 
-  if (orig_histo == NULL || histos == NULL) {
-    goto Error;
-  }
+  if (orig_histo == NULL) goto Error;
 
   // Don't attempt linear bin-partition heuristic for:
   // histograms of small sizes, as bin_map will be very sparse and;
-  // Higher qualities (> 90), to preserve the compression gains at those
-  // quality settings.
-  if (orig_histo->size > 2 * BIN_SIZE && quality < 90) {
-    const int bin_map_size = bin_depth * BIN_SIZE;
+  // Maximum quality (q==100), to preserve the compression gains at that level.
+  if (entropy_combine) {
+    const int bin_map_size = bin_depth * entropy_combine_num_bins;
     bin_map = (int16_t*)WebPSafeCalloc(bin_map_size, sizeof(*bin_map));
     if (bin_map == NULL) goto Error;
   }
@@ -719,18 +903,33 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   // Copies the histograms and computes its bit_cost.
   HistogramCopyAndAnalyze(orig_histo, image_histo);
 
-  if (bin_map != NULL) {
+  cur_combo = tmp_histos->histograms[1];  // pick up working slot
+  if (entropy_combine) {
     const double combine_cost_factor =
         GetCombineCostFactor(image_histo_raw_size, quality);
-    HistogramAnalyzeEntropyBin(orig_histo, bin_map);
+    HistogramAnalyzeEntropyBin(orig_histo, bin_map, low_effort);
     // Collapse histograms with similar entropy.
-    HistogramCombineEntropyBin(image_histo, histos->histograms[0],
-                               bin_map, bin_depth, combine_cost_factor);
+    cur_combo = HistogramCombineEntropyBin(image_histo, cur_combo, bin_map,
+                                           bin_depth, entropy_combine_num_bins,
+                                           combine_cost_factor, low_effort);
   }
 
-  // Collapse similar histograms by random histogram-pair compares.
-  HistogramCombine(image_histo, histos, quality);
+  // Don't combine the histograms using stochastic and greedy heuristics for
+  // low-effort compression mode.
+  if (!low_effort || !entropy_combine) {
+    const float x = quality / 100.f;
+    // cubic ramp between 1 and MAX_HISTO_GREEDY:
+    const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1));
+    cur_combo = HistogramCombineStochastic(image_histo,
+                                           tmp_histos->histograms[0],
+                                           cur_combo, quality, threshold_size);
+    if ((image_histo->size <= threshold_size) &&
+        !HistogramCombineGreedy(image_histo, cur_combo)) {
+      goto Error;
+    }
+  }
 
+  // TODO(vikasa): Optimize HistogramRemap for low-effort compression mode also.
   // Find the optimal map from original histograms to the final ones.
   HistogramRemap(orig_histo, image_histo, histogram_symbols);
 
@@ -739,6 +938,5 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
  Error:
   WebPSafeFree(bin_map);
   VP8LFreeHistogramSet(orig_histo);
-  VP8LFreeHistogramSet(histos);
   return ok;
 }
diff --git a/src/3rdparty/libwebp/src/enc/histogram.h b/src/3rdparty/libwebp/src/enc/histogram.h
index 1cf4c54..d303d1d 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.h
+++ b/src/3rdparty/libwebp/src/enc/histogram.h
@@ -14,10 +14,6 @@
 #ifndef WEBP_ENC_HISTOGRAM_H_
 #define WEBP_ENC_HISTOGRAM_H_
 
-#include <assert.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
 #include <string.h>
 
 #include "./backward_references.h"
@@ -28,6 +24,9 @@
 extern "C" {
 #endif
 
+// Not a trivial literal symbol.
+#define VP8L_NON_TRIVIAL_SYM (0xffffffff)
+
 // A simple container for histograms of data.
 typedef struct {
   // literal_ contains green literal, palette-code and
@@ -39,9 +38,11 @@ typedef struct {
   // Backward reference prefix-code histogram.
   uint32_t distance_[NUM_DISTANCE_CODES];
   int palette_code_bits_;
-  double bit_cost_;      // cached value of VP8LHistogramEstimateBits(this)
-  double literal_cost_;  // Cached values of dominant entropy costs:
-  double red_cost_;      //   literal, red & blue.
+  uint32_t trivial_symbol_;  // True, if histograms for Red, Blue & Alpha
+                             // literal symbols are single valued.
+  double bit_cost_;          // cached value of bit cost.
+  double literal_cost_;      // Cached values of dominant entropy costs:
+  double red_cost_;          // literal, red & blue.
   double blue_cost_;
 } VP8LHistogram;
 
@@ -91,14 +92,6 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits);
 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
                                      const PixOrCopy* const v);
 
-// Estimate how many bits the combined entropy of literals and distance
-// approximately maps to.
-double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
-
-// This function estimates the cost in bits excluding the bits needed to
-// represent the entropy code itself.
-double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p);
-
 static WEBP_INLINE int VP8LHistogramNumCodes(int palette_code_bits) {
   return NUM_LITERAL_CODES + NUM_LENGTH_CODES +
       ((palette_code_bits > 0) ? (1 << palette_code_bits) : 0);
@@ -107,10 +100,22 @@ static WEBP_INLINE int VP8LHistogramNumCodes(int palette_code_bits) {
 // Builds the histogram image.
 int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              const VP8LBackwardRefs* const refs,
-                             int quality, int histogram_bits, int cache_bits,
+                             int quality, int low_effort,
+                             int histogram_bits, int cache_bits,
                              VP8LHistogramSet* const image_in,
+                             VP8LHistogramSet* const tmp_histos,
                              uint16_t* const histogram_symbols);
 
+// Returns the entropy for the symbols in the input array.
+// Also sets trivial_symbol to the code value, if the array has only one code
+// value. Otherwise, set it to VP8L_NON_TRIVIAL_SYM.
+double VP8LBitsEntropy(const uint32_t* const array, int n,
+                       uint32_t* const trivial_symbol);
+
+// Estimate how many bits the combined entropy of literals and distance
+// approximately maps to.
+double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/3rdparty/libwebp/src/enc/iterator.c b/src/3rdparty/libwebp/src/enc/iterator.c
index e42ad00..99d960a 100644
--- a/src/3rdparty/libwebp/src/enc/iterator.c
+++ b/src/3rdparty/libwebp/src/enc/iterator.c
@@ -70,13 +70,13 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
   it->enc_ = enc;
   it->y_stride_  = enc->pic_->y_stride;
   it->uv_stride_ = enc->pic_->uv_stride;
-  it->yuv_in_   = (uint8_t*)DO_ALIGN(it->yuv_mem_);
-  it->yuv_out_  = it->yuv_in_ + YUV_SIZE;
-  it->yuv_out2_ = it->yuv_out_ + YUV_SIZE;
-  it->yuv_p_    = it->yuv_out2_ + YUV_SIZE;
+  it->yuv_in_   = (uint8_t*)WEBP_ALIGN(it->yuv_mem_);
+  it->yuv_out_  = it->yuv_in_ + YUV_SIZE_ENC;
+  it->yuv_out2_ = it->yuv_out_ + YUV_SIZE_ENC;
+  it->yuv_p_    = it->yuv_out2_ + YUV_SIZE_ENC;
   it->lf_stats_ = enc->lf_stats_;
   it->percent0_ = enc->percent_;
-  it->y_left_ = (uint8_t*)DO_ALIGN(it->yuv_left_mem_ + 1);
+  it->y_left_ = (uint8_t*)WEBP_ALIGN(it->yuv_left_mem_ + 1);
   it->u_left_ = it->y_left_ + 16 + 16;
   it->v_left_ = it->u_left_ + 16;
   VP8IteratorReset(it);
@@ -136,9 +136,9 @@ void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32) {
   const int uv_w = (w + 1) >> 1;
   const int uv_h = (h + 1) >> 1;
 
-  ImportBlock(ysrc, pic->y_stride,  it->yuv_in_ + Y_OFF, w, h, 16);
-  ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF, uv_w, uv_h, 8);
-  ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF, uv_w, uv_h, 8);
+  ImportBlock(ysrc, pic->y_stride,  it->yuv_in_ + Y_OFF_ENC, w, h, 16);
+  ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF_ENC, uv_w, uv_h, 8);
+  ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF_ENC, uv_w, uv_h, 8);
 
   if (tmp_32 == NULL) return;
 
@@ -185,9 +185,9 @@ void VP8IteratorExport(const VP8EncIterator* const it) {
   const VP8Encoder* const enc = it->enc_;
   if (enc->config_->show_compressed) {
     const int x = it->x_, y = it->y_;
-    const uint8_t* const ysrc = it->yuv_out_ + Y_OFF;
-    const uint8_t* const usrc = it->yuv_out_ + U_OFF;
-    const uint8_t* const vsrc = it->yuv_out_ + V_OFF;
+    const uint8_t* const ysrc = it->yuv_out_ + Y_OFF_ENC;
+    const uint8_t* const usrc = it->yuv_out_ + U_OFF_ENC;
+    const uint8_t* const vsrc = it->yuv_out_ + V_OFF_ENC;
     const WebPPicture* const pic = enc->pic_;
     uint8_t* const ydst = pic->y + (y * pic->y_stride + x) * 16;
     uint8_t* const udst = pic->u + (y * pic->uv_stride + x) * 8;
@@ -286,8 +286,8 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) {
 void VP8IteratorSaveBoundary(VP8EncIterator* const it) {
   VP8Encoder* const enc = it->enc_;
   const int x = it->x_, y = it->y_;
-  const uint8_t* const ysrc = it->yuv_out_ + Y_OFF;
-  const uint8_t* const uvsrc = it->yuv_out_ + U_OFF;
+  const uint8_t* const ysrc = it->yuv_out_ + Y_OFF_ENC;
+  const uint8_t* const uvsrc = it->yuv_out_ + U_OFF_ENC;
   if (x < enc->mb_w_ - 1) {   // left
     int i;
     for (i = 0; i < 16; ++i) {
diff --git a/src/3rdparty/libwebp/src/enc/near_lossless.c b/src/3rdparty/libwebp/src/enc/near_lossless.c
new file mode 100644
index 0000000..9bc0f0e
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/near_lossless.c
@@ -0,0 +1,160 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Near-lossless image preprocessing adjusts pixel values to help
+// compressibility with a guarantee of maximum deviation between original and
+// resulting pixel values.
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+// Converted to C by Aleksander Kramarz (akramarz@google.com)
+
+#include <stdlib.h>
+
+#include "../dsp/lossless.h"
+#include "../utils/utils.h"
+#include "./vp8enci.h"
+
+#define MIN_DIM_FOR_NEAR_LOSSLESS 64
+#define MAX_LIMIT_BITS             5
+
+// Computes quantized pixel value and distance from original value.
+static void GetValAndDistance(int a, int initial, int bits,
+                              int* const val, int* const distance) {
+  const int mask = ~((1 << bits) - 1);
+  *val = (initial & mask) | (initial >> (8 - bits));
+  *distance = 2 * abs(a - *val);
+}
+
+// Clamps the value to range [0, 255].
+static int Clamp8b(int val) {
+  const int min_val = 0;
+  const int max_val = 0xff;
+  return (val < min_val) ? min_val : (val > max_val) ? max_val : val;
+}
+
+// Quantizes values {a, a+(1<<bits), a-(1<<bits)} and returns the nearest one.
+static int FindClosestDiscretized(int a, int bits) {
+  int best_val = a, i;
+  int min_distance = 256;
+
+  for (i = -1; i <= 1; ++i) {
+    int candidate, distance;
+    const int val = Clamp8b(a + i * (1 << bits));
+    GetValAndDistance(a, val, bits, &candidate, &distance);
+    if (i != 0) {
+      ++distance;
+    }
+    // Smallest distance but favor i == 0 over i == -1 and i == 1
+    // since that keeps the overall intensity more constant in the
+    // images.
+    if (distance < min_distance) {
+      min_distance = distance;
+      best_val = candidate;
+    }
+  }
+  return best_val;
+}
+
+// Applies FindClosestDiscretized to all channels of pixel.
+static uint32_t ClosestDiscretizedArgb(uint32_t a, int bits) {
+  return
+      (FindClosestDiscretized(a >> 24, bits) << 24) |
+      (FindClosestDiscretized((a >> 16) & 0xff, bits) << 16) |
+      (FindClosestDiscretized((a >> 8) & 0xff, bits) << 8) |
+      (FindClosestDiscretized(a & 0xff, bits));
+}
+
+// Checks if distance between corresponding channel values of pixels a and b
+// is within the given limit.
+static int IsNear(uint32_t a, uint32_t b, int limit) {
+  int k;
+  for (k = 0; k < 4; ++k) {
+    const int delta =
+        (int)((a >> (k * 8)) & 0xff) - (int)((b >> (k * 8)) & 0xff);
+    if (delta >= limit || delta <= -limit) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static int IsSmooth(const uint32_t* const prev_row,
+                    const uint32_t* const curr_row,
+                    const uint32_t* const next_row,
+                    int ix, int limit) {
+  // Check that all pixels in 4-connected neighborhood are smooth.
+  return (IsNear(curr_row[ix], curr_row[ix - 1], limit) &&
+          IsNear(curr_row[ix], curr_row[ix + 1], limit) &&
+          IsNear(curr_row[ix], prev_row[ix], limit) &&
+          IsNear(curr_row[ix], next_row[ix], limit));
+}
+
+// Adjusts pixel values of image with given maximum error.
+static void NearLossless(int xsize, int ysize, uint32_t* argb,
+                         int limit_bits, uint32_t* copy_buffer) {
+  int x, y;
+  const int limit = 1 << limit_bits;
+  uint32_t* prev_row = copy_buffer;
+  uint32_t* curr_row = prev_row + xsize;
+  uint32_t* next_row = curr_row + xsize;
+  memcpy(copy_buffer, argb, xsize * 2 * sizeof(argb[0]));
+
+  for (y = 1; y < ysize - 1; ++y) {
+    uint32_t* const curr_argb_row = argb + y * xsize;
+    uint32_t* const next_argb_row = curr_argb_row + xsize;
+    memcpy(next_row, next_argb_row, xsize * sizeof(argb[0]));
+    for (x = 1; x < xsize - 1; ++x) {
+      if (!IsSmooth(prev_row, curr_row, next_row, x, limit)) {
+        curr_argb_row[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
+      }
+    }
+    {
+      // Three-way swap.
+      uint32_t* const temp = prev_row;
+      prev_row = curr_row;
+      curr_row = next_row;
+      next_row = temp;
+    }
+  }
+}
+
+static int QualityToLimitBits(int quality) {
+  // quality mapping:
+  //  0..19 -> 5
+  //  0..39 -> 4
+  //  0..59 -> 3
+  //  0..79 -> 2
+  //  0..99 -> 1
+  //  100   -> 0
+  return MAX_LIMIT_BITS - quality / 20;
+}
+
+int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality) {
+  int i;
+  uint32_t* const copy_buffer =
+      (uint32_t*)WebPSafeMalloc(xsize * 3, sizeof(*copy_buffer));
+  const int limit_bits = QualityToLimitBits(quality);
+  assert(argb != NULL);
+  assert(limit_bits >= 0);
+  assert(limit_bits <= MAX_LIMIT_BITS);
+  if (copy_buffer == NULL) {
+    return 0;
+  }
+  // For small icon images, don't attempt to apply near-lossless compression.
+  if (xsize < MIN_DIM_FOR_NEAR_LOSSLESS && ysize < MIN_DIM_FOR_NEAR_LOSSLESS) {
+    WebPSafeFree(copy_buffer);
+    return 1;
+  }
+
+  for (i = limit_bits; i != 0; --i) {
+    NearLossless(xsize, ysize, argb, i, copy_buffer);
+  }
+  WebPSafeFree(copy_buffer);
+  return 1;
+}
diff --git a/src/3rdparty/libwebp/src/enc/picture.c b/src/3rdparty/libwebp/src/enc/picture.c
index 9a66fbe..26679a7 100644
--- a/src/3rdparty/libwebp/src/enc/picture.c
+++ b/src/3rdparty/libwebp/src/enc/picture.c
@@ -15,6 +15,7 @@
 #include <stdlib.h>
 
 #include "./vp8enci.h"
+#include "../dsp/dsp.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/picture_csp.c b/src/3rdparty/libwebp/src/enc/picture_csp.c
index 7875f62..0ef5f9e 100644
--- a/src/3rdparty/libwebp/src/enc/picture_csp.c
+++ b/src/3rdparty/libwebp/src/enc/picture_csp.c
@@ -32,10 +32,6 @@ static const union {
 } test_endian = { 0xff000000u };
 #define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)
 
-static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
-  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
-}
-
 //------------------------------------------------------------------------------
 // Detection of non-trivial transparency
 
@@ -89,9 +85,9 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
 
 static int kLinearToGammaTab[kGammaTabSize + 1];
 static uint16_t kGammaToLinearTab[256];
-static int kGammaTablesOk = 0;
+static volatile int kGammaTablesOk = 0;
 
-static void InitGammaTables(void) {
+static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTables(void) {
   if (!kGammaTablesOk) {
     int v;
     const double scale = (double)(1 << kGammaTabFix) / kGammaScale;
@@ -130,7 +126,7 @@ static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
 
 #else
 
-static void InitGammaTables(void) {}
+static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTables(void) {}
 static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
 static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
   return (int)(base_value << shift);
@@ -162,19 +158,15 @@ static int RGBToV(int r, int g, int b, VP8Random* const rg) {
 static const int kNumIterations = 6;
 static const int kMinDimensionIterativeConversion = 4;
 
-// We use a-priori a different precision for storing RGB and Y/W components
-// We could use YFIX=0 and only uint8_t for fixed_y_t, but it produces some
+// We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
 // banding sometimes. Better use extra precision.
-// TODO(skal): cleanup once TFIX/YFIX values are fixed.
+#define SFIX 2                // fixed-point precision of RGB and Y/W
+typedef int16_t fixed_t;      // signed type with extra SFIX precision for UV
+typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
 
-typedef int16_t fixed_t;      // signed type with extra TFIX precision for UV
-typedef uint16_t fixed_y_t;   // unsigned type with extra YFIX precision for W
-#define TFIX 6   // fixed-point precision of RGB
-#define YFIX 2   // fixed point precision for Y/W
-
-#define THALF ((1 << TFIX) >> 1)
-#define MAX_Y_T ((256 << YFIX) - 1)
-#define TROUNDER (1 << (YUV_FIX + TFIX - 1))
+#define SHALF (1 << SFIX >> 1)
+#define MAX_Y_T ((256 << SFIX) - 1)
+#define SROUNDER (1 << (YUV_FIX + SFIX - 1))
 
 #if defined(USE_GAMMA_COMPRESSION)
 
@@ -184,9 +176,9 @@ typedef uint16_t fixed_y_t;   // unsigned type with extra YFIX precision for W
 #define kGammaF 2.2
 static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
 static float kLinearToGammaTabF[kGammaTabSize + 2];
-static int kGammaTablesFOk = 0;
+static volatile int kGammaTablesFOk = 0;
 
-static void InitGammaTablesF(void) {
+static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
   if (!kGammaTablesFOk) {
     int v;
     const double norm = 1. / MAX_Y_T;
@@ -207,52 +199,31 @@ static WEBP_INLINE float GammaToLinearF(int v) {
   return kGammaToLinearTabF[v];
 }
 
-static WEBP_INLINE float LinearToGammaF(float value) {
+static WEBP_INLINE int LinearToGammaF(float value) {
   const float v = value * kGammaTabSize;
   const int tab_pos = (int)v;
   const float x = v - (float)tab_pos;      // fractional part
   const float v0 = kLinearToGammaTabF[tab_pos + 0];
   const float v1 = kLinearToGammaTabF[tab_pos + 1];
   const float y = v1 * x + v0 * (1.f - x);  // interpolate
-  return y;
+  return (int)(y + .5);
 }
 
 #else
 
-static void InitGammaTablesF(void) {}
+static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {}
 static WEBP_INLINE float GammaToLinearF(int v) {
   const float norm = 1.f / MAX_Y_T;
   return norm * v;
 }
-static WEBP_INLINE float LinearToGammaF(float value) {
-  return MAX_Y_T * value;
+static WEBP_INLINE int LinearToGammaF(float value) {
+  return (int)(MAX_Y_T * value + .5);
 }
 
 #endif    // USE_GAMMA_COMPRESSION
 
 //------------------------------------------------------------------------------
 
-// precision: YFIX -> TFIX
-static WEBP_INLINE int FixedYToW(int v) {
-#if TFIX == YFIX
-  return v;
-#elif TFIX >= YFIX
-  return v << (TFIX - YFIX);
-#else
-  return v >> (YFIX - TFIX);
-#endif
-}
-
-static WEBP_INLINE int FixedWToY(int v) {
-#if TFIX == YFIX
-  return v;
-#elif YFIX >= TFIX
-  return v << (YFIX - TFIX);
-#else
-  return v >> (TFIX - YFIX);
-#endif
-}
-
 static uint8_t clip_8b(fixed_t v) {
   return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
 }
@@ -261,13 +232,6 @@ static fixed_y_t clip_y(int y) {
   return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
 }
 
-// precision: TFIX -> YFIX
-static fixed_y_t clip_fixed_t(fixed_t v) {
-  const int y = FixedWToY(v);
-  const fixed_y_t w = clip_y(y);
-  return w;
-}
-
 //------------------------------------------------------------------------------
 
 static int RGBToGray(int r, int g, int b) {
@@ -279,7 +243,7 @@ static float RGBToGrayF(float r, float g, float b) {
   return 0.299f * r + 0.587f * g + 0.114f * b;
 }
 
-static float ScaleDown(int a, int b, int c, int d) {
+static int ScaleDown(int a, int b, int c, int d) {
   const float A = GammaToLinearF(a);
   const float B = GammaToLinearF(b);
   const float C = GammaToLinearF(c);
@@ -293,30 +257,36 @@ static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int len) {
     const float G = GammaToLinearF(src[1]);
     const float B = GammaToLinearF(src[2]);
     const float Y = RGBToGrayF(R, G, B);
-    *dst++ = (fixed_y_t)(LinearToGammaF(Y) + .5);
+    *dst++ = (fixed_y_t)LinearToGammaF(Y);
     src += 3;
   }
 }
 
-static WEBP_INLINE void UpdateChroma(const fixed_y_t* src1,
-                                     const fixed_y_t* src2,
-                                     fixed_t* dst, fixed_y_t* tmp, int len) {
+static int UpdateChroma(const fixed_y_t* src1,
+                        const fixed_y_t* src2,
+                        fixed_t* dst, fixed_y_t* tmp, int len) {
+  int diff = 0;
   while (len--> 0) {
-    const float r = ScaleDown(src1[0], src1[3], src2[0], src2[3]);
-    const float g = ScaleDown(src1[1], src1[4], src2[1], src2[4]);
-    const float b = ScaleDown(src1[2], src1[5], src2[2], src2[5]);
-    const float W = RGBToGrayF(r, g, b);
-    dst[0] = (fixed_t)FixedYToW((int)(r - W));
-    dst[1] = (fixed_t)FixedYToW((int)(g - W));
-    dst[2] = (fixed_t)FixedYToW((int)(b - W));
+    const int r = ScaleDown(src1[0], src1[3], src2[0], src2[3]);
+    const int g = ScaleDown(src1[1], src1[4], src2[1], src2[4]);
+    const int b = ScaleDown(src1[2], src1[5], src2[2], src2[5]);
+    const int W = RGBToGray(r, g, b);
+    const int r_avg = (src1[0] + src1[3] + src2[0] + src2[3] + 2) >> 2;
+    const int g_avg = (src1[1] + src1[4] + src2[1] + src2[4] + 2) >> 2;
+    const int b_avg = (src1[2] + src1[5] + src2[2] + src2[5] + 2) >> 2;
+    dst[0] = (fixed_t)(r - W);
+    dst[1] = (fixed_t)(g - W);
+    dst[2] = (fixed_t)(b - W);
     dst += 3;
     src1 += 6;
     src2 += 6;
     if (tmp != NULL) {
-      tmp[0] = tmp[1] = clip_y((int)(W + .5));
+      tmp[0] = tmp[1] = clip_y(W);
       tmp += 2;
     }
+    diff += abs(RGBToGray(r_avg, g_avg, b_avg) - W);
   }
+  return diff;
 }
 
 //------------------------------------------------------------------------------
@@ -336,9 +306,8 @@ static WEBP_INLINE int Filter2(int A, int B) { return (A * 3 + B + 2) >> 2; }
 
 //------------------------------------------------------------------------------
 
-// 8bit -> YFIX
-static WEBP_INLINE fixed_y_t UpLift(uint8_t a) {
-  return ((fixed_y_t)a << YFIX) | (1 << (YFIX - 1));
+static WEBP_INLINE fixed_y_t UpLift(uint8_t a) {  // 8bit -> SFIX
+  return ((fixed_y_t)a << SFIX) | SHALF;
 }
 
 static void ImportOneRow(const uint8_t* const r_ptr,
@@ -368,50 +337,48 @@ static void InterpolateTwoRows(const fixed_y_t* const best_y,
                                fixed_y_t* const out2) {
   int i, k;
   {  // special boundary case for i==0
-    const int W0 = FixedYToW(best_y[0]);
-    const int W1 = FixedYToW(best_y[w]);
+    const int W0 = best_y[0];
+    const int W1 = best_y[w];
     for (k = 0; k <= 2; ++k) {
-      out1[k] = clip_fixed_t(Filter2(cur_uv[k], prev_uv[k]) + W0);
-      out2[k] = clip_fixed_t(Filter2(cur_uv[k], next_uv[k]) + W1);
+      out1[k] = clip_y(Filter2(cur_uv[k], prev_uv[k]) + W0);
+      out2[k] = clip_y(Filter2(cur_uv[k], next_uv[k]) + W1);
     }
   }
   for (i = 1; i < w - 1; ++i) {
-    const int W0 = FixedYToW(best_y[i + 0]);
-    const int W1 = FixedYToW(best_y[i + w]);
+    const int W0 = best_y[i + 0];
+    const int W1 = best_y[i + w];
     const int off = 3 * (i >> 1);
     for (k = 0; k <= 2; ++k) {
       const int tmp0 = Filter(cur_uv + off + k, prev_uv + off + k, i & 1);
       const int tmp1 = Filter(cur_uv + off + k, next_uv + off + k, i & 1);
-      out1[3 * i + k] = clip_fixed_t(tmp0 + W0);
-      out2[3 * i + k] = clip_fixed_t(tmp1 + W1);
+      out1[3 * i + k] = clip_y(tmp0 + W0);
+      out2[3 * i + k] = clip_y(tmp1 + W1);
     }
   }
   {  // special boundary case for i == w - 1
-    const int W0 = FixedYToW(best_y[i + 0]);
-    const int W1 = FixedYToW(best_y[i + w]);
+    const int W0 = best_y[i + 0];
+    const int W1 = best_y[i + w];
     const int off = 3 * (i >> 1);
     for (k = 0; k <= 2; ++k) {
-      out1[3 * i + k] =
-          clip_fixed_t(Filter2(cur_uv[off + k], prev_uv[off + k]) + W0);
-      out2[3 * i + k] =
-          clip_fixed_t(Filter2(cur_uv[off + k], next_uv[off + k]) + W1);
+      out1[3 * i + k] = clip_y(Filter2(cur_uv[off + k], prev_uv[off + k]) + W0);
+      out2[3 * i + k] = clip_y(Filter2(cur_uv[off + k], next_uv[off + k]) + W1);
     }
   }
 }
 
 static WEBP_INLINE uint8_t ConvertRGBToY(int r, int g, int b) {
-  const int luma = 16839 * r + 33059 * g + 6420 * b + TROUNDER;
-  return clip_8b(16 + (luma >> (YUV_FIX + TFIX)));
+  const int luma = 16839 * r + 33059 * g + 6420 * b + SROUNDER;
+  return clip_8b(16 + (luma >> (YUV_FIX + SFIX)));
 }
 
 static WEBP_INLINE uint8_t ConvertRGBToU(int r, int g, int b) {
-  const int u =  -9719 * r - 19081 * g + 28800 * b + TROUNDER;
-  return clip_8b(128 + (u >> (YUV_FIX + TFIX)));
+  const int u =  -9719 * r - 19081 * g + 28800 * b + SROUNDER;
+  return clip_8b(128 + (u >> (YUV_FIX + SFIX)));
 }
 
 static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
-  const int v = +28800 * r - 24116 * g -  4684 * b + TROUNDER;
-  return clip_8b(128 + (v >> (YUV_FIX + TFIX)));
+  const int v = +28800 * r - 24116 * g -  4684 * b + SROUNDER;
+  return clip_8b(128 + (v >> (YUV_FIX + SFIX)));
 }
 
 static int ConvertWRGBToYUV(const fixed_y_t* const best_y,
@@ -426,7 +393,7 @@ static int ConvertWRGBToYUV(const fixed_y_t* const best_y,
     for (i = 0; i < picture->width; ++i) {
       const int off = 3 * ((i >> 1) + (j >> 1) * uv_w);
       const int off2 = i + j * picture->y_stride;
-      const int W = FixedYToW(best_y[i + j * w]);
+      const int W = best_y[i + j * w];
       const int r = best_uv[off + 0] + W;
       const int g = best_uv[off + 1] + W;
       const int b = best_uv[off + 2] + W;
@@ -475,6 +442,10 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
   fixed_t* const target_uv = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
   fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
   int ok;
+  int diff_sum = 0;
+  const int first_diff_threshold = (int)(2.5 * w * h);
+  const int min_improvement = 5;   // stop if improvement is below this %
+  const int min_first_improvement = 80;
 
   if (best_y == NULL || best_uv == NULL ||
       target_y == NULL || target_uv == NULL ||
@@ -507,7 +478,7 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
     }
     UpdateW(src1, target_y + (j + 0) * w, w);
     UpdateW(src2, target_y + (j + 1) * w, w);
-    UpdateChroma(src1, src2, target_uv + uv_off, dst_y, uv_w);
+    diff_sum += UpdateChroma(src1, src2, target_uv + uv_off, dst_y, uv_w);
     memcpy(best_uv + uv_off, target_uv + uv_off, 3 * uv_w * sizeof(*best_uv));
     memcpy(dst_y + w, dst_y, w * sizeof(*dst_y));
   }
@@ -517,10 +488,11 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
     int k;
     const fixed_t* cur_uv = best_uv;
     const fixed_t* prev_uv = best_uv;
+    const int old_diff_sum = diff_sum;
+    diff_sum = 0;
     for (j = 0; j < h; j += 2) {
       fixed_y_t* const src1 = tmp_buffer;
       fixed_y_t* const src2 = tmp_buffer + 3 * w;
-
       {
         const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
         InterpolateTwoRows(best_y + j * w, prev_uv, cur_uv, next_uv,
@@ -531,7 +503,7 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
 
       UpdateW(src1, best_rgb_y + 0 * w, w);
       UpdateW(src2, best_rgb_y + 1 * w, w);
-      UpdateChroma(src1, src2, best_rgb_uv, NULL, uv_w);
+      diff_sum += UpdateChroma(src1, src2, best_rgb_uv, NULL, uv_w);
 
       // update two rows of Y and one row of RGB
       for (i = 0; i < 2 * w; ++i) {
@@ -553,7 +525,23 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
         }
       }
     }
-    // TODO(skal): add early-termination criterion
+    // test exit condition
+    if (diff_sum > 0) {
+      const int improvement = 100 * abs(diff_sum - old_diff_sum) / diff_sum;
+      // Check if first iteration gave good result already, without a large
+      // jump of improvement (otherwise it means we need to try few extra
+      // iterations, just to be sure).
+      if (iter == 0 && diff_sum < first_diff_threshold &&
+          improvement < min_first_improvement) {
+        break;
+      }
+      // then, check if improvement is stalling.
+      if (improvement < min_improvement) {
+        break;
+      }
+    } else {
+      break;
+    }
   }
 
   // final reconstruction
@@ -762,23 +750,20 @@ static WEBP_INLINE void ConvertRowToY(const uint8_t* const r_ptr,
                                       int width,
                                       VP8Random* const rg) {
   int i, j;
-  for (i = 0, j = 0; i < width; ++i, j += step) {
+  for (i = 0, j = 0; i < width; i += 1, j += step) {
     dst_y[i] = RGBToY(r_ptr[j], g_ptr[j], b_ptr[j], rg);
   }
 }
 
-static WEBP_INLINE void ConvertRowsToUVWithAlpha(const uint8_t* const r_ptr,
-                                                 const uint8_t* const g_ptr,
-                                                 const uint8_t* const b_ptr,
-                                                 const uint8_t* const a_ptr,
-                                                 int rgb_stride,
-                                                 uint8_t* const dst_u,
-                                                 uint8_t* const dst_v,
-                                                 int width,
-                                                 VP8Random* const rg) {
+static WEBP_INLINE void AccumulateRGBA(const uint8_t* const r_ptr,
+                                       const uint8_t* const g_ptr,
+                                       const uint8_t* const b_ptr,
+                                       const uint8_t* const a_ptr,
+                                       int rgb_stride,
+                                       uint16_t* dst, int width) {
   int i, j;
-  // we loop over 2x2 blocks and produce one U/V value for each.
-  for (i = 0, j = 0; i < (width >> 1); ++i, j += 2 * sizeof(uint32_t)) {
+  // we loop over 2x2 blocks and produce one R/G/B/A value for each.
+  for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * 4, dst += 4) {
     const uint32_t a = SUM4ALPHA(a_ptr + j);
     int r, g, b;
     if (a == 4 * 0xff || a == 0) {
@@ -790,8 +775,10 @@ static WEBP_INLINE void ConvertRowsToUVWithAlpha(const uint8_t* const r_ptr,
       g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 4, rgb_stride);
       b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 4, rgb_stride);
     }
-    dst_u[i] = RGBToU(r, g, b, rg);
-    dst_v[i] = RGBToV(r, g, b, rg);
+    dst[0] = r;
+    dst[1] = g;
+    dst[2] = b;
+    dst[3] = a;
   }
   if (width & 1) {
     const uint32_t a = 2u * SUM2ALPHA(a_ptr + j);
@@ -805,31 +792,39 @@ static WEBP_INLINE void ConvertRowsToUVWithAlpha(const uint8_t* const r_ptr,
       g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 0, rgb_stride);
       b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 0, rgb_stride);
     }
-    dst_u[i] = RGBToU(r, g, b, rg);
-    dst_v[i] = RGBToV(r, g, b, rg);
+    dst[0] = r;
+    dst[1] = g;
+    dst[2] = b;
+    dst[3] = a;
+  }
+}
+
+static WEBP_INLINE void AccumulateRGB(const uint8_t* const r_ptr,
+                                      const uint8_t* const g_ptr,
+                                      const uint8_t* const b_ptr,
+                                      int step, int rgb_stride,
+                                      uint16_t* dst, int width) {
+  int i, j;
+  for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * step, dst += 4) {
+    dst[0] = SUM4(r_ptr + j, step);
+    dst[1] = SUM4(g_ptr + j, step);
+    dst[2] = SUM4(b_ptr + j, step);
+  }
+  if (width & 1) {
+    dst[0] = SUM2(r_ptr + j);
+    dst[1] = SUM2(g_ptr + j);
+    dst[2] = SUM2(b_ptr + j);
   }
 }
 
-static WEBP_INLINE void ConvertRowsToUV(const uint8_t* const r_ptr,
-                                        const uint8_t* const g_ptr,
-                                        const uint8_t* const b_ptr,
-                                        int step, int rgb_stride,
+static WEBP_INLINE void ConvertRowsToUV(const uint16_t* rgb,
                                         uint8_t* const dst_u,
                                         uint8_t* const dst_v,
                                         int width,
                                         VP8Random* const rg) {
-  int i, j;
-  for (i = 0, j = 0; i < (width >> 1); ++i, j += 2 * step) {
-    const int r = SUM4(r_ptr + j, step);
-    const int g = SUM4(g_ptr + j, step);
-    const int b = SUM4(b_ptr + j, step);
-    dst_u[i] = RGBToU(r, g, b, rg);
-    dst_v[i] = RGBToV(r, g, b, rg);
-  }
-  if (width & 1) {
-    const int r = SUM2(r_ptr + j);
-    const int g = SUM2(g_ptr + j);
-    const int b = SUM2(b_ptr + j);
+  int i;
+  for (i = 0; i < width; i += 1, rgb += 4) {
+    const int r = rgb[0], g = rgb[1], b = rgb[2];
     dst_u[i] = RGBToU(r, g, b, rg);
     dst_v[i] = RGBToV(r, g, b, rg);
   }
@@ -848,6 +843,7 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
   const int width = picture->width;
   const int height = picture->height;
   const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
+  const int is_rgb = (r_ptr < b_ptr);  // otherwise it's bgr
 
   picture->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420;
   picture->use_argb = 0;
@@ -864,7 +860,7 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
   if (has_alpha) {
     WebPInitAlphaProcessing();
     assert(step == 4);
-#if defined(USE_INVERSE_ALPHA_TABLE)
+#if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE)
     assert(kAlphaFix + kGammaFix <= 31);
 #endif
   }
@@ -879,6 +875,11 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
                        picture->a, picture->a_stride);
     }
   } else {
+    const int uv_width = (width + 1) >> 1;
+    int use_dsp = (step == 3);  // use special function in this case
+    // temporary storage for accumulated R/G/B values during conversion to U/V
+    uint16_t* const tmp_rgb =
+        (uint16_t*)WebPSafeMalloc(4 * uv_width, sizeof(*tmp_rgb));
     uint8_t* dst_y = picture->y;
     uint8_t* dst_u = picture->u;
     uint8_t* dst_v = picture->v;
@@ -889,19 +890,32 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
     if (dithering > 0.) {
       VP8InitRandom(&base_rg, dithering);
       rg = &base_rg;
+      use_dsp = 0;   // can't use dsp in this case
     }
-
+    WebPInitConvertARGBToYUV();
     InitGammaTables();
 
+    if (tmp_rgb == NULL) return 0;  // malloc error
+
     // Downsample Y/U/V planes, two rows at a time
     for (y = 0; y < (height >> 1); ++y) {
       int rows_have_alpha = has_alpha;
       const int off1 = (2 * y + 0) * rgb_stride;
       const int off2 = (2 * y + 1) * rgb_stride;
-      ConvertRowToY(r_ptr + off1, g_ptr + off1, b_ptr + off1, step,
-                    dst_y, width, rg);
-      ConvertRowToY(r_ptr + off2, g_ptr + off2, b_ptr + off2, step,
-                    dst_y + picture->y_stride, width, rg);
+      if (use_dsp) {
+        if (is_rgb) {
+          WebPConvertRGB24ToY(r_ptr + off1, dst_y, width);
+          WebPConvertRGB24ToY(r_ptr + off2, dst_y + picture->y_stride, width);
+        } else {
+          WebPConvertBGR24ToY(b_ptr + off1, dst_y, width);
+          WebPConvertBGR24ToY(b_ptr + off2, dst_y + picture->y_stride, width);
+        }
+      } else {
+        ConvertRowToY(r_ptr + off1, g_ptr + off1, b_ptr + off1, step,
+                      dst_y, width, rg);
+        ConvertRowToY(r_ptr + off2, g_ptr + off2, b_ptr + off2, step,
+                      dst_y + picture->y_stride, width, rg);
+      }
       dst_y += 2 * picture->y_stride;
       if (has_alpha) {
         rows_have_alpha &= !WebPExtractAlpha(a_ptr + off1, rgb_stride,
@@ -909,13 +923,19 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
                                              dst_a, picture->a_stride);
         dst_a += 2 * picture->a_stride;
       }
+      // Collect averaged R/G/B(/A)
       if (!rows_have_alpha) {
-        ConvertRowsToUV(r_ptr + off1, g_ptr + off1, b_ptr + off1,
-                        step, rgb_stride, dst_u, dst_v, width, rg);
+        AccumulateRGB(r_ptr + off1, g_ptr + off1, b_ptr + off1,
+                      step, rgb_stride, tmp_rgb, width);
+      } else {
+        AccumulateRGBA(r_ptr + off1, g_ptr + off1, b_ptr + off1, a_ptr + off1,
+                       rgb_stride, tmp_rgb, width);
+      }
+      // Convert to U/V
+      if (rg == NULL) {
+        WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
       } else {
-        ConvertRowsToUVWithAlpha(r_ptr + off1, g_ptr + off1, b_ptr + off1,
-                                 a_ptr + off1, rgb_stride,
-                                 dst_u, dst_v, width, rg);
+        ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg);
       }
       dst_u += picture->uv_stride;
       dst_v += picture->uv_stride;
@@ -923,20 +943,35 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
     if (height & 1) {    // extra last row
       const int off = 2 * y * rgb_stride;
       int row_has_alpha = has_alpha;
-      ConvertRowToY(r_ptr + off, g_ptr + off, b_ptr + off, step,
-                    dst_y, width, rg);
+      if (use_dsp) {
+        if (r_ptr < b_ptr) {
+          WebPConvertRGB24ToY(r_ptr + off, dst_y, width);
+        } else {
+          WebPConvertBGR24ToY(b_ptr + off, dst_y, width);
+        }
+      } else {
+        ConvertRowToY(r_ptr + off, g_ptr + off, b_ptr + off, step,
+                      dst_y, width, rg);
+      }
       if (row_has_alpha) {
         row_has_alpha &= !WebPExtractAlpha(a_ptr + off, 0, width, 1, dst_a, 0);
       }
+      // Collect averaged R/G/B(/A)
       if (!row_has_alpha) {
-        ConvertRowsToUV(r_ptr + off, g_ptr + off, b_ptr + off,
-                        step, 0, dst_u, dst_v, width, rg);
+        // Collect averaged R/G/B
+        AccumulateRGB(r_ptr + off, g_ptr + off, b_ptr + off,
+                      step, /* rgb_stride = */ 0, tmp_rgb, width);
+      } else {
+        AccumulateRGBA(r_ptr + off, g_ptr + off, b_ptr + off, a_ptr + off,
+                       /* rgb_stride = */ 0, tmp_rgb, width);
+      }
+      if (rg == NULL) {
+        WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
       } else {
-        ConvertRowsToUVWithAlpha(r_ptr + off, g_ptr + off, b_ptr + off,
-                                 a_ptr + off, 0,
-                                 dst_u, dst_v, width, rg);
+        ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg);
       }
     }
+    WebPSafeFree(tmp_rgb);
   }
   return 1;
 }
@@ -978,11 +1013,9 @@ int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
   return PictureARGBToYUVA(picture, colorspace, 0.f, 0);
 }
 
-#if WEBP_ENCODER_ABI_VERSION > 0x0204
 int WebPPictureSmartARGBToYUVA(WebPPicture* picture) {
   return PictureARGBToYUVA(picture, WEBP_YUV420, 0.f, 1);
 }
-#endif
 
 //------------------------------------------------------------------------------
 // call for YUVA -> ARGB conversion
@@ -1066,14 +1099,23 @@ static int Import(WebPPicture* const picture,
   }
   if (!WebPPictureAlloc(picture)) return 0;
 
-  assert(step >= (import_alpha ? 4 : 3));
-  for (y = 0; y < height; ++y) {
-    uint32_t* const dst = &picture->argb[y * picture->argb_stride];
-    int x;
-    for (x = 0; x < width; ++x) {
-      const int offset = step * x + y * rgb_stride;
-      dst[x] = MakeARGB32(import_alpha ? a_ptr[offset] : 0xff,
-                          r_ptr[offset], g_ptr[offset], b_ptr[offset]);
+  VP8EncDspARGBInit();
+
+  if (import_alpha) {
+    assert(step == 4);
+    for (y = 0; y < height; ++y) {
+      uint32_t* const dst = &picture->argb[y * picture->argb_stride];
+      const int offset = y * rgb_stride;
+      VP8PackARGB(a_ptr + offset, r_ptr + offset, g_ptr + offset,
+                  b_ptr + offset, width, dst);
+    }
+  } else {
+    assert(step >= 3);
+    for (y = 0; y < height; ++y) {
+      uint32_t* const dst = &picture->argb[y * picture->argb_stride];
+      const int offset = y * rgb_stride;
+      VP8PackRGB(r_ptr + offset, g_ptr + offset, b_ptr + offset,
+                 width, step, dst);
     }
   }
   return 1;
diff --git a/src/3rdparty/libwebp/src/enc/picture_psnr.c b/src/3rdparty/libwebp/src/enc/picture_psnr.c
index 2254b7e..40214ef 100644
--- a/src/3rdparty/libwebp/src/enc/picture_psnr.c
+++ b/src/3rdparty/libwebp/src/enc/picture_psnr.c
@@ -12,8 +12,10 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <math.h>
+#include <stdlib.h>
 
 #include "./vp8enci.h"
+#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 // local-min distortion
@@ -23,9 +25,9 @@
 
 #define RADIUS 2  // search radius. Shouldn't be too large.
 
-static float AccumulateLSIM(const uint8_t* src, int src_stride,
-                            const uint8_t* ref, int ref_stride,
-                            int w, int h) {
+static void AccumulateLSIM(const uint8_t* src, int src_stride,
+                           const uint8_t* ref, int ref_stride,
+                           int w, int h, DistoStats* stats) {
   int x, y;
   double total_sse = 0.;
   for (y = 0; y < h; ++y) {
@@ -38,16 +40,22 @@ static float AccumulateLSIM(const uint8_t* src, int src_stride,
       const double value = (double)ref[y * ref_stride + x];
       int i, j;
       for (j = y_0; j < y_1; ++j) {
-        const uint8_t* s = src + j * src_stride;
+        const uint8_t* const s = src + j * src_stride;
         for (i = x_0; i < x_1; ++i) {
-          const double sse = (double)(s[i] - value) * (s[i] - value);
+          const double diff = s[i] - value;
+          const double sse = diff * diff;
           if (sse < best_sse) best_sse = sse;
         }
       }
       total_sse += best_sse;
     }
   }
-  return (float)total_sse;
+  stats->w = w * h;
+  stats->xm = 0;
+  stats->ym = 0;
+  stats->xxm = total_sse;
+  stats->yym = 0;
+  stats->xxm = 0;
 }
 #undef RADIUS
 
@@ -64,73 +72,90 @@ static float GetPSNR(const double v) {
 int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
                           int type, float result[5]) {
   DistoStats stats[5];
-  int has_alpha;
-  int uv_w, uv_h;
+  int w, h;
+
+  memset(stats, 0, sizeof(stats));
 
   if (src == NULL || ref == NULL ||
       src->width != ref->width || src->height != ref->height ||
-      src->y == NULL || ref->y == NULL ||
-      src->u == NULL || ref->u == NULL ||
-      src->v == NULL || ref->v == NULL ||
-      result == NULL) {
-    return 0;
-  }
-  // TODO(skal): provide distortion for ARGB too.
-  if (src->use_argb == 1 || src->use_argb != ref->use_argb) {
-    return 0;
-  }
-
-  has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
-  if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
-      (has_alpha && (src->a == NULL || ref->a == NULL))) {
+      src->use_argb != ref->use_argb || result == NULL) {
     return 0;
   }
+  w = src->width;
+  h = src->height;
 
-  memset(stats, 0, sizeof(stats));
+  if (src->use_argb == 1) {
+    if (src->argb == NULL || ref->argb == NULL) {
+      return 0;
+    } else {
+      int i, j, c;
+      uint8_t* tmp1, *tmp2;
+      uint8_t* const tmp_plane =
+          (uint8_t*)WebPSafeMalloc(2ULL * w * h, sizeof(*tmp_plane));
+      if (tmp_plane == NULL) return 0;
+      tmp1 = tmp_plane;
+      tmp2 = tmp_plane + w * h;
+      for (c = 0; c < 4; ++c) {
+        for (j = 0; j < h; ++j) {
+          for (i = 0; i < w; ++i) {
+            tmp1[j * w + i] = src->argb[i + j * src->argb_stride] >> (c * 8);
+            tmp2[j * w + i] = ref->argb[i + j * ref->argb_stride] >> (c * 8);
+          }
+        }
+        if (type >= 2) {
+          AccumulateLSIM(tmp1, w, tmp2, w, w, h, &stats[c]);
+        } else {
+          VP8SSIMAccumulatePlane(tmp1, w, tmp2, w, w, h, &stats[c]);
+        }
+      }
+      free(tmp_plane);
+    }
+  } else {
+    int has_alpha, uv_w, uv_h;
+    if (src->y == NULL || ref->y == NULL ||
+        src->u == NULL || ref->u == NULL ||
+        src->v == NULL || ref->v == NULL) {
+      return 0;
+    }
+    has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
+    if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
+        (has_alpha && (src->a == NULL || ref->a == NULL))) {
+      return 0;
+    }
 
-  uv_w = (src->width + 1) >> 1;
-  uv_h = (src->height + 1) >> 1;
-  if (type >= 2) {
-    float sse[4];
-    sse[0] = AccumulateLSIM(src->y, src->y_stride,
-                            ref->y, ref->y_stride, src->width, src->height);
-    sse[1] = AccumulateLSIM(src->u, src->uv_stride,
-                            ref->u, ref->uv_stride, uv_w, uv_h);
-    sse[2] = AccumulateLSIM(src->v, src->uv_stride,
-                            ref->v, ref->uv_stride, uv_w, uv_h);
-    sse[3] = has_alpha ? AccumulateLSIM(src->a, src->a_stride,
-                                        ref->a, ref->a_stride,
-                                        src->width, src->height)
-                       : 0.f;
-    result[0] = GetPSNR(sse[0] / (src->width * src->height));
-    result[1] = GetPSNR(sse[1] / (uv_w * uv_h));
-    result[2] = GetPSNR(sse[2] / (uv_w * uv_h));
-    result[3] = GetPSNR(sse[3] / (src->width * src->height));
-    {
-      double total_sse = sse[0] + sse[1] + sse[2];
-      int total_pixels = src->width * src->height + 2 * uv_w * uv_h;
+    uv_w = (src->width + 1) >> 1;
+    uv_h = (src->height + 1) >> 1;
+    if (type >= 2) {
+      AccumulateLSIM(src->y, src->y_stride, ref->y, ref->y_stride,
+                     w, h, &stats[0]);
+      AccumulateLSIM(src->u, src->uv_stride, ref->u, ref->uv_stride,
+                     uv_w, uv_h, &stats[1]);
+      AccumulateLSIM(src->v, src->uv_stride, ref->v, ref->uv_stride,
+                     uv_w, uv_h, &stats[2]);
       if (has_alpha) {
-        total_pixels += src->width * src->height;
-        total_sse += sse[3];
+        AccumulateLSIM(src->a, src->a_stride, ref->a, ref->a_stride,
+                       w, h, &stats[3]);
+      }
+    } else {
+      VP8SSIMAccumulatePlane(src->y, src->y_stride,
+                             ref->y, ref->y_stride,
+                             w, h, &stats[0]);
+      VP8SSIMAccumulatePlane(src->u, src->uv_stride,
+                             ref->u, ref->uv_stride,
+                             uv_w, uv_h, &stats[1]);
+      VP8SSIMAccumulatePlane(src->v, src->uv_stride,
+                             ref->v, ref->uv_stride,
+                             uv_w, uv_h, &stats[2]);
+      if (has_alpha) {
+        VP8SSIMAccumulatePlane(src->a, src->a_stride,
+                               ref->a, ref->a_stride,
+                               w, h, &stats[3]);
       }
-      result[4] = GetPSNR(total_sse / total_pixels);
     }
-  } else {
+  }
+  // Final stat calculations.
+  {
     int c;
-    VP8SSIMAccumulatePlane(src->y, src->y_stride,
-                           ref->y, ref->y_stride,
-                           src->width, src->height, &stats[0]);
-    VP8SSIMAccumulatePlane(src->u, src->uv_stride,
-                           ref->u, ref->uv_stride,
-                           uv_w, uv_h, &stats[1]);
-    VP8SSIMAccumulatePlane(src->v, src->uv_stride,
-                           ref->v, ref->uv_stride,
-                           uv_w, uv_h, &stats[2]);
-    if (has_alpha) {
-      VP8SSIMAccumulatePlane(src->a, src->a_stride,
-                             ref->a, ref->a_stride,
-                             src->width, src->height, &stats[3]);
-    }
     for (c = 0; c <= 4; ++c) {
       if (type == 1) {
         const double v = VP8SSIMGet(&stats[c]);
diff --git a/src/3rdparty/libwebp/src/enc/picture_rescale.c b/src/3rdparty/libwebp/src/enc/picture_rescale.c
index 9e45551..9f19e8e 100644
--- a/src/3rdparty/libwebp/src/enc/picture_rescale.c
+++ b/src/3rdparty/libwebp/src/enc/picture_rescale.c
@@ -30,16 +30,6 @@ static void PictureGrabSpecs(const WebPPicture* const src,
 }
 
 //------------------------------------------------------------------------------
-// Picture copying
-
-static void CopyPlane(const uint8_t* src, int src_stride,
-                      uint8_t* dst, int dst_stride, int width, int height) {
-  while (height-- > 0) {
-    memcpy(dst, src, width);
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
 
 // Adjust top-left corner to chroma sample position.
 static void SnapTopLeftPosition(const WebPPicture* const pic,
@@ -70,20 +60,20 @@ int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
   if (!WebPPictureAlloc(dst)) return 0;
 
   if (!src->use_argb) {
-    CopyPlane(src->y, src->y_stride,
-              dst->y, dst->y_stride, dst->width, dst->height);
-    CopyPlane(src->u, src->uv_stride,
-              dst->u, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
-    CopyPlane(src->v, src->uv_stride,
-              dst->v, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
+    WebPCopyPlane(src->y, src->y_stride,
+                  dst->y, dst->y_stride, dst->width, dst->height);
+    WebPCopyPlane(src->u, src->uv_stride, dst->u, dst->uv_stride,
+                  HALVE(dst->width), HALVE(dst->height));
+    WebPCopyPlane(src->v, src->uv_stride, dst->v, dst->uv_stride,
+                  HALVE(dst->width), HALVE(dst->height));
     if (dst->a != NULL)  {
-      CopyPlane(src->a, src->a_stride,
-                dst->a, dst->a_stride, dst->width, dst->height);
+      WebPCopyPlane(src->a, src->a_stride,
+                    dst->a, dst->a_stride, dst->width, dst->height);
     }
   } else {
-    CopyPlane((const uint8_t*)src->argb, 4 * src->argb_stride,
-              (uint8_t*)dst->argb, 4 * dst->argb_stride,
-              4 * dst->width, dst->height);
+    WebPCopyPlane((const uint8_t*)src->argb, 4 * src->argb_stride,
+                  (uint8_t*)dst->argb, 4 * dst->argb_stride,
+                  4 * dst->width, dst->height);
   }
   return 1;
 }
@@ -144,24 +134,23 @@ int WebPPictureCrop(WebPPicture* pic,
   if (!pic->use_argb) {
     const int y_offset = top * pic->y_stride + left;
     const int uv_offset = (top / 2) * pic->uv_stride + left / 2;
-    CopyPlane(pic->y + y_offset, pic->y_stride,
-              tmp.y, tmp.y_stride, width, height);
-    CopyPlane(pic->u + uv_offset, pic->uv_stride,
-              tmp.u, tmp.uv_stride, HALVE(width), HALVE(height));
-    CopyPlane(pic->v + uv_offset, pic->uv_stride,
-              tmp.v, tmp.uv_stride, HALVE(width), HALVE(height));
+    WebPCopyPlane(pic->y + y_offset, pic->y_stride,
+                  tmp.y, tmp.y_stride, width, height);
+    WebPCopyPlane(pic->u + uv_offset, pic->uv_stride,
+                  tmp.u, tmp.uv_stride, HALVE(width), HALVE(height));
+    WebPCopyPlane(pic->v + uv_offset, pic->uv_stride,
+                  tmp.v, tmp.uv_stride, HALVE(width), HALVE(height));
 
     if (tmp.a != NULL) {
       const int a_offset = top * pic->a_stride + left;
-      CopyPlane(pic->a + a_offset, pic->a_stride,
-                tmp.a, tmp.a_stride, width, height);
+      WebPCopyPlane(pic->a + a_offset, pic->a_stride,
+                    tmp.a, tmp.a_stride, width, height);
     }
   } else {
     const uint8_t* const src =
         (const uint8_t*)(pic->argb + top * pic->argb_stride + left);
-    CopyPlane(src, pic->argb_stride * 4,
-              (uint8_t*)tmp.argb, tmp.argb_stride * 4,
-              width * 4, height);
+    WebPCopyPlane(src, pic->argb_stride * 4, (uint8_t*)tmp.argb,
+                  tmp.argb_stride * 4, width * 4, height);
   }
   WebPPictureFree(pic);
   *pic = tmp;
@@ -210,16 +199,10 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
   if (pic == NULL) return 0;
   prev_width = pic->width;
   prev_height = pic->height;
-  // if width is unspecified, scale original proportionally to height ratio.
-  if (width == 0) {
-    width = (prev_width * height + prev_height / 2) / prev_height;
+  if (!WebPRescalerGetScaledDimensions(
+          prev_width, prev_height, &width, &height)) {
+    return 0;
   }
-  // if height is unspecified, scale original proportionally to width ratio.
-  if (height == 0) {
-    height = (prev_height * width + prev_width / 2) / prev_width;
-  }
-  // Check if the overall dimensions still make sense.
-  if (width <= 0 || height <= 0) return 0;
 
   PictureGrabSpecs(pic, &tmp);
   tmp.width = width;
diff --git a/src/3rdparty/libwebp/src/enc/picture_tools.c b/src/3rdparty/libwebp/src/enc/picture_tools.c
index 7c73646..bf97af8 100644
--- a/src/3rdparty/libwebp/src/enc/picture_tools.c
+++ b/src/3rdparty/libwebp/src/enc/picture_tools.c
@@ -11,6 +11,8 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
+#include <assert.h>
+
 #include "./vp8enci.h"
 #include "../dsp/yuv.h"
 
@@ -120,6 +122,24 @@ void WebPCleanupTransparentArea(WebPPicture* pic) {
 #undef SIZE
 #undef SIZE2
 
+void WebPCleanupTransparentAreaLossless(WebPPicture* const pic) {
+  int x, y, w, h;
+  uint32_t* argb;
+  assert(pic != NULL && pic->use_argb);
+  w = pic->width;
+  h = pic->height;
+  argb = pic->argb;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      if ((argb[x] & 0xff000000) == 0) {
+        argb[x] = 0x00000000;
+      }
+    }
+    argb += pic->argb_stride;
+  }
+}
+
 //------------------------------------------------------------------------------
 // Blend color and remove transparency info
 
diff --git a/src/3rdparty/libwebp/src/enc/quant.c b/src/3rdparty/libwebp/src/enc/quant.c
index 9130a41..dd6885a 100644
--- a/src/3rdparty/libwebp/src/enc/quant.c
+++ b/src/3rdparty/libwebp/src/enc/quant.c
@@ -30,7 +30,7 @@
 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
                           // power-law modulation. Must be strictly less than 1.
 
-#define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
+#define I4_PENALTY 14000  // Rate-penalty for quick i4/i16 decision
 
 // number of non-zero coeffs below which we consider the block very flat
 // (and apply a penalty to complex predictions)
@@ -41,6 +41,8 @@
 
 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
 
+#define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
+
 // #define DEBUG_BLOCK
 
 //------------------------------------------------------------------------------
@@ -54,15 +56,37 @@ static void PrintBlockInfo(const VP8EncIterator* const it,
                            const VP8ModeScore* const rd) {
   int i, j;
   const int is_i16 = (it->mb_->type_ == 1);
+  const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
+  const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
+  const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
+  const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
   printf("SOURCE / OUTPUT / ABS DELTA\n");
-  for (j = 0; j < 24; ++j) {
-    if (j == 16) printf("\n");   // newline before the U/V block
-    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
     printf("     ");
-    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
+    for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
     printf("     ");
     for (i = 0; i < 16; ++i) {
-      printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
+      printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
+    }
+    printf("\n");
+  }
+  printf("\n");   // newline before the U/V block
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
+    printf(" ");
+    for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
+    printf("    ");
+    for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
+    printf(" ");
+    for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
+    printf("   ");
+    for (i = 0; i < 8; ++i) {
+      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
+    }
+    printf(" ");
+    for (i = 8; i < 16; ++i) {
+      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
     }
     printf("\n");
   }
@@ -444,15 +468,12 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
 // Quantize
 
 // Layout:
-// +----+
-// |YYYY| 0
-// |YYYY| 4
-// |YYYY| 8
-// |YYYY| 12
-// +----+
-// |UUVV| 16
-// |UUVV| 20
-// +----+
+// +----+----+
+// |YYYY|UUVV| 0
+// |YYYY|UUVV| 4
+// |YYYY|....| 8
+// |YYYY|....| 12
+// +----+----+
 
 const int VP8Scan[16] = {  // Luma
   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
@@ -538,13 +559,12 @@ typedef struct {
 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
 
 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
-  // TODO: incorporate the "* 256" in the tables?
-  rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
+  rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
 }
 
 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
                                           score_t distortion) {
-  return rate * lambda + 256 * distortion;
+  return rate * lambda + RD_DISTO_MULT * distortion;
 }
 
 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
@@ -553,7 +573,8 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
                                 const VP8Matrix* const mtx,
                                 int lambda) {
   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
-  const CostArray* const costs = enc->proba_.level_cost_[coeff_type];
+  CostArrayPtr const costs =
+      (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
   const int first = (coeff_type == 0) ? 1 : 0;
   Node nodes[16][NUM_NODES];
   ScoreState score_states[2][NUM_NODES];
@@ -590,7 +611,7 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
-      ss_cur[m].costs = costs[VP8EncBands[first]][ctx0];
+      ss_cur[m].costs = costs[first][ctx0];
     }
   }
 
@@ -624,7 +645,7 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
       int best_prev = 0;   // default, in case
 
       ss_cur[m].score = MAX_COST;
-      ss_cur[m].costs = costs[band][ctx];
+      ss_cur[m].costs = costs[n + 1][ctx];
       if (level > MAX_LEVEL || level < 0) {   // node is dead?
         continue;
       }
@@ -719,14 +740,14 @@ static int ReconstructIntra16(VP8EncIterator* const it,
                               int mode) {
   const VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
-  const uint8_t* const src = it->yuv_in_ + Y_OFF;
+  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   int nz = 0;
   int n;
   int16_t tmp[16][16], dc_tmp[16];
 
-  for (n = 0; n < 16; ++n) {
-    VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
+  for (n = 0; n < 16; n += 2) {
+    VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
   }
   VP8FTransformWHT(tmp[0], dc_tmp);
   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
@@ -746,12 +767,13 @@ static int ReconstructIntra16(VP8EncIterator* const it,
       }
     }
   } else {
-    for (n = 0; n < 16; ++n) {
+    for (n = 0; n < 16; n += 2) {
       // Zero-out the first coeff, so that: a) nz is correct below, and
       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
-      tmp[n][0] = 0;
-      nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
-      assert(rd->y_ac_levels[n][0] == 0);
+      tmp[n][0] = tmp[n + 1][0] = 0;
+      nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
+      assert(rd->y_ac_levels[n + 0][0] == 0);
+      assert(rd->y_ac_levels[n + 1][0] == 0);
     }
   }
 
@@ -792,14 +814,14 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
                          uint8_t* const yuv_out, int mode) {
   const VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
-  const uint8_t* const src = it->yuv_in_ + U_OFF;
+  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   int nz = 0;
   int n;
   int16_t tmp[8][16];
 
-  for (n = 0; n < 8; ++n) {
-    VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
+  for (n = 0; n < 8; n += 2) {
+    VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
   }
   if (DO_TRELLIS_UV && it->do_trellis_) {
     int ch, x, y;
@@ -816,8 +838,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
       }
     }
   } else {
-    for (n = 0; n < 8; ++n) {
-      nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
+    for (n = 0; n < 8; n += 2) {
+      nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
     }
   }
 
@@ -842,6 +864,12 @@ static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
 }
 
+static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
+  VP8ModeScore* const tmp = *a;
+  *a = *b;
+  *b = tmp;
+}
+
 static void SwapPtr(uint8_t** a, uint8_t** b) {
   uint8_t* const tmp = *a;
   *a = *b;
@@ -865,46 +893,47 @@ static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
   return 1;
 }
 
-static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
+static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
   const int kNumBlocks = 16;
   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i16_;
   const int tlambda = dqm->tlambda_;
-  const uint8_t* const src = it->yuv_in_ + Y_OFF;
-  VP8ModeScore rd16;
+  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
+  VP8ModeScore rd_tmp;
+  VP8ModeScore* rd_cur = &rd_tmp;
+  VP8ModeScore* rd_best = rd;
   int mode;
 
   rd->mode_i16 = -1;
   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
-    uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
-    int nz;
+    uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
+    rd_cur->mode_i16 = mode;
 
     // Reconstruct
-    nz = ReconstructIntra16(it, &rd16, tmp_dst, mode);
+    rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
 
     // Measure RD-score
-    rd16.D = VP8SSE16x16(src, tmp_dst);
-    rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
-            : 0;
-    rd16.H = VP8FixedCostsI16[mode];
-    rd16.R = VP8GetCostLuma16(it, &rd16);
+    rd_cur->D = VP8SSE16x16(src, tmp_dst);
+    rd_cur->SD =
+        tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
+    rd_cur->H = VP8FixedCostsI16[mode];
+    rd_cur->R = VP8GetCostLuma16(it, rd_cur);
     if (mode > 0 &&
-        IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
+        IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
       // penalty to avoid flat area to be mispredicted by complex mode
-      rd16.R += FLATNESS_PENALTY * kNumBlocks;
+      rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
     }
 
     // Since we always examine Intra16 first, we can overwrite *rd directly.
-    SetRDScore(lambda, &rd16);
-    if (mode == 0 || rd16.score < rd->score) {
-      CopyScore(rd, &rd16);
-      rd->mode_i16 = mode;
-      rd->nz = nz;
-      memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels));
-      memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels));
+    SetRDScore(lambda, rd_cur);
+    if (mode == 0 || rd_cur->score < rd_best->score) {
+      SwapModeScore(&rd_cur, &rd_best);
       SwapOut(it);
     }
   }
+  if (rd_best != rd) {
+    memcpy(rd, rd_best, sizeof(*rd));
+  }
   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
   VP8SetIntra16Mode(it, rd->mode_i16);
 
@@ -933,8 +962,8 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i4_;
   const int tlambda = dqm->tlambda_;
-  const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
-  uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
+  const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
+  uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
   int total_header_bits = 0;
   VP8ModeScore rd_best;
 
@@ -972,17 +1001,28 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
                   : 0;
       rd_tmp.H = mode_costs[mode];
-      rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
+
+      // Add flatness penalty
       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
-        rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
+        rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
+      } else {
+        rd_tmp.R = 0;
       }
 
+      // early-out check
       SetRDScore(lambda, &rd_tmp);
+      if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
+
+      // finish computing score
+      rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
+      SetRDScore(lambda, &rd_tmp);
+
       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
         CopyScore(&rd_i4, &rd_tmp);
         best_mode = mode;
         SwapPtr(&tmp_dst, &best_block);
-        memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels));
+        memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
+               sizeof(rd_best.y_ac_levels[it->i4_]));
       }
     }
     SetRDScore(dqm->lambda_mode_, &rd_i4);
@@ -1016,9 +1056,10 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
   const int kNumBlocks = 8;
   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_uv_;
-  const uint8_t* const src = it->yuv_in_ + U_OFF;
-  uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF;  // scratch buffer
-  uint8_t* const dst0 = it->yuv_out_ + U_OFF;
+  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
+  uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
+  uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
+  uint8_t* dst = dst0;
   VP8ModeScore rd_best;
   int mode;
 
@@ -1032,7 +1073,7 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
 
     // Compute RD-score
     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
-    rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
+    rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
     rd_uv.H  = VP8FixedCostsUV[mode];
     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
@@ -1044,11 +1085,14 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
       CopyScore(&rd_best, &rd_uv);
       rd->mode_uv = mode;
       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
-      memcpy(dst0, tmp_dst, UV_SIZE);   //  TODO: SwapUVOut() ?
+      SwapPtr(&dst, &tmp_dst);
     }
   }
   VP8SetIntraUVMode(it, rd->mode_uv);
   AddScore(rd, &rd_best);
+  if (dst != dst0) {   // copy 16x8 block if needed
+    VP8Copy16x8(dst, dst0);
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -1060,35 +1104,41 @@ static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
   int nz = 0;
 
   if (is_i16) {
-    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
+    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
   } else {
     VP8IteratorStartI4(it);
     do {
       const int mode =
           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
-      const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
-      uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_];
+      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
+      uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
       VP8MakeIntra4Preds(it);
       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
                               src, dst, mode) << it->i4_;
-    } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF));
+    } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
   }
 
-  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_);
+  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
   rd->nz = nz;
 }
 
 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
-static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
-  const int is_i16 = (it->mb_->type_ == 1);
+static void RefineUsingDistortion(VP8EncIterator* const it,
+                                  int try_both_modes, int refine_uv_mode,
+                                  VP8ModeScore* const rd) {
   score_t best_score = MAX_COST;
+  score_t score_i4 = (score_t)I4_PENALTY;
+  int16_t tmp_levels[16][16];
+  uint8_t modes_i4[16];
+  int nz = 0;
+  int mode;
+  int is_i16 = try_both_modes || (it->mb_->type_ == 1);
 
-  if (try_both_i4_i16 || is_i16) {
-    int mode;
+  if (is_i16) {   // First, evaluate Intra16 distortion
     int best_mode = -1;
+    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
-      const uint8_t* const src = it->yuv_in_ + Y_OFF;
       const score_t score = VP8SSE16x16(src, ref);
       if (score < best_score) {
         best_mode = mode;
@@ -1096,39 +1146,72 @@ static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
       }
     }
     VP8SetIntra16Mode(it, best_mode);
+    // we'll reconstruct later, if i16 mode actually gets selected
   }
-  if (try_both_i4_i16 || !is_i16) {
-    uint8_t modes_i4[16];
+
+  // Next, evaluate Intra4
+  if (try_both_modes || !is_i16) {
     // We don't evaluate the rate here, but just account for it through a
     // constant penalty (i4 mode usually needs more bits compared to i16).
-    score_t score_i4 = (score_t)I4_PENALTY;
-
+    is_i16 = 0;
     VP8IteratorStartI4(it);
     do {
-      int mode;
-      int best_sub_mode = -1;
-      score_t best_sub_score = MAX_COST;
-      const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
+      int best_i4_mode = -1;
+      score_t best_i4_score = MAX_COST;
+      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
 
-      // TODO(skal): we don't really need the prediction pixels here,
-      // but just the distortion against 'src'.
       VP8MakeIntra4Preds(it);
       for (mode = 0; mode < NUM_BMODES; ++mode) {
         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
         const score_t score = VP8SSE4x4(src, ref);
-        if (score < best_sub_score) {
-          best_sub_mode = mode;
-          best_sub_score = score;
+        if (score < best_i4_score) {
+          best_i4_mode = mode;
+          best_i4_score = score;
         }
       }
-      modes_i4[it->i4_] = best_sub_mode;
-      score_i4 += best_sub_score;
-      if (score_i4 >= best_score) break;
-    } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
-    if (score_i4 < best_score) {
-      VP8SetIntra4Mode(it, modes_i4);
+      modes_i4[it->i4_] = best_i4_mode;
+      score_i4 += best_i4_score;
+      if (score_i4 >= best_score) {
+        // Intra4 won't be better than Intra16. Bail out and pick Intra16.
+        is_i16 = 1;
+        break;
+      } else {  // reconstruct partial block inside yuv_out2_ buffer
+        uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
+        nz |= ReconstructIntra4(it, tmp_levels[it->i4_],
+                                src, tmp_dst, best_i4_mode) << it->i4_;
+      }
+    } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
+  }
+
+  // Final reconstruction, depending on which mode is selected.
+  if (!is_i16) {
+    VP8SetIntra4Mode(it, modes_i4);
+    memcpy(rd->y_ac_levels, tmp_levels, sizeof(tmp_levels));
+    SwapOut(it);
+    best_score = score_i4;
+  } else {
+    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
+  }
+
+  // ... and UV!
+  if (refine_uv_mode) {
+    int best_mode = -1;
+    score_t best_uv_score = MAX_COST;
+    const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
+    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+      const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
+      const score_t score = VP8SSE16x8(src, ref);
+      if (score < best_uv_score) {
+        best_mode = mode;
+        best_uv_score = score;
+      }
     }
+    VP8SetIntraUVMode(it, best_mode);
   }
+  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
+
+  rd->nz = nz;
+  rd->score = best_score;
 }
 
 //------------------------------------------------------------------------------
@@ -1158,13 +1241,13 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
       SimpleQuantize(it, rd);
     }
   } else {
-    // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
-    // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
-    DistoRefine(it, (method >= 2));
-    SimpleQuantize(it, rd);
+    // At this point we have heuristically decided intra16 / intra4.
+    // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
+    // For method <= 1, we don't re-examine the decision but just go ahead with
+    // quantization/reconstruction.
+    RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
   }
   is_skipped = (rd->nz == 0);
   VP8SetSkip(it, is_skipped);
   return is_skipped;
 }
-
diff --git a/src/3rdparty/libwebp/src/enc/syntax.c b/src/3rdparty/libwebp/src/enc/syntax.c
index d1ff0a5..a0e79ef 100644
--- a/src/3rdparty/libwebp/src/enc/syntax.c
+++ b/src/3rdparty/libwebp/src/enc/syntax.c
@@ -186,8 +186,8 @@ static int PutWebPHeaders(const VP8Encoder* const enc, size_t size0,
 // Segmentation header
 static void PutSegmentHeader(VP8BitWriter* const bw,
                              const VP8Encoder* const enc) {
-  const VP8SegmentHeader* const hdr = &enc->segment_hdr_;
-  const VP8Proba* const proba = &enc->proba_;
+  const VP8EncSegmentHeader* const hdr = &enc->segment_hdr_;
+  const VP8EncProba* const proba = &enc->proba_;
   if (VP8PutBitUniform(bw, (hdr->num_segments_ > 1))) {
     // We always 'update' the quant and filter strength values
     const int update_data = 1;
@@ -197,16 +197,16 @@ static void PutSegmentHeader(VP8BitWriter* const bw,
       // we always use absolute values, not relative ones
       VP8PutBitUniform(bw, 1);   // (segment_feature_mode = 1. Paragraph 9.3.)
       for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        VP8PutSignedValue(bw, enc->dqm_[s].quant_, 7);
+        VP8PutSignedBits(bw, enc->dqm_[s].quant_, 7);
       }
       for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        VP8PutSignedValue(bw, enc->dqm_[s].fstrength_, 6);
+        VP8PutSignedBits(bw, enc->dqm_[s].fstrength_, 6);
       }
     }
     if (hdr->update_map_) {
       for (s = 0; s < 3; ++s) {
         if (VP8PutBitUniform(bw, (proba->segments_[s] != 255u))) {
-          VP8PutValue(bw, proba->segments_[s], 8);
+          VP8PutBits(bw, proba->segments_[s], 8);
         }
       }
     }
@@ -215,20 +215,20 @@ static void PutSegmentHeader(VP8BitWriter* const bw,
 
 // Filtering parameters header
 static void PutFilterHeader(VP8BitWriter* const bw,
-                            const VP8FilterHeader* const hdr) {
+                            const VP8EncFilterHeader* const hdr) {
   const int use_lf_delta = (hdr->i4x4_lf_delta_ != 0);
   VP8PutBitUniform(bw, hdr->simple_);
-  VP8PutValue(bw, hdr->level_, 6);
-  VP8PutValue(bw, hdr->sharpness_, 3);
+  VP8PutBits(bw, hdr->level_, 6);
+  VP8PutBits(bw, hdr->sharpness_, 3);
   if (VP8PutBitUniform(bw, use_lf_delta)) {
     // '0' is the default value for i4x4_lf_delta_ at frame #0.
     const int need_update = (hdr->i4x4_lf_delta_ != 0);
     if (VP8PutBitUniform(bw, need_update)) {
       // we don't use ref_lf_delta => emit four 0 bits
-      VP8PutValue(bw, 0, 4);
+      VP8PutBits(bw, 0, 4);
       // we use mode_lf_delta for i4x4
-      VP8PutSignedValue(bw, hdr->i4x4_lf_delta_, 6);
-      VP8PutValue(bw, 0, 3);    // all others unused
+      VP8PutSignedBits(bw, hdr->i4x4_lf_delta_, 6);
+      VP8PutBits(bw, 0, 3);    // all others unused
     }
   }
 }
@@ -236,12 +236,12 @@ static void PutFilterHeader(VP8BitWriter* const bw,
 // Nominal quantization parameters
 static void PutQuant(VP8BitWriter* const bw,
                      const VP8Encoder* const enc) {
-  VP8PutValue(bw, enc->base_quant_, 7);
-  VP8PutSignedValue(bw, enc->dq_y1_dc_, 4);
-  VP8PutSignedValue(bw, enc->dq_y2_dc_, 4);
-  VP8PutSignedValue(bw, enc->dq_y2_ac_, 4);
-  VP8PutSignedValue(bw, enc->dq_uv_dc_, 4);
-  VP8PutSignedValue(bw, enc->dq_uv_ac_, 4);
+  VP8PutBits(bw, enc->base_quant_, 7);
+  VP8PutSignedBits(bw, enc->dq_y1_dc_, 4);
+  VP8PutSignedBits(bw, enc->dq_y2_dc_, 4);
+  VP8PutSignedBits(bw, enc->dq_y2_ac_, 4);
+  VP8PutSignedBits(bw, enc->dq_uv_dc_, 4);
+  VP8PutSignedBits(bw, enc->dq_uv_ac_, 4);
 }
 
 // Partition sizes
@@ -277,9 +277,9 @@ static int GeneratePartition0(VP8Encoder* const enc) {
 
   PutSegmentHeader(bw, enc);
   PutFilterHeader(bw, &enc->filter_hdr_);
-  VP8PutValue(bw, enc->num_parts_ == 8 ? 3 :
-                  enc->num_parts_ == 4 ? 2 :
-                  enc->num_parts_ == 2 ? 1 : 0, 2);
+  VP8PutBits(bw, enc->num_parts_ == 8 ? 3 :
+                 enc->num_parts_ == 4 ? 2 :
+                 enc->num_parts_ == 2 ? 1 : 0, 2);
   PutQuant(bw, enc);
   VP8PutBitUniform(bw, 0);   // no proba update
   VP8WriteProbas(bw, &enc->proba_);
diff --git a/src/3rdparty/libwebp/src/enc/token.c b/src/3rdparty/libwebp/src/enc/token.c
index 8af13a0..e73256b 100644
--- a/src/3rdparty/libwebp/src/enc/token.c
+++ b/src/3rdparty/libwebp/src/enc/token.c
@@ -30,15 +30,15 @@
 #define MIN_PAGE_SIZE 8192          // minimum number of token per page
 #define FIXED_PROBA_BIT (1u << 14)
 
-typedef uint16_t token_t;  // bit#15: bit
-                           // bit #14: constant proba or idx
-                           // bits 0..13: slot or constant proba
+typedef uint16_t token_t;  // bit #15: bit value
+                           // bit #14: flags for constant proba or idx
+                           // bits #0..13: slot or constant proba
 struct VP8Tokens {
   VP8Tokens* next_;        // pointer to next page
 };
 // Token data is located in memory just after the next_ field.
 // This macro is used to return their address and hide the trick.
-#define TOKEN_DATA(p) ((token_t*)&(p)[1])
+#define TOKEN_DATA(p) ((const token_t*)&(p)[1])
 
 //------------------------------------------------------------------------------
 
@@ -53,10 +53,10 @@ void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
 
 void VP8TBufferClear(VP8TBuffer* const b) {
   if (b != NULL) {
-    const VP8Tokens* p = b->pages_;
+    VP8Tokens* p = b->pages_;
     while (p != NULL) {
-      const VP8Tokens* const next = p->next_;
-      WebPSafeFree((void*)p);
+      VP8Tokens* const next = p->next_;
+      WebPSafeFree(p);
       p = next;
     }
     VP8TBufferInit(b, b->page_size_);
@@ -65,8 +65,8 @@ void VP8TBufferClear(VP8TBuffer* const b) {
 
 static int TBufferNewPage(VP8TBuffer* const b) {
   VP8Tokens* page = NULL;
-  const size_t size = sizeof(*page) + b->page_size_ * sizeof(token_t);
   if (!b->error_) {
+    const size_t size = sizeof(*page) + b->page_size_ * sizeof(token_t);
     page = (VP8Tokens*)WebPSafeMalloc(1ULL, size);
   }
   if (page == NULL) {
@@ -78,19 +78,19 @@ static int TBufferNewPage(VP8TBuffer* const b) {
   *b->last_page_ = page;
   b->last_page_ = &page->next_;
   b->left_ = b->page_size_;
-  b->tokens_ = TOKEN_DATA(page);
+  b->tokens_ = (token_t*)TOKEN_DATA(page);
   return 1;
 }
 
 //------------------------------------------------------------------------------
 
-#define TOKEN_ID(t, b, ctx, p) \
-    ((p) + NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
+#define TOKEN_ID(t, b, ctx) \
+    (NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
 
-static WEBP_INLINE int AddToken(VP8TBuffer* const b,
-                                int bit, uint32_t proba_idx) {
+static WEBP_INLINE uint32_t AddToken(VP8TBuffer* const b,
+                                     uint32_t bit, uint32_t proba_idx) {
   assert(proba_idx < FIXED_PROBA_BIT);
-  assert(bit == 0 || bit == 1);
+  assert(bit <= 1);
   if (b->left_ > 0 || TBufferNewPage(b)) {
     const int slot = --b->left_;
     b->tokens_[slot] = (bit << 15) | proba_idx;
@@ -99,20 +99,21 @@ static WEBP_INLINE int AddToken(VP8TBuffer* const b,
 }
 
 static WEBP_INLINE void AddConstantToken(VP8TBuffer* const b,
-                                         int bit, int proba) {
+                                         uint32_t bit, uint32_t proba) {
   assert(proba < 256);
-  assert(bit == 0 || bit == 1);
+  assert(bit <= 1);
   if (b->left_ > 0 || TBufferNewPage(b)) {
     const int slot = --b->left_;
     b->tokens_[slot] = (bit << 15) | FIXED_PROBA_BIT | proba;
   }
 }
 
-int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
+int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
+                         int first, int last,
                          const int16_t* const coeffs,
                          VP8TBuffer* const tokens) {
   int n = first;
-  uint32_t base_id = TOKEN_ID(coeff_type, n, ctx, 0);
+  uint32_t base_id = TOKEN_ID(coeff_type, n, ctx);
   if (!AddToken(tokens, last >= 0, base_id + 0)) {
     return 0;
   }
@@ -120,14 +121,13 @@ int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
   while (n < 16) {
     const int c = coeffs[n++];
     const int sign = c < 0;
-    int v = sign ? -c : c;
+    const uint32_t v = sign ? -c : c;
     if (!AddToken(tokens, v != 0, base_id + 1)) {
-      ctx = 0;
-      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], ctx, 0);
+      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 0);  // ctx=0
       continue;
     }
     if (!AddToken(tokens, v > 1, base_id + 2)) {
-      ctx = 1;
+      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 1);  // ctx=1
     } else {
       if (!AddToken(tokens, v > 4, base_id + 3)) {
         if (AddToken(tokens, v != 2, base_id + 4))
@@ -142,40 +142,40 @@ int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
       } else {
         int mask;
         const uint8_t* tab;
-        if (v < 3 + (8 << 1)) {          // VP8Cat3  (3b)
+        uint32_t residue = v - 3;
+        if (residue < (8 << 1)) {          // VP8Cat3  (3b)
           AddToken(tokens, 0, base_id + 8);
           AddToken(tokens, 0, base_id + 9);
-          v -= 3 + (8 << 0);
+          residue -= (8 << 0);
           mask = 1 << 2;
           tab = VP8Cat3;
-        } else if (v < 3 + (8 << 2)) {   // VP8Cat4  (4b)
+        } else if (residue < (8 << 2)) {   // VP8Cat4  (4b)
           AddToken(tokens, 0, base_id + 8);
           AddToken(tokens, 1, base_id + 9);
-          v -= 3 + (8 << 1);
+          residue -= (8 << 1);
           mask = 1 << 3;
           tab = VP8Cat4;
-        } else if (v < 3 + (8 << 3)) {   // VP8Cat5  (5b)
+        } else if (residue < (8 << 3)) {   // VP8Cat5  (5b)
           AddToken(tokens, 1, base_id + 8);
           AddToken(tokens, 0, base_id + 10);
-          v -= 3 + (8 << 2);
+          residue -= (8 << 2);
           mask = 1 << 4;
           tab = VP8Cat5;
         } else {                         // VP8Cat6 (11b)
           AddToken(tokens, 1, base_id + 8);
           AddToken(tokens, 1, base_id + 10);
-          v -= 3 + (8 << 3);
+          residue -= (8 << 3);
           mask = 1 << 10;
           tab = VP8Cat6;
         }
         while (mask) {
-          AddConstantToken(tokens, !!(v & mask), *tab++);
+          AddConstantToken(tokens, !!(residue & mask), *tab++);
           mask >>= 1;
         }
       }
-      ctx = 2;
+      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 2);  // ctx=2
     }
     AddConstantToken(tokens, sign, 128);
-    base_id = TOKEN_ID(coeff_type, VP8EncBands[n], ctx, 0);
     if (n == 16 || !AddToken(tokens, n <= last, base_id + 0)) {
       return 1;   // EOB
     }
@@ -224,7 +224,6 @@ void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats) {
 int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
                   const uint8_t* const probas, int final_pass) {
   const VP8Tokens* p = b->pages_;
-  (void)final_pass;
   assert(!b->error_);
   while (p != NULL) {
     const VP8Tokens* const next = p->next_;
diff --git a/src/3rdparty/libwebp/src/enc/tree.c b/src/3rdparty/libwebp/src/enc/tree.c
index e5d05e5..f141006 100644
--- a/src/3rdparty/libwebp/src/enc/tree.c
+++ b/src/3rdparty/libwebp/src/enc/tree.c
@@ -154,7 +154,7 @@ const uint8_t
 };
 
 void VP8DefaultProbas(VP8Encoder* const enc) {
-  VP8Proba* const probas = &enc->proba_;
+  VP8EncProba* const probas = &enc->proba_;
   probas->use_skip_proba_ = 0;
   memset(probas->segments_, 255u, sizeof(probas->segments_));
   memcpy(probas->coeffs_, VP8CoeffsProba0, sizeof(VP8CoeffsProba0));
@@ -482,7 +482,7 @@ const uint8_t
   }
 };
 
-void VP8WriteProbas(VP8BitWriter* const bw, const VP8Proba* const probas) {
+void VP8WriteProbas(VP8BitWriter* const bw, const VP8EncProba* const probas) {
   int t, b, c, p;
   for (t = 0; t < NUM_TYPES; ++t) {
     for (b = 0; b < NUM_BANDS; ++b) {
@@ -491,14 +491,14 @@ void VP8WriteProbas(VP8BitWriter* const bw, const VP8Proba* const probas) {
           const uint8_t p0 = probas->coeffs_[t][b][c][p];
           const int update = (p0 != VP8CoeffsProba0[t][b][c][p]);
           if (VP8PutBit(bw, update, VP8CoeffsUpdateProba[t][b][c][p])) {
-            VP8PutValue(bw, p0, 8);
+            VP8PutBits(bw, p0, 8);
           }
         }
       }
     }
   }
   if (VP8PutBitUniform(bw, probas->use_skip_proba_)) {
-    VP8PutValue(bw, probas->skip_proba_, 8);
+    VP8PutBits(bw, probas->skip_proba_, 8);
   }
 }
 
diff --git a/src/3rdparty/libwebp/src/enc/vp8enci.h b/src/3rdparty/libwebp/src/enc/vp8enci.h
index 20f58c6..b2cc8d1 100644
--- a/src/3rdparty/libwebp/src/enc/vp8enci.h
+++ b/src/3rdparty/libwebp/src/enc/vp8enci.h
@@ -15,10 +15,16 @@
 #define WEBP_ENC_VP8ENCI_H_
 
 #include <string.h>     // for memcpy()
-#include "../webp/encode.h"
+#include "../dec/common.h"
 #include "../dsp/dsp.h"
 #include "../utils/bit_writer.h"
 #include "../utils/thread.h"
+#include "../utils/utils.h"
+#include "../webp/encode.h"
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+#include "./vp8li.h"
+#endif  // WEBP_EXPERIMENTAL_FEATURES
 
 #ifdef __cplusplus
 extern "C" {
@@ -29,35 +35,10 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 4
-#define ENC_REV_VERSION 4
-
-// intra prediction modes
-enum { B_DC_PRED = 0,   // 4x4 modes
-       B_TM_PRED = 1,
-       B_VE_PRED = 2,
-       B_HE_PRED = 3,
-       B_RD_PRED = 4,
-       B_VR_PRED = 5,
-       B_LD_PRED = 6,
-       B_VL_PRED = 7,
-       B_HD_PRED = 8,
-       B_HU_PRED = 9,
-       NUM_BMODES = B_HU_PRED + 1 - B_DC_PRED,  // = 10
-
-       // Luma16 or UV modes
-       DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
-       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
-       NUM_PRED_MODES = 4
-     };
+#define ENC_MIN_VERSION 5
+#define ENC_REV_VERSION 0
 
-enum { NUM_MB_SEGMENTS = 4,
-       MAX_NUM_PARTITIONS = 8,
-       NUM_TYPES = 4,   // 0: i16-AC,  1: i16-DC,  2:chroma-AC,  3:i4-AC
-       NUM_BANDS = 8,
-       NUM_CTX = 3,
-       NUM_PROBAS = 11,
-       MAX_LF_LEVELS = 64,       // Maximum loop filter level
+enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
        MAX_LEVEL = 2047          // max level (note: max codable is 2047 + 67)
      };
@@ -69,66 +50,34 @@ typedef enum {   // Rate-distortion optimization levels
   RD_OPT_TRELLIS_ALL = 3   // trellis-quant for every scoring (much slower)
 } VP8RDLevel;
 
-// YUV-cache parameters. Cache is 16-pixels wide.
-// The original or reconstructed samples can be accessed using VP8Scan[]
+// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
+// The original or reconstructed samples can be accessed using VP8Scan[].
 // The predicted blocks can be accessed using offsets to yuv_p_ and
-// the arrays VP8*ModeOffsets[];
-//         +----+      YUV Samples area. See VP8Scan[] for accessing the blocks.
-//  Y_OFF  |YYYY| <- original samples  ('yuv_in_')
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-//  U_OFF  |UUVV| V_OFF  (=U_OFF + 8)
-//         |UUVV|
-//         +----+
-//  Y_OFF  |YYYY| <- compressed/decoded samples  ('yuv_out_')
-//         |YYYY|    There are two buffers like this ('yuv_out_'/'yuv_out2_')
-//         |YYYY|
-//         |YYYY|
-//  U_OFF  |UUVV| V_OFF
-//         |UUVV|
-//          x2 (for yuv_out2_)
-//         +----+     Prediction area ('yuv_p_', size = PRED_SIZE)
-// I16DC16 |YYYY|  Intra16 predictions (16x16 block each)
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-// I16TM16 |YYYY|
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-// I16VE16 |YYYY|
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-// I16HE16 |YYYY|
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-//         +----+  Chroma U/V predictions (16x8 block each)
-// C8DC8   |UUVV|
-//         |UUVV|
-// C8TM8   |UUVV|
-//         |UUVV|
-// C8VE8   |UUVV|
-//         |UUVV|
-// C8HE8   |UUVV|
-//         |UUVV|
-//         +----+  Intra 4x4 predictions (4x4 block each)
-//         |YYYY| I4DC4 I4TM4 I4VE4 I4HE4
-//         |YYYY| I4RD4 I4VR4 I4LD4 I4VL4
-//         |YY..| I4HD4 I4HU4 I4TMP
-//         +----+
-#define BPS       16   // this is the common stride
-#define Y_SIZE   (BPS * 16)
-#define UV_SIZE  (BPS * 8)
-#define YUV_SIZE (Y_SIZE + UV_SIZE)
-#define PRED_SIZE (6 * 16 * BPS + 12 * BPS)
-#define Y_OFF    (0)
-#define U_OFF    (Y_SIZE)
-#define V_OFF    (U_OFF + 8)
-#define ALIGN_CST 15
-#define DO_ALIGN(PTR) ((uintptr_t)((PTR) + ALIGN_CST) & ~ALIGN_CST)
+// the arrays VP8*ModeOffsets[].
+// * YUV Samples area (yuv_in_/yuv_out_/yuv_out2_)
+//   (see VP8Scan[] for accessing the blocks, along with
+//   Y_OFF_ENC/U_OFF_ENC/V_OFF_ENC):
+//             +----+----+
+//  Y_OFF_ENC  |YYYY|UUVV|
+//  U_OFF_ENC  |YYYY|UUVV|
+//  V_OFF_ENC  |YYYY|....| <- 25% wasted U/V area
+//             |YYYY|....|
+//             +----+----+
+// * Prediction area ('yuv_p_', size = PRED_SIZE_ENC)
+//   Intra16 predictions (16x16 block each, two per row):
+//         |I16DC16|I16TM16|
+//         |I16VE16|I16HE16|
+//   Chroma U/V predictions (16x8 block each, two per row):
+//         |C8DC8|C8TM8|
+//         |C8VE8|C8HE8|
+//   Intra 4x4 predictions (4x4 block each)
+//         |I4DC4 I4TM4 I4VE4 I4HE4|I4RD4 I4VR4 I4LD4 I4VL4|
+//         |I4HD4 I4HU4 I4TMP .....|.......................| <- ~31% wasted
+#define YUV_SIZE_ENC (BPS * 16)
+#define PRED_SIZE_ENC (32 * BPS + 16 * BPS + 8 * BPS)   // I16+Chroma+I4 preds
+#define Y_OFF_ENC    (0)
+#define U_OFF_ENC    (16)
+#define V_OFF_ENC    (16 + 8)
 
 extern const int VP8Scan[16];           // in quant.c
 extern const int VP8UVModeOffsets[4];   // in analyze.c
@@ -138,26 +87,26 @@ extern const int VP8I4ModeOffsets[NUM_BMODES];
 // Layout of prediction blocks
 // intra 16x16
 #define I16DC16 (0 * 16 * BPS)
-#define I16TM16 (1 * 16 * BPS)
-#define I16VE16 (2 * 16 * BPS)
-#define I16HE16 (3 * 16 * BPS)
+#define I16TM16 (I16DC16 + 16)
+#define I16VE16 (1 * 16 * BPS)
+#define I16HE16 (I16VE16 + 16)
 // chroma 8x8, two U/V blocks side by side (hence: 16x8 each)
-#define C8DC8 (4 * 16 * BPS)
-#define C8TM8 (4 * 16 * BPS + 8 * BPS)
-#define C8VE8 (5 * 16 * BPS)
-#define C8HE8 (5 * 16 * BPS + 8 * BPS)
+#define C8DC8 (2 * 16 * BPS)
+#define C8TM8 (C8DC8 + 1 * 16)
+#define C8VE8 (2 * 16 * BPS + 8 * BPS)
+#define C8HE8 (C8VE8 + 1 * 16)
 // intra 4x4
-#define I4DC4 (6 * 16 * BPS +  0)
-#define I4TM4 (6 * 16 * BPS +  4)
-#define I4VE4 (6 * 16 * BPS +  8)
-#define I4HE4 (6 * 16 * BPS + 12)
-#define I4RD4 (6 * 16 * BPS + 4 * BPS +  0)
-#define I4VR4 (6 * 16 * BPS + 4 * BPS +  4)
-#define I4LD4 (6 * 16 * BPS + 4 * BPS +  8)
-#define I4VL4 (6 * 16 * BPS + 4 * BPS + 12)
-#define I4HD4 (6 * 16 * BPS + 8 * BPS +  0)
-#define I4HU4 (6 * 16 * BPS + 8 * BPS +  4)
-#define I4TMP (6 * 16 * BPS + 8 * BPS +  8)
+#define I4DC4 (3 * 16 * BPS +  0)
+#define I4TM4 (I4DC4 +  4)
+#define I4VE4 (I4DC4 +  8)
+#define I4HE4 (I4DC4 + 12)
+#define I4RD4 (I4DC4 + 16)
+#define I4VR4 (I4DC4 + 20)
+#define I4LD4 (I4DC4 + 24)
+#define I4VL4 (I4DC4 + 28)
+#define I4HD4 (3 * 16 * BPS + 4 * BPS)
+#define I4HU4 (I4HD4 + 4)
+#define I4TMP (I4HD4 + 8)
 
 typedef int64_t score_t;     // type used for scores, rate, distortion
 // Note that MAX_COST is not the maximum allowed by sizeof(score_t),
@@ -172,14 +121,6 @@ static WEBP_INLINE int QUANTDIV(uint32_t n, uint32_t iQ, uint32_t B) {
   return (int)((n * iQ + B) >> QFIX);
 }
 
-// size of histogram used by CollectHistogram.
-#define MAX_COEFF_THRESH   31
-typedef struct VP8Histogram VP8Histogram;
-struct VP8Histogram {
-  // TODO(skal): we only need to store the max_value and last_non_zero actually.
-  int distribution[MAX_COEFF_THRESH + 1];
-};
-
 // Uncomment the following to remove token-buffer code:
 // #define DISABLE_TOKEN_BUFFER
 
@@ -190,6 +131,8 @@ typedef uint32_t proba_t;   // 16b + 16b
 typedef uint8_t ProbaArray[NUM_CTX][NUM_PROBAS];
 typedef proba_t StatsArray[NUM_CTX][NUM_PROBAS];
 typedef uint16_t CostArray[NUM_CTX][MAX_VARIABLE_LEVEL + 1];
+typedef const uint16_t* (*CostArrayPtr)[NUM_CTX];   // for easy casting
+typedef const uint16_t* CostArrayMap[16][NUM_CTX];
 typedef double LFStats[NUM_MB_SEGMENTS][MAX_LF_LEVELS];  // filter stats
 
 typedef struct VP8Encoder VP8Encoder;
@@ -200,7 +143,7 @@ typedef struct {
   int update_map_;        // whether to update the segment map or not.
                           // must be 0 if there's only 1 segment.
   int size_;              // bit-cost for transmitting the segment map
-} VP8SegmentHeader;
+} VP8EncSegmentHeader;
 
 // Struct collecting all frame-persistent probabilities.
 typedef struct {
@@ -209,10 +152,11 @@ typedef struct {
   ProbaArray coeffs_[NUM_TYPES][NUM_BANDS];      // 1056 bytes
   StatsArray stats_[NUM_TYPES][NUM_BANDS];       // 4224 bytes
   CostArray level_cost_[NUM_TYPES][NUM_BANDS];   // 13056 bytes
+  CostArrayMap remapped_costs_[NUM_TYPES];       // 1536 bytes
   int dirty_;               // if true, need to call VP8CalculateLevelCosts()
   int use_skip_proba_;      // Note: we always use skip_proba for now.
   int nb_skip_;             // number of skipped blocks
-} VP8Proba;
+} VP8EncProba;
 
 // Filter parameters. Not actually used in the code (we don't perform
 // the in-loop filtering), but filled from user's config
@@ -221,7 +165,7 @@ typedef struct {
   int level_;              // base filter level [0..63]
   int sharpness_;          // [0..7]
   int i4x4_lf_delta_;      // delta filter level for i4x4 relative to i16x16
-} VP8FilterHeader;
+} VP8EncFilterHeader;
 
 //------------------------------------------------------------------------------
 // Informations about the macroblocks.
@@ -307,9 +251,10 @@ typedef struct {
   uint8_t* y_top_;     // top luma samples at position 'x_'
   uint8_t* uv_top_;    // top u/v samples at position 'x_', packed as 16 bytes
 
-  // memory for storing y/u/v_left_ and yuv_in_/out_*
-  uint8_t yuv_left_mem_[17 + 16 + 16 + 8 + ALIGN_CST];     // memory for *_left_
-  uint8_t yuv_mem_[3 * YUV_SIZE + PRED_SIZE + ALIGN_CST];  // memory for yuv_*
+  // memory for storing y/u/v_left_
+  uint8_t yuv_left_mem_[17 + 16 + 16 + 8 + WEBP_ALIGN_CST];
+  // memory for yuv_*
+  uint8_t yuv_mem_[3 * YUV_SIZE_ENC + PRED_SIZE_ENC + WEBP_ALIGN_CST];
 } VP8EncIterator;
 
   // in iterator.c
@@ -381,7 +326,8 @@ int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
                   const uint8_t* const probas, int final_pass);
 
 // record the coding of coefficients without knowing the probabilities yet
-int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
+int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
+                         int first, int last,
                          const int16_t* const coeffs,
                          VP8TBuffer* const tokens);
 
@@ -401,8 +347,8 @@ struct VP8Encoder {
   WebPPicture* pic_;            // input / output picture
 
   // headers
-  VP8FilterHeader   filter_hdr_;     // filtering information
-  VP8SegmentHeader  segment_hdr_;    // segment information
+  VP8EncFilterHeader   filter_hdr_;     // filtering information
+  VP8EncSegmentHeader  segment_hdr_;    // segment information
 
   int profile_;                      // VP8's profile, deduced from Config.
 
@@ -438,12 +384,12 @@ struct VP8Encoder {
   int dq_uv_dc_, dq_uv_ac_;
 
   // probabilities and statistics
-  VP8Proba proba_;
-  uint64_t sse_[4];        // sum of Y/U/V/A squared errors for all macroblocks
-  uint64_t sse_count_;     // pixel count for the sse_[] stats
-  int      coded_size_;
-  int      residual_bytes_[3][4];
-  int      block_count_[3];
+  VP8EncProba proba_;
+  uint64_t    sse_[4];      // sum of Y/U/V/A squared errors for all macroblocks
+  uint64_t    sse_count_;   // pixel count for the sse_[] stats
+  int         coded_size_;
+  int         residual_bytes_[3][4];
+  int         block_count_[3];
 
   // quality/speed settings
   int method_;               // 0=fastest, 6=best/slowest.
@@ -473,7 +419,7 @@ extern const uint8_t
 // Reset the token probabilities to their initial (default) values
 void VP8DefaultProbas(VP8Encoder* const enc);
 // Write the token probabilities
-void VP8WriteProbas(VP8BitWriter* const bw, const VP8Proba* const probas);
+void VP8WriteProbas(VP8BitWriter* const bw, const VP8EncProba* const probas);
 // Writes the partition #0 modes (that is: all intra modes)
 void VP8CodeIntraModes(VP8Encoder* const enc);
 
@@ -486,7 +432,6 @@ int VP8EncWrite(VP8Encoder* const enc);
 void VP8EncFreeBitWriters(VP8Encoder* const enc);
 
   // in frame.c
-extern const uint8_t VP8EncBands[16 + 1];
 extern const uint8_t VP8Cat3[];
 extern const uint8_t VP8Cat4[];
 extern const uint8_t VP8Cat5[];
@@ -569,12 +514,21 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height);
 // Returns false in case of error (invalid param, out-of-memory).
 int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
 
+// Clean-up the RGB samples under fully transparent area, to help lossless
+// compressibility (no guarantee, though). Assumes that pic->use_argb is true.
+void WebPCleanupTransparentAreaLossless(WebPPicture* const pic);
+
+  // in near_lossless.c
+// Near lossless preprocessing in RGB color-space.
+int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality);
+// Near lossless adjustment for predictors.
+void VP8ApplyNearLosslessPredict(int xsize, int ysize, int pred_bits,
+                                 const uint32_t* argb_orig,
+                                 uint32_t* argb, uint32_t* argb_scratch,
+                                 const uint32_t* const transform_data,
+                                 int quality, int subtract_green);
 //------------------------------------------------------------------------------
 
-#if WEBP_ENCODER_ABI_VERSION <= 0x0203
-void WebPMemoryWriterClear(WebPMemoryWriter* writer);
-#endif
-
 #ifdef __cplusplus
 }    // extern "C"
 #endif
diff --git a/src/3rdparty/libwebp/src/enc/vp8l.c b/src/3rdparty/libwebp/src/enc/vp8l.c
index c2bb13d..db94e78 100644
--- a/src/3rdparty/libwebp/src/enc/vp8l.c
+++ b/src/3rdparty/libwebp/src/enc/vp8l.c
@@ -13,10 +13,10 @@
 //
 
 #include <assert.h>
-#include <stdio.h>
 #include <stdlib.h>
 
 #include "./backward_references.h"
+#include "./histogram.h"
 #include "./vp8enci.h"
 #include "./vp8li.h"
 #include "../dsp/lossless.h"
@@ -25,23 +25,105 @@
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
+#include "./delta_palettization.h"
+
 #define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
-#define MAX_HUFF_IMAGE_SIZE       (16 * 1024 * 1024)
-#define MAX_COLORS_FOR_GRAPH      64
+// Maximum number of histogram images (sub-blocks).
+#define MAX_HUFF_IMAGE_SIZE       2600
 
-// -----------------------------------------------------------------------------
-// Palette
+// Palette reordering for smaller sum of deltas (and for smaller storage).
 
-static int CompareColors(const void* p1, const void* p2) {
-  const uint32_t a = *(const uint32_t*)p1;
-  const uint32_t b = *(const uint32_t*)p2;
+static int PaletteCompareColorsForQsort(const void* p1, const void* p2) {
+  const uint32_t a = WebPMemToUint32(p1);
+  const uint32_t b = WebPMemToUint32(p2);
   assert(a != b);
   return (a < b) ? -1 : 1;
 }
 
+static WEBP_INLINE uint32_t PaletteComponentDistance(uint32_t v) {
+  return (v <= 128) ? v : (256 - v);
+}
+
+// Computes a value that is related to the entropy created by the
+// palette entry diff.
+//
+// Note that the last & 0xff is a no-operation in the next statement, but
+// removed by most compilers and is here only for regularity of the code.
+static WEBP_INLINE uint32_t PaletteColorDistance(uint32_t col1, uint32_t col2) {
+  const uint32_t diff = VP8LSubPixels(col1, col2);
+  const int kMoreWeightForRGBThanForAlpha = 9;
+  uint32_t score;
+  score =  PaletteComponentDistance((diff >>  0) & 0xff);
+  score += PaletteComponentDistance((diff >>  8) & 0xff);
+  score += PaletteComponentDistance((diff >> 16) & 0xff);
+  score *= kMoreWeightForRGBThanForAlpha;
+  score += PaletteComponentDistance((diff >> 24) & 0xff);
+  return score;
+}
+
+static WEBP_INLINE void SwapColor(uint32_t* const col1, uint32_t* const col2) {
+  const uint32_t tmp = *col1;
+  *col1 = *col2;
+  *col2 = tmp;
+}
+
+static void GreedyMinimizeDeltas(uint32_t palette[], int num_colors) {
+  // Find greedily always the closest color of the predicted color to minimize
+  // deltas in the palette. This reduces storage needs since the
+  // palette is stored with delta encoding.
+  uint32_t predict = 0x00000000;
+  int i, k;
+  for (i = 0; i < num_colors; ++i) {
+    int best_ix = i;
+    uint32_t best_score = ~0U;
+    for (k = i; k < num_colors; ++k) {
+      const uint32_t cur_score = PaletteColorDistance(palette[k], predict);
+      if (best_score > cur_score) {
+        best_score = cur_score;
+        best_ix = k;
+      }
+    }
+    SwapColor(&palette[best_ix], &palette[i]);
+    predict = palette[i];
+  }
+}
+
+// The palette has been sorted by alpha. This function checks if the other
+// components of the palette have a monotonic development with regards to
+// position in the palette. If all have monotonic development, there is
+// no benefit to re-organize them greedily. A monotonic development
+// would be spotted in green-only situations (like lossy alpha) or gray-scale
+// images.
+static int PaletteHasNonMonotonousDeltas(uint32_t palette[], int num_colors) {
+  uint32_t predict = 0x000000;
+  int i;
+  uint8_t sign_found = 0x00;
+  for (i = 0; i < num_colors; ++i) {
+    const uint32_t diff = VP8LSubPixels(palette[i], predict);
+    const uint8_t rd = (diff >> 16) & 0xff;
+    const uint8_t gd = (diff >>  8) & 0xff;
+    const uint8_t bd = (diff >>  0) & 0xff;
+    if (rd != 0x00) {
+      sign_found |= (rd < 0x80) ? 1 : 2;
+    }
+    if (gd != 0x00) {
+      sign_found |= (gd < 0x80) ? 8 : 16;
+    }
+    if (bd != 0x00) {
+      sign_found |= (bd < 0x80) ? 64 : 128;
+    }
+    predict = palette[i];
+  }
+  return (sign_found & (sign_found << 1)) != 0;  // two consequent signs.
+}
+
+// -----------------------------------------------------------------------------
+// Palette
+
 // If number of colors in the image is less than or equal to MAX_PALETTE_SIZE,
 // creates a palette and returns true, else returns false.
 static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
+                                   int low_effort,
                                    uint32_t palette[MAX_PALETTE_SIZE],
                                    int* const palette_size) {
   int i, x, y, key;
@@ -92,84 +174,240 @@ static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
       ++num_colors;
     }
   }
-
-  qsort(palette, num_colors, sizeof(*palette), CompareColors);
   *palette_size = num_colors;
+  qsort(palette, num_colors, sizeof(*palette), PaletteCompareColorsForQsort);
+  if (!low_effort && PaletteHasNonMonotonousDeltas(palette, num_colors)) {
+    GreedyMinimizeDeltas(palette, num_colors);
+  }
   return 1;
 }
 
-static int AnalyzeEntropy(const uint32_t* argb,
-                          int width, int height, int argb_stride,
-                          double* const nonpredicted_bits,
-                          double* const predicted_bits) {
-  int x, y;
-  const uint32_t* last_line = NULL;
-  uint32_t last_pix = argb[0];    // so we're sure that pix_diff == 0
+// These five modes are evaluated and their respective entropy is computed.
+typedef enum {
+  kDirect = 0,
+  kSpatial = 1,
+  kSubGreen = 2,
+  kSpatialSubGreen = 3,
+  kPalette = 4,
+  kNumEntropyIx = 5
+} EntropyIx;
+
+typedef enum {
+  kHistoAlpha = 0,
+  kHistoAlphaPred,
+  kHistoGreen,
+  kHistoGreenPred,
+  kHistoRed,
+  kHistoRedPred,
+  kHistoBlue,
+  kHistoBluePred,
+  kHistoRedSubGreen,
+  kHistoRedPredSubGreen,
+  kHistoBlueSubGreen,
+  kHistoBluePredSubGreen,
+  kHistoPalette,
+  kHistoTotal  // Must be last.
+} HistoIx;
+
+static void AddSingleSubGreen(uint32_t p, uint32_t* r, uint32_t* b) {
+  const uint32_t green = p >> 8;  // The upper bits are masked away later.
+  ++r[((p >> 16) - green) & 0xff];
+  ++b[(p - green) & 0xff];
+}
 
-  VP8LHistogramSet* const histo_set = VP8LAllocateHistogramSet(2, 0);
-  if (histo_set == NULL) return 0;
+static void AddSingle(uint32_t p,
+                      uint32_t* a, uint32_t* r, uint32_t* g, uint32_t* b) {
+  ++a[p >> 24];
+  ++r[(p >> 16) & 0xff];
+  ++g[(p >> 8) & 0xff];
+  ++b[(p & 0xff)];
+}
 
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      const uint32_t pix = argb[x];
-      const uint32_t pix_diff = VP8LSubPixels(pix, last_pix);
-      if (pix_diff == 0) continue;
-      if (last_line != NULL && pix == last_line[x]) {
-        continue;
+static int AnalyzeEntropy(const uint32_t* argb,
+                          int width, int height, int argb_stride,
+                          int use_palette,
+                          EntropyIx* const min_entropy_ix,
+                          int* const red_and_blue_always_zero) {
+  // Allocate histogram set with cache_bits = 0.
+  uint32_t* const histo =
+      (uint32_t*)WebPSafeCalloc(kHistoTotal, sizeof(*histo) * 256);
+  if (histo != NULL) {
+    int i, x, y;
+    const uint32_t* prev_row = argb;
+    const uint32_t* curr_row = argb + argb_stride;
+    for (y = 1; y < height; ++y) {
+      uint32_t prev_pix = curr_row[0];
+      for (x = 1; x < width; ++x) {
+        const uint32_t pix = curr_row[x];
+        const uint32_t pix_diff = VP8LSubPixels(pix, prev_pix);
+        if ((pix_diff == 0) || (pix == prev_row[x])) continue;
+        prev_pix = pix;
+        AddSingle(pix,
+                  &histo[kHistoAlpha * 256],
+                  &histo[kHistoRed * 256],
+                  &histo[kHistoGreen * 256],
+                  &histo[kHistoBlue * 256]);
+        AddSingle(pix_diff,
+                  &histo[kHistoAlphaPred * 256],
+                  &histo[kHistoRedPred * 256],
+                  &histo[kHistoGreenPred * 256],
+                  &histo[kHistoBluePred * 256]);
+        AddSingleSubGreen(pix,
+                          &histo[kHistoRedSubGreen * 256],
+                          &histo[kHistoBlueSubGreen * 256]);
+        AddSingleSubGreen(pix_diff,
+                          &histo[kHistoRedPredSubGreen * 256],
+                          &histo[kHistoBluePredSubGreen * 256]);
+        {
+          // Approximate the palette by the entropy of the multiplicative hash.
+          const int hash = ((pix + (pix >> 19)) * 0x39c5fba7) >> 24;
+          ++histo[kHistoPalette * 256 + (hash & 0xff)];
+        }
+      }
+      prev_row = curr_row;
+      curr_row += argb_stride;
+    }
+    {
+      double entropy_comp[kHistoTotal];
+      double entropy[kNumEntropyIx];
+      EntropyIx k;
+      EntropyIx last_mode_to_analyze =
+          use_palette ? kPalette : kSpatialSubGreen;
+      int j;
+      // Let's add one zero to the predicted histograms. The zeros are removed
+      // too efficiently by the pix_diff == 0 comparison, at least one of the
+      // zeros is likely to exist.
+      ++histo[kHistoRedPredSubGreen * 256];
+      ++histo[kHistoBluePredSubGreen * 256];
+      ++histo[kHistoRedPred * 256];
+      ++histo[kHistoGreenPred * 256];
+      ++histo[kHistoBluePred * 256];
+      ++histo[kHistoAlphaPred * 256];
+
+      for (j = 0; j < kHistoTotal; ++j) {
+        entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256, NULL);
       }
-      last_pix = pix;
+      entropy[kDirect] = entropy_comp[kHistoAlpha] +
+          entropy_comp[kHistoRed] +
+          entropy_comp[kHistoGreen] +
+          entropy_comp[kHistoBlue];
+      entropy[kSpatial] = entropy_comp[kHistoAlphaPred] +
+          entropy_comp[kHistoRedPred] +
+          entropy_comp[kHistoGreenPred] +
+          entropy_comp[kHistoBluePred];
+      entropy[kSubGreen] = entropy_comp[kHistoAlpha] +
+          entropy_comp[kHistoRedSubGreen] +
+          entropy_comp[kHistoGreen] +
+          entropy_comp[kHistoBlueSubGreen];
+      entropy[kSpatialSubGreen] = entropy_comp[kHistoAlphaPred] +
+          entropy_comp[kHistoRedPredSubGreen] +
+          entropy_comp[kHistoGreenPred] +
+          entropy_comp[kHistoBluePredSubGreen];
+      // Palette mode seems more efficient in a breakeven case. Bias with 1.0.
+      entropy[kPalette] = entropy_comp[kHistoPalette] - 1.0;
+
+      *min_entropy_ix = kDirect;
+      for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) {
+        if (entropy[*min_entropy_ix] > entropy[k]) {
+          *min_entropy_ix = k;
+        }
+      }
+      *red_and_blue_always_zero = 1;
+      // Let's check if the histogram of the chosen entropy mode has
+      // non-zero red and blue values. If all are zero, we can later skip
+      // the cross color optimization.
       {
-        const PixOrCopy pix_token = PixOrCopyCreateLiteral(pix);
-        const PixOrCopy pix_diff_token = PixOrCopyCreateLiteral(pix_diff);
-        VP8LHistogramAddSinglePixOrCopy(histo_set->histograms[0], &pix_token);
-        VP8LHistogramAddSinglePixOrCopy(histo_set->histograms[1],
-                                        &pix_diff_token);
+        static const uint8_t kHistoPairs[5][2] = {
+          { kHistoRed, kHistoBlue },
+          { kHistoRedPred, kHistoBluePred },
+          { kHistoRedSubGreen, kHistoBlueSubGreen },
+          { kHistoRedPredSubGreen, kHistoBluePredSubGreen },
+          { kHistoRed, kHistoBlue }
+        };
+        const uint32_t* const red_histo =
+            &histo[256 * kHistoPairs[*min_entropy_ix][0]];
+        const uint32_t* const blue_histo =
+            &histo[256 * kHistoPairs[*min_entropy_ix][1]];
+        for (i = 1; i < 256; ++i) {
+          if ((red_histo[i] | blue_histo[i]) != 0) {
+            *red_and_blue_always_zero = 0;
+            break;
+          }
+        }
       }
     }
-    last_line = argb;
-    argb += argb_stride;
+    free(histo);
+    return 1;
+  } else {
+    return 0;
   }
-  *nonpredicted_bits = VP8LHistogramEstimateBitsBulk(histo_set->histograms[0]);
-  *predicted_bits = VP8LHistogramEstimateBitsBulk(histo_set->histograms[1]);
-  VP8LFreeHistogramSet(histo_set);
-  return 1;
 }
 
-static int AnalyzeAndInit(VP8LEncoder* const enc, WebPImageHint image_hint) {
+static int GetHistoBits(int method, int use_palette, int width, int height) {
+  // Make tile size a function of encoding method (Range: 0 to 6).
+  int histo_bits = (use_palette ? 9 : 7) - method;
+  while (1) {
+    const int huff_image_size = VP8LSubSampleSize(width, histo_bits) *
+                                VP8LSubSampleSize(height, histo_bits);
+    if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
+    ++histo_bits;
+  }
+  return (histo_bits < MIN_HUFFMAN_BITS) ? MIN_HUFFMAN_BITS :
+         (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
+}
+
+static int GetTransformBits(int method, int histo_bits) {
+  const int max_transform_bits = (method < 4) ? 6 : (method > 4) ? 4 : 5;
+  return (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
+}
+
+static int AnalyzeAndInit(VP8LEncoder* const enc) {
   const WebPPicture* const pic = enc->pic_;
   const int width = pic->width;
   const int height = pic->height;
   const int pix_cnt = width * height;
+  const WebPConfig* const config = enc->config_;
+  const int method = config->method;
+  const int low_effort = (config->method == 0);
   // we round the block size up, so we're guaranteed to have
   // at max MAX_REFS_BLOCK_PER_IMAGE blocks used:
   int refs_block_size = (pix_cnt - 1) / MAX_REFS_BLOCK_PER_IMAGE + 1;
   assert(pic != NULL && pic->argb != NULL);
 
+  enc->use_cross_color_ = 0;
+  enc->use_predict_ = 0;
+  enc->use_subtract_green_ = 0;
   enc->use_palette_ =
-      AnalyzeAndCreatePalette(pic, enc->palette_, &enc->palette_size_);
+      AnalyzeAndCreatePalette(pic, low_effort,
+                              enc->palette_, &enc->palette_size_);
 
-  if (image_hint == WEBP_HINT_GRAPH) {
-    if (enc->use_palette_ && enc->palette_size_ < MAX_COLORS_FOR_GRAPH) {
-      enc->use_palette_ = 0;
-    }
-  }
+  // TODO(jyrki): replace the decision to be based on an actual estimate
+  // of entropy, or even spatial variance of entropy.
+  enc->histo_bits_ = GetHistoBits(method, enc->use_palette_,
+                                  pic->width, pic->height);
+  enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
 
-  if (!enc->use_palette_) {
-    if (image_hint == WEBP_HINT_PHOTO) {
-      enc->use_predict_ = 1;
-      enc->use_cross_color_ = 1;
-    } else {
-      double non_pred_entropy, pred_entropy;
-      if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride,
-                          &non_pred_entropy, &pred_entropy)) {
-        return 0;
-      }
-      if (pred_entropy < 0.95 * non_pred_entropy) {
-        enc->use_predict_ = 1;
-        enc->use_cross_color_ = 1;
-      }
+  if (low_effort) {
+    // AnalyzeEntropy is somewhat slow.
+    enc->use_predict_ = !enc->use_palette_;
+    enc->use_subtract_green_ = !enc->use_palette_;
+    enc->use_cross_color_ = 0;
+  } else {
+    int red_and_blue_always_zero;
+    EntropyIx min_entropy_ix;
+    if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride,
+                        enc->use_palette_, &min_entropy_ix,
+                        &red_and_blue_always_zero)) {
+      return 0;
     }
+    enc->use_palette_ = (min_entropy_ix == kPalette);
+    enc->use_subtract_green_ =
+        (min_entropy_ix == kSubGreen) || (min_entropy_ix == kSpatialSubGreen);
+    enc->use_predict_ =
+        (min_entropy_ix == kSpatial) || (min_entropy_ix == kSpatialSubGreen);
+    enc->use_cross_color_ = red_and_blue_always_zero ? 0 : enc->use_predict_;
   }
+
   if (!VP8LHashChainInit(&enc->hash_chain_, pix_cnt)) return 0;
 
   // palette-friendly input typically uses less literals
@@ -271,9 +509,9 @@ static void StoreHuffmanTreeOfHuffmanTreeToBitMask(
       break;
     }
   }
-  VP8LWriteBits(bw, 4, codes_to_store - 4);
+  VP8LPutBits(bw, codes_to_store - 4, 4);
   for (i = 0; i < codes_to_store; ++i) {
-    VP8LWriteBits(bw, 3, code_length_bitdepth[kStorageOrder[i]]);
+    VP8LPutBits(bw, code_length_bitdepth[kStorageOrder[i]], 3);
   }
 }
 
@@ -301,16 +539,16 @@ static void StoreHuffmanTreeToBitMask(
   for (i = 0; i < num_tokens; ++i) {
     const int ix = tokens[i].code;
     const int extra_bits = tokens[i].extra_bits;
-    VP8LWriteBits(bw, huffman_code->code_lengths[ix], huffman_code->codes[ix]);
+    VP8LPutBits(bw, huffman_code->codes[ix], huffman_code->code_lengths[ix]);
     switch (ix) {
       case 16:
-        VP8LWriteBits(bw, 2, extra_bits);
+        VP8LPutBits(bw, extra_bits, 2);
         break;
       case 17:
-        VP8LWriteBits(bw, 3, extra_bits);
+        VP8LPutBits(bw, extra_bits, 3);
         break;
       case 18:
-        VP8LWriteBits(bw, 7, extra_bits);
+        VP8LPutBits(bw, extra_bits, 7);
         break;
     }
   }
@@ -330,7 +568,7 @@ static void StoreFullHuffmanCode(VP8LBitWriter* const bw,
   huffman_code.code_lengths = code_length_bitdepth;
   huffman_code.codes = code_length_bitdepth_symbols;
 
-  VP8LWriteBits(bw, 1, 0);
+  VP8LPutBits(bw, 0, 1);
   num_tokens = VP8LCreateCompressedHuffmanTree(tree, tokens, max_tokens);
   {
     uint32_t histogram[CODE_LENGTH_CODES] = { 0 };
@@ -367,13 +605,13 @@ static void StoreFullHuffmanCode(VP8LBitWriter* const bw,
     }
     write_trimmed_length = (trimmed_length > 1 && trailing_zero_bits > 12);
     length = write_trimmed_length ? trimmed_length : num_tokens;
-    VP8LWriteBits(bw, 1, write_trimmed_length);
+    VP8LPutBits(bw, write_trimmed_length, 1);
     if (write_trimmed_length) {
       const int nbits = VP8LBitsLog2Ceiling(trimmed_length - 1);
       const int nbitpairs = (nbits == 0) ? 1 : (nbits + 1) / 2;
-      VP8LWriteBits(bw, 3, nbitpairs - 1);
+      VP8LPutBits(bw, nbitpairs - 1, 3);
       assert(trimmed_length >= 2);
-      VP8LWriteBits(bw, nbitpairs * 2, trimmed_length - 2);
+      VP8LPutBits(bw, trimmed_length - 2, nbitpairs * 2);
     }
     StoreHuffmanTreeToBitMask(bw, tokens, length, &huffman_code);
   }
@@ -400,31 +638,42 @@ static void StoreHuffmanCode(VP8LBitWriter* const bw,
 
   if (count == 0) {   // emit minimal tree for empty cases
     // bits: small tree marker: 1, count-1: 0, large 8-bit code: 0, code: 0
-    VP8LWriteBits(bw, 4, 0x01);
+    VP8LPutBits(bw, 0x01, 4);
   } else if (count <= 2 && symbols[0] < kMaxSymbol && symbols[1] < kMaxSymbol) {
-    VP8LWriteBits(bw, 1, 1);  // Small tree marker to encode 1 or 2 symbols.
-    VP8LWriteBits(bw, 1, count - 1);
+    VP8LPutBits(bw, 1, 1);  // Small tree marker to encode 1 or 2 symbols.
+    VP8LPutBits(bw, count - 1, 1);
     if (symbols[0] <= 1) {
-      VP8LWriteBits(bw, 1, 0);  // Code bit for small (1 bit) symbol value.
-      VP8LWriteBits(bw, 1, symbols[0]);
+      VP8LPutBits(bw, 0, 1);  // Code bit for small (1 bit) symbol value.
+      VP8LPutBits(bw, symbols[0], 1);
     } else {
-      VP8LWriteBits(bw, 1, 1);
-      VP8LWriteBits(bw, 8, symbols[0]);
+      VP8LPutBits(bw, 1, 1);
+      VP8LPutBits(bw, symbols[0], 8);
     }
     if (count == 2) {
-      VP8LWriteBits(bw, 8, symbols[1]);
+      VP8LPutBits(bw, symbols[1], 8);
     }
   } else {
     StoreFullHuffmanCode(bw, huff_tree, tokens, huffman_code);
   }
 }
 
-static void WriteHuffmanCode(VP8LBitWriter* const bw,
+static WEBP_INLINE void WriteHuffmanCode(VP8LBitWriter* const bw,
                              const HuffmanTreeCode* const code,
                              int code_index) {
   const int depth = code->code_lengths[code_index];
   const int symbol = code->codes[code_index];
-  VP8LWriteBits(bw, depth, symbol);
+  VP8LPutBits(bw, symbol, depth);
+}
+
+static WEBP_INLINE void WriteHuffmanCodeWithExtraBits(
+    VP8LBitWriter* const bw,
+    const HuffmanTreeCode* const code,
+    int code_index,
+    int bits,
+    int n_bits) {
+  const int depth = code->code_lengths[code_index];
+  const int symbol = code->codes[code_index];
+  VP8LPutBits(bw, (bits << depth) | symbol, depth + n_bits);
 }
 
 static WebPEncodingError StoreImageToBitMask(
@@ -432,40 +681,51 @@ static WebPEncodingError StoreImageToBitMask(
     VP8LBackwardRefs* const refs,
     const uint16_t* histogram_symbols,
     const HuffmanTreeCode* const huffman_codes) {
+  const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
+  const int tile_mask = (histo_bits == 0) ? 0 : -(1 << histo_bits);
   // x and y trace the position in the image.
   int x = 0;
   int y = 0;
-  const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
+  int tile_x = x & tile_mask;
+  int tile_y = y & tile_mask;
+  int histogram_ix = histogram_symbols[0];
+  const HuffmanTreeCode* codes = huffman_codes + 5 * histogram_ix;
   VP8LRefsCursor c = VP8LRefsCursorInit(refs);
   while (VP8LRefsCursorOk(&c)) {
     const PixOrCopy* const v = c.cur_pos;
-    const int histogram_ix = histogram_symbols[histo_bits ?
-                                               (y >> histo_bits) * histo_xsize +
-                                               (x >> histo_bits) : 0];
-    const HuffmanTreeCode* const codes = huffman_codes + 5 * histogram_ix;
-    if (PixOrCopyIsCacheIdx(v)) {
-      const int code = PixOrCopyCacheIdx(v);
-      const int literal_ix = 256 + NUM_LENGTH_CODES + code;
-      WriteHuffmanCode(bw, codes, literal_ix);
-    } else if (PixOrCopyIsLiteral(v)) {
+    if ((tile_x != (x & tile_mask)) || (tile_y != (y & tile_mask))) {
+      tile_x = x & tile_mask;
+      tile_y = y & tile_mask;
+      histogram_ix = histogram_symbols[(y >> histo_bits) * histo_xsize +
+                                       (x >> histo_bits)];
+      codes = huffman_codes + 5 * histogram_ix;
+    }
+    if (PixOrCopyIsLiteral(v)) {
       static const int order[] = { 1, 2, 0, 3 };
       int k;
       for (k = 0; k < 4; ++k) {
         const int code = PixOrCopyLiteral(v, order[k]);
         WriteHuffmanCode(bw, codes + k, code);
       }
+    } else if (PixOrCopyIsCacheIdx(v)) {
+      const int code = PixOrCopyCacheIdx(v);
+      const int literal_ix = 256 + NUM_LENGTH_CODES + code;
+      WriteHuffmanCode(bw, codes, literal_ix);
     } else {
       int bits, n_bits;
-      int code, distance;
+      int code;
 
+      const int distance = PixOrCopyDistance(v);
       VP8LPrefixEncode(v->len, &code, &n_bits, &bits);
-      WriteHuffmanCode(bw, codes, 256 + code);
-      VP8LWriteBits(bw, n_bits, bits);
+      WriteHuffmanCodeWithExtraBits(bw, codes, 256 + code, bits, n_bits);
 
-      distance = PixOrCopyDistance(v);
+      // Don't write the distance with the extra bits code since
+      // the distance can be up to 18 bits of extra bits, and the prefix
+      // 15 bits, totaling to 33, and our PutBits only supports up to 32 bits.
+      // TODO(jyrki): optimize this further.
       VP8LPrefixEncode(distance, &code, &n_bits, &bits);
       WriteHuffmanCode(bw, codes + 4, code);
-      VP8LWriteBits(bw, n_bits, bits);
+      VP8LPutBits(bw, bits, n_bits);
     }
     x += PixOrCopyLength(v);
     while (x >= width) {
@@ -491,21 +751,28 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
   HuffmanTreeToken* tokens = NULL;
   HuffmanTreeCode huffman_codes[5] = { { 0, NULL, NULL } };
   const uint16_t histogram_symbols[1] = { 0 };    // only one tree, one symbol
-  VP8LHistogramSet* const histogram_image = VP8LAllocateHistogramSet(1, 0);
+  int cache_bits = 0;
+  VP8LHistogramSet* histogram_image = NULL;
   HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
         3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
-  if (histogram_image == NULL || huff_tree == NULL) {
+  if (huff_tree == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
 
   // Calculate backward references from ARGB image.
-  refs = VP8LGetBackwardReferences(width, height, argb, quality, 0, 1,
+  refs = VP8LGetBackwardReferences(width, height, argb, quality, 0, &cache_bits,
                                    hash_chain, refs_array);
   if (refs == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
+  histogram_image = VP8LAllocateHistogramSet(1, cache_bits);
+  if (histogram_image == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
   // Build histogram image and symbols from backward references.
   VP8LHistogramStoreRefs(refs, histogram_image->histograms[0]);
 
@@ -517,7 +784,7 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
   }
 
   // No color cache, no Huffman image.
-  VP8LWriteBits(bw, 1, 0);
+  VP8LPutBits(bw, 0, 1);
 
   // Find maximum number of symbols for the huffman tree-set.
   for (i = 0; i < 5; ++i) {
@@ -557,16 +824,17 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
                                              VP8LHashChain* const hash_chain,
                                              VP8LBackwardRefs refs_array[2],
                                              int width, int height, int quality,
-                                             int cache_bits,
-                                             int histogram_bits) {
+                                             int low_effort, int* cache_bits,
+                                             int histogram_bits,
+                                             size_t init_byte_position,
+                                             int* const hdr_size,
+                                             int* const data_size) {
   WebPEncodingError err = VP8_ENC_OK;
-  const int use_2d_locality = 1;
-  const int use_color_cache = (cache_bits > 0);
   const uint32_t histogram_image_xysize =
       VP8LSubSampleSize(width, histogram_bits) *
       VP8LSubSampleSize(height, histogram_bits);
-  VP8LHistogramSet* histogram_image =
-      VP8LAllocateHistogramSet(histogram_image_xysize, cache_bits);
+  VP8LHistogramSet* histogram_image = NULL;
+  VP8LHistogramSet* tmp_histos = NULL;
   int histogram_image_size = 0;
   size_t bit_array_size = 0;
   HuffmanTree* huff_tree = NULL;
@@ -579,28 +847,39 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
                                 sizeof(*histogram_symbols));
   assert(histogram_bits >= MIN_HUFFMAN_BITS);
   assert(histogram_bits <= MAX_HUFFMAN_BITS);
+  assert(hdr_size != NULL);
+  assert(data_size != NULL);
 
   VP8LBackwardRefsInit(&refs, refs_array[0].block_size_);
-  if (histogram_image == NULL || histogram_symbols == NULL) {
-    VP8LFreeHistogramSet(histogram_image);
-    WebPSafeFree(histogram_symbols);
-    return 0;
+  if (histogram_symbols == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
   }
 
+  *cache_bits = MAX_COLOR_CACHE_BITS;
   // 'best_refs' is the reference to the best backward refs and points to one
   // of refs_array[0] or refs_array[1].
   // Calculate backward references from ARGB image.
   best_refs = VP8LGetBackwardReferences(width, height, argb, quality,
-                                        cache_bits, use_2d_locality,
-                                        hash_chain, refs_array);
+                                        low_effort, cache_bits, hash_chain,
+                                        refs_array);
   if (best_refs == NULL || !VP8LBackwardRefsCopy(best_refs, &refs)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
+  histogram_image =
+      VP8LAllocateHistogramSet(histogram_image_xysize, *cache_bits);
+  tmp_histos = VP8LAllocateHistogramSet(2, *cache_bits);
+  if (histogram_image == NULL || tmp_histos == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
   // Build histogram image and symbols from backward references.
-  if (!VP8LGetHistoImageSymbols(width, height, &refs,
-                                quality, histogram_bits, cache_bits,
-                                histogram_image,
-                                histogram_symbols)) {
+  if (!VP8LGetHistoImageSymbols(width, height, &refs, quality, low_effort,
+                                histogram_bits, *cache_bits, histogram_image,
+                                tmp_histos, histogram_symbols)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
   // Create Huffman bit lengths and codes for each histogram image.
@@ -608,41 +887,53 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
   bit_array_size = 5 * histogram_image_size;
   huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
                                                    sizeof(*huffman_codes));
+  // Note: some histogram_image entries may point to tmp_histos[], so the latter
+  // need to outlive the following call to GetHuffBitLengthsAndCodes().
   if (huffman_codes == NULL ||
       !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
   // Free combined histograms.
   VP8LFreeHistogramSet(histogram_image);
   histogram_image = NULL;
 
+  // Free scratch histograms.
+  VP8LFreeHistogramSet(tmp_histos);
+  tmp_histos = NULL;
+
   // Color Cache parameters.
-  VP8LWriteBits(bw, 1, use_color_cache);
-  if (use_color_cache) {
-    VP8LWriteBits(bw, 4, cache_bits);
+  if (*cache_bits > 0) {
+    VP8LPutBits(bw, 1, 1);
+    VP8LPutBits(bw, *cache_bits, 4);
+  } else {
+    VP8LPutBits(bw, 0, 1);
   }
 
   // Huffman image + meta huffman.
   {
     const int write_histogram_image = (histogram_image_size > 1);
-    VP8LWriteBits(bw, 1, write_histogram_image);
+    VP8LPutBits(bw, write_histogram_image, 1);
     if (write_histogram_image) {
       uint32_t* const histogram_argb =
           (uint32_t*)WebPSafeMalloc(histogram_image_xysize,
                                     sizeof(*histogram_argb));
       int max_index = 0;
       uint32_t i;
-      if (histogram_argb == NULL) goto Error;
+      if (histogram_argb == NULL) {
+        err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+        goto Error;
+      }
       for (i = 0; i < histogram_image_xysize; ++i) {
         const int symbol_index = histogram_symbols[i] & 0xffff;
-        histogram_argb[i] = 0xff000000 | (symbol_index << 8);
+        histogram_argb[i] = (symbol_index << 8);
         if (symbol_index >= max_index) {
           max_index = symbol_index + 1;
         }
       }
       histogram_image_size = max_index;
 
-      VP8LWriteBits(bw, 3, histogram_bits - 2);
+      VP8LPutBits(bw, histogram_bits - 2, 3);
       err = EncodeImageNoHuffman(bw, histogram_argb, hash_chain, refs_array,
                                  VP8LSubSampleSize(width, histogram_bits),
                                  VP8LSubSampleSize(height, histogram_bits),
@@ -658,7 +949,10 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
     int max_tokens = 0;
     huff_tree = (HuffmanTree*)WebPSafeMalloc(3ULL * CODE_LENGTH_CODES,
                                              sizeof(*huff_tree));
-    if (huff_tree == NULL) goto Error;
+    if (huff_tree == NULL) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
     // Find maximum number of symbols for the huffman tree-set.
     for (i = 0; i < 5 * histogram_image_size; ++i) {
       HuffmanTreeCode* const codes = &huffman_codes[i];
@@ -668,7 +962,10 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
     }
     tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens,
                                                sizeof(*tokens));
-    if (tokens == NULL) goto Error;
+    if (tokens == NULL) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
     for (i = 0; i < 5 * histogram_image_size; ++i) {
       HuffmanTreeCode* const codes = &huffman_codes[i];
       StoreHuffmanCode(bw, huff_tree, tokens, codes);
@@ -676,14 +973,18 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
     }
   }
 
+  *hdr_size = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
   // Store actual literals.
   err = StoreImageToBitMask(bw, width, histogram_bits, &refs,
                             histogram_symbols, huffman_codes);
+  *data_size =
+        (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size);
 
  Error:
   WebPSafeFree(tokens);
   WebPSafeFree(huff_tree);
   VP8LFreeHistogramSet(histogram_image);
+  VP8LFreeHistogramSet(tmp_histos);
   VP8LBackwardRefsClear(&refs);
   if (huffman_codes != NULL) {
     WebPSafeFree(huffman_codes->codes);
@@ -696,59 +997,28 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
 // -----------------------------------------------------------------------------
 // Transforms
 
-// Check if it would be a good idea to subtract green from red and blue. We
-// only impact entropy in red/blue components, don't bother to look at others.
-static WebPEncodingError EvalAndApplySubtractGreen(VP8LEncoder* const enc,
-                                                   int width, int height,
-                                                   VP8LBitWriter* const bw) {
-  if (!enc->use_palette_) {
-    int i;
-    const uint32_t* const argb = enc->argb_;
-    double bit_cost_before, bit_cost_after;
-    // Allocate histogram with cache_bits = 1.
-    VP8LHistogram* const histo = VP8LAllocateHistogram(1);
-    if (histo == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
-    for (i = 0; i < width * height; ++i) {
-      const uint32_t c = argb[i];
-      ++histo->red_[(c >> 16) & 0xff];
-      ++histo->blue_[(c >> 0) & 0xff];
-    }
-    bit_cost_before = VP8LHistogramEstimateBits(histo);
-
-    VP8LHistogramInit(histo, 1);
-    for (i = 0; i < width * height; ++i) {
-      const uint32_t c = argb[i];
-      const int green = (c >> 8) & 0xff;
-      ++histo->red_[((c >> 16) - green) & 0xff];
-      ++histo->blue_[((c >> 0) - green) & 0xff];
-    }
-    bit_cost_after = VP8LHistogramEstimateBits(histo);
-    VP8LFreeHistogram(histo);
-
-    // Check if subtracting green yields low entropy.
-    enc->use_subtract_green_ = (bit_cost_after < bit_cost_before);
-    if (enc->use_subtract_green_) {
-      VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
-      VP8LWriteBits(bw, 2, SUBTRACT_GREEN);
-      VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
-    }
-  }
-  return VP8_ENC_OK;
+static void ApplySubtractGreen(VP8LEncoder* const enc, int width, int height,
+                               VP8LBitWriter* const bw) {
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, SUBTRACT_GREEN, 2);
+  VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
 }
 
 static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
-                                            int width, int height, int quality,
+                                            int width, int height,
+                                            int quality, int low_effort,
                                             VP8LBitWriter* const bw) {
   const int pred_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, pred_bits);
   const int transform_height = VP8LSubSampleSize(height, pred_bits);
 
-  VP8LResidualImage(width, height, pred_bits, enc->argb_, enc->argb_scratch_,
-                    enc->transform_data_);
-  VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
-  VP8LWriteBits(bw, 2, PREDICTOR_TRANSFORM);
+  VP8LResidualImage(width, height, pred_bits, low_effort, enc->argb_,
+                    enc->argb_scratch_, enc->transform_data_,
+                    enc->config_->exact);
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
   assert(pred_bits >= 2);
-  VP8LWriteBits(bw, 3, pred_bits - 2);
+  VP8LPutBits(bw, pred_bits - 2, 3);
   return EncodeImageNoHuffman(bw, enc->transform_data_,
                               (VP8LHashChain*)&enc->hash_chain_,
                               (VP8LBackwardRefs*)enc->refs_,  // cast const away
@@ -766,10 +1036,10 @@ static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
 
   VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality,
                           enc->argb_, enc->transform_data_);
-  VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
-  VP8LWriteBits(bw, 2, CROSS_COLOR_TRANSFORM);
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2);
   assert(ccolor_transform_bits >= 2);
-  VP8LWriteBits(bw, 3, ccolor_transform_bits - 2);
+  VP8LPutBits(bw, ccolor_transform_bits - 2, 3);
   return EncodeImageNoHuffman(bw, enc->transform_data_,
                               (VP8LHashChain*)&enc->hash_chain_,
                               (VP8LBackwardRefs*)enc->refs_,  // cast const away
@@ -799,14 +1069,14 @@ static int WriteImageSize(const WebPPicture* const pic,
   const int height = pic->height - 1;
   assert(width < WEBP_MAX_DIMENSION && height < WEBP_MAX_DIMENSION);
 
-  VP8LWriteBits(bw, VP8L_IMAGE_SIZE_BITS, width);
-  VP8LWriteBits(bw, VP8L_IMAGE_SIZE_BITS, height);
+  VP8LPutBits(bw, width, VP8L_IMAGE_SIZE_BITS);
+  VP8LPutBits(bw, height, VP8L_IMAGE_SIZE_BITS);
   return !bw->error_;
 }
 
 static int WriteRealAlphaAndVersion(VP8LBitWriter* const bw, int has_alpha) {
-  VP8LWriteBits(bw, 1, has_alpha);
-  VP8LWriteBits(bw, VP8L_VERSION_BITS, VP8L_VERSION);
+  VP8LPutBits(bw, has_alpha, 1);
+  VP8LPutBits(bw, VP8L_VERSION, VP8L_VERSION_BITS);
   return !bw->error_;
 }
 
@@ -846,39 +1116,107 @@ static WebPEncodingError WriteImage(const WebPPicture* const pic,
 
 // Allocates the memory for argb (W x H) buffer, 2 rows of context for
 // prediction and transform data.
+// Flags influencing the memory allocated:
+//  enc->transform_bits_
+//  enc->use_predict_, enc->use_cross_color_
 static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
                                                  int width, int height) {
   WebPEncodingError err = VP8_ENC_OK;
-  const int tile_size = 1 << enc->transform_bits_;
-  const uint64_t image_size = width * height;
-  const uint64_t argb_scratch_size = tile_size * width + width;
-  const int transform_data_size =
-      VP8LSubSampleSize(width, enc->transform_bits_) *
-      VP8LSubSampleSize(height, enc->transform_bits_);
-  const uint64_t total_size =
-      image_size + argb_scratch_size + (uint64_t)transform_data_size;
-  uint32_t* mem = (uint32_t*)WebPSafeMalloc(total_size, sizeof(*mem));
-  if (mem == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
+  if (enc->argb_ == NULL) {
+    const int tile_size = 1 << enc->transform_bits_;
+    const uint64_t image_size = width * height;
+    // Ensure enough size for tiles, as well as for two scanlines and two
+    // extra pixels for CopyImageWithPrediction.
+    const uint64_t argb_scratch_size =
+        enc->use_predict_ ? tile_size * width + width + 2 : 0;
+    const int transform_data_size =
+        (enc->use_predict_ || enc->use_cross_color_)
+            ? VP8LSubSampleSize(width, enc->transform_bits_) *
+              VP8LSubSampleSize(height, enc->transform_bits_)
+            : 0;
+    const uint64_t total_size =
+        image_size + WEBP_ALIGN_CST +
+        argb_scratch_size + WEBP_ALIGN_CST +
+        (uint64_t)transform_data_size;
+    uint32_t* mem = (uint32_t*)WebPSafeMalloc(total_size, sizeof(*mem));
+    if (mem == NULL) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
+    enc->argb_ = mem;
+    mem = (uint32_t*)WEBP_ALIGN(mem + image_size);
+    enc->argb_scratch_ = mem;
+    mem = (uint32_t*)WEBP_ALIGN(mem + argb_scratch_size);
+    enc->transform_data_ = mem;
+    enc->current_width_ = width;
   }
-  enc->argb_ = mem;
-  mem += image_size;
-  enc->argb_scratch_ = mem;
-  mem += argb_scratch_size;
-  enc->transform_data_ = mem;
-  enc->current_width_ = width;
-
  Error:
   return err;
 }
 
-static void ApplyPalette(uint32_t* src, uint32_t* dst,
-                         uint32_t src_stride, uint32_t dst_stride,
-                         const uint32_t* palette, int palette_size,
-                         int width, int height, int xbits, uint8_t* row) {
+static void ClearTransformBuffer(VP8LEncoder* const enc) {
+  WebPSafeFree(enc->argb_);
+  enc->argb_ = NULL;
+}
+
+static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
+  WebPEncodingError err = VP8_ENC_OK;
+  const WebPPicture* const picture = enc->pic_;
+  const int width = picture->width;
+  const int height = picture->height;
+  int y;
+  err = AllocateTransformBuffer(enc, width, height);
+  if (err != VP8_ENC_OK) return err;
+  for (y = 0; y < height; ++y) {
+    memcpy(enc->argb_ + y * width,
+           picture->argb + y * picture->argb_stride,
+           width * sizeof(*enc->argb_));
+  }
+  assert(enc->current_width_ == width);
+  return VP8_ENC_OK;
+}
+
+// -----------------------------------------------------------------------------
+
+static void MapToPalette(const uint32_t palette[], int num_colors,
+                         uint32_t* const last_pix, int* const last_idx,
+                         const uint32_t* src, uint8_t* dst, int width) {
+  int x;
+  int prev_idx = *last_idx;
+  uint32_t prev_pix = *last_pix;
+  for (x = 0; x < width; ++x) {
+    const uint32_t pix = src[x];
+    if (pix != prev_pix) {
+      int i;
+      for (i = 0; i < num_colors; ++i) {
+        if (pix == palette[i]) {
+          prev_idx = i;
+          prev_pix = pix;
+          break;
+        }
+      }
+    }
+    dst[x] = prev_idx;
+  }
+  *last_idx = prev_idx;
+  *last_pix = prev_pix;
+}
+
+// Remap argb values in src[] to packed palettes entries in dst[]
+// using 'row' as a temporary buffer of size 'width'.
+// We assume that all src[] values have a corresponding entry in the palette.
+// Note: src[] can be the same as dst[]
+static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
+                                      uint32_t* dst, uint32_t dst_stride,
+                                      const uint32_t* palette, int palette_size,
+                                      int width, int height, int xbits) {
+  // TODO(skal): this tmp buffer is not needed if VP8LBundleColorMap() can be
+  // made to work in-place.
+  uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
   int i, x, y;
   int use_LUT = 1;
+
+  if (tmp_row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
   for (i = 0; i < palette_size; ++i) {
     if ((palette[i] & 0xffff00ffu) != 0) {
       use_LUT = 0;
@@ -895,9 +1233,9 @@ static void ApplyPalette(uint32_t* src, uint32_t* dst,
     for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         const int color = (src[x] >> 8) & 0xff;
-        row[x] = inv_palette[color];
+        tmp_row[x] = inv_palette[color];
       }
-      VP8LBundleColorMap(row, width, xbits, dst);
+      VP8LBundleColorMap(tmp_row, width, xbits, dst);
       src += src_stride;
       dst += dst_stride;
     }
@@ -906,41 +1244,28 @@ static void ApplyPalette(uint32_t* src, uint32_t* dst,
     uint32_t last_pix = palette[0];
     int last_idx = 0;
     for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const uint32_t pix = src[x];
-        if (pix != last_pix) {
-          for (i = 0; i < palette_size; ++i) {
-            if (pix == palette[i]) {
-              last_idx = i;
-              last_pix = pix;
-              break;
-            }
-          }
-        }
-        row[x] = last_idx;
-      }
-      VP8LBundleColorMap(row, width, xbits, dst);
+      MapToPalette(palette, palette_size, &last_pix, &last_idx,
+                   src, tmp_row, width);
+      VP8LBundleColorMap(tmp_row, width, xbits, dst);
       src += src_stride;
       dst += dst_stride;
     }
   }
+  WebPSafeFree(tmp_row);
+  return VP8_ENC_OK;
 }
 
 // Note: Expects "enc->palette_" to be set properly.
-// Also, "enc->palette_" will be modified after this call and should not be used
-// later.
-static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
-                                       VP8LEncoder* const enc, int quality) {
+static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
+                                             int in_place) {
   WebPEncodingError err = VP8_ENC_OK;
-  int i;
   const WebPPicture* const pic = enc->pic_;
-  uint32_t* src = pic->argb;
-  uint32_t* dst;
   const int width = pic->width;
   const int height = pic->height;
-  uint32_t* const palette = enc->palette_;
+  const uint32_t* const palette = enc->palette_;
+  const uint32_t* src = in_place ? enc->argb_ : pic->argb;
+  const int src_stride = in_place ? enc->current_width_ : pic->argb_stride;
   const int palette_size = enc->palette_size_;
-  uint8_t* row = NULL;
   int xbits;
 
   // Replace each input pixel by corresponding palette index.
@@ -952,67 +1277,74 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
   }
 
   err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
-  if (err != VP8_ENC_OK) goto Error;
-  dst = enc->argb_;
-
-  row = (uint8_t*)WebPSafeMalloc(width, sizeof(*row));
-  if (row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+  if (err != VP8_ENC_OK) return err;
 
-  ApplyPalette(src, dst, pic->argb_stride, enc->current_width_,
-               palette, palette_size, width, height, xbits, row);
+  err = ApplyPalette(src, src_stride,
+                     enc->argb_, enc->current_width_,
+                     palette, palette_size, width, height, xbits);
+  return err;
+}
 
-  // Save palette to bitstream.
-  VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
-  VP8LWriteBits(bw, 2, COLOR_INDEXING_TRANSFORM);
-  assert(palette_size >= 1);
-  VP8LWriteBits(bw, 8, palette_size - 1);
+// Save palette_[] to bitstream.
+static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
+                                       VP8LEncoder* const enc) {
+  int i;
+  uint32_t tmp_palette[MAX_PALETTE_SIZE];
+  const int palette_size = enc->palette_size_;
+  const uint32_t* const palette = enc->palette_;
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, COLOR_INDEXING_TRANSFORM, 2);
+  assert(palette_size >= 1 && palette_size <= MAX_PALETTE_SIZE);
+  VP8LPutBits(bw, palette_size - 1, 8);
   for (i = palette_size - 1; i >= 1; --i) {
-    palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
+    tmp_palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
   }
-  err = EncodeImageNoHuffman(bw, palette, &enc->hash_chain_, enc->refs_,
-                             palette_size, 1, quality);
-
- Error:
-  WebPSafeFree(row);
-  return err;
+  tmp_palette[0] = palette[0];
+  return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_, enc->refs_,
+                              palette_size, 1, 20 /* quality */);
 }
 
-// -----------------------------------------------------------------------------
+#ifdef WEBP_EXPERIMENTAL_FEATURES
 
-static int GetHistoBits(int method, int use_palette, int width, int height) {
-  const int hist_size = VP8LGetHistogramSize(MAX_COLOR_CACHE_BITS);
-  // Make tile size a function of encoding method (Range: 0 to 6).
-  int histo_bits = (use_palette ? 9 : 7) - method;
-  while (1) {
-    const int huff_image_size = VP8LSubSampleSize(width, histo_bits) *
-                                VP8LSubSampleSize(height, histo_bits);
-    if ((uint64_t)huff_image_size * hist_size <= MAX_HUFF_IMAGE_SIZE) break;
-    ++histo_bits;
-  }
-  return (histo_bits < MIN_HUFFMAN_BITS) ? MIN_HUFFMAN_BITS :
-         (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
-}
+static WebPEncodingError EncodeDeltaPalettePredictorImage(
+    VP8LBitWriter* const bw, VP8LEncoder* const enc, int quality) {
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
 
-static int GetTransformBits(int method, int histo_bits) {
-  const int max_transform_bits = (method < 4) ? 6 : (method > 4) ? 4 : 5;
-  return (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
-}
+  const int pred_bits = 5;
+  const int transform_width = VP8LSubSampleSize(width, pred_bits);
+  const int transform_height = VP8LSubSampleSize(height, pred_bits);
+  const int pred = 7;   // default is Predictor7 (Top/Left Average)
+  const int tiles_per_row = VP8LSubSampleSize(width, pred_bits);
+  const int tiles_per_col = VP8LSubSampleSize(height, pred_bits);
+  uint32_t* predictors;
+  int tile_x, tile_y;
+  WebPEncodingError err = VP8_ENC_OK;
 
-static int GetCacheBits(float quality) {
-  return (quality <= 25.f) ? 0 : 7;
-}
+  predictors = (uint32_t*)WebPSafeMalloc(tiles_per_col * tiles_per_row,
+                                         sizeof(*predictors));
+  if (predictors == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
 
-static void FinishEncParams(VP8LEncoder* const enc) {
-  const WebPConfig* const config = enc->config_;
-  const WebPPicture* const pic = enc->pic_;
-  const int method = config->method;
-  const float quality = config->quality;
-  const int use_palette = enc->use_palette_;
-  enc->histo_bits_ = GetHistoBits(method, use_palette, pic->width, pic->height);
-  enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
-  enc->cache_bits_ = GetCacheBits(quality);
+  for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
+    for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
+      predictors[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
+    }
+  }
+
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
+  VP8LPutBits(bw, pred_bits - 2, 3);
+  err = EncodeImageNoHuffman(bw, predictors, &enc->hash_chain_,
+                             (VP8LBackwardRefs*)enc->refs_,  // cast const away
+                             transform_width, transform_height,
+                             quality);
+  WebPSafeFree(predictors);
+  return err;
 }
 
+#endif // WEBP_EXPERIMENTAL_FEATURES
+
 // -----------------------------------------------------------------------------
 // VP8LEncoder
 
@@ -1026,7 +1358,7 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
   enc->config_ = config;
   enc->pic_ = picture;
 
-  VP8LDspInit();
+  VP8LEncDspInit();
 
   return enc;
 }
@@ -1036,7 +1368,7 @@ static void VP8LEncoderDelete(VP8LEncoder* enc) {
     VP8LHashChainClear(&enc->hash_chain_);
     VP8LBackwardRefsClear(&enc->refs_[0]);
     VP8LBackwardRefsClear(&enc->refs_[1]);
-    WebPSafeFree(enc->argb_);
+    ClearTransformBuffer(enc);
     WebPSafeFree(enc);
   }
 }
@@ -1049,10 +1381,15 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
                                    VP8LBitWriter* const bw) {
   WebPEncodingError err = VP8_ENC_OK;
   const int quality = (int)config->quality;
+  const int low_effort = (config->method == 0);
   const int width = picture->width;
   const int height = picture->height;
   VP8LEncoder* const enc = VP8LEncoderNew(config, picture);
   const size_t byte_position = VP8LBitWriterNumBytes(bw);
+  int use_near_lossless = 0;
+  int hdr_size = 0;
+  int data_size = 0;
+  int use_delta_palettization = 0;
 
   if (enc == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
@@ -1062,70 +1399,83 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   // ---------------------------------------------------------------------------
   // Analyze image (entropy, num_palettes etc)
 
-  if (!AnalyzeAndInit(enc, config->image_hint)) {
+  if (!AnalyzeAndInit(enc)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
 
-  FinishEncParams(enc);
-
-  if (enc->use_palette_) {
-    err = EncodePalette(bw, enc, quality);
-    if (err != VP8_ENC_OK) goto Error;
-    // Color cache is disabled for palette.
-    enc->cache_bits_ = 0;
+  // Apply near-lossless preprocessing.
+  use_near_lossless = !enc->use_palette_ && (config->near_lossless < 100);
+  if (use_near_lossless) {
+    if (!VP8ApplyNearLossless(width, height, picture->argb,
+                              config->near_lossless)) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
   }
 
-  // In case image is not packed.
-  if (enc->argb_ == NULL) {
-    int y;
-    err = AllocateTransformBuffer(enc, width, height);
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  if (config->delta_palettization) {
+    enc->use_predict_ = 1;
+    enc->use_cross_color_ = 0;
+    enc->use_subtract_green_ = 0;
+    enc->use_palette_ = 1;
+    err = MakeInputImageCopy(enc);
     if (err != VP8_ENC_OK) goto Error;
-    assert(enc->argb_ != NULL);
-    for (y = 0; y < height; ++y) {
-      memcpy(enc->argb_ + y * width,
-             picture->argb + y * picture->argb_stride,
-             width * sizeof(*enc->argb_));
+    err = WebPSearchOptimalDeltaPalette(enc);
+    if (err != VP8_ENC_OK) goto Error;
+    if (enc->use_palette_) {
+      err = AllocateTransformBuffer(enc, width, height);
+      if (err != VP8_ENC_OK) goto Error;
+      err = EncodeDeltaPalettePredictorImage(bw, enc, quality);
+      if (err != VP8_ENC_OK) goto Error;
+      use_delta_palettization = 1;
     }
-    enc->current_width_ = width;
   }
+#endif  // WEBP_EXPERIMENTAL_FEATURES
 
-  // ---------------------------------------------------------------------------
-  // Apply transforms and write transform data.
-
-  err = EvalAndApplySubtractGreen(enc, enc->current_width_, height, bw);
-  if (err != VP8_ENC_OK) goto Error;
-
-  if (enc->use_predict_) {
-    err = ApplyPredictFilter(enc, enc->current_width_, height, quality, bw);
+  // Encode palette
+  if (enc->use_palette_) {
+    err = EncodePalette(bw, enc);
     if (err != VP8_ENC_OK) goto Error;
-  }
-
-  if (enc->use_cross_color_) {
-    err = ApplyCrossColorFilter(enc, enc->current_width_, height, quality, bw);
+    err = MapImageFromPalette(enc, use_delta_palettization);
     if (err != VP8_ENC_OK) goto Error;
   }
+  if (!use_delta_palettization) {
+    // In case image is not packed.
+    if (enc->argb_ == NULL) {
+      err = MakeInputImageCopy(enc);
+      if (err != VP8_ENC_OK) goto Error;
+    }
 
-  VP8LWriteBits(bw, 1, !TRANSFORM_PRESENT);  // No more transforms.
+    // -------------------------------------------------------------------------
+    // Apply transforms and write transform data.
 
-  // ---------------------------------------------------------------------------
-  // Estimate the color cache size.
+    if (enc->use_subtract_green_) {
+      ApplySubtractGreen(enc, enc->current_width_, height, bw);
+    }
 
-  if (enc->cache_bits_ > 0) {
-    if (!VP8LCalculateEstimateForCacheSize(enc->argb_, enc->current_width_,
-                                           height, quality, &enc->hash_chain_,
-                                           &enc->refs_[0], &enc->cache_bits_)) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
+    if (enc->use_predict_) {
+      err = ApplyPredictFilter(enc, enc->current_width_, height, quality,
+                               low_effort, bw);
+      if (err != VP8_ENC_OK) goto Error;
+    }
+
+    if (enc->use_cross_color_) {
+      err = ApplyCrossColorFilter(enc, enc->current_width_,
+                                  height, quality, bw);
+      if (err != VP8_ENC_OK) goto Error;
     }
   }
 
+  VP8LPutBits(bw, !TRANSFORM_PRESENT, 1);  // No more transforms.
+
   // ---------------------------------------------------------------------------
   // Encode and write the transformed image.
-
   err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_,
-                            enc->current_width_, height, quality,
-                            enc->cache_bits_, enc->histo_bits_);
+                            enc->current_width_, height, quality, low_effort,
+                            &enc->cache_bits_, enc->histo_bits_, byte_position,
+                            &hdr_size, &data_size);
   if (err != VP8_ENC_OK) goto Error;
 
   if (picture->stats != NULL) {
@@ -1140,6 +1490,8 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     stats->cache_bits = enc->cache_bits_;
     stats->palette_size = enc->palette_size_;
     stats->lossless_size = (int)(VP8LBitWriterNumBytes(bw) - byte_position);
+    stats->lossless_hdr_size = hdr_size;
+    stats->lossless_data_size = data_size;
   }
 
  Error:
@@ -1170,7 +1522,7 @@ int VP8LEncodeImage(const WebPConfig* const config,
   // Initialize BitWriter with size corresponding to 16 bpp to photo images and
   // 8 bpp for graphical images.
   initial_size = (config->image_hint == WEBP_HINT_GRAPH) ?
-                 width * height : width * height * 2;
+      width * height : width * height * 2;
   if (!VP8LBitWriterInit(&bw, initial_size)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
@@ -1234,7 +1586,7 @@ int VP8LEncodeImage(const WebPConfig* const config,
 
  Error:
   if (bw.error_) err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-  VP8LBitWriterDestroy(&bw);
+  VP8LBitWriterWipeOut(&bw);
   if (err != VP8_ENC_OK) {
     WebPEncodingSetError(picture, err);
     return 0;
diff --git a/src/3rdparty/libwebp/src/enc/webpenc.c b/src/3rdparty/libwebp/src/enc/webpenc.c
index ca85e0b..fece736 100644
--- a/src/3rdparty/libwebp/src/enc/webpenc.c
+++ b/src/3rdparty/libwebp/src/enc/webpenc.c
@@ -16,9 +16,9 @@
 #include <string.h>
 #include <math.h>
 
+#include "./cost.h"
 #include "./vp8enci.h"
 #include "./vp8li.h"
-#include "./cost.h"
 #include "../utils/utils.h"
 
 // #define PRINT_MEMORY_INFO
@@ -38,14 +38,14 @@ int WebPGetEncoderVersion(void) {
 //------------------------------------------------------------------------------
 
 static void ResetSegmentHeader(VP8Encoder* const enc) {
-  VP8SegmentHeader* const hdr = &enc->segment_hdr_;
+  VP8EncSegmentHeader* const hdr = &enc->segment_hdr_;
   hdr->num_segments_ = enc->config_->segments;
   hdr->update_map_  = (hdr->num_segments_ > 1);
   hdr->size_ = 0;
 }
 
 static void ResetFilterHeader(VP8Encoder* const enc) {
-  VP8FilterHeader* const hdr = &enc->filter_hdr_;
+  VP8EncFilterHeader* const hdr = &enc->filter_hdr_;
   hdr->simple_ = 1;
   hdr->level_ = 0;
   hdr->sharpness_ = 0;
@@ -79,7 +79,9 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) {
 //-------------------+---+---+---+---+---+---+---+
 // basic rd-opt      |   |   |   | x | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
-// disto-score i4/16 |   |   | x |   |   |   |   |
+// disto-refine i4/16| x | x | x |   |   |   |   |
+//-------------------+---+---+---+---+---+---+---+
+// disto-refine uv   |   | x | x |   |   |   |   |
 //-------------------+---+---+---+---+---+---+---+
 // rd-opt i4/16      |   |   | ~ | x | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
@@ -131,35 +133,36 @@ static void MapConfigToTools(VP8Encoder* const enc) {
 //       VP8EncIterator: 3360
 //         VP8ModeScore: 872
 //       VP8SegmentInfo: 732
-//             VP8Proba: 18352
+//          VP8EncProba: 18352
 //              LFStats: 2048
 // Picture size (yuv): 419328
 
 static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
                                   WebPPicture* const picture) {
+  VP8Encoder* enc;
   const int use_filter =
       (config->filter_strength > 0) || (config->autofilter > 0);
   const int mb_w = (picture->width + 15) >> 4;
   const int mb_h = (picture->height + 15) >> 4;
   const int preds_w = 4 * mb_w + 1;
   const int preds_h = 4 * mb_h + 1;
-  const size_t preds_size = preds_w * preds_h * sizeof(uint8_t);
+  const size_t preds_size = preds_w * preds_h * sizeof(*enc->preds_);
   const int top_stride = mb_w * 16;
-  const size_t nz_size = (mb_w + 1) * sizeof(uint32_t) + ALIGN_CST;
-  const size_t info_size = mb_w * mb_h * sizeof(VP8MBInfo);
-  const size_t samples_size = 2 * top_stride * sizeof(uint8_t)  // top-luma/u/v
-                            + ALIGN_CST;                        // align all
+  const size_t nz_size = (mb_w + 1) * sizeof(*enc->nz_) + WEBP_ALIGN_CST;
+  const size_t info_size = mb_w * mb_h * sizeof(*enc->mb_info_);
+  const size_t samples_size =
+      2 * top_stride * sizeof(*enc->y_top_)  // top-luma/u/v
+      + WEBP_ALIGN_CST;                      // align all
   const size_t lf_stats_size =
-      config->autofilter ? sizeof(LFStats) + ALIGN_CST : 0;
-  VP8Encoder* enc;
+      config->autofilter ? sizeof(*enc->lf_stats_) + WEBP_ALIGN_CST : 0;
   uint8_t* mem;
-  const uint64_t size = (uint64_t)sizeof(VP8Encoder)   // main struct
-                      + ALIGN_CST                      // cache alignment
-                      + info_size                      // modes info
-                      + preds_size                     // prediction modes
-                      + samples_size                   // top/left samples
-                      + nz_size                        // coeff context bits
-                      + lf_stats_size;                 // autofilter stats
+  const uint64_t size = (uint64_t)sizeof(*enc)   // main struct
+                      + WEBP_ALIGN_CST           // cache alignment
+                      + info_size                // modes info
+                      + preds_size               // prediction modes
+                      + samples_size             // top/left samples
+                      + nz_size                  // coeff context bits
+                      + lf_stats_size;           // autofilter stats
 
 #ifdef PRINT_MEMORY_INFO
   printf("===================================\n");
@@ -171,16 +174,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
          "            non-zero: %ld\n"
          "            lf-stats: %ld\n"
          "               total: %ld\n",
-         sizeof(VP8Encoder) + ALIGN_CST, info_size,
+         sizeof(*enc) + WEBP_ALIGN_CST, info_size,
          preds_size, samples_size, nz_size, lf_stats_size, size);
   printf("Transient object sizes:\n"
          "      VP8EncIterator: %ld\n"
          "        VP8ModeScore: %ld\n"
          "      VP8SegmentInfo: %ld\n"
-         "            VP8Proba: %ld\n"
+         "         VP8EncProba: %ld\n"
          "             LFStats: %ld\n",
          sizeof(VP8EncIterator), sizeof(VP8ModeScore),
-         sizeof(VP8SegmentInfo), sizeof(VP8Proba),
+         sizeof(VP8SegmentInfo), sizeof(VP8EncProba),
          sizeof(LFStats));
   printf("Picture size (yuv): %ld\n",
          mb_w * mb_h * 384 * sizeof(uint8_t));
@@ -192,7 +195,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
     return NULL;
   }
   enc = (VP8Encoder*)mem;
-  mem = (uint8_t*)DO_ALIGN(mem + sizeof(*enc));
+  mem = (uint8_t*)WEBP_ALIGN(mem + sizeof(*enc));
   memset(enc, 0, sizeof(*enc));
   enc->num_parts_ = 1 << config->partitions;
   enc->mb_w_ = mb_w;
@@ -201,14 +204,14 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   enc->mb_info_ = (VP8MBInfo*)mem;
   mem += info_size;
   enc->preds_ = ((uint8_t*)mem) + 1 + enc->preds_w_;
-  mem += preds_w * preds_h * sizeof(uint8_t);
-  enc->nz_ = 1 + (uint32_t*)DO_ALIGN(mem);
+  mem += preds_size;
+  enc->nz_ = 1 + (uint32_t*)WEBP_ALIGN(mem);
   mem += nz_size;
-  enc->lf_stats_ = lf_stats_size ? (LFStats*)DO_ALIGN(mem) : NULL;
+  enc->lf_stats_ = lf_stats_size ? (LFStats*)WEBP_ALIGN(mem) : NULL;
   mem += lf_stats_size;
 
   // top samples (all 16-aligned)
-  mem = (uint8_t*)DO_ALIGN(mem);
+  mem = (uint8_t*)WEBP_ALIGN(mem);
   enc->y_top_ = (uint8_t*)mem;
   enc->uv_top_ = enc->y_top_ + top_stride;
   mem += 2 * top_stride;
@@ -225,8 +228,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   ResetSegmentHeader(enc);
   ResetFilterHeader(enc);
   ResetBoundaryPredictions(enc);
-  VP8GetResidualCostInit();
-  VP8SetResidualCoeffsInit();
+  VP8EncDspCostInit();
   VP8EncInitAlpha(enc);
 
   // lower quality means smaller output -> we modulate a little the page
@@ -326,14 +328,17 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
 
   if (!config->lossless) {
     VP8Encoder* enc = NULL;
+
+    if (!config->exact) {
+      WebPCleanupTransparentArea(pic);
+    }
+
     if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
       // Make sure we have YUVA samples.
       if (config->preprocessing & 4) {
-#if WEBP_ENCODER_ABI_VERSION > 0x0204
         if (!WebPPictureSmartARGBToYUVA(pic)) {
           return 0;
         }
-#endif
       } else {
         float dithering = 0.f;
         if (config->preprocessing & 2) {
@@ -375,6 +380,10 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
       return 0;
     }
 
+    if (!config->exact) {
+      WebPCleanupTransparentAreaLossless(pic);
+    }
+
     ok = VP8LEncodeImage(config, pic);  // Sets pic->error in case of problem.
   }