libwebp: update to 0.5.0

This commit imports libwebp 0.5.0, including AUTHORS, COPYING, ChangeLog, NEWS, PATENTS, README and src directories. In src, only includes header and source files. The patches required to build it in Qt will follow in separate commit(s). Change-Id: I96b4961ba63c75cc7fbab158c36a0f403f254c14 Reviewed-by: aavit <eirik.aavitsland@theqtcompany.com>
author: Liang Qi <liang.qi@theqtcompany.com> 2016-03-09 10:22:13 +0100
committer: Liang Qi <liang.qi@theqtcompany.com> 2016-03-11 20:05:19 +0000
commit: b114e552211456fbde3ff6ca2da21cbc8d1e90e2 (patch)
tree: 9c033ea7bcc9cc7314eaa8aff57356b2ae301257 /src/3rdparty/libwebp/src/enc
parent: 1d4f24820c0fff474d524e006d715e13e409a4b8 (diff)
27 files changed, 3040 insertions, 2011 deletions
diff --git a/src/3rdparty/libwebp/src/enc/alpha.c b/src/3rdparty/libwebp/src/enc/alpha.c
index 79cb94d..3c970b0 100644
--- a/src/3rdparty/libwebp/src/enc/alpha.c
+++ b/src/3rdparty/libwebp/src/enc/alpha.c
@@ -15,6 +15,7 @@
 #include <stdlib.h>
 
 #include "./vp8enci.h"
+#include "../dsp/dsp.h"
 #include "../utils/filters.h"
 #include "../utils/quant_levels.h"
 #include "../utils/utils.h"
@@ -61,21 +62,16 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   if (!WebPPictureAlloc(&picture)) return 0;
 
   // Transfer the alpha values to the green channel.
-  {
-    int i, j;
-    uint32_t* dst = picture.argb;
-    const uint8_t* src = data;
-    for (j = 0; j < picture.height; ++j) {
-      for (i = 0; i < picture.width; ++i) {
-        dst[i] = src[i] << 8;  // we leave A/R/B channels zero'd.
-      }
-      src += width;
-      dst += picture.argb_stride;
-    }
-  }
+  WebPDispatchAlphaToGreen(data, width, picture.width, picture.height,
+                           picture.argb, picture.argb_stride);
 
   WebPConfigInit(&config);
   config.lossless = 1;
+  // Enable exact, or it would alter RGB values of transparent alpha, which is
+  // normally OK but not here since we are not encoding the input image but  an
+  // internal encoding-related image containing necessary exact information in
+  // RGB channels.
+  config.exact = 1;
   config.method = effort_level;  // impact is very small
   // Set a low default quality for encoding alpha. Ensure that Alpha quality at
   // lower methods (3 and below) is less than the threshold for triggering
@@ -87,11 +83,10 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   WebPPictureFree(&picture);
   ok = ok && !bw->error_;
   if (!ok) {
-    VP8LBitWriterDestroy(bw);
+    VP8LBitWriterWipeOut(bw);
     return 0;
   }
   return 1;
-
 }
 
 // -----------------------------------------------------------------------------
@@ -143,10 +138,10 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
       if (output_size > data_size) {
         // compressed size is larger than source! Revert to uncompressed mode.
         method = ALPHA_NO_COMPRESSION;
-        VP8LBitWriterDestroy(&tmp_bw);
+        VP8LBitWriterWipeOut(&tmp_bw);
       }
     } else {
-      VP8LBitWriterDestroy(&tmp_bw);
+      VP8LBitWriterWipeOut(&tmp_bw);
       return 0;
     }
   }
@@ -166,7 +161,7 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   ok = ok && VP8BitWriterAppend(&result->bw, output, output_size);
 
   if (method != ALPHA_NO_COMPRESSION) {
-    VP8LBitWriterDestroy(&tmp_bw);
+    VP8LBitWriterWipeOut(&tmp_bw);
   }
   ok = ok && !result->bw.error_;
   result->score = VP8BitWriterSize(&result->bw);
@@ -175,16 +170,6 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
 
 // -----------------------------------------------------------------------------
 
-// TODO(skal): move to dsp/ ?
-static void CopyPlane(const uint8_t* src, int src_stride,
-                      uint8_t* dst, int dst_stride, int width, int height) {
-  while (height-- > 0) {
-    memcpy(dst, src, width);
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 static int GetNumColors(const uint8_t* data, int width, int height,
                         int stride) {
   int j;
@@ -218,8 +203,9 @@ static uint32_t GetFilterMap(const uint8_t* alpha, int width, int height,
     const int kMaxColorsForFilterNone = 192;
     const int num_colors = GetNumColors(alpha, width, height, width);
     // For low number of colors, NONE yields better compression.
-    filter = (num_colors <= kMinColorsForFilterNone) ? WEBP_FILTER_NONE :
-             EstimateBestFilter(alpha, width, height, width);
+    filter = (num_colors <= kMinColorsForFilterNone)
+        ? WEBP_FILTER_NONE
+        : WebPEstimateBestFilter(alpha, width, height, width);
     bit_map |= 1 << filter;
     // For large number of colors, try FILTER_NONE in addition to the best
     // filter as well.
@@ -250,6 +236,7 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
   uint32_t try_map =
       GetFilterMap(alpha, width, height, filter, effort_level);
   InitFilterTrial(&best);
+
   if (try_map != FILTER_TRY_NONE) {
     uint8_t* filtered_alpha =  (uint8_t*)WebPSafeMalloc(1ULL, data_size);
     if (filtered_alpha == NULL) return 0;
@@ -274,7 +261,16 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
                              reduce_levels, effort_level, NULL, &best);
   }
   if (ok) {
-    if (stats != NULL) *stats = best.stats;
+    if (stats != NULL) {
+      stats->lossless_features = best.stats.lossless_features;
+      stats->histogram_bits = best.stats.histogram_bits;
+      stats->transform_bits = best.stats.transform_bits;
+      stats->cache_bits = best.stats.cache_bits;
+      stats->palette_size = best.stats.palette_size;
+      stats->lossless_size = best.stats.lossless_size;
+      stats->lossless_hdr_size = best.stats.lossless_hdr_size;
+      stats->lossless_data_size = best.stats.lossless_data_size;
+    }
     *output_size = VP8BitWriterSize(&best.bw);
     *output = VP8BitWriterBuf(&best.bw);
   } else {
@@ -324,7 +320,7 @@ static int EncodeAlpha(VP8Encoder* const enc,
   }
 
   // Extract alpha data (width x height) from raw_data (stride x height).
-  CopyPlane(pic->a, pic->a_stride, quant_alpha, width, width, height);
+  WebPCopyPlane(pic->a, pic->a_stride, quant_alpha, width, width, height);
 
   if (reduce_levels) {  // No Quantization required for 'quality = 100'.
     // 16 alpha levels gives quite a low MSE w.r.t original alpha plane hence
@@ -336,6 +332,7 @@ static int EncodeAlpha(VP8Encoder* const enc,
   }
 
   if (ok) {
+    VP8FiltersInit();
     ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
                                filter, reduce_levels, effort_level, output,
                                output_size, pic->stats);
@@ -376,6 +373,7 @@ static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
 }
 
 void VP8EncInitAlpha(VP8Encoder* const enc) {
+  WebPInitAlphaProcessing();
   enc->has_alpha_ = WebPPictureHasTransparency(enc->pic_);
   enc->alpha_data_ = NULL;
   enc->alpha_data_size_ = 0;
@@ -430,4 +428,3 @@ int VP8EncDeleteAlpha(VP8Encoder* const enc) {
   enc->has_alpha_ = 0;
   return ok;
 }
-
diff --git a/src/3rdparty/libwebp/src/enc/analysis.c b/src/3rdparty/libwebp/src/enc/analysis.c
index e019465..b55128f 100644
--- a/src/3rdparty/libwebp/src/enc/analysis.c
+++ b/src/3rdparty/libwebp/src/enc/analysis.c
@@ -111,28 +111,28 @@ static int FinalAlphaValue(int alpha) {
 }
 
 static int GetAlpha(const VP8Histogram* const histo) {
-  int max_value = 0, last_non_zero = 1;
-  int k;
-  int alpha;
-  for (k = 0; k <= MAX_COEFF_THRESH; ++k) {
-    const int value = histo->distribution[k];
-    if (value > 0) {
-      if (value > max_value) max_value = value;
-      last_non_zero = k;
-    }
-  }
   // 'alpha' will later be clipped to [0..MAX_ALPHA] range, clamping outer
   // values which happen to be mostly noise. This leaves the maximum precision
   // for handling the useful small values which contribute most.
-  alpha = (max_value > 1) ? ALPHA_SCALE * last_non_zero / max_value : 0;
+  const int max_value = histo->max_value;
+  const int last_non_zero = histo->last_non_zero;
+  const int alpha =
+      (max_value > 1) ? ALPHA_SCALE * last_non_zero / max_value : 0;
   return alpha;
 }
 
+static void InitHistogram(VP8Histogram* const histo) {
+  histo->max_value = 0;
+  histo->last_non_zero = 1;
+}
+
 static void MergeHistograms(const VP8Histogram* const in,
                             VP8Histogram* const out) {
-  int i;
-  for (i = 0; i <= MAX_COEFF_THRESH; ++i) {
-    out->distribution[i] += in->distribution[i];
+  if (in->max_value > out->max_value) {
+    out->max_value = in->max_value;
+  }
+  if (in->last_non_zero > out->last_non_zero) {
+    out->last_non_zero = in->last_non_zero;
   }
 }
 
@@ -245,10 +245,11 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
 
   VP8MakeLuma16Preds(it);
   for (mode = 0; mode < max_mode; ++mode) {
-    VP8Histogram histo = { { 0 } };
+    VP8Histogram histo;
     int alpha;
 
-    VP8CollectHistogram(it->yuv_in_ + Y_OFF,
+    InitHistogram(&histo);
+    VP8CollectHistogram(it->yuv_in_ + Y_OFF_ENC,
                         it->yuv_p_ + VP8I16ModeOffsets[mode],
                         0, 16, &histo);
     alpha = GetAlpha(&histo);
@@ -266,21 +267,22 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
   uint8_t modes[16];
   const int max_mode = MAX_INTRA4_MODE;
   int i4_alpha;
-  VP8Histogram total_histo = { { 0 } };
+  VP8Histogram total_histo;
   int cur_histo = 0;
+  InitHistogram(&total_histo);
 
   VP8IteratorStartI4(it);
   do {
     int mode;
     int best_mode_alpha = DEFAULT_ALPHA;
     VP8Histogram histos[2];
-    const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
+    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
 
     VP8MakeIntra4Preds(it);
     for (mode = 0; mode < max_mode; ++mode) {
       int alpha;
 
-      memset(&histos[cur_histo], 0, sizeof(histos[cur_histo]));
+      InitHistogram(&histos[cur_histo]);
       VP8CollectHistogram(src, it->yuv_p_ + VP8I4ModeOffsets[mode],
                           0, 1, &histos[cur_histo]);
       alpha = GetAlpha(&histos[cur_histo]);
@@ -293,7 +295,7 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
     // accumulate best histogram
     MergeHistograms(&histos[cur_histo ^ 1], &total_histo);
     // Note: we reuse the original samples for predictors
-  } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
+  } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF_ENC));
 
   i4_alpha = GetAlpha(&total_histo);
   if (IS_BETTER_ALPHA(i4_alpha, best_alpha)) {
@@ -311,9 +313,10 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
 
   VP8MakeChroma8Preds(it);
   for (mode = 0; mode < max_mode; ++mode) {
-    VP8Histogram histo = { { 0 } };
+    VP8Histogram histo;
     int alpha;
-    VP8CollectHistogram(it->yuv_in_ + U_OFF,
+    InitHistogram(&histo);
+    VP8CollectHistogram(it->yuv_in_ + U_OFF_ENC,
                         it->yuv_p_ + VP8UVModeOffsets[mode],
                         16, 16 + 4 + 4, &histo);
     alpha = GetAlpha(&histo);
@@ -402,8 +405,8 @@ typedef struct {
 static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) {
   int ok = 1;
   if (!VP8IteratorIsDone(it)) {
-    uint8_t tmp[32 + ALIGN_CST];
-    uint8_t* const scratch = (uint8_t*)DO_ALIGN(tmp);
+    uint8_t tmp[32 + WEBP_ALIGN_CST];
+    uint8_t* const scratch = (uint8_t*)WEBP_ALIGN(tmp);
     do {
       // Let's pretend we have perfect lossless reconstruction.
       VP8IteratorImport(it, scratch);
diff --git a/src/3rdparty/libwebp/src/enc/backward_references.c b/src/3rdparty/libwebp/src/enc/backward_references.c
index a3c30aa..c39437d 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references.c
+++ b/src/3rdparty/libwebp/src/enc/backward_references.c
@@ -16,13 +16,12 @@
 #include "./backward_references.h"
 #include "./histogram.h"
 #include "../dsp/lossless.h"
+#include "../dsp/dsp.h"
 #include "../utils/color_cache.h"
 #include "../utils/utils.h"
 
 #define VALUES_IN_BYTE 256
 
-#define HASH_MULTIPLIER (0xc6a4a7935bd1e995ULL)
-
 #define MIN_BLOCK_SIZE 256  // minimum block size for backward references
 
 #define MAX_ENTROPY    (1e30f)
@@ -58,10 +57,28 @@ static int DistanceToPlaneCode(int xsize, int dist) {
   return dist + 120;
 }
 
+// Returns the exact index where array1 and array2 are different if this
+// index is strictly superior to best_len_match. Otherwise, it returns 0.
+// If no two elements are the same, it returns max_limit.
 static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
                                        const uint32_t* const array2,
-                                       const int max_limit) {
-  int match_len = 0;
+                                       int best_len_match,
+                                       int max_limit) {
+  int match_len;
+
+  // Before 'expensive' linear match, check if the two arrays match at the
+  // current best length index.
+  if (array1[best_len_match] != array2[best_len_match]) return 0;
+
+#if defined(WEBP_USE_SSE2)
+  // Check if anything is different up to best_len_match excluded.
+  // memcmp seems to be slower on ARM so it is disabled for now.
+  if (memcmp(array1, array2, best_len_match * sizeof(*array1))) return 0;
+  match_len = best_len_match + 1;
+#else
+  match_len = 0;
+#endif
+
   while (match_len < max_limit && array1[match_len] == array2[match_len]) {
     ++match_len;
   }
@@ -178,15 +195,12 @@ int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
 // Hash chains
 
 // initialize as empty
-static void HashChainInit(VP8LHashChain* const p) {
-  int i;
+static void HashChainReset(VP8LHashChain* const p) {
   assert(p != NULL);
-  for (i = 0; i < p->size_; ++i) {
-    p->chain_[i] = -1;
-  }
-  for (i = 0; i < HASH_SIZE; ++i) {
-    p->hash_to_first_index_[i] = -1;
-  }
+  // Set the int32_t arrays to -1.
+  memset(p->chain_, 0xff, p->size_ * sizeof(*p->chain_));
+  memset(p->hash_to_first_index_, 0xff,
+         HASH_SIZE * sizeof(*p->hash_to_first_index_));
 }
 
 int VP8LHashChainInit(VP8LHashChain* const p, int size) {
@@ -196,7 +210,7 @@ int VP8LHashChainInit(VP8LHashChain* const p, int size) {
   p->chain_ = (int*)WebPSafeMalloc(size, sizeof(*p->chain_));
   if (p->chain_ == NULL) return 0;
   p->size_ = size;
-  HashChainInit(p);
+  HashChainReset(p);
   return 1;
 }
 
@@ -209,209 +223,212 @@ void VP8LHashChainClear(VP8LHashChain* const p) {
 
 // -----------------------------------------------------------------------------
 
-static WEBP_INLINE uint64_t GetPixPairHash64(const uint32_t* const argb) {
-  uint64_t key = ((uint64_t)argb[1] << 32) | argb[0];
-  key = (key * HASH_MULTIPLIER) >> (64 - HASH_BITS);
+#define HASH_MULTIPLIER_HI (0xc6a4a793U)
+#define HASH_MULTIPLIER_LO (0x5bd1e996U)
+
+static WEBP_INLINE uint32_t GetPixPairHash64(const uint32_t* const argb) {
+  uint32_t key;
+  key  = argb[1] * HASH_MULTIPLIER_HI;
+  key += argb[0] * HASH_MULTIPLIER_LO;
+  key = key >> (32 - HASH_BITS);
   return key;
 }
 
 // Insertion of two pixels at a time.
 static void HashChainInsert(VP8LHashChain* const p,
                             const uint32_t* const argb, int pos) {
-  const uint64_t hash_code = GetPixPairHash64(argb);
+  const uint32_t hash_code = GetPixPairHash64(argb);
   p->chain_[pos] = p->hash_to_first_index_[hash_code];
   p->hash_to_first_index_[hash_code] = pos;
 }
 
-static void GetParamsForHashChainFindCopy(int quality, int xsize,
-                                          int cache_bits, int* window_size,
-                                          int* iter_pos, int* iter_limit) {
-  const int iter_mult = (quality < 27) ? 1 : 1 + ((quality - 27) >> 4);
-  const int iter_neg = -iter_mult * (quality >> 1);
-  // Limit the backward-ref window size for lower qualities.
-  const int max_window_size = (quality > 50) ? WINDOW_SIZE
-                            : (quality > 25) ? (xsize << 8)
+// Returns the maximum number of hash chain lookups to do for a
+// given compression quality. Return value in range [6, 86].
+static int GetMaxItersForQuality(int quality, int low_effort) {
+  return (low_effort ? 6 : 8) + (quality * quality) / 128;
+}
+
+static int GetWindowSizeForHashChain(int quality, int xsize) {
+  const int max_window_size = (quality > 75) ? WINDOW_SIZE
+                            : (quality > 50) ? (xsize << 8)
+                            : (quality > 25) ? (xsize << 6)
                             : (xsize << 4);
   assert(xsize > 0);
-  *window_size = (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE
-               : max_window_size;
-  *iter_pos = 8 + (quality >> 3);
-  // For lower entropy images, the rigorous search loop in HashChainFindCopy
-  // can be relaxed.
-  *iter_limit = (cache_bits > 0) ? iter_neg : iter_neg / 2;
+  return (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE : max_window_size;
+}
+
+static WEBP_INLINE int MaxFindCopyLength(int len) {
+  return (len < MAX_LENGTH) ? len : MAX_LENGTH;
+}
+
+static void HashChainFindOffset(const VP8LHashChain* const p, int base_position,
+                                const uint32_t* const argb, int len,
+                                int window_size, int* const distance_ptr) {
+  const uint32_t* const argb_start = argb + base_position;
+  const int min_pos =
+      (base_position > window_size) ? base_position - window_size : 0;
+  int pos;
+  assert(len <= MAX_LENGTH);
+  for (pos = p->hash_to_first_index_[GetPixPairHash64(argb_start)];
+       pos >= min_pos;
+       pos = p->chain_[pos]) {
+    const int curr_length =
+        FindMatchLength(argb + pos, argb_start, len - 1, len);
+    if (curr_length == len) break;
+  }
+  *distance_ptr = base_position - pos;
 }
 
 static int HashChainFindCopy(const VP8LHashChain* const p,
-                             int base_position, int xsize_signed,
+                             int base_position,
                              const uint32_t* const argb, int max_len,
-                             int window_size, int iter_pos, int iter_limit,
+                             int window_size, int iter_max,
                              int* const distance_ptr,
                              int* const length_ptr) {
   const uint32_t* const argb_start = argb + base_position;
-  uint64_t best_val = 0;
-  uint32_t best_length = 1;
-  uint32_t best_distance = 0;
-  const uint32_t xsize = (uint32_t)xsize_signed;
+  int iter = iter_max;
+  int best_length = 0;
+  int best_distance = 0;
   const int min_pos =
       (base_position > window_size) ? base_position - window_size : 0;
   int pos;
-  assert(xsize > 0);
-  if (max_len > MAX_LENGTH) {
-    max_len = MAX_LENGTH;
+  int length_max = 256;
+  if (max_len < length_max) {
+    length_max = max_len;
   }
   for (pos = p->hash_to_first_index_[GetPixPairHash64(argb_start)];
        pos >= min_pos;
        pos = p->chain_[pos]) {
-    uint64_t val;
-    uint32_t curr_length;
-    uint32_t distance;
-    const uint32_t* const ptr1 = (argb + pos + best_length - 1);
-    const uint32_t* const ptr2 = (argb_start + best_length - 1);
-
-    if (iter_pos < 0) {
-      if (iter_pos < iter_limit || best_val >= 0xff0000) {
-        break;
-      }
+    int curr_length;
+    int distance;
+    if (--iter < 0) {
+      break;
     }
-    --iter_pos;
-
-    // Before 'expensive' linear match, check if the two arrays match at the
-    // current best length index and also for the succeeding elements.
-    if (ptr1[0] != ptr2[0] || ptr1[1] != ptr2[1]) continue;
-
-    curr_length = FindMatchLength(argb + pos, argb_start, max_len);
-    if (curr_length < best_length) continue;
-
-    distance = (uint32_t)(base_position - pos);
-    val = curr_length << 16;
-    // Favoring 2d locality here gives savings for certain images.
-    if (distance < 9 * xsize) {
-      const uint32_t y = distance / xsize;
-      uint32_t x = distance % xsize;
-      if (x > (xsize >> 1)) {
-        x = xsize - x;
-      }
-      if (x <= 7) {
-        val += 9 * 9 + 9 * 9;
-        val -= y * y + x * x;
-      }
-    }
-    if (best_val < val) {
-      best_val = val;
+
+    curr_length = FindMatchLength(argb + pos, argb_start, best_length, max_len);
+    if (best_length < curr_length) {
+      distance = base_position - pos;
       best_length = curr_length;
       best_distance = distance;
-      if (curr_length >= (uint32_t)max_len) {
-        break;
-      }
-      if ((best_distance == 1 || distance == xsize) &&
-          best_length >= 128) {
+      if (curr_length >= length_max) {
         break;
       }
     }
   }
-  *distance_ptr = (int)best_distance;
+  *distance_ptr = best_distance;
   *length_ptr = best_length;
   return (best_length >= MIN_LENGTH);
 }
 
-static WEBP_INLINE void PushBackCopy(VP8LBackwardRefs* const refs, int length) {
-  while (length >= MAX_LENGTH) {
-    BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, MAX_LENGTH));
-    length -= MAX_LENGTH;
-  }
-  if (length > 0) {
-    BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, length));
+static WEBP_INLINE void AddSingleLiteral(uint32_t pixel, int use_color_cache,
+                                         VP8LColorCache* const hashers,
+                                         VP8LBackwardRefs* const refs) {
+  PixOrCopy v;
+  if (use_color_cache) {
+    const uint32_t key = VP8LColorCacheGetIndex(hashers, pixel);
+    if (VP8LColorCacheLookup(hashers, key) == pixel) {
+      v = PixOrCopyCreateCacheIdx(key);
+    } else {
+      v = PixOrCopyCreateLiteral(pixel);
+      VP8LColorCacheSet(hashers, key, pixel);
+    }
+  } else {
+    v = PixOrCopyCreateLiteral(pixel);
   }
+  BackwardRefsCursorAdd(refs, v);
 }
 
 static int BackwardReferencesRle(int xsize, int ysize,
                                  const uint32_t* const argb,
-                                 VP8LBackwardRefs* const refs) {
+                                 int cache_bits, VP8LBackwardRefs* const refs) {
   const int pix_count = xsize * ysize;
-  int match_len = 0;
-  int i;
+  int i, k;
+  const int use_color_cache = (cache_bits > 0);
+  VP8LColorCache hashers;
+
+  if (use_color_cache && !VP8LColorCacheInit(&hashers, cache_bits)) {
+    return 0;
+  }
   ClearBackwardRefs(refs);
-  PushBackCopy(refs, match_len);    // i=0 case
-  BackwardRefsCursorAdd(refs, PixOrCopyCreateLiteral(argb[0]));
-  for (i = 1; i < pix_count; ++i) {
-    if (argb[i] == argb[i - 1]) {
-      ++match_len;
+  // Add first pixel as literal.
+  AddSingleLiteral(argb[0], use_color_cache, &hashers, refs);
+  i = 1;
+  while (i < pix_count) {
+    const int max_len = MaxFindCopyLength(pix_count - i);
+    const int kMinLength = 4;
+    const int rle_len = FindMatchLength(argb + i, argb + i - 1, 0, max_len);
+    const int prev_row_len = (i < xsize) ? 0 :
+        FindMatchLength(argb + i, argb + i - xsize, 0, max_len);
+    if (rle_len >= prev_row_len && rle_len >= kMinLength) {
+      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, rle_len));
+      // We don't need to update the color cache here since it is always the
+      // same pixel being copied, and that does not change the color cache
+      // state.
+      i += rle_len;
+    } else if (prev_row_len >= kMinLength) {
+      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(xsize, prev_row_len));
+      if (use_color_cache) {
+        for (k = 0; k < prev_row_len; ++k) {
+          VP8LColorCacheInsert(&hashers, argb[i + k]);
+        }
+      }
+      i += prev_row_len;
     } else {
-      PushBackCopy(refs, match_len);
-      match_len = 0;
-      BackwardRefsCursorAdd(refs, PixOrCopyCreateLiteral(argb[i]));
+      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+      i++;
     }
   }
-  PushBackCopy(refs, match_len);
+  if (use_color_cache) VP8LColorCacheClear(&hashers);
   return !refs->error_;
 }
 
-static int BackwardReferencesHashChain(int xsize, int ysize,
-                                       const uint32_t* const argb,
-                                       int cache_bits, int quality,
-                                       VP8LHashChain* const hash_chain,
-                                       VP8LBackwardRefs* const refs) {
+static int BackwardReferencesLz77(int xsize, int ysize,
+                                  const uint32_t* const argb, int cache_bits,
+                                  int quality, int low_effort,
+                                  VP8LHashChain* const hash_chain,
+                                  VP8LBackwardRefs* const refs) {
   int i;
   int ok = 0;
   int cc_init = 0;
   const int use_color_cache = (cache_bits > 0);
   const int pix_count = xsize * ysize;
   VP8LColorCache hashers;
-  int window_size = WINDOW_SIZE;
-  int iter_pos = 1;
-  int iter_limit = -1;
+  int iter_max = GetMaxItersForQuality(quality, low_effort);
+  const int window_size = GetWindowSizeForHashChain(quality, xsize);
+  int min_matches = 32;
 
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     if (!cc_init) goto Error;
   }
-
   ClearBackwardRefs(refs);
-  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
-                                &window_size, &iter_pos, &iter_limit);
-  HashChainInit(hash_chain);
-  for (i = 0; i < pix_count; ) {
+  HashChainReset(hash_chain);
+  for (i = 0; i < pix_count - 2; ) {
     // Alternative#1: Code the pixels starting at 'i' using backward reference.
     int offset = 0;
     int len = 0;
-    if (i < pix_count - 1) {  // FindCopy(i,..) reads pixels at [i] and [i + 1].
-      int max_len = pix_count - i;
-      HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
-                        window_size, iter_pos, iter_limit,
-                        &offset, &len);
-    }
-    if (len >= MIN_LENGTH) {
-      // Alternative#2: Insert the pixel at 'i' as literal, and code the
-      // pixels starting at 'i + 1' using backward reference.
+    const int max_len = MaxFindCopyLength(pix_count - i);
+    HashChainFindCopy(hash_chain, i, argb, max_len, window_size,
+                      iter_max, &offset, &len);
+    if (len > MIN_LENGTH || (len == MIN_LENGTH && offset <= 512)) {
       int offset2 = 0;
       int len2 = 0;
       int k;
+      min_matches = 8;
       HashChainInsert(hash_chain, &argb[i], i);
-      if (i < pix_count - 2) {  // FindCopy(i+1,..) reads [i + 1] and [i + 2].
-        int max_len = pix_count - (i + 1);
-        HashChainFindCopy(hash_chain, i + 1, xsize, argb, max_len,
-                          window_size, iter_pos, iter_limit,
-                          &offset2, &len2);
+      if ((len < (max_len >> 2)) && !low_effort) {
+        // Evaluate Alternative#2: Insert the pixel at 'i' as literal, and code
+        // the pixels starting at 'i + 1' using backward reference.
+        HashChainFindCopy(hash_chain, i + 1, argb, max_len - 1,
+                          window_size, iter_max, &offset2,
+                          &len2);
         if (len2 > len + 1) {
-          const uint32_t pixel = argb[i];
-          // Alternative#2 is a better match. So push pixel at 'i' as literal.
-          PixOrCopy v;
-          if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
-            const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
-            v = PixOrCopyCreateCacheIdx(ix);
-          } else {
-            if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
-            v = PixOrCopyCreateLiteral(pixel);
-          }
-          BackwardRefsCursorAdd(refs, v);
+          AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
           i++;  // Backward reference to be done for next pixel.
           len = len2;
           offset = offset2;
         }
       }
-      if (len >= MAX_LENGTH) {
-        len = MAX_LENGTH - 1;
-      }
       BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
       if (use_color_cache) {
         for (k = 0; k < len; ++k) {
@@ -419,33 +436,36 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
         }
       }
       // Add to the hash_chain (but cannot add the last pixel).
-      {
+      if (offset >= 3 && offset != xsize) {
         const int last = (len < pix_count - 1 - i) ? len : pix_count - 1 - i;
-        for (k = 1; k < last; ++k) {
+        for (k = 2; k < last - 8; k += 2) {
+          HashChainInsert(hash_chain, &argb[i + k], i + k);
+        }
+        for (; k < last; ++k) {
           HashChainInsert(hash_chain, &argb[i + k], i + k);
         }
       }
       i += len;
     } else {
-      const uint32_t pixel = argb[i];
-      PixOrCopy v;
-      if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
-        // push pixel as a PixOrCopyCreateCacheIdx pixel
-        const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
-        v = PixOrCopyCreateCacheIdx(ix);
-      } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
-        v = PixOrCopyCreateLiteral(pixel);
-      }
-      BackwardRefsCursorAdd(refs, v);
-      if (i + 1 < pix_count) {
+      AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+      HashChainInsert(hash_chain, &argb[i], i);
+      ++i;
+      --min_matches;
+      if (min_matches <= 0) {
+        AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
         HashChainInsert(hash_chain, &argb[i], i);
+        ++i;
       }
-      ++i;
     }
   }
+  while (i < pix_count) {
+    // Handle the last pixel(s).
+    AddSingleLiteral(argb[i], use_color_cache, &hashers, refs);
+    ++i;
+  }
+
   ok = !refs->error_;
-Error:
+ Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
   return ok;
 }
@@ -455,15 +475,14 @@ Error:
 typedef struct {
   double alpha_[VALUES_IN_BYTE];
   double red_[VALUES_IN_BYTE];
-  double literal_[PIX_OR_COPY_CODES_MAX];
   double blue_[VALUES_IN_BYTE];
   double distance_[NUM_DISTANCE_CODES];
+  double* literal_;
 } CostModel;
 
 static int BackwardReferencesTraceBackwards(
-    int xsize, int ysize, int recursive_cost_model,
-    const uint32_t* const argb, int quality, int cache_bits,
-    VP8LHashChain* const hash_chain,
+    int xsize, int ysize, const uint32_t* const argb, int quality,
+    int cache_bits, VP8LHashChain* const hash_chain,
     VP8LBackwardRefs* const refs);
 
 static void ConvertPopulationCountTableToBitEstimates(
@@ -487,28 +506,10 @@ static void ConvertPopulationCountTableToBitEstimates(
   }
 }
 
-static int CostModelBuild(CostModel* const m, int xsize, int ysize,
-                          int recursion_level, const uint32_t* const argb,
-                          int quality, int cache_bits,
-                          VP8LHashChain* const hash_chain,
+static int CostModelBuild(CostModel* const m, int cache_bits,
                           VP8LBackwardRefs* const refs) {
   int ok = 0;
-  VP8LHistogram* histo = NULL;
-
-  ClearBackwardRefs(refs);
-  if (recursion_level > 0) {
-    if (!BackwardReferencesTraceBackwards(xsize, ysize, recursion_level - 1,
-                                          argb, quality, cache_bits, hash_chain,
-                                          refs)) {
-      goto Error;
-    }
-  } else {
-    if (!BackwardReferencesHashChain(xsize, ysize, argb, cache_bits, quality,
-                                     hash_chain, refs)) {
-      goto Error;
-    }
-  }
-  histo = VP8LAllocateHistogram(cache_bits);
+  VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
   if (histo == NULL) goto Error;
 
   VP8LHistogramCreate(histo, refs, cache_bits);
@@ -557,10 +558,35 @@ static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
   return m->distance_[code] + extra_bits;
 }
 
+static void AddSingleLiteralWithCostModel(
+    const uint32_t* const argb, VP8LHashChain* const hash_chain,
+    VP8LColorCache* const hashers, const CostModel* const cost_model, int idx,
+    int is_last, int use_color_cache, double prev_cost, float* const cost,
+    uint16_t* const dist_array) {
+  double cost_val = prev_cost;
+  const uint32_t color = argb[0];
+  if (!is_last) {
+    HashChainInsert(hash_chain, argb, idx);
+  }
+  if (use_color_cache && VP8LColorCacheContains(hashers, color)) {
+    const double mul0 = 0.68;
+    const int ix = VP8LColorCacheGetIndex(hashers, color);
+    cost_val += GetCacheCost(cost_model, ix) * mul0;
+  } else {
+    const double mul1 = 0.82;
+    if (use_color_cache) VP8LColorCacheInsert(hashers, color);
+    cost_val += GetLiteralCost(cost_model, color) * mul1;
+  }
+  if (cost[idx] > cost_val) {
+    cost[idx] = (float)cost_val;
+    dist_array[idx] = 1;  // only one is inserted.
+  }
+}
+
 static int BackwardReferencesHashChainDistanceOnly(
-    int xsize, int ysize, int recursive_cost_model, const uint32_t* const argb,
+    int xsize, int ysize, const uint32_t* const argb,
     int quality, int cache_bits, VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs* const refs, uint32_t* const dist_array) {
+    VP8LBackwardRefs* const refs, uint16_t* const dist_array) {
   int i;
   int ok = 0;
   int cc_init = 0;
@@ -568,24 +594,27 @@ static int BackwardReferencesHashChainDistanceOnly(
   const int use_color_cache = (cache_bits > 0);
   float* const cost =
       (float*)WebPSafeMalloc(pix_count, sizeof(*cost));
-  CostModel* cost_model = (CostModel*)WebPSafeMalloc(1ULL, sizeof(*cost_model));
+  const size_t literal_array_size = sizeof(double) *
+      (NUM_LITERAL_CODES + NUM_LENGTH_CODES +
+       ((cache_bits > 0) ? (1 << cache_bits) : 0));
+  const size_t cost_model_size = sizeof(CostModel) + literal_array_size;
+  CostModel* const cost_model =
+      (CostModel*)WebPSafeMalloc(1ULL, cost_model_size);
   VP8LColorCache hashers;
-  const double mul0 = (recursive_cost_model != 0) ? 1.0 : 0.68;
-  const double mul1 = (recursive_cost_model != 0) ? 1.0 : 0.82;
-  const int min_distance_code = 2;  // TODO(vikasa): tune as function of quality
-  int window_size = WINDOW_SIZE;
-  int iter_pos = 1;
-  int iter_limit = -1;
+  const int skip_length = 32 + quality;
+  const int skip_min_distance_code = 2;
+  int iter_max = GetMaxItersForQuality(quality, 0);
+  const int window_size = GetWindowSizeForHashChain(quality, xsize);
 
   if (cost == NULL || cost_model == NULL) goto Error;
 
+  cost_model->literal_ = (double*)(cost_model + 1);
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     if (!cc_init) goto Error;
   }
 
-  if (!CostModelBuild(cost_model, xsize, ysize, recursive_cost_model, argb,
-                      quality, cache_bits, hash_chain, refs)) {
+  if (!CostModelBuild(cost_model, cache_bits, refs)) {
     goto Error;
   }
 
@@ -594,85 +623,80 @@ static int BackwardReferencesHashChainDistanceOnly(
   // We loop one pixel at a time, but store all currently best points to
   // non-processed locations from this point.
   dist_array[0] = 0;
-  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
-                                &window_size, &iter_pos, &iter_limit);
-  HashChainInit(hash_chain);
-  for (i = 0; i < pix_count; ++i) {
-    double prev_cost = 0.0;
-    int shortmax;
-    if (i > 0) {
-      prev_cost = cost[i - 1];
-    }
-    for (shortmax = 0; shortmax < 2; ++shortmax) {
-      int offset = 0;
-      int len = 0;
-      if (i < pix_count - 1) {  // FindCopy reads pixels at [i] and [i + 1].
-        int max_len = shortmax ? 2 : pix_count - i;
-        HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
-                          window_size, iter_pos, iter_limit,
-                          &offset, &len);
+  HashChainReset(hash_chain);
+  // Add first pixel as literal.
+  AddSingleLiteralWithCostModel(argb + 0, hash_chain, &hashers, cost_model, 0,
+                                0, use_color_cache, 0.0, cost, dist_array);
+  for (i = 1; i < pix_count - 1; ++i) {
+    int offset = 0;
+    int len = 0;
+    double prev_cost = cost[i - 1];
+    const int max_len = MaxFindCopyLength(pix_count - i);
+    HashChainFindCopy(hash_chain, i, argb, max_len, window_size,
+                      iter_max, &offset, &len);
+    if (len >= MIN_LENGTH) {
+      const int code = DistanceToPlaneCode(xsize, offset);
+      const double distance_cost =
+          prev_cost + GetDistanceCost(cost_model, code);
+      int k;
+      for (k = 1; k < len; ++k) {
+        const double cost_val = distance_cost + GetLengthCost(cost_model, k);
+        if (cost[i + k] > cost_val) {
+          cost[i + k] = (float)cost_val;
+          dist_array[i + k] = k + 1;
+        }
       }
-      if (len >= MIN_LENGTH) {
-        const int code = DistanceToPlaneCode(xsize, offset);
-        const double distance_cost =
-            prev_cost + GetDistanceCost(cost_model, code);
-        int k;
-        for (k = 1; k < len; ++k) {
-          const double cost_val = distance_cost + GetLengthCost(cost_model, k);
-          if (cost[i + k] > cost_val) {
-            cost[i + k] = (float)cost_val;
-            dist_array[i + k] = k + 1;
+      // This if is for speedup only. It roughly doubles the speed, and
+      // makes compression worse by .1 %.
+      if (len >= skip_length && code <= skip_min_distance_code) {
+        // Long copy for short distances, let's skip the middle
+        // lookups for better copies.
+        // 1) insert the hashes.
+        if (use_color_cache) {
+          for (k = 0; k < len; ++k) {
+            VP8LColorCacheInsert(&hashers, argb[i + k]);
           }
         }
-        // This if is for speedup only. It roughly doubles the speed, and
-        // makes compression worse by .1 %.
-        if (len >= 128 && code <= min_distance_code) {
-          // Long copy for short distances, let's skip the middle
-          // lookups for better copies.
-          // 1) insert the hashes.
-          if (use_color_cache) {
-            for (k = 0; k < len; ++k) {
-              VP8LColorCacheInsert(&hashers, argb[i + k]);
-            }
-          }
-          // 2) Add to the hash_chain (but cannot add the last pixel)
-          {
-            const int last = (len + i < pix_count - 1) ? len + i
-                                                       : pix_count - 1;
-            for (k = i; k < last; ++k) {
-              HashChainInsert(hash_chain, &argb[k], k);
-            }
+        // 2) Add to the hash_chain (but cannot add the last pixel)
+        {
+          const int last = (len + i < pix_count - 1) ? len + i
+                                                     : pix_count - 1;
+          for (k = i; k < last; ++k) {
+            HashChainInsert(hash_chain, &argb[k], k);
           }
-          // 3) jump.
-          i += len - 1;  // for loop does ++i, thus -1 here.
-          goto next_symbol;
         }
+        // 3) jump.
+        i += len - 1;  // for loop does ++i, thus -1 here.
+        goto next_symbol;
       }
-    }
-    if (i < pix_count - 1) {
-      HashChainInsert(hash_chain, &argb[i], i);
-    }
-    {
-      // inserting a literal pixel
-      double cost_val = prev_cost;
-      if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
-        const int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
-        cost_val += GetCacheCost(cost_model, ix) * mul0;
-      } else {
-        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
-        cost_val += GetLiteralCost(cost_model, argb[i]) * mul1;
-      }
-      if (cost[i] > cost_val) {
-        cost[i] = (float)cost_val;
-        dist_array[i] = 1;  // only one is inserted.
+      if (len != MIN_LENGTH) {
+        int code_min_length;
+        double cost_total;
+        HashChainFindOffset(hash_chain, i, argb, MIN_LENGTH, window_size,
+                            &offset);
+        code_min_length = DistanceToPlaneCode(xsize, offset);
+        cost_total = prev_cost +
+            GetDistanceCost(cost_model, code_min_length) +
+            GetLengthCost(cost_model, 1);
+        if (cost[i + 1] > cost_total) {
+          cost[i + 1] = (float)cost_total;
+          dist_array[i + 1] = 2;
+        }
       }
     }
+    AddSingleLiteralWithCostModel(argb + i, hash_chain, &hashers, cost_model, i,
+                                  0, use_color_cache, prev_cost, cost,
+                                  dist_array);
  next_symbol: ;
   }
-  // Last pixel still to do, it can only be a single step if not reached
-  // through cheaper means already.
+  // Handle the last pixel.
+  if (i == (pix_count - 1)) {
+    AddSingleLiteralWithCostModel(argb + i, hash_chain, &hashers, cost_model, i,
+                                  1, use_color_cache, cost[pix_count - 2], cost,
+                                  dist_array);
+  }
   ok = !refs->error_;
-Error:
+ Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
   WebPSafeFree(cost_model);
   WebPSafeFree(cost);
@@ -682,12 +706,12 @@ Error:
 // We pack the path at the end of *dist_array and return
 // a pointer to this part of the array. Example:
 // dist_array = [1x2xx3x2] => packed [1x2x1232], chosen_path = [1232]
-static void TraceBackwards(uint32_t* const dist_array,
+static void TraceBackwards(uint16_t* const dist_array,
                            int dist_array_size,
-                           uint32_t** const chosen_path,
+                           uint16_t** const chosen_path,
                            int* const chosen_path_size) {
-  uint32_t* path = dist_array + dist_array_size;
-  uint32_t* cur = dist_array + dist_array_size - 1;
+  uint16_t* path = dist_array + dist_array_size;
+  uint16_t* cur = dist_array + dist_array_size - 1;
   while (cur >= dist_array) {
     const int k = *cur;
     --path;
@@ -701,20 +725,16 @@ static void TraceBackwards(uint32_t* const dist_array,
 static int BackwardReferencesHashChainFollowChosenPath(
     int xsize, int ysize, const uint32_t* const argb,
     int quality, int cache_bits,
-    const uint32_t* const chosen_path, int chosen_path_size,
+    const uint16_t* const chosen_path, int chosen_path_size,
     VP8LHashChain* const hash_chain,
     VP8LBackwardRefs* const refs) {
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
-  int size = 0;
-  int i = 0;
-  int k;
   int ix;
+  int i = 0;
   int ok = 0;
   int cc_init = 0;
-  int window_size = WINDOW_SIZE;
-  int iter_pos = 1;
-  int iter_limit = -1;
+  const int window_size = GetWindowSizeForHashChain(quality, xsize);
   VP8LColorCache hashers;
 
   if (use_color_cache) {
@@ -723,18 +743,13 @@ static int BackwardReferencesHashChainFollowChosenPath(
   }
 
   ClearBackwardRefs(refs);
-  GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
-                                &window_size, &iter_pos, &iter_limit);
-  HashChainInit(hash_chain);
-  for (ix = 0; ix < chosen_path_size; ++ix, ++size) {
+  HashChainReset(hash_chain);
+  for (ix = 0; ix < chosen_path_size; ++ix) {
     int offset = 0;
-    int len = 0;
-    int max_len = chosen_path[ix];
-    if (max_len != 1) {
-      HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
-                        window_size, iter_pos, iter_limit,
-                        &offset, &len);
-      assert(len == max_len);
+    const int len = chosen_path[ix];
+    if (len != 1) {
+      int k;
+      HashChainFindOffset(hash_chain, i, argb, len, window_size, &offset);
       BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
       if (use_color_cache) {
         for (k = 0; k < len; ++k) {
@@ -766,29 +781,28 @@ static int BackwardReferencesHashChainFollowChosenPath(
     }
   }
   ok = !refs->error_;
-Error:
+ Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
   return ok;
 }
 
 // Returns 1 on success.
 static int BackwardReferencesTraceBackwards(int xsize, int ysize,
-                                            int recursive_cost_model,
                                             const uint32_t* const argb,
                                             int quality, int cache_bits,
                                             VP8LHashChain* const hash_chain,
                                             VP8LBackwardRefs* const refs) {
   int ok = 0;
   const int dist_array_size = xsize * ysize;
-  uint32_t* chosen_path = NULL;
+  uint16_t* chosen_path = NULL;
   int chosen_path_size = 0;
-  uint32_t* dist_array =
-      (uint32_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
+  uint16_t* dist_array =
+      (uint16_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
 
   if (dist_array == NULL) goto Error;
 
   if (!BackwardReferencesHashChainDistanceOnly(
-      xsize, ysize, recursive_cost_model, argb, quality, cache_bits, hash_chain,
+      xsize, ysize, argb, quality, cache_bits, hash_chain,
       refs, dist_array)) {
     goto Error;
   }
@@ -817,72 +831,10 @@ static void BackwardReferences2DLocality(int xsize,
   }
 }
 
-VP8LBackwardRefs* VP8LGetBackwardReferences(
-    int width, int height, const uint32_t* const argb, int quality,
-    int cache_bits, int use_2d_locality, VP8LHashChain* const hash_chain,
-    VP8LBackwardRefs refs_array[2]) {
-  int lz77_is_useful;
-  const int num_pix = width * height;
-  VP8LBackwardRefs* best = NULL;
-  VP8LBackwardRefs* const refs_lz77 = &refs_array[0];
-  VP8LBackwardRefs* const refs_rle = &refs_array[1];
-
-  if (!BackwardReferencesHashChain(width, height, argb, cache_bits, quality,
-                                   hash_chain, refs_lz77)) {
-    return NULL;
-  }
-  if (!BackwardReferencesRle(width, height, argb, refs_rle)) {
-    return NULL;
-  }
-
-  {
-    double bit_cost_lz77, bit_cost_rle;
-    VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
-    if (histo == NULL) return NULL;
-    // Evaluate LZ77 coding.
-    VP8LHistogramCreate(histo, refs_lz77, cache_bits);
-    bit_cost_lz77 = VP8LHistogramEstimateBits(histo);
-    // Evaluate RLE coding.
-    VP8LHistogramCreate(histo, refs_rle, cache_bits);
-    bit_cost_rle = VP8LHistogramEstimateBits(histo);
-    // Decide if LZ77 is useful.
-    lz77_is_useful = (bit_cost_lz77 < bit_cost_rle);
-    VP8LFreeHistogram(histo);
-  }
-
-  // Choose appropriate backward reference.
-  if (lz77_is_useful) {
-    // TraceBackwards is costly. Don't execute it at lower quality.
-    const int try_lz77_trace_backwards = (quality >= 25);
-    best = refs_lz77;   // default guess: lz77 is better
-    if (try_lz77_trace_backwards) {
-      // Set recursion level for large images using a color cache.
-      const int recursion_level =
-          (num_pix < 320 * 200) && (cache_bits > 0) ? 1 : 0;
-      VP8LBackwardRefs* const refs_trace = &refs_array[1];
-      ClearBackwardRefs(refs_trace);
-      if (BackwardReferencesTraceBackwards(width, height, recursion_level, argb,
-                                           quality, cache_bits, hash_chain,
-                                           refs_trace)) {
-        best = refs_trace;
-      }
-    }
-  } else {
-    best = refs_rle;
-  }
-
-  if (use_2d_locality) BackwardReferences2DLocality(width, best);
-
-  return best;
-}
-
 // Returns entropy for the given cache bits.
-static double ComputeCacheEntropy(const uint32_t* const argb,
-                                  int xsize, int ysize,
+static double ComputeCacheEntropy(const uint32_t* argb,
                                   const VP8LBackwardRefs* const refs,
                                   int cache_bits) {
-  int pixel_index = 0;
-  uint32_t k;
   const int use_color_cache = (cache_bits > 0);
   int cc_init = 0;
   double entropy = MAX_ENTROPY;
@@ -896,33 +848,40 @@ static double ComputeCacheEntropy(const uint32_t* const argb,
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     if (!cc_init) goto Error;
   }
-
-  while (VP8LRefsCursorOk(&c)) {
-    const PixOrCopy* const v = c.cur_pos;
-    if (PixOrCopyIsLiteral(v)) {
-      if (use_color_cache &&
-          VP8LColorCacheContains(&hashers, argb[pixel_index])) {
-        // push pixel as a cache index
-        const int ix = VP8LColorCacheGetIndex(&hashers, argb[pixel_index]);
-        const PixOrCopy token = PixOrCopyCreateCacheIdx(ix);
-        VP8LHistogramAddSinglePixOrCopy(histo, &token);
-      } else {
-        VP8LHistogramAddSinglePixOrCopy(histo, v);
-      }
-    } else {
-      VP8LHistogramAddSinglePixOrCopy(histo, v);
+  if (!use_color_cache) {
+    while (VP8LRefsCursorOk(&c)) {
+      VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos);
+      VP8LRefsCursorNext(&c);
     }
-    if (use_color_cache) {
-      for (k = 0; k < PixOrCopyLength(v); ++k) {
-        VP8LColorCacheInsert(&hashers, argb[pixel_index + k]);
+  } else {
+    while (VP8LRefsCursorOk(&c)) {
+      const PixOrCopy* const v = c.cur_pos;
+      if (PixOrCopyIsLiteral(v)) {
+        const uint32_t pix = *argb++;
+        const uint32_t key = VP8LColorCacheGetIndex(&hashers, pix);
+        if (VP8LColorCacheLookup(&hashers, key) == pix) {
+          ++histo->literal_[NUM_LITERAL_CODES + NUM_LENGTH_CODES + key];
+        } else {
+          VP8LColorCacheSet(&hashers, key, pix);
+          ++histo->blue_[pix & 0xff];
+          ++histo->literal_[(pix >> 8) & 0xff];
+          ++histo->red_[(pix >> 16) & 0xff];
+          ++histo->alpha_[pix >> 24];
+        }
+      } else {
+        int len = PixOrCopyLength(v);
+        int code, extra_bits;
+        VP8LPrefixEncodeBits(len, &code, &extra_bits);
+        ++histo->literal_[NUM_LITERAL_CODES + code];
+        VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
+        ++histo->distance_[code];
+        do {
+          VP8LColorCacheInsert(&hashers, *argb++);
+        } while(--len != 0);
       }
+      VP8LRefsCursorNext(&c);
     }
-    pixel_index += PixOrCopyLength(v);
-    VP8LRefsCursorNext(&c);
   }
-  assert(pixel_index == xsize * ysize);
-  (void)xsize;  // xsize is not used in non-debug compilations otherwise.
-  (void)ysize;  // ysize is not used in non-debug compilations otherwise.
   entropy = VP8LHistogramEstimateBits(histo) +
       kSmallPenaltyForLargeCache * cache_bits;
  Error:
@@ -931,45 +890,204 @@ static double ComputeCacheEntropy(const uint32_t* const argb,
   return entropy;
 }
 
-// *best_cache_bits will contain how many bits are to be used for a color cache.
+// Evaluate optimal cache bits for the local color cache.
+// The input *best_cache_bits sets the maximum cache bits to use (passing 0
+// implies disabling the local color cache). The local color cache is also
+// disabled for the lower (<= 25) quality.
 // Returns 0 in case of memory error.
-int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
-                                      int xsize, int ysize, int quality,
-                                      VP8LHashChain* const hash_chain,
-                                      VP8LBackwardRefs* const refs,
-                                      int* const best_cache_bits) {
+static int CalculateBestCacheSize(const uint32_t* const argb,
+                                  int xsize, int ysize, int quality,
+                                  VP8LHashChain* const hash_chain,
+                                  VP8LBackwardRefs* const refs,
+                                  int* const lz77_computed,
+                                  int* const best_cache_bits) {
   int eval_low = 1;
   int eval_high = 1;
   double entropy_low = MAX_ENTROPY;
   double entropy_high = MAX_ENTROPY;
+  const double cost_mul = 5e-4;
   int cache_bits_low = 0;
-  int cache_bits_high = MAX_COLOR_CACHE_BITS;
+  int cache_bits_high = (quality <= 25) ? 0 : *best_cache_bits;
 
-  if (!BackwardReferencesHashChain(xsize, ysize, argb, 0, quality, hash_chain,
-                                   refs)) {
+  assert(cache_bits_high <= MAX_COLOR_CACHE_BITS);
+
+  *lz77_computed = 0;
+  if (cache_bits_high == 0) {
+    *best_cache_bits = 0;
+    // Local color cache is disabled.
+    return 1;
+  }
+  if (!BackwardReferencesLz77(xsize, ysize, argb, cache_bits_low, quality, 0,
+                              hash_chain, refs)) {
     return 0;
   }
   // Do a binary search to find the optimal entropy for cache_bits.
-  while (cache_bits_high - cache_bits_low > 1) {
+  while (eval_low || eval_high) {
     if (eval_low) {
-      entropy_low =
-          ComputeCacheEntropy(argb, xsize, ysize, refs, cache_bits_low);
+      entropy_low = ComputeCacheEntropy(argb, refs, cache_bits_low);
+      entropy_low += entropy_low * cache_bits_low * cost_mul;
       eval_low = 0;
     }
     if (eval_high) {
-      entropy_high =
-          ComputeCacheEntropy(argb, xsize, ysize, refs, cache_bits_high);
+      entropy_high = ComputeCacheEntropy(argb, refs, cache_bits_high);
+      entropy_high += entropy_high * cache_bits_high * cost_mul;
       eval_high = 0;
     }
     if (entropy_high < entropy_low) {
+      const int prev_cache_bits_low = cache_bits_low;
       *best_cache_bits = cache_bits_high;
       cache_bits_low = (cache_bits_low + cache_bits_high) / 2;
-      eval_low = 1;
+      if (cache_bits_low != prev_cache_bits_low) eval_low = 1;
     } else {
       *best_cache_bits = cache_bits_low;
       cache_bits_high = (cache_bits_low + cache_bits_high) / 2;
-      eval_high = 1;
+      if (cache_bits_high != cache_bits_low) eval_high = 1;
     }
   }
+  *lz77_computed = 1;
   return 1;
 }
+
+// Update (in-place) backward references for specified cache_bits.
+static int BackwardRefsWithLocalCache(const uint32_t* const argb,
+                                      int cache_bits,
+                                      VP8LBackwardRefs* const refs) {
+  int pixel_index = 0;
+  VP8LColorCache hashers;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  if (!VP8LColorCacheInit(&hashers, cache_bits)) return 0;
+
+  while (VP8LRefsCursorOk(&c)) {
+    PixOrCopy* const v = c.cur_pos;
+    if (PixOrCopyIsLiteral(v)) {
+      const uint32_t argb_literal = v->argb_or_distance;
+      if (VP8LColorCacheContains(&hashers, argb_literal)) {
+        const int ix = VP8LColorCacheGetIndex(&hashers, argb_literal);
+        *v = PixOrCopyCreateCacheIdx(ix);
+      } else {
+        VP8LColorCacheInsert(&hashers, argb_literal);
+      }
+      ++pixel_index;
+    } else {
+      // refs was created without local cache, so it can not have cache indexes.
+      int k;
+      assert(PixOrCopyIsCopy(v));
+      for (k = 0; k < v->len; ++k) {
+        VP8LColorCacheInsert(&hashers, argb[pixel_index++]);
+      }
+    }
+    VP8LRefsCursorNext(&c);
+  }
+  VP8LColorCacheClear(&hashers);
+  return 1;
+}
+
+static VP8LBackwardRefs* GetBackwardReferencesLowEffort(
+    int width, int height, const uint32_t* const argb, int quality,
+    int* const cache_bits, VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs refs_array[2]) {
+  VP8LBackwardRefs* refs_lz77 = &refs_array[0];
+  *cache_bits = 0;
+  if (!BackwardReferencesLz77(width, height, argb, 0, quality,
+                              1 /* Low effort. */, hash_chain, refs_lz77)) {
+    return NULL;
+  }
+  BackwardReferences2DLocality(width, refs_lz77);
+  return refs_lz77;
+}
+
+static VP8LBackwardRefs* GetBackwardReferences(
+    int width, int height, const uint32_t* const argb, int quality,
+    int* const cache_bits, VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs refs_array[2]) {
+  int lz77_is_useful;
+  int lz77_computed;
+  double bit_cost_lz77, bit_cost_rle;
+  VP8LBackwardRefs* best = NULL;
+  VP8LBackwardRefs* refs_lz77 = &refs_array[0];
+  VP8LBackwardRefs* refs_rle = &refs_array[1];
+  VP8LHistogram* histo = NULL;
+
+  if (!CalculateBestCacheSize(argb, width, height, quality, hash_chain,
+                              refs_lz77, &lz77_computed, cache_bits)) {
+    goto Error;
+  }
+
+  if (lz77_computed) {
+    // Transform refs_lz77 for the optimized cache_bits.
+    if (*cache_bits > 0) {
+      if (!BackwardRefsWithLocalCache(argb, *cache_bits, refs_lz77)) {
+        goto Error;
+      }
+    }
+  } else {
+    if (!BackwardReferencesLz77(width, height, argb, *cache_bits, quality,
+                                0 /* Low effort. */, hash_chain, refs_lz77)) {
+      goto Error;
+    }
+  }
+
+  if (!BackwardReferencesRle(width, height, argb, *cache_bits, refs_rle)) {
+    goto Error;
+  }
+
+  histo = VP8LAllocateHistogram(*cache_bits);
+  if (histo == NULL) goto Error;
+
+  {
+    // Evaluate LZ77 coding.
+    VP8LHistogramCreate(histo, refs_lz77, *cache_bits);
+    bit_cost_lz77 = VP8LHistogramEstimateBits(histo);
+    // Evaluate RLE coding.
+    VP8LHistogramCreate(histo, refs_rle, *cache_bits);
+    bit_cost_rle = VP8LHistogramEstimateBits(histo);
+    // Decide if LZ77 is useful.
+    lz77_is_useful = (bit_cost_lz77 < bit_cost_rle);
+  }
+
+  // Choose appropriate backward reference.
+  if (lz77_is_useful) {
+    // TraceBackwards is costly. Don't execute it at lower quality.
+    const int try_lz77_trace_backwards = (quality >= 25);
+    best = refs_lz77;   // default guess: lz77 is better
+    if (try_lz77_trace_backwards) {
+      VP8LBackwardRefs* const refs_trace = refs_rle;
+      if (!VP8LBackwardRefsCopy(refs_lz77, refs_trace)) {
+        best = NULL;
+        goto Error;
+      }
+      if (BackwardReferencesTraceBackwards(width, height, argb, quality,
+                                           *cache_bits, hash_chain,
+                                           refs_trace)) {
+        double bit_cost_trace;
+        // Evaluate LZ77 coding.
+        VP8LHistogramCreate(histo, refs_trace, *cache_bits);
+        bit_cost_trace = VP8LHistogramEstimateBits(histo);
+        if (bit_cost_trace < bit_cost_lz77) {
+          best = refs_trace;
+        }
+      }
+    }
+  } else {
+    best = refs_rle;
+  }
+
+  BackwardReferences2DLocality(width, best);
+
+ Error:
+  VP8LFreeHistogram(histo);
+  return best;
+}
+
+VP8LBackwardRefs* VP8LGetBackwardReferences(
+    int width, int height, const uint32_t* const argb, int quality,
+    int low_effort, int* const cache_bits, VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs refs_array[2]) {
+  if (low_effort) {
+    return GetBackwardReferencesLowEffort(width, height, argb, quality,
+                                          cache_bits, hash_chain, refs_array);
+  } else {
+    return GetBackwardReferences(width, height, argb, quality, cache_bits,
+                                 hash_chain, refs_array);
+  }
+}
diff --git a/src/3rdparty/libwebp/src/enc/backward_references.h b/src/3rdparty/libwebp/src/enc/backward_references.h
index c2c81c5..daa084d 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references.h
+++ b/src/3rdparty/libwebp/src/enc/backward_references.h
@@ -22,13 +22,8 @@
 extern "C" {
 #endif
 
-// The spec allows 11, we use 9 bits to reduce memory consumption in encoding.
-// Having 9 instead of 11 only removes about 0.25 % of compression density.
-#define MAX_COLOR_CACHE_BITS 9
-
-// Max ever number of codes we'll use:
-#define PIX_OR_COPY_CODES_MAX \
-    (NUM_LITERAL_CODES + NUM_LENGTH_CODES + (1 << MAX_COLOR_CACHE_BITS))
+// The maximum allowed limit is 11.
+#define MAX_COLOR_CACHE_BITS 10
 
 // -----------------------------------------------------------------------------
 // PixOrCopy
@@ -190,21 +185,16 @@ static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
 // Main entry points
 
 // Evaluates best possible backward references for specified quality.
-// Further optimize for 2D locality if use_2d_locality flag is set.
+// The input cache_bits to 'VP8LGetBackwardReferences' sets the maximum cache
+// bits to use (passing 0 implies disabling the local color cache).
+// The optimal cache bits is evaluated and set for the *cache_bits parameter.
 // The return value is the pointer to the best of the two backward refs viz,
 // refs[0] or refs[1].
 VP8LBackwardRefs* VP8LGetBackwardReferences(
     int width, int height, const uint32_t* const argb, int quality,
-    int cache_bits, int use_2d_locality, VP8LHashChain* const hash_chain,
+    int low_effort, int* const cache_bits, VP8LHashChain* const hash_chain,
     VP8LBackwardRefs refs[2]);
 
-// Produce an estimate for a good color cache size for the image.
-int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
-                                      int xsize, int ysize, int quality,
-                                      VP8LHashChain* const hash_chain,
-                                      VP8LBackwardRefs* const ref,
-                                      int* const best_cache_bits);
-
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/3rdparty/libwebp/src/enc/config.c b/src/3rdparty/libwebp/src/enc/config.c
index 53a3bb2..f9f7961 100644
--- a/src/3rdparty/libwebp/src/enc/config.c
+++ b/src/3rdparty/libwebp/src/enc/config.c
@@ -43,10 +43,15 @@ int WebPConfigInitInternal(WebPConfig* config,
   config->alpha_filtering = 1;
   config->alpha_quality = 100;
   config->lossless = 0;
+  config->exact = 0;
   config->image_hint = WEBP_HINT_DEFAULT;
   config->emulate_jpeg_size = 0;
   config->thread_level = 0;
   config->low_memory = 0;
+  config->near_lossless = 100;
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  config->delta_palettization = 0;
+#endif // WEBP_EXPERIMENTAL_FEATURES
 
   // TODO(skal): tune.
   switch (preset) {
@@ -111,11 +116,7 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->show_compressed < 0 || config->show_compressed > 1)
     return 0;
-#if WEBP_ENCODER_ABI_VERSION > 0x0204
   if (config->preprocessing < 0 || config->preprocessing > 7)
-#else
-  if (config->preprocessing < 0 || config->preprocessing > 3)
-#endif
     return 0;
   if (config->partitions < 0 || config->partitions > 3)
     return 0;
@@ -129,6 +130,8 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->lossless < 0 || config->lossless > 1)
     return 0;
+  if (config->near_lossless < 0 || config->near_lossless > 100)
+    return 0;
   if (config->image_hint >= WEBP_HINT_LAST)
     return 0;
   if (config->emulate_jpeg_size < 0 || config->emulate_jpeg_size > 1)
@@ -137,12 +140,17 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->low_memory < 0 || config->low_memory > 1)
     return 0;
+  if (config->exact < 0 || config->exact > 1)
+    return 0;
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  if (config->delta_palettization < 0 || config->delta_palettization > 1)
+    return 0;
+#endif  // WEBP_EXPERIMENTAL_FEATURES
   return 1;
 }
 
 //------------------------------------------------------------------------------
 
-#if WEBP_ENCODER_ABI_VERSION > 0x0202
 #define MAX_LEVEL 9
 
 // Mapping between -z level and -m / -q parameter settings.
@@ -161,6 +169,5 @@ int WebPConfigLosslessPreset(WebPConfig* config, int level) {
   config->quality = kLosslessPresets[level].quality_;
   return 1;
 }
-#endif
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/cost.c b/src/3rdparty/libwebp/src/enc/cost.c
index 9d2cc01..ae7fe01 100644
--- a/src/3rdparty/libwebp/src/enc/cost.c
+++ b/src/3rdparty/libwebp/src/enc/cost.c
@@ -14,38 +14,6 @@
 #include "./cost.h"
 
 //------------------------------------------------------------------------------
-// Boolean-cost cost table
-
-const uint16_t VP8EntropyCost[256] = {
-  1792, 1792, 1792, 1536, 1536, 1408, 1366, 1280, 1280, 1216,
-  1178, 1152, 1110, 1076, 1061, 1024, 1024,  992,  968,  951,
-   939,  911,  896,  878,  871,  854,  838,  820,  811,  794,
-   786,  768,  768,  752,  740,  732,  720,  709,  704,  690,
-   683,  672,  666,  655,  647,  640,  631,  622,  615,  607,
-   598,  592,  586,  576,  572,  564,  559,  555,  547,  541,
-   534,  528,  522,  512,  512,  504,  500,  494,  488,  483,
-   477,  473,  467,  461,  458,  452,  448,  443,  438,  434,
-   427,  424,  419,  415,  410,  406,  403,  399,  394,  390,
-   384,  384,  377,  374,  370,  366,  362,  359,  355,  351,
-   347,  342,  342,  336,  333,  330,  326,  323,  320,  316,
-   312,  308,  305,  302,  299,  296,  293,  288,  287,  283,
-   280,  277,  274,  272,  268,  266,  262,  256,  256,  256,
-   251,  248,  245,  242,  240,  237,  234,  232,  228,  226,
-   223,  221,  218,  216,  214,  211,  208,  205,  203,  201,
-   198,  196,  192,  191,  188,  187,  183,  181,  179,  176,
-   175,  171,  171,  168,  165,  163,  160,  159,  156,  154,
-   152,  150,  148,  146,  144,  142,  139,  138,  135,  133,
-   131,  128,  128,  125,  123,  121,  119,  117,  115,  113,
-   111,  110,  107,  105,  103,  102,  100,   98,   96,   94,
-    92,   91,   89,   86,   86,   83,   82,   80,   77,   76,
-    74,   73,   71,   69,   67,   66,   64,   63,   61,   59,
-    57,   55,   54,   52,   51,   49,   47,   46,   44,   43,
-    41,   40,   38,   36,   35,   33,   32,   30,   29,   27,
-    25,   24,   22,   21,   19,   18,   16,   15,   13,   12,
-    10,    9,    7,    6,    4,    3
-};
-
-//------------------------------------------------------------------------------
 // Level cost tables
 
 // For each given level, the following table gives the pattern of contexts to
@@ -71,267 +39,6 @@ const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = {
   {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x053}, {0x153, 0x153}
 };
 
-// fixed costs for coding levels, deduce from the coding tree.
-// This is only the part that doesn't depend on the probability state.
-const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1] = {
-     0,  256,  256,  256,  256,  432,  618,  630,
-   731,  640,  640,  828,  901,  948, 1021, 1101,
-  1174, 1221, 1294, 1042, 1085, 1115, 1158, 1202,
-  1245, 1275, 1318, 1337, 1380, 1410, 1453, 1497,
-  1540, 1570, 1613, 1280, 1295, 1317, 1332, 1358,
-  1373, 1395, 1410, 1454, 1469, 1491, 1506, 1532,
-  1547, 1569, 1584, 1601, 1616, 1638, 1653, 1679,
-  1694, 1716, 1731, 1775, 1790, 1812, 1827, 1853,
-  1868, 1890, 1905, 1727, 1733, 1742, 1748, 1759,
-  1765, 1774, 1780, 1800, 1806, 1815, 1821, 1832,
-  1838, 1847, 1853, 1878, 1884, 1893, 1899, 1910,
-  1916, 1925, 1931, 1951, 1957, 1966, 1972, 1983,
-  1989, 1998, 2004, 2027, 2033, 2042, 2048, 2059,
-  2065, 2074, 2080, 2100, 2106, 2115, 2121, 2132,
-  2138, 2147, 2153, 2178, 2184, 2193, 2199, 2210,
-  2216, 2225, 2231, 2251, 2257, 2266, 2272, 2283,
-  2289, 2298, 2304, 2168, 2174, 2183, 2189, 2200,
-  2206, 2215, 2221, 2241, 2247, 2256, 2262, 2273,
-  2279, 2288, 2294, 2319, 2325, 2334, 2340, 2351,
-  2357, 2366, 2372, 2392, 2398, 2407, 2413, 2424,
-  2430, 2439, 2445, 2468, 2474, 2483, 2489, 2500,
-  2506, 2515, 2521, 2541, 2547, 2556, 2562, 2573,
-  2579, 2588, 2594, 2619, 2625, 2634, 2640, 2651,
-  2657, 2666, 2672, 2692, 2698, 2707, 2713, 2724,
-  2730, 2739, 2745, 2540, 2546, 2555, 2561, 2572,
-  2578, 2587, 2593, 2613, 2619, 2628, 2634, 2645,
-  2651, 2660, 2666, 2691, 2697, 2706, 2712, 2723,
-  2729, 2738, 2744, 2764, 2770, 2779, 2785, 2796,
-  2802, 2811, 2817, 2840, 2846, 2855, 2861, 2872,
-  2878, 2887, 2893, 2913, 2919, 2928, 2934, 2945,
-  2951, 2960, 2966, 2991, 2997, 3006, 3012, 3023,
-  3029, 3038, 3044, 3064, 3070, 3079, 3085, 3096,
-  3102, 3111, 3117, 2981, 2987, 2996, 3002, 3013,
-  3019, 3028, 3034, 3054, 3060, 3069, 3075, 3086,
-  3092, 3101, 3107, 3132, 3138, 3147, 3153, 3164,
-  3170, 3179, 3185, 3205, 3211, 3220, 3226, 3237,
-  3243, 3252, 3258, 3281, 3287, 3296, 3302, 3313,
-  3319, 3328, 3334, 3354, 3360, 3369, 3375, 3386,
-  3392, 3401, 3407, 3432, 3438, 3447, 3453, 3464,
-  3470, 3479, 3485, 3505, 3511, 3520, 3526, 3537,
-  3543, 3552, 3558, 2816, 2822, 2831, 2837, 2848,
-  2854, 2863, 2869, 2889, 2895, 2904, 2910, 2921,
-  2927, 2936, 2942, 2967, 2973, 2982, 2988, 2999,
-  3005, 3014, 3020, 3040, 3046, 3055, 3061, 3072,
-  3078, 3087, 3093, 3116, 3122, 3131, 3137, 3148,
-  3154, 3163, 3169, 3189, 3195, 3204, 3210, 3221,
-  3227, 3236, 3242, 3267, 3273, 3282, 3288, 3299,
-  3305, 3314, 3320, 3340, 3346, 3355, 3361, 3372,
-  3378, 3387, 3393, 3257, 3263, 3272, 3278, 3289,
-  3295, 3304, 3310, 3330, 3336, 3345, 3351, 3362,
-  3368, 3377, 3383, 3408, 3414, 3423, 3429, 3440,
-  3446, 3455, 3461, 3481, 3487, 3496, 3502, 3513,
-  3519, 3528, 3534, 3557, 3563, 3572, 3578, 3589,
-  3595, 3604, 3610, 3630, 3636, 3645, 3651, 3662,
-  3668, 3677, 3683, 3708, 3714, 3723, 3729, 3740,
-  3746, 3755, 3761, 3781, 3787, 3796, 3802, 3813,
-  3819, 3828, 3834, 3629, 3635, 3644, 3650, 3661,
-  3667, 3676, 3682, 3702, 3708, 3717, 3723, 3734,
-  3740, 3749, 3755, 3780, 3786, 3795, 3801, 3812,
-  3818, 3827, 3833, 3853, 3859, 3868, 3874, 3885,
-  3891, 3900, 3906, 3929, 3935, 3944, 3950, 3961,
-  3967, 3976, 3982, 4002, 4008, 4017, 4023, 4034,
-  4040, 4049, 4055, 4080, 4086, 4095, 4101, 4112,
-  4118, 4127, 4133, 4153, 4159, 4168, 4174, 4185,
-  4191, 4200, 4206, 4070, 4076, 4085, 4091, 4102,
-  4108, 4117, 4123, 4143, 4149, 4158, 4164, 4175,
-  4181, 4190, 4196, 4221, 4227, 4236, 4242, 4253,
-  4259, 4268, 4274, 4294, 4300, 4309, 4315, 4326,
-  4332, 4341, 4347, 4370, 4376, 4385, 4391, 4402,
-  4408, 4417, 4423, 4443, 4449, 4458, 4464, 4475,
-  4481, 4490, 4496, 4521, 4527, 4536, 4542, 4553,
-  4559, 4568, 4574, 4594, 4600, 4609, 4615, 4626,
-  4632, 4641, 4647, 3515, 3521, 3530, 3536, 3547,
-  3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
-  3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
-  3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
-  3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
-  3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
-  3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
-  4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
-  4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
-  3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
-  4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
-  4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
-  4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
-  4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
-  4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
-  4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
-  4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
-  4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
-  4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
-  4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
-  4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
-  4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
-  4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
-  4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
-  4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
-  4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
-  4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
-  4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
-  5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
-  5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
-  5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
-  5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
-  5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
-  4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
-  4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
-  4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
-  4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
-  4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
-  5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
-  5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
-  5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
-  5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
-  5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
-  5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
-  5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
-  5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
-  5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
-  5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
-  5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
-  5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
-  5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
-  5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
-  5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
-  5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
-  5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
-  5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
-  5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
-  5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
-  5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
-  6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
-  6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
-  6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
-  6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
-  6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
-  6420, 6429, 6435, 3515, 3521, 3530, 3536, 3547,
-  3553, 3562, 3568, 3588, 3594, 3603, 3609, 3620,
-  3626, 3635, 3641, 3666, 3672, 3681, 3687, 3698,
-  3704, 3713, 3719, 3739, 3745, 3754, 3760, 3771,
-  3777, 3786, 3792, 3815, 3821, 3830, 3836, 3847,
-  3853, 3862, 3868, 3888, 3894, 3903, 3909, 3920,
-  3926, 3935, 3941, 3966, 3972, 3981, 3987, 3998,
-  4004, 4013, 4019, 4039, 4045, 4054, 4060, 4071,
-  4077, 4086, 4092, 3956, 3962, 3971, 3977, 3988,
-  3994, 4003, 4009, 4029, 4035, 4044, 4050, 4061,
-  4067, 4076, 4082, 4107, 4113, 4122, 4128, 4139,
-  4145, 4154, 4160, 4180, 4186, 4195, 4201, 4212,
-  4218, 4227, 4233, 4256, 4262, 4271, 4277, 4288,
-  4294, 4303, 4309, 4329, 4335, 4344, 4350, 4361,
-  4367, 4376, 4382, 4407, 4413, 4422, 4428, 4439,
-  4445, 4454, 4460, 4480, 4486, 4495, 4501, 4512,
-  4518, 4527, 4533, 4328, 4334, 4343, 4349, 4360,
-  4366, 4375, 4381, 4401, 4407, 4416, 4422, 4433,
-  4439, 4448, 4454, 4479, 4485, 4494, 4500, 4511,
-  4517, 4526, 4532, 4552, 4558, 4567, 4573, 4584,
-  4590, 4599, 4605, 4628, 4634, 4643, 4649, 4660,
-  4666, 4675, 4681, 4701, 4707, 4716, 4722, 4733,
-  4739, 4748, 4754, 4779, 4785, 4794, 4800, 4811,
-  4817, 4826, 4832, 4852, 4858, 4867, 4873, 4884,
-  4890, 4899, 4905, 4769, 4775, 4784, 4790, 4801,
-  4807, 4816, 4822, 4842, 4848, 4857, 4863, 4874,
-  4880, 4889, 4895, 4920, 4926, 4935, 4941, 4952,
-  4958, 4967, 4973, 4993, 4999, 5008, 5014, 5025,
-  5031, 5040, 5046, 5069, 5075, 5084, 5090, 5101,
-  5107, 5116, 5122, 5142, 5148, 5157, 5163, 5174,
-  5180, 5189, 5195, 5220, 5226, 5235, 5241, 5252,
-  5258, 5267, 5273, 5293, 5299, 5308, 5314, 5325,
-  5331, 5340, 5346, 4604, 4610, 4619, 4625, 4636,
-  4642, 4651, 4657, 4677, 4683, 4692, 4698, 4709,
-  4715, 4724, 4730, 4755, 4761, 4770, 4776, 4787,
-  4793, 4802, 4808, 4828, 4834, 4843, 4849, 4860,
-  4866, 4875, 4881, 4904, 4910, 4919, 4925, 4936,
-  4942, 4951, 4957, 4977, 4983, 4992, 4998, 5009,
-  5015, 5024, 5030, 5055, 5061, 5070, 5076, 5087,
-  5093, 5102, 5108, 5128, 5134, 5143, 5149, 5160,
-  5166, 5175, 5181, 5045, 5051, 5060, 5066, 5077,
-  5083, 5092, 5098, 5118, 5124, 5133, 5139, 5150,
-  5156, 5165, 5171, 5196, 5202, 5211, 5217, 5228,
-  5234, 5243, 5249, 5269, 5275, 5284, 5290, 5301,
-  5307, 5316, 5322, 5345, 5351, 5360, 5366, 5377,
-  5383, 5392, 5398, 5418, 5424, 5433, 5439, 5450,
-  5456, 5465, 5471, 5496, 5502, 5511, 5517, 5528,
-  5534, 5543, 5549, 5569, 5575, 5584, 5590, 5601,
-  5607, 5616, 5622, 5417, 5423, 5432, 5438, 5449,
-  5455, 5464, 5470, 5490, 5496, 5505, 5511, 5522,
-  5528, 5537, 5543, 5568, 5574, 5583, 5589, 5600,
-  5606, 5615, 5621, 5641, 5647, 5656, 5662, 5673,
-  5679, 5688, 5694, 5717, 5723, 5732, 5738, 5749,
-  5755, 5764, 5770, 5790, 5796, 5805, 5811, 5822,
-  5828, 5837, 5843, 5868, 5874, 5883, 5889, 5900,
-  5906, 5915, 5921, 5941, 5947, 5956, 5962, 5973,
-  5979, 5988, 5994, 5858, 5864, 5873, 5879, 5890,
-  5896, 5905, 5911, 5931, 5937, 5946, 5952, 5963,
-  5969, 5978, 5984, 6009, 6015, 6024, 6030, 6041,
-  6047, 6056, 6062, 6082, 6088, 6097, 6103, 6114,
-  6120, 6129, 6135, 6158, 6164, 6173, 6179, 6190,
-  6196, 6205, 6211, 6231, 6237, 6246, 6252, 6263,
-  6269, 6278, 6284, 6309, 6315, 6324, 6330, 6341,
-  6347, 6356, 6362, 6382, 6388, 6397, 6403, 6414,
-  6420, 6429, 6435, 5303, 5309, 5318, 5324, 5335,
-  5341, 5350, 5356, 5376, 5382, 5391, 5397, 5408,
-  5414, 5423, 5429, 5454, 5460, 5469, 5475, 5486,
-  5492, 5501, 5507, 5527, 5533, 5542, 5548, 5559,
-  5565, 5574, 5580, 5603, 5609, 5618, 5624, 5635,
-  5641, 5650, 5656, 5676, 5682, 5691, 5697, 5708,
-  5714, 5723, 5729, 5754, 5760, 5769, 5775, 5786,
-  5792, 5801, 5807, 5827, 5833, 5842, 5848, 5859,
-  5865, 5874, 5880, 5744, 5750, 5759, 5765, 5776,
-  5782, 5791, 5797, 5817, 5823, 5832, 5838, 5849,
-  5855, 5864, 5870, 5895, 5901, 5910, 5916, 5927,
-  5933, 5942, 5948, 5968, 5974, 5983, 5989, 6000,
-  6006, 6015, 6021, 6044, 6050, 6059, 6065, 6076,
-  6082, 6091, 6097, 6117, 6123, 6132, 6138, 6149,
-  6155, 6164, 6170, 6195, 6201, 6210, 6216, 6227,
-  6233, 6242, 6248, 6268, 6274, 6283, 6289, 6300,
-  6306, 6315, 6321, 6116, 6122, 6131, 6137, 6148,
-  6154, 6163, 6169, 6189, 6195, 6204, 6210, 6221,
-  6227, 6236, 6242, 6267, 6273, 6282, 6288, 6299,
-  6305, 6314, 6320, 6340, 6346, 6355, 6361, 6372,
-  6378, 6387, 6393, 6416, 6422, 6431, 6437, 6448,
-  6454, 6463, 6469, 6489, 6495, 6504, 6510, 6521,
-  6527, 6536, 6542, 6567, 6573, 6582, 6588, 6599,
-  6605, 6614, 6620, 6640, 6646, 6655, 6661, 6672,
-  6678, 6687, 6693, 6557, 6563, 6572, 6578, 6589,
-  6595, 6604, 6610, 6630, 6636, 6645, 6651, 6662,
-  6668, 6677, 6683, 6708, 6714, 6723, 6729, 6740,
-  6746, 6755, 6761, 6781, 6787, 6796, 6802, 6813,
-  6819, 6828, 6834, 6857, 6863, 6872, 6878, 6889,
-  6895, 6904, 6910, 6930, 6936, 6945, 6951, 6962,
-  6968, 6977, 6983, 7008, 7014, 7023, 7029, 7040,
-  7046, 7055, 7061, 7081, 7087, 7096, 7102, 7113,
-  7119, 7128, 7134, 6392, 6398, 6407, 6413, 6424,
-  6430, 6439, 6445, 6465, 6471, 6480, 6486, 6497,
-  6503, 6512, 6518, 6543, 6549, 6558, 6564, 6575,
-  6581, 6590, 6596, 6616, 6622, 6631, 6637, 6648,
-  6654, 6663, 6669, 6692, 6698, 6707, 6713, 6724,
-  6730, 6739, 6745, 6765, 6771, 6780, 6786, 6797,
-  6803, 6812, 6818, 6843, 6849, 6858, 6864, 6875,
-  6881, 6890, 6896, 6916, 6922, 6931, 6937, 6948,
-  6954, 6963, 6969, 6833, 6839, 6848, 6854, 6865,
-  6871, 6880, 6886, 6906, 6912, 6921, 6927, 6938,
-  6944, 6953, 6959, 6984, 6990, 6999, 7005, 7016,
-  7022, 7031, 7037, 7057, 7063, 7072, 7078, 7089,
-  7095, 7104, 7110, 7133, 7139, 7148, 7154, 7165,
-  7171, 7180, 7186, 7206, 7212, 7221, 7227, 7238,
-  7244, 7253, 7259, 7284, 7290, 7299, 7305, 7316,
-  7322, 7331, 7337, 7357, 7363, 7372, 7378, 7389,
-  7395, 7404, 7410, 7205, 7211, 7220, 7226, 7237,
-  7243, 7252, 7258, 7278, 7284, 7293, 7299, 7310,
-  7316, 7325, 7331, 7356, 7362, 7371, 7377, 7388,
-  7394, 7403, 7409, 7429, 7435, 7444, 7450, 7461,
-  7467, 7476, 7482, 7505, 7511, 7520, 7526, 7537,
-  7543, 7552, 7558, 7578, 7584, 7593, 7599, 7610,
-  7616, 7625, 7631, 7656, 7662, 7671, 7677, 7688,
-  7694, 7703, 7709, 7729, 7735, 7744, 7750, 7761
-};
-
 static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) {
   int pattern = VP8LevelCodes[level - 1][0];
   int bits = VP8LevelCodes[level - 1][1];
@@ -350,12 +57,13 @@ static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) {
 //------------------------------------------------------------------------------
 // Pre-calc level costs once for all
 
-void VP8CalculateLevelCosts(VP8Proba* const proba) {
+void VP8CalculateLevelCosts(VP8EncProba* const proba) {
   int ctype, band, ctx;
 
   if (!proba->dirty_) return;  // nothing to do.
 
   for (ctype = 0; ctype < NUM_TYPES; ++ctype) {
+    int n;
     for (band = 0; band < NUM_BANDS; ++band) {
       for (ctx = 0; ctx < NUM_CTX; ++ctx) {
         const uint8_t* const p = proba->coeffs_[ctype][band][ctx];
@@ -371,6 +79,12 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) {
         // actually constant.
       }
     }
+    for (n = 0; n < 16; ++n) {    // replicate bands. We don't need to sentinel.
+      for (ctx = 0; ctx < NUM_CTX; ++ctx) {
+        proba->remapped_costs_[ctype][n][ctx] =
+            proba->level_cost_[ctype][VP8EncBands[n]][ctx];
+      }
+    }
   }
   proba->dirty_ = 0;
 }
@@ -487,66 +201,6 @@ const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
 };
 
 //------------------------------------------------------------------------------
-// Mode costs
-
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
-  int n = res->first;
-  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  const int p0 = res->prob[n][ctx0][0];
-  const uint16_t* t = res->cost[n][ctx0];
-  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
-  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
-  // be missing during the loop.
-  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
-
-  if (res->last < 0) {
-    return VP8BitCost(0, p0);
-  }
-  for (; n < res->last; ++n) {
-    const int v = abs(res->coeffs[n]);
-    const int b = VP8EncBands[n + 1];
-    const int ctx = (v >= 2) ? 2 : v;
-    cost += VP8LevelCost(t, v);
-    t = res->cost[b][ctx];
-  }
-  // Last coefficient is always non-zero
-  {
-    const int v = abs(res->coeffs[n]);
-    assert(v != 0);
-    cost += VP8LevelCost(t, v);
-    if (n < 15) {
-      const int b = VP8EncBands[n + 1];
-      const int ctx = (v == 1) ? 1 : 2;
-      const int last_p0 = res->prob[b][ctx][0];
-      cost += VP8BitCost(0, last_p0);
-    }
-  }
-  return cost;
-}
-
-//------------------------------------------------------------------------------
-// init function
-
-#if defined(WEBP_USE_MIPS32)
-extern int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res);
-#endif  // WEBP_USE_MIPS32
-
-// TODO(skal): this, and GetResidualCost(), should probably go somewhere
-// under src/dsp/ at some point.
-VP8GetResidualCostFunc VP8GetResidualCost;
-
-void VP8GetResidualCostInit(void) {
-  VP8GetResidualCost = GetResidualCost;
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_MIPS32)
-    if (VP8GetCPUInfo(kMIPS32)) {
-      VP8GetResidualCost = VP8GetResidualCostMIPS32;
-    }
-#endif
-  }
-}
-
-//------------------------------------------------------------------------------
 // helper functions for residuals struct VP8Residual.
 
 void VP8InitResidual(int first, int coeff_type,
@@ -554,45 +208,10 @@ void VP8InitResidual(int first, int coeff_type,
   res->coeff_type = coeff_type;
   res->prob  = enc->proba_.coeffs_[coeff_type];
   res->stats = enc->proba_.stats_[coeff_type];
-  res->cost  = enc->proba_.level_cost_[coeff_type];
+  res->costs = enc->proba_.remapped_costs_[coeff_type];
   res->first = first;
 }
 
-static void SetResidualCoeffs(const int16_t* const coeffs,
-                              VP8Residual* const res) {
-  int n;
-  res->last = -1;
-  assert(res->first == 0 || coeffs[0] == 0);
-  for (n = 15; n >= 0; --n) {
-    if (coeffs[n]) {
-      res->last = n;
-      break;
-    }
-  }
-  res->coeffs = coeffs;
-}
-
-//------------------------------------------------------------------------------
-// init function
-
-#if defined(WEBP_USE_SSE2)
-extern void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
-                                     VP8Residual* const res);
-#endif  // WEBP_USE_SSE2
-
-VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
-
-void VP8SetResidualCoeffsInit(void) {
-  VP8SetResidualCoeffs = SetResidualCoeffs;
-  if (VP8GetCPUInfo != NULL) {
-#if defined(WEBP_USE_SSE2)
-    if (VP8GetCPUInfo(kSSE2)) {
-      VP8SetResidualCoeffs = VP8SetResidualCoeffsSSE2;
-    }
-#endif
-  }
-}
-
 //------------------------------------------------------------------------------
 // Mode costs
 
diff --git a/src/3rdparty/libwebp/src/enc/cost.h b/src/3rdparty/libwebp/src/enc/cost.h
index 4e55895..20960d6 100644
--- a/src/3rdparty/libwebp/src/enc/cost.h
+++ b/src/3rdparty/libwebp/src/enc/cost.h
@@ -24,46 +24,31 @@ extern "C" {
 
 // On-the-fly info about the current set of residuals. Handy to avoid
 // passing zillions of params.
-typedef struct {
+typedef struct VP8Residual VP8Residual;
+struct VP8Residual {
   int first;
   int last;
   const int16_t* coeffs;
 
   int coeff_type;
-  ProbaArray* prob;
-  StatsArray* stats;
-  CostArray*  cost;
-} VP8Residual;
+  ProbaArray*   prob;
+  StatsArray*   stats;
+  CostArrayPtr  costs;
+};
 
 void VP8InitResidual(int first, int coeff_type,
                      VP8Encoder* const enc, VP8Residual* const res);
 
-typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
-                                         VP8Residual* const res);
-extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
-
-void VP8SetResidualCoeffsInit(void);  // must be called first
-
 int VP8RecordCoeffs(int ctx, const VP8Residual* const res);
 
-// approximate cost per level:
-extern const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1];
-extern const uint16_t VP8EntropyCost[256];        // 8bit fixed-point log(p)
-
 // Cost of coding one event with probability 'proba'.
 static WEBP_INLINE int VP8BitCost(int bit, uint8_t proba) {
   return !bit ? VP8EntropyCost[proba] : VP8EntropyCost[255 - proba];
 }
 
-// Cost calculation function.
-typedef int (*VP8GetResidualCostFunc)(int ctx0, const VP8Residual* const res);
-extern VP8GetResidualCostFunc VP8GetResidualCost;
-
-void VP8GetResidualCostInit(void);  // must be called first
-
 // Level cost calculations
 extern const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2];
-void VP8CalculateLevelCosts(VP8Proba* const proba);
+void VP8CalculateLevelCosts(VP8EncProba* const proba);
 static WEBP_INLINE int VP8LevelCost(const uint16_t* const table, int level) {
   return VP8LevelFixedCosts[level]
        + table[(level > MAX_VARIABLE_LEVEL) ? MAX_VARIABLE_LEVEL : level];
diff --git a/src/3rdparty/libwebp/src/enc/delta_palettization.c b/src/3rdparty/libwebp/src/enc/delta_palettization.c
new file mode 100644
index 0000000..062e588
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/delta_palettization.c
@@ -0,0 +1,455 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Mislav Bradac (mislavm@google.com)
+//
+
+#include "./delta_palettization.h"
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+#include "../webp/types.h"
+#include "../dsp/lossless.h"
+
+#define MK_COL(r, g, b) (((r) << 16) + ((g) << 8) + (b))
+
+// Format allows palette up to 256 entries, but more palette entries produce
+// bigger entropy. In the future it will probably be useful to add more entries
+// that are far from the origin of the palette or choose remaining entries
+// dynamically.
+#define DELTA_PALETTE_SIZE 226
+
+// Palette used for delta_palettization. Entries are roughly sorted by distance
+// of their signed equivalents from the origin.
+static const uint32_t kDeltaPalette[DELTA_PALETTE_SIZE] = {
+  MK_COL(0u, 0u, 0u),
+  MK_COL(255u, 255u, 255u),
+  MK_COL(1u, 1u, 1u),
+  MK_COL(254u, 254u, 254u),
+  MK_COL(2u, 2u, 2u),
+  MK_COL(4u, 4u, 4u),
+  MK_COL(252u, 252u, 252u),
+  MK_COL(250u, 0u, 0u),
+  MK_COL(0u, 250u, 0u),
+  MK_COL(0u, 0u, 250u),
+  MK_COL(6u, 0u, 0u),
+  MK_COL(0u, 6u, 0u),
+  MK_COL(0u, 0u, 6u),
+  MK_COL(0u, 0u, 248u),
+  MK_COL(0u, 0u, 8u),
+  MK_COL(0u, 248u, 0u),
+  MK_COL(0u, 248u, 248u),
+  MK_COL(0u, 248u, 8u),
+  MK_COL(0u, 8u, 0u),
+  MK_COL(0u, 8u, 248u),
+  MK_COL(0u, 8u, 8u),
+  MK_COL(8u, 8u, 8u),
+  MK_COL(248u, 0u, 0u),
+  MK_COL(248u, 0u, 248u),
+  MK_COL(248u, 0u, 8u),
+  MK_COL(248u, 248u, 0u),
+  MK_COL(248u, 8u, 0u),
+  MK_COL(8u, 0u, 0u),
+  MK_COL(8u, 0u, 248u),
+  MK_COL(8u, 0u, 8u),
+  MK_COL(8u, 248u, 0u),
+  MK_COL(8u, 8u, 0u),
+  MK_COL(23u, 23u, 23u),
+  MK_COL(13u, 13u, 13u),
+  MK_COL(232u, 232u, 232u),
+  MK_COL(244u, 244u, 244u),
+  MK_COL(245u, 245u, 250u),
+  MK_COL(50u, 50u, 50u),
+  MK_COL(204u, 204u, 204u),
+  MK_COL(236u, 236u, 236u),
+  MK_COL(16u, 16u, 16u),
+  MK_COL(240u, 16u, 16u),
+  MK_COL(16u, 240u, 16u),
+  MK_COL(240u, 240u, 16u),
+  MK_COL(16u, 16u, 240u),
+  MK_COL(240u, 16u, 240u),
+  MK_COL(16u, 240u, 240u),
+  MK_COL(240u, 240u, 240u),
+  MK_COL(0u, 0u, 232u),
+  MK_COL(0u, 232u, 0u),
+  MK_COL(232u, 0u, 0u),
+  MK_COL(0u, 0u, 24u),
+  MK_COL(0u, 24u, 0u),
+  MK_COL(24u, 0u, 0u),
+  MK_COL(32u, 32u, 32u),
+  MK_COL(224u, 32u, 32u),
+  MK_COL(32u, 224u, 32u),
+  MK_COL(224u, 224u, 32u),
+  MK_COL(32u, 32u, 224u),
+  MK_COL(224u, 32u, 224u),
+  MK_COL(32u, 224u, 224u),
+  MK_COL(224u, 224u, 224u),
+  MK_COL(0u, 0u, 176u),
+  MK_COL(0u, 0u, 80u),
+  MK_COL(0u, 176u, 0u),
+  MK_COL(0u, 176u, 176u),
+  MK_COL(0u, 176u, 80u),
+  MK_COL(0u, 80u, 0u),
+  MK_COL(0u, 80u, 176u),
+  MK_COL(0u, 80u, 80u),
+  MK_COL(176u, 0u, 0u),
+  MK_COL(176u, 0u, 176u),
+  MK_COL(176u, 0u, 80u),
+  MK_COL(176u, 176u, 0u),
+  MK_COL(176u, 80u, 0u),
+  MK_COL(80u, 0u, 0u),
+  MK_COL(80u, 0u, 176u),
+  MK_COL(80u, 0u, 80u),
+  MK_COL(80u, 176u, 0u),
+  MK_COL(80u, 80u, 0u),
+  MK_COL(0u, 0u, 152u),
+  MK_COL(0u, 0u, 104u),
+  MK_COL(0u, 152u, 0u),
+  MK_COL(0u, 152u, 152u),
+  MK_COL(0u, 152u, 104u),
+  MK_COL(0u, 104u, 0u),
+  MK_COL(0u, 104u, 152u),
+  MK_COL(0u, 104u, 104u),
+  MK_COL(152u, 0u, 0u),
+  MK_COL(152u, 0u, 152u),
+  MK_COL(152u, 0u, 104u),
+  MK_COL(152u, 152u, 0u),
+  MK_COL(152u, 104u, 0u),
+  MK_COL(104u, 0u, 0u),
+  MK_COL(104u, 0u, 152u),
+  MK_COL(104u, 0u, 104u),
+  MK_COL(104u, 152u, 0u),
+  MK_COL(104u, 104u, 0u),
+  MK_COL(216u, 216u, 216u),
+  MK_COL(216u, 216u, 40u),
+  MK_COL(216u, 216u, 176u),
+  MK_COL(216u, 216u, 80u),
+  MK_COL(216u, 40u, 216u),
+  MK_COL(216u, 40u, 40u),
+  MK_COL(216u, 40u, 176u),
+  MK_COL(216u, 40u, 80u),
+  MK_COL(216u, 176u, 216u),
+  MK_COL(216u, 176u, 40u),
+  MK_COL(216u, 176u, 176u),
+  MK_COL(216u, 176u, 80u),
+  MK_COL(216u, 80u, 216u),
+  MK_COL(216u, 80u, 40u),
+  MK_COL(216u, 80u, 176u),
+  MK_COL(216u, 80u, 80u),
+  MK_COL(40u, 216u, 216u),
+  MK_COL(40u, 216u, 40u),
+  MK_COL(40u, 216u, 176u),
+  MK_COL(40u, 216u, 80u),
+  MK_COL(40u, 40u, 216u),
+  MK_COL(40u, 40u, 40u),
+  MK_COL(40u, 40u, 176u),
+  MK_COL(40u, 40u, 80u),
+  MK_COL(40u, 176u, 216u),
+  MK_COL(40u, 176u, 40u),
+  MK_COL(40u, 176u, 176u),
+  MK_COL(40u, 176u, 80u),
+  MK_COL(40u, 80u, 216u),
+  MK_COL(40u, 80u, 40u),
+  MK_COL(40u, 80u, 176u),
+  MK_COL(40u, 80u, 80u),
+  MK_COL(80u, 216u, 216u),
+  MK_COL(80u, 216u, 40u),
+  MK_COL(80u, 216u, 176u),
+  MK_COL(80u, 216u, 80u),
+  MK_COL(80u, 40u, 216u),
+  MK_COL(80u, 40u, 40u),
+  MK_COL(80u, 40u, 176u),
+  MK_COL(80u, 40u, 80u),
+  MK_COL(80u, 176u, 216u),
+  MK_COL(80u, 176u, 40u),
+  MK_COL(80u, 176u, 176u),
+  MK_COL(80u, 176u, 80u),
+  MK_COL(80u, 80u, 216u),
+  MK_COL(80u, 80u, 40u),
+  MK_COL(80u, 80u, 176u),
+  MK_COL(80u, 80u, 80u),
+  MK_COL(0u, 0u, 192u),
+  MK_COL(0u, 0u, 64u),
+  MK_COL(0u, 0u, 128u),
+  MK_COL(0u, 192u, 0u),
+  MK_COL(0u, 192u, 192u),
+  MK_COL(0u, 192u, 64u),
+  MK_COL(0u, 192u, 128u),
+  MK_COL(0u, 64u, 0u),
+  MK_COL(0u, 64u, 192u),
+  MK_COL(0u, 64u, 64u),
+  MK_COL(0u, 64u, 128u),
+  MK_COL(0u, 128u, 0u),
+  MK_COL(0u, 128u, 192u),
+  MK_COL(0u, 128u, 64u),
+  MK_COL(0u, 128u, 128u),
+  MK_COL(176u, 216u, 216u),
+  MK_COL(176u, 216u, 40u),
+  MK_COL(176u, 216u, 176u),
+  MK_COL(176u, 216u, 80u),
+  MK_COL(176u, 40u, 216u),
+  MK_COL(176u, 40u, 40u),
+  MK_COL(176u, 40u, 176u),
+  MK_COL(176u, 40u, 80u),
+  MK_COL(176u, 176u, 216u),
+  MK_COL(176u, 176u, 40u),
+  MK_COL(176u, 176u, 176u),
+  MK_COL(176u, 176u, 80u),
+  MK_COL(176u, 80u, 216u),
+  MK_COL(176u, 80u, 40u),
+  MK_COL(176u, 80u, 176u),
+  MK_COL(176u, 80u, 80u),
+  MK_COL(192u, 0u, 0u),
+  MK_COL(192u, 0u, 192u),
+  MK_COL(192u, 0u, 64u),
+  MK_COL(192u, 0u, 128u),
+  MK_COL(192u, 192u, 0u),
+  MK_COL(192u, 192u, 192u),
+  MK_COL(192u, 192u, 64u),
+  MK_COL(192u, 192u, 128u),
+  MK_COL(192u, 64u, 0u),
+  MK_COL(192u, 64u, 192u),
+  MK_COL(192u, 64u, 64u),
+  MK_COL(192u, 64u, 128u),
+  MK_COL(192u, 128u, 0u),
+  MK_COL(192u, 128u, 192u),
+  MK_COL(192u, 128u, 64u),
+  MK_COL(192u, 128u, 128u),
+  MK_COL(64u, 0u, 0u),
+  MK_COL(64u, 0u, 192u),
+  MK_COL(64u, 0u, 64u),
+  MK_COL(64u, 0u, 128u),
+  MK_COL(64u, 192u, 0u),
+  MK_COL(64u, 192u, 192u),
+  MK_COL(64u, 192u, 64u),
+  MK_COL(64u, 192u, 128u),
+  MK_COL(64u, 64u, 0u),
+  MK_COL(64u, 64u, 192u),
+  MK_COL(64u, 64u, 64u),
+  MK_COL(64u, 64u, 128u),
+  MK_COL(64u, 128u, 0u),
+  MK_COL(64u, 128u, 192u),
+  MK_COL(64u, 128u, 64u),
+  MK_COL(64u, 128u, 128u),
+  MK_COL(128u, 0u, 0u),
+  MK_COL(128u, 0u, 192u),
+  MK_COL(128u, 0u, 64u),
+  MK_COL(128u, 0u, 128u),
+  MK_COL(128u, 192u, 0u),
+  MK_COL(128u, 192u, 192u),
+  MK_COL(128u, 192u, 64u),
+  MK_COL(128u, 192u, 128u),
+  MK_COL(128u, 64u, 0u),
+  MK_COL(128u, 64u, 192u),
+  MK_COL(128u, 64u, 64u),
+  MK_COL(128u, 64u, 128u),
+  MK_COL(128u, 128u, 0u),
+  MK_COL(128u, 128u, 192u),
+  MK_COL(128u, 128u, 64u),
+  MK_COL(128u, 128u, 128u),
+};
+
+#undef MK_COL
+
+//------------------------------------------------------------------------------
+// TODO(skal): move the functions to dsp/lossless.c when the correct
+// granularity is found. For now, we'll just copy-paste some useful bits
+// here instead.
+
+// In-place sum of each component with mod 256.
+static WEBP_INLINE void AddPixelsEq(uint32_t* a, uint32_t b) {
+  const uint32_t alpha_and_green = (*a & 0xff00ff00u) + (b & 0xff00ff00u);
+  const uint32_t red_and_blue = (*a & 0x00ff00ffu) + (b & 0x00ff00ffu);
+  *a = (alpha_and_green & 0xff00ff00u) | (red_and_blue & 0x00ff00ffu);
+}
+
+static WEBP_INLINE uint32_t Clip255(uint32_t a) {
+  if (a < 256) {
+    return a;
+  }
+  // return 0, when a is a negative integer.
+  // return 255, when a is positive.
+  return ~a >> 24;
+}
+
+// Delta palettization functions.
+static WEBP_INLINE int Square(int x) {
+  return x * x;
+}
+
+static WEBP_INLINE uint32_t Intensity(uint32_t a) {
+  return
+      30 * ((a >> 16) & 0xff) +
+      59 * ((a >>  8) & 0xff) +
+      11 * ((a >>  0) & 0xff);
+}
+
+static uint32_t CalcDist(uint32_t predicted_value, uint32_t actual_value,
+                         uint32_t palette_entry) {
+  int i;
+  uint32_t distance = 0;
+  AddPixelsEq(&predicted_value, palette_entry);
+  for (i = 0; i < 32; i += 8) {
+    const int32_t av = (actual_value >> i) & 0xff;
+    const int32_t pv = (predicted_value >> i) & 0xff;
+    distance += Square(pv - av);
+  }
+  // We sum square of intensity difference with factor 10, but because Intensity
+  // returns 100 times real intensity we need to multiply differences of colors
+  // by 1000.
+  distance *= 1000u;
+  distance += Square(Intensity(predicted_value)
+                     - Intensity(actual_value));
+  return distance;
+}
+
+static uint32_t Predict(int x, int y, uint32_t* image) {
+  const uint32_t t = (y == 0) ? ARGB_BLACK : image[x];
+  const uint32_t l = (x == 0) ? ARGB_BLACK : image[x - 1];
+  const uint32_t p =
+      (((((t >> 24) & 0xff) + ((l >> 24) & 0xff)) / 2) << 24) +
+      (((((t >> 16) & 0xff) + ((l >> 16) & 0xff)) / 2) << 16) +
+      (((((t >>  8) & 0xff) + ((l >>  8) & 0xff)) / 2) <<  8) +
+      (((((t >>  0) & 0xff) + ((l >>  0) & 0xff)) / 2) <<  0);
+  if (x == 0 && y == 0) return ARGB_BLACK;
+  if (x == 0) return t;
+  if (y == 0) return l;
+  return p;
+}
+
+static WEBP_INLINE int AddSubtractComponentFullWithCoefficient(
+    int a, int b, int c) {
+  return Clip255(a + ((b - c) >> 2));
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractFullWithCoefficient(
+    uint32_t c0, uint32_t c1, uint32_t c2) {
+  const int a = AddSubtractComponentFullWithCoefficient(
+      c0 >> 24, c1 >> 24, c2 >> 24);
+  const int r = AddSubtractComponentFullWithCoefficient((c0 >> 16) & 0xff,
+                                                       (c1 >> 16) & 0xff,
+                                                       (c2 >> 16) & 0xff);
+  const int g = AddSubtractComponentFullWithCoefficient((c0 >> 8) & 0xff,
+                                                       (c1 >> 8) & 0xff,
+                                                       (c2 >> 8) & 0xff);
+  const int b = AddSubtractComponentFullWithCoefficient(
+      c0 & 0xff, c1 & 0xff, c2 & 0xff);
+  return ((uint32_t)a << 24) | (r << 16) | (g << 8) | b;
+}
+
+//------------------------------------------------------------------------------
+
+// Find palette entry with minimum error from difference of actual pixel value
+// and predicted pixel value. Propagate error of pixel to its top and left pixel
+// in src array. Write predicted_value + palette_entry to new_image. Return
+// index of best palette entry.
+static int FindBestPaletteEntry(uint32_t src, uint32_t predicted_value,
+                                const uint32_t palette[], int palette_size) {
+  int i;
+  int idx = 0;
+  uint32_t best_distance = CalcDist(predicted_value, src, palette[0]);
+  for (i = 1; i < palette_size; ++i) {
+    const uint32_t distance = CalcDist(predicted_value, src, palette[i]);
+    if (distance < best_distance) {
+      best_distance = distance;
+      idx = i;
+    }
+  }
+  return idx;
+}
+
+static void ApplyBestPaletteEntry(int x, int y,
+                                  uint32_t new_value, uint32_t palette_value,
+                                  uint32_t* src, int src_stride,
+                                  uint32_t* new_image) {
+  AddPixelsEq(&new_value, palette_value);
+  if (x > 0) {
+    src[x - 1] = ClampedAddSubtractFullWithCoefficient(src[x - 1],
+                                                       new_value, src[x]);
+  }
+  if (y > 0) {
+    src[x - src_stride] =
+        ClampedAddSubtractFullWithCoefficient(src[x - src_stride],
+                                              new_value, src[x]);
+  }
+  new_image[x] = new_value;
+}
+
+//------------------------------------------------------------------------------
+// Main entry point
+
+static WebPEncodingError ApplyDeltaPalette(uint32_t* src, uint32_t* dst,
+                                           uint32_t src_stride,
+                                           uint32_t dst_stride,
+                                           const uint32_t* palette,
+                                           int palette_size,
+                                           int width, int height,
+                                           int num_passes) {
+  int x, y;
+  WebPEncodingError err = VP8_ENC_OK;
+  uint32_t* new_image = (uint32_t*)WebPSafeMalloc(width, sizeof(*new_image));
+  uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
+  if (new_image == NULL || tmp_row == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  while (num_passes--) {
+    uint32_t* cur_src = src;
+    uint32_t* cur_dst = dst;
+    for (y = 0; y < height; ++y) {
+      for (x = 0; x < width; ++x) {
+        const uint32_t predicted_value = Predict(x, y, new_image);
+        tmp_row[x] = FindBestPaletteEntry(cur_src[x], predicted_value,
+                                          palette, palette_size);
+        ApplyBestPaletteEntry(x, y, predicted_value, palette[tmp_row[x]],
+                              cur_src, src_stride, new_image);
+      }
+      for (x = 0; x < width; ++x) {
+        cur_dst[x] = palette[tmp_row[x]];
+      }
+      cur_src += src_stride;
+      cur_dst += dst_stride;
+    }
+  }
+ Error:
+  WebPSafeFree(new_image);
+  WebPSafeFree(tmp_row);
+  return err;
+}
+
+// replaces enc->argb_ by a palettizable approximation of it,
+// and generates optimal enc->palette_[]
+WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) {
+  const WebPPicture* const pic = enc->pic_;
+  uint32_t* src = pic->argb;
+  uint32_t* dst = enc->argb_;
+  const int width = pic->width;
+  const int height = pic->height;
+
+  WebPEncodingError err = VP8_ENC_OK;
+  memcpy(enc->palette_, kDeltaPalette, sizeof(kDeltaPalette));
+  enc->palette_[DELTA_PALETTE_SIZE - 1] = src[0] - 0xff000000u;
+  enc->palette_size_ = DELTA_PALETTE_SIZE;
+  err = ApplyDeltaPalette(src, dst, pic->argb_stride, enc->current_width_,
+                          enc->palette_, enc->palette_size_,
+                          width, height, 2);
+  if (err != VP8_ENC_OK) goto Error;
+
+ Error:
+  return err;
+}
+
+#else  // !WEBP_EXPERIMENTAL_FEATURES
+
+WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc) {
+  (void)enc;
+  return VP8_ENC_ERROR_INVALID_CONFIGURATION;
+}
+
+#endif  // WEBP_EXPERIMENTAL_FEATURES
diff --git a/src/3rdparty/libwebp/src/enc/delta_palettization.h b/src/3rdparty/libwebp/src/enc/delta_palettization.h
new file mode 100644
index 0000000..e41c0c5
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/delta_palettization.h
@@ -0,0 +1,25 @@
+// Copyright 2015 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Author: Mislav Bradac (mislavm@google.com)
+//
+
+#ifndef WEBP_ENC_DELTA_PALETTIZATION_H_
+#define WEBP_ENC_DELTA_PALETTIZATION_H_
+
+#include "../webp/encode.h"
+#include "../enc/vp8li.h"
+
+// Replaces enc->argb_[] input by a palettizable approximation of it,
+// and generates optimal enc->palette_[].
+// This function can revert enc->use_palette_ / enc->use_predict_ flag
+// if delta-palettization is not producing expected saving.
+WebPEncodingError WebPSearchOptimalDeltaPalette(VP8LEncoder* const enc);
+
+#endif  // WEBP_ENC_DELTA_PALETTIZATION_H_
diff --git a/src/3rdparty/libwebp/src/enc/filter.c b/src/3rdparty/libwebp/src/enc/filter.c
index 11db4bd..41813cf 100644
--- a/src/3rdparty/libwebp/src/enc/filter.c
+++ b/src/3rdparty/libwebp/src/enc/filter.c
@@ -85,12 +85,12 @@ static void DoFilter(const VP8EncIterator* const it, int level) {
   const int ilevel = GetILevel(enc->config_->filter_sharpness, level);
   const int limit = 2 * level + ilevel;
 
-  uint8_t* const y_dst = it->yuv_out2_ + Y_OFF;
-  uint8_t* const u_dst = it->yuv_out2_ + U_OFF;
-  uint8_t* const v_dst = it->yuv_out2_ + V_OFF;
+  uint8_t* const y_dst = it->yuv_out2_ + Y_OFF_ENC;
+  uint8_t* const u_dst = it->yuv_out2_ + U_OFF_ENC;
+  uint8_t* const v_dst = it->yuv_out2_ + V_OFF_ENC;
 
   // copy current block to yuv_out2_
-  memcpy(y_dst, it->yuv_out_, YUV_SIZE * sizeof(uint8_t));
+  memcpy(y_dst, it->yuv_out_, YUV_SIZE_ENC * sizeof(uint8_t));
 
   if (enc->filter_hdr_.simple_ == 1) {   // simple
     VP8SimpleHFilter16i(y_dst, BPS, limit);
@@ -195,13 +195,16 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
   // compute SSIM in a 10 x 10 window
   for (x = 3; x < 13; x++) {
     for (y = 3; y < 13; y++) {
-      VP8SSIMAccumulate(yuv1 + Y_OFF, BPS, yuv2 + Y_OFF, BPS, x, y, 16, 16, &s);
+      VP8SSIMAccumulate(yuv1 + Y_OFF_ENC, BPS, yuv2 + Y_OFF_ENC, BPS,
+                        x, y, 16, 16, &s);
     }
   }
   for (x = 1; x < 7; x++) {
     for (y = 1; y < 7; y++) {
-      VP8SSIMAccumulate(yuv1 + U_OFF, BPS, yuv2 + U_OFF, BPS, x, y, 8, 8, &s);
-      VP8SSIMAccumulate(yuv1 + V_OFF, BPS, yuv2 + V_OFF, BPS, x, y, 8, 8, &s);
+      VP8SSIMAccumulate(yuv1 + U_OFF_ENC, BPS, yuv2 + U_OFF_ENC, BPS,
+                        x, y, 8, 8, &s);
+      VP8SSIMAccumulate(yuv1 + V_OFF_ENC, BPS, yuv2 + V_OFF_ENC, BPS,
+                        x, y, 8, 8, &s);
     }
   }
   return VP8SSIMGet(&s);
@@ -226,7 +229,7 @@ void VP8StoreFilterStats(VP8EncIterator* const it) {
   int d;
   VP8Encoder* const enc = it->enc_;
   const int s = it->mb_->segment_;
-  const int level0 = enc->dqm_[s].fstrength_;  // TODO: ref_lf_delta[]
+  const int level0 = enc->dqm_[s].fstrength_;
 
   // explore +/-quant range of values around level0
   const int delta_min = -enc->dqm_[s].quant_;
diff --git a/src/3rdparty/libwebp/src/enc/frame.c b/src/3rdparty/libwebp/src/enc/frame.c
index cdf1dab..5b7a40b 100644
--- a/src/3rdparty/libwebp/src/enc/frame.c
+++ b/src/3rdparty/libwebp/src/enc/frame.c
@@ -14,8 +14,9 @@
 #include <string.h>
 #include <math.h>
 
-#include "./vp8enci.h"
 #include "./cost.h"
+#include "./vp8enci.h"
+#include "../dsp/dsp.h"
 #include "../webp/format_constants.h"  // RIFF constants
 
 #define SEGMENT_VISU 0
@@ -81,11 +82,6 @@ static float ComputeNextQ(PassStats* const s) {
 //------------------------------------------------------------------------------
 // Tables for level coding
 
-const uint8_t VP8EncBands[16 + 1] = {
-  0, 1, 2, 3, 6, 4, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7,
-  0  // sentinel
-};
-
 const uint8_t VP8Cat3[] = { 173, 148, 140 };
 const uint8_t VP8Cat4[] = { 176, 155, 140, 135 };
 const uint8_t VP8Cat5[] = { 180, 157, 141, 134, 130 };
@@ -96,7 +92,7 @@ const uint8_t VP8Cat6[] =
 // Reset the statistics about: number of skips, token proba, level cost,...
 
 static void ResetStats(VP8Encoder* const enc) {
-  VP8Proba* const proba = &enc->proba_;
+  VP8EncProba* const proba = &enc->proba_;
   VP8CalculateLevelCosts(proba);
   proba->nb_skip_ = 0;
 }
@@ -112,7 +108,7 @@ static int CalcSkipProba(uint64_t nb, uint64_t total) {
 
 // Returns the bit-cost for coding the skip probability.
 static int FinalizeSkipProba(VP8Encoder* const enc) {
-  VP8Proba* const proba = &enc->proba_;
+  VP8EncProba* const proba = &enc->proba_;
   const int nb_mbs = enc->mb_w_ * enc->mb_h_;
   const int nb_events = proba->nb_skip_;
   int size;
@@ -140,11 +136,11 @@ static int BranchCost(int nb, int total, int proba) {
 }
 
 static void ResetTokenStats(VP8Encoder* const enc) {
-  VP8Proba* const proba = &enc->proba_;
+  VP8EncProba* const proba = &enc->proba_;
   memset(proba->stats_, 0, sizeof(proba->stats_));
 }
 
-static int FinalizeTokenProbas(VP8Proba* const proba) {
+static int FinalizeTokenProbas(VP8EncProba* const proba) {
   int has_changed = 0;
   int size = 0;
   int t, b, c, p;
@@ -476,9 +472,9 @@ static void StoreSSE(const VP8EncIterator* const it) {
   const uint8_t* const in = it->yuv_in_;
   const uint8_t* const out = it->yuv_out_;
   // Note: not totally accurate at boundary. And doesn't include in-loop filter.
-  enc->sse_[0] += VP8SSE16x16(in + Y_OFF, out + Y_OFF);
-  enc->sse_[1] += VP8SSE8x8(in + U_OFF, out + U_OFF);
-  enc->sse_[2] += VP8SSE8x8(in + V_OFF, out + V_OFF);
+  enc->sse_[0] += VP8SSE16x16(in + Y_OFF_ENC, out + Y_OFF_ENC);
+  enc->sse_[1] += VP8SSE8x8(in + U_OFF_ENC, out + U_OFF_ENC);
+  enc->sse_[2] += VP8SSE8x8(in + V_OFF_ENC, out + V_OFF_ENC);
   enc->sse_count_ += 16 * 16;
 }
 
@@ -511,9 +507,9 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
     }
   }
 #if SEGMENT_VISU  // visualize segments and prediction modes
-  SetBlock(it->yuv_out_ + Y_OFF, mb->segment_ * 64, 16);
-  SetBlock(it->yuv_out_ + U_OFF, it->preds_[0] * 64, 8);
-  SetBlock(it->yuv_out_ + V_OFF, mb->uv_mode_ * 64, 8);
+  SetBlock(it->yuv_out_ + Y_OFF_ENC, mb->segment_ * 64, 16);
+  SetBlock(it->yuv_out_ + U_OFF_ENC, it->preds_[0] * 64, 8);
+  SetBlock(it->yuv_out_ + V_OFF_ENC, mb->uv_mode_ * 64, 8);
 #endif
 }
 
@@ -743,7 +739,7 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
   int num_pass_left = enc->config_->pass;
   const int do_search = enc->do_search_;
   VP8EncIterator it;
-  VP8Proba* const proba = &enc->proba_;
+  VP8EncProba* const proba = &enc->proba_;
   const VP8RDLevel rd_opt = enc->rd_opt_level_;
   const uint64_t pixel_count = enc->mb_w_ * enc->mb_h_ * 384;
   PassStats stats;
diff --git a/src/3rdparty/libwebp/src/enc/histogram.c b/src/3rdparty/libwebp/src/enc/histogram.c
index a2266b4..869882d 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.c
+++ b/src/3rdparty/libwebp/src/enc/histogram.c
@@ -20,9 +20,6 @@
 #include "../dsp/lossless.h"
 #include "../utils/utils.h"
 
-#define ALIGN_CST 15
-#define DO_ALIGN(PTR) ((uintptr_t)((PTR) + ALIGN_CST) & ~ALIGN_CST)
-
 #define MAX_COST 1.e38
 
 // Number of partitions for the three dominant (literal, red and blue) symbol
@@ -30,6 +27,8 @@
 #define NUM_PARTITIONS 4
 // The size of the bin-hash corresponding to the three dominant costs.
 #define BIN_SIZE (NUM_PARTITIONS * NUM_PARTITIONS * NUM_PARTITIONS)
+// Maximum number of histograms allowed in greedy combining algorithm.
+#define MAX_HISTO_GREEDY 100
 
 static void HistogramClear(VP8LHistogram* const p) {
   uint32_t* const literal = p->literal_;
@@ -40,6 +39,13 @@ static void HistogramClear(VP8LHistogram* const p) {
   p->literal_ = literal;
 }
 
+// Swap two histogram pointers.
+static void HistogramSwap(VP8LHistogram** const A, VP8LHistogram** const B) {
+  VP8LHistogram* const tmp = *A;
+  *A = *B;
+  *B = tmp;
+}
+
 static void HistogramCopy(const VP8LHistogram* const src,
                           VP8LHistogram* const dst) {
   uint32_t* const dst_literal = dst->literal_;
@@ -106,7 +112,8 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
   VP8LHistogramSet* set;
   const int histo_size = VP8LGetHistogramSize(cache_bits);
   const size_t total_size =
-      sizeof(*set) + size * (sizeof(*set->histograms) + histo_size + ALIGN_CST);
+      sizeof(*set) + size * (sizeof(*set->histograms) +
+      histo_size + WEBP_ALIGN_CST);
   uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
   if (memory == NULL) return NULL;
 
@@ -117,7 +124,7 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
   set->max_size = size;
   set->size = size;
   for (i = 0; i < size; ++i) {
-    memory = (uint8_t*)DO_ALIGN(memory);
+    memory = (uint8_t*)WEBP_ALIGN(memory);
     set->histograms[i] = (VP8LHistogram*)memory;
     // literal_ won't necessary be aligned.
     set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
@@ -149,24 +156,26 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
   }
 }
 
-static WEBP_INLINE double BitsEntropyRefine(int nonzeros, int sum, int max_val,
-                                            double retval) {
+// -----------------------------------------------------------------------------
+// Entropy-related functions.
+
+static WEBP_INLINE double BitsEntropyRefine(const VP8LBitEntropy* entropy) {
   double mix;
-  if (nonzeros < 5) {
-    if (nonzeros <= 1) {
+  if (entropy->nonzeros < 5) {
+    if (entropy->nonzeros <= 1) {
       return 0;
     }
     // Two symbols, they will be 0 and 1 in a Huffman code.
     // Let's mix in a bit of entropy to favor good clustering when
     // distributions of these are combined.
-    if (nonzeros == 2) {
-      return 0.99 * sum + 0.01 * retval;
+    if (entropy->nonzeros == 2) {
+      return 0.99 * entropy->sum + 0.01 * entropy->entropy;
     }
     // No matter what the entropy says, we cannot be better than min_limit
     // with Huffman coding. I am mixing a bit of entropy into the
     // min_limit since it produces much better (~0.5 %) compression results
     // perhaps because of better entropy clustering.
-    if (nonzeros == 3) {
+    if (entropy->nonzeros == 3) {
       mix = 0.95;
     } else {
       mix = 0.7;  // nonzeros == 4.
@@ -176,52 +185,22 @@ static WEBP_INLINE double BitsEntropyRefine(int nonzeros, int sum, int max_val,
   }
 
   {
-    double min_limit = 2 * sum - max_val;
-    min_limit = mix * min_limit + (1.0 - mix) * retval;
-    return (retval < min_limit) ? min_limit : retval;
+    double min_limit = 2 * entropy->sum - entropy->max_val;
+    min_limit = mix * min_limit + (1.0 - mix) * entropy->entropy;
+    return (entropy->entropy < min_limit) ? min_limit : entropy->entropy;
   }
 }
 
-static double BitsEntropy(const uint32_t* const array, int n) {
-  double retval = 0.;
-  uint32_t sum = 0;
-  int nonzeros = 0;
-  uint32_t max_val = 0;
-  int i;
-  for (i = 0; i < n; ++i) {
-    if (array[i] != 0) {
-      sum += array[i];
-      ++nonzeros;
-      retval -= VP8LFastSLog2(array[i]);
-      if (max_val < array[i]) {
-        max_val = array[i];
-      }
-    }
+double VP8LBitsEntropy(const uint32_t* const array, int n,
+                       uint32_t* const trivial_symbol) {
+  VP8LBitEntropy entropy;
+  VP8LBitsEntropyUnrefined(array, n, &entropy);
+  if (trivial_symbol != NULL) {
+    *trivial_symbol =
+        (entropy.nonzeros == 1) ? entropy.nonzero_code : VP8L_NON_TRIVIAL_SYM;
   }
-  retval += VP8LFastSLog2(sum);
-  return BitsEntropyRefine(nonzeros, sum, max_val, retval);
-}
 
-static double BitsEntropyCombined(const uint32_t* const X,
-                                  const uint32_t* const Y, int n) {
-  double retval = 0.;
-  int sum = 0;
-  int nonzeros = 0;
-  int max_val = 0;
-  int i;
-  for (i = 0; i < n; ++i) {
-    const int xy = X[i] + Y[i];
-    if (xy != 0) {
-      sum += xy;
-      ++nonzeros;
-      retval -= VP8LFastSLog2(xy);
-      if (max_val < xy) {
-        max_val = xy;
-      }
-    }
-  }
-  retval += VP8LFastSLog2(sum);
-  return BitsEntropyRefine(nonzeros, sum, max_val, retval);
+  return BitsEntropyRefine(&entropy);
 }
 
 static double InitialHuffmanCost(void) {
@@ -242,47 +221,40 @@ static double FinalHuffmanCost(const VP8LStreaks* const stats) {
   return retval;
 }
 
-// Trampolines
-static double HuffmanCost(const uint32_t* const population, int length) {
-  const VP8LStreaks stats = VP8LHuffmanCostCount(population, length);
-  return FinalHuffmanCost(&stats);
-}
+// Get the symbol entropy for the distribution 'population'.
+// Set 'trivial_sym', if there's only one symbol present in the distribution.
+static double PopulationCost(const uint32_t* const population, int length,
+                             uint32_t* const trivial_sym) {
+  VP8LBitEntropy bit_entropy;
+  VP8LStreaks stats;
+  VP8LGetEntropyUnrefined(population, length, &bit_entropy, &stats);
+  if (trivial_sym != NULL) {
+    *trivial_sym = (bit_entropy.nonzeros == 1) ? bit_entropy.nonzero_code
+                                               : VP8L_NON_TRIVIAL_SYM;
+  }
 
-static double HuffmanCostCombined(const uint32_t* const X,
-                                  const uint32_t* const Y, int length) {
-  const VP8LStreaks stats = VP8LHuffmanCostCombinedCount(X, Y, length);
-  return FinalHuffmanCost(&stats);
+  return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
 }
 
-// Aggregated costs
-static double PopulationCost(const uint32_t* const population, int length) {
-  return BitsEntropy(population, length) + HuffmanCost(population, length);
-}
+static WEBP_INLINE double GetCombinedEntropy(const uint32_t* const X,
+                                             const uint32_t* const Y,
+                                             int length) {
+  VP8LBitEntropy bit_entropy;
+  VP8LStreaks stats;
+  VP8LGetCombinedEntropyUnrefined(X, Y, length, &bit_entropy, &stats);
 
-static double GetCombinedEntropy(const uint32_t* const X,
-                                 const uint32_t* const Y, int length) {
-  return BitsEntropyCombined(X, Y, length) + HuffmanCostCombined(X, Y, length);
+  return BitsEntropyRefine(&bit_entropy) + FinalHuffmanCost(&stats);
 }
 
 // Estimates the Entropy + Huffman + other block overhead size cost.
 double VP8LHistogramEstimateBits(const VP8LHistogram* const p) {
   return
-      PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_))
-      + PopulationCost(p->red_, NUM_LITERAL_CODES)
-      + PopulationCost(p->blue_, NUM_LITERAL_CODES)
-      + PopulationCost(p->alpha_, NUM_LITERAL_CODES)
-      + PopulationCost(p->distance_, NUM_DISTANCE_CODES)
-      + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
-      + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
-}
-
-double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) {
-  return
-      BitsEntropy(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_))
-      + BitsEntropy(p->red_, NUM_LITERAL_CODES)
-      + BitsEntropy(p->blue_, NUM_LITERAL_CODES)
-      + BitsEntropy(p->alpha_, NUM_LITERAL_CODES)
-      + BitsEntropy(p->distance_, NUM_DISTANCE_CODES)
+      PopulationCost(
+          p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_), NULL)
+      + PopulationCost(p->red_, NUM_LITERAL_CODES, NULL)
+      + PopulationCost(p->blue_, NUM_LITERAL_CODES, NULL)
+      + PopulationCost(p->alpha_, NUM_LITERAL_CODES, NULL)
+      + PopulationCost(p->distance_, NUM_DISTANCE_CODES, NULL)
       + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
       + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
 }
@@ -313,8 +285,8 @@ static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
   if (*cost > cost_threshold) return 0;
 
   *cost += GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES);
-  *cost += VP8LExtraCostCombined(a->distance_, b->distance_,
-                                 NUM_DISTANCE_CODES);
+  *cost +=
+      VP8LExtraCostCombined(a->distance_, b->distance_, NUM_DISTANCE_CODES);
   if (*cost > cost_threshold) return 0;
 
   return 1;
@@ -338,6 +310,8 @@ static double HistogramAddEval(const VP8LHistogram* const a,
     VP8LHistogramAdd(a, b, out);
     out->bit_cost_ = cost;
     out->palette_code_bits_ = a->palette_code_bits_;
+    out->trivial_symbol_ = (a->trivial_symbol_ == b->trivial_symbol_) ?
+        a->trivial_symbol_ : VP8L_NON_TRIVIAL_SYM;
   }
 
   return cost - sum_cost;
@@ -389,18 +363,26 @@ static void UpdateDominantCostRange(
 }
 
 static void UpdateHistogramCost(VP8LHistogram* const h) {
-  const double alpha_cost = PopulationCost(h->alpha_, NUM_LITERAL_CODES);
+  uint32_t alpha_sym, red_sym, blue_sym;
+  const double alpha_cost =
+      PopulationCost(h->alpha_, NUM_LITERAL_CODES, &alpha_sym);
   const double distance_cost =
-      PopulationCost(h->distance_, NUM_DISTANCE_CODES) +
+      PopulationCost(h->distance_, NUM_DISTANCE_CODES, NULL) +
       VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
   const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
-  h->literal_cost_ = PopulationCost(h->literal_, num_codes) +
+  h->literal_cost_ = PopulationCost(h->literal_, num_codes, NULL) +
                      VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES,
                                    NUM_LENGTH_CODES);
-  h->red_cost_ = PopulationCost(h->red_, NUM_LITERAL_CODES);
-  h->blue_cost_ = PopulationCost(h->blue_, NUM_LITERAL_CODES);
+  h->red_cost_ = PopulationCost(h->red_, NUM_LITERAL_CODES, &red_sym);
+  h->blue_cost_ = PopulationCost(h->blue_, NUM_LITERAL_CODES, &blue_sym);
   h->bit_cost_ = h->literal_cost_ + h->red_cost_ + h->blue_cost_ +
                  alpha_cost + distance_cost;
+  if ((alpha_sym | red_sym | blue_sym) == VP8L_NON_TRIVIAL_SYM) {
+    h->trivial_symbol_ = VP8L_NON_TRIVIAL_SYM;
+  } else {
+    h->trivial_symbol_ =
+        ((uint32_t)alpha_sym << 24) | (red_sym << 16) | (blue_sym << 0);
+  }
 }
 
 static int GetBinIdForEntropy(double min, double max, double val) {
@@ -409,7 +391,14 @@ static int GetBinIdForEntropy(double min, double max, double val) {
   return (int)(NUM_PARTITIONS * delta / range);
 }
 
-// TODO(vikasa): Evaluate, if there's any correlation between red & blue.
+static int GetHistoBinIndexLowEffort(
+    const VP8LHistogram* const h, const DominantCostRange* const c) {
+  const int bin_id = GetBinIdForEntropy(c->literal_min_, c->literal_max_,
+                                        h->literal_cost_);
+  assert(bin_id < NUM_PARTITIONS);
+  return bin_id;
+}
+
 static int GetHistoBinIndex(
     const VP8LHistogram* const h, const DominantCostRange* const c) {
   const int bin_id =
@@ -432,7 +421,6 @@ static void HistogramBuild(
   VP8LHistogram** const histograms = image_histo->histograms;
   VP8LRefsCursor c = VP8LRefsCursorInit(backward_refs);
   assert(histo_bits > 0);
-  // Construct the Histo from a given backward references.
   while (VP8LRefsCursorOk(&c)) {
     const PixOrCopy* const v = c.cur_pos;
     const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
@@ -463,8 +451,8 @@ static void HistogramCopyAndAnalyze(
 
 // Partition histograms to different entropy bins for three dominant (literal,
 // red and blue) symbol costs and compute the histogram aggregate bit_cost.
-static void HistogramAnalyzeEntropyBin(
-    VP8LHistogramSet* const image_histo, int16_t* const bin_map) {
+static void HistogramAnalyzeEntropyBin(VP8LHistogramSet* const image_histo,
+                                       int16_t* const bin_map, int low_effort) {
   int i;
   VP8LHistogram** const histograms = image_histo->histograms;
   const int histo_size = image_histo->size;
@@ -483,7 +471,9 @@ static void HistogramAnalyzeEntropyBin(
   for (i = 0; i < histo_size; ++i) {
     int num_histos;
     VP8LHistogram* const histo = histograms[i];
-    const int16_t bin_id = (int16_t)GetHistoBinIndex(histo, &cost_range);
+    const int16_t bin_id = low_effort ?
+        (int16_t)GetHistoBinIndexLowEffort(histo, &cost_range) :
+        (int16_t)GetHistoBinIndex(histo, &cost_range);
     const int bin_offset = bin_id * bin_depth;
     // bin_map[n][0] for every bin 'n' maintains the counter for the number of
     // histograms in that bin.
@@ -495,64 +485,79 @@ static void HistogramAnalyzeEntropyBin(
   }
 }
 
-// Compact the histogram set by moving the valid one left in the set to the
-// head and moving the ones that have been merged to other histograms towards
-// the end.
-// TODO(vikasa): Evaluate if this method can be avoided by altering the code
-// logic of HistogramCombineEntropyBin main loop.
+// Compact the histogram set by removing unused entries.
 static void HistogramCompactBins(VP8LHistogramSet* const image_histo) {
-  int start = 0;
-  int end = image_histo->size - 1;
   VP8LHistogram** const histograms = image_histo->histograms;
-  while (start < end) {
-    while (start <= end && histograms[start] != NULL &&
-           histograms[start]->bit_cost_ != 0.) {
-      ++start;
-    }
-    while (start <= end && histograms[end]->bit_cost_ == 0.) {
-      histograms[end] = NULL;
-      --end;
-    }
-    if (start < end) {
-      assert(histograms[start] != NULL);
-      assert(histograms[end] != NULL);
-      HistogramCopy(histograms[end], histograms[start]);
-      histograms[end] = NULL;
-      --end;
+  int i, j;
+
+  for (i = 0, j = 0; i < image_histo->size; ++i) {
+    if (histograms[i] != NULL && histograms[i]->bit_cost_ != 0.) {
+      if (j < i) {
+        histograms[j] = histograms[i];
+        histograms[i] = NULL;
+      }
+      ++j;
     }
   }
-  image_histo->size = end + 1;
+  image_histo->size = j;
 }
 
-static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
-                                       VP8LHistogram* const histos,
-                                       int16_t* const bin_map, int bin_depth,
-                                       double combine_cost_factor) {
+static VP8LHistogram* HistogramCombineEntropyBin(
+    VP8LHistogramSet* const image_histo,
+    VP8LHistogram* cur_combo,
+    int16_t* const bin_map, int bin_depth, int num_bins,
+    double combine_cost_factor, int low_effort) {
   int bin_id;
-  VP8LHistogram* cur_combo = histos;
   VP8LHistogram** const histograms = image_histo->histograms;
 
-  for (bin_id = 0; bin_id < BIN_SIZE; ++bin_id) {
+  for (bin_id = 0; bin_id < num_bins; ++bin_id) {
     const int bin_offset = bin_id * bin_depth;
     const int num_histos = bin_map[bin_offset];
     const int idx1 = bin_map[bin_offset + 1];
+    int num_combine_failures = 0;
     int n;
     for (n = 2; n <= num_histos; ++n) {
       const int idx2 = bin_map[bin_offset + n];
-      const double bit_cost_idx2 = histograms[idx2]->bit_cost_;
-      if (bit_cost_idx2 > 0.) {
-        const double bit_cost_thresh = -bit_cost_idx2 * combine_cost_factor;
-        const double curr_cost_diff =
-            HistogramAddEval(histograms[idx1], histograms[idx2],
-                             cur_combo, bit_cost_thresh);
-        if (curr_cost_diff < bit_cost_thresh) {
-          HistogramCopy(cur_combo, histograms[idx1]);
-          histograms[idx2]->bit_cost_ = 0.;
+      if (low_effort) {
+        // Merge all histograms with the same bin index, irrespective of cost of
+        // the merged histograms.
+        VP8LHistogramAdd(histograms[idx1], histograms[idx2], histograms[idx1]);
+        histograms[idx2]->bit_cost_ = 0.;
+      } else {
+        const double bit_cost_idx2 = histograms[idx2]->bit_cost_;
+        if (bit_cost_idx2 > 0.) {
+          const double bit_cost_thresh = -bit_cost_idx2 * combine_cost_factor;
+          const double curr_cost_diff =
+              HistogramAddEval(histograms[idx1], histograms[idx2],
+                               cur_combo, bit_cost_thresh);
+          if (curr_cost_diff < bit_cost_thresh) {
+            // Try to merge two histograms only if the combo is a trivial one or
+            // the two candidate histograms are already non-trivial.
+            // For some images, 'try_combine' turns out to be false for a lot of
+            // histogram pairs. In that case, we fallback to combining
+            // histograms as usual to avoid increasing the header size.
+            const int try_combine =
+                (cur_combo->trivial_symbol_ != VP8L_NON_TRIVIAL_SYM) ||
+                ((histograms[idx1]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM) &&
+                 (histograms[idx2]->trivial_symbol_ == VP8L_NON_TRIVIAL_SYM));
+            const int max_combine_failures = 32;
+            if (try_combine || (num_combine_failures >= max_combine_failures)) {
+              HistogramSwap(&cur_combo, &histograms[idx1]);
+              histograms[idx2]->bit_cost_ = 0.;
+            } else {
+              ++num_combine_failures;
+            }
+          }
         }
       }
     }
+    if (low_effort) {
+      // Update the bit_cost for the merged histograms (per bin index).
+      UpdateHistogramCost(histograms[idx1]);
+    }
   }
   HistogramCompactBins(image_histo);
+  return cur_combo;
 }
 
 static uint32_t MyRand(uint32_t *seed) {
@@ -563,8 +568,179 @@ static uint32_t MyRand(uint32_t *seed) {
   return *seed;
 }
 
-static void HistogramCombine(VP8LHistogramSet* const image_histo,
-                             VP8LHistogramSet* const histos, int quality) {
+// -----------------------------------------------------------------------------
+// Histogram pairs priority queue
+
+// Pair of histograms. Negative idx1 value means that pair is out-of-date.
+typedef struct {
+  int idx1;
+  int idx2;
+  double cost_diff;
+  double cost_combo;
+} HistogramPair;
+
+typedef struct {
+  HistogramPair* queue;
+  int size;
+  int max_size;
+} HistoQueue;
+
+static int HistoQueueInit(HistoQueue* const histo_queue, const int max_index) {
+  histo_queue->size = 0;
+  // max_index^2 for the queue size is safe. If you look at
+  // HistogramCombineGreedy, and imagine that UpdateQueueFront always pushes
+  // data to the queue, you insert at most:
+  // - max_index*(max_index-1)/2 (the first two for loops)
+  // - max_index - 1 in the last for loop at the first iteration of the while
+  //   loop, max_index - 2 at the second iteration ... therefore
+  //   max_index*(max_index-1)/2 overall too
+  histo_queue->max_size = max_index * max_index;
+  // We allocate max_size + 1 because the last element at index "size" is
+  // used as temporary data (and it could be up to max_size).
+  histo_queue->queue = WebPSafeMalloc(histo_queue->max_size + 1,
+                                      sizeof(*histo_queue->queue));
+  return histo_queue->queue != NULL;
+}
+
+static void HistoQueueClear(HistoQueue* const histo_queue) {
+  assert(histo_queue != NULL);
+  WebPSafeFree(histo_queue->queue);
+}
+
+static void SwapHistogramPairs(HistogramPair *p1,
+                               HistogramPair *p2) {
+  const HistogramPair tmp = *p1;
+  *p1 = *p2;
+  *p2 = tmp;
+}
+
+// Given a valid priority queue in range [0, queue_size) this function checks
+// whether histo_queue[queue_size] should be accepted and swaps it with the
+// front if it is smaller. Otherwise, it leaves it as is.
+static void UpdateQueueFront(HistoQueue* const histo_queue) {
+  if (histo_queue->queue[histo_queue->size].cost_diff >= 0) return;
+
+  if (histo_queue->queue[histo_queue->size].cost_diff <
+      histo_queue->queue[0].cost_diff) {
+    SwapHistogramPairs(histo_queue->queue,
+                       histo_queue->queue + histo_queue->size);
+  }
+  ++histo_queue->size;
+
+  // We cannot add more elements than the capacity.
+  // The allocation adds an extra element to the official capacity so that
+  // histo_queue->queue[histo_queue->max_size] is read/written within bound.
+  assert(histo_queue->size <= histo_queue->max_size);
+}
+
+// -----------------------------------------------------------------------------
+
+static void PreparePair(VP8LHistogram** histograms, int idx1, int idx2,
+                        HistogramPair* const pair,
+                        VP8LHistogram* const histos) {
+  if (idx1 > idx2) {
+    const int tmp = idx2;
+    idx2 = idx1;
+    idx1 = tmp;
+  }
+  pair->idx1 = idx1;
+  pair->idx2 = idx2;
+  pair->cost_diff =
+      HistogramAddEval(histograms[idx1], histograms[idx2], histos, 0);
+  pair->cost_combo = histos->bit_cost_;
+}
+
+// Combines histograms by continuously choosing the one with the highest cost
+// reduction.
+static int HistogramCombineGreedy(VP8LHistogramSet* const image_histo,
+                                  VP8LHistogram* const histos) {
+  int ok = 0;
+  int image_histo_size = image_histo->size;
+  int i, j;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  // Indexes of remaining histograms.
+  int* const clusters = WebPSafeMalloc(image_histo_size, sizeof(*clusters));
+  // Priority queue of histogram pairs.
+  HistoQueue histo_queue;
+
+  if (!HistoQueueInit(&histo_queue, image_histo_size) || clusters == NULL) {
+    goto End;
+  }
+
+  for (i = 0; i < image_histo_size; ++i) {
+    // Initialize clusters indexes.
+    clusters[i] = i;
+    for (j = i + 1; j < image_histo_size; ++j) {
+      // Initialize positions array.
+      PreparePair(histograms, i, j, &histo_queue.queue[histo_queue.size],
+                  histos);
+      UpdateQueueFront(&histo_queue);
+    }
+  }
+
+  while (image_histo_size > 1 && histo_queue.size > 0) {
+    HistogramPair* copy_to;
+    const int idx1 = histo_queue.queue[0].idx1;
+    const int idx2 = histo_queue.queue[0].idx2;
+    VP8LHistogramAdd(histograms[idx2], histograms[idx1], histograms[idx1]);
+    histograms[idx1]->bit_cost_ = histo_queue.queue[0].cost_combo;
+    // Remove merged histogram.
+    for (i = 0; i + 1 < image_histo_size; ++i) {
+      if (clusters[i] >= idx2) {
+        clusters[i] = clusters[i + 1];
+      }
+    }
+    --image_histo_size;
+
+    // Remove pairs intersecting the just combined best pair. This will
+    // therefore pop the head of the queue.
+    copy_to = histo_queue.queue;
+    for (i = 0; i < histo_queue.size; ++i) {
+      HistogramPair* const p = histo_queue.queue + i;
+      if (p->idx1 == idx1 || p->idx2 == idx1 ||
+          p->idx1 == idx2 || p->idx2 == idx2) {
+        // Do not copy the invalid pair.
+        continue;
+      }
+      if (p->cost_diff < histo_queue.queue[0].cost_diff) {
+        // Replace the top of the queue if we found better.
+        SwapHistogramPairs(histo_queue.queue, p);
+      }
+      SwapHistogramPairs(copy_to, p);
+      ++copy_to;
+    }
+    histo_queue.size = (int)(copy_to - histo_queue.queue);
+
+    // Push new pairs formed with combined histogram to the queue.
+    for (i = 0; i < image_histo_size; ++i) {
+      if (clusters[i] != idx1) {
+        PreparePair(histograms, idx1, clusters[i],
+                    &histo_queue.queue[histo_queue.size], histos);
+        UpdateQueueFront(&histo_queue);
+      }
+    }
+  }
+  // Move remaining histograms to the beginning of the array.
+  for (i = 0; i < image_histo_size; ++i) {
+    if (i != clusters[i]) {  // swap the two histograms
+      HistogramSwap(&histograms[i], &histograms[clusters[i]]);
+    }
+  }
+
+  image_histo->size = image_histo_size;
+  ok = 1;
+
+ End:
+  WebPSafeFree(clusters);
+  HistoQueueClear(&histo_queue);
+  return ok;
+}
+
+static VP8LHistogram* HistogramCombineStochastic(
+    VP8LHistogramSet* const image_histo,
+    VP8LHistogram* tmp_histo,
+    VP8LHistogram* best_combo,
+    int quality, int min_cluster_size) {
   int iter;
   uint32_t seed = 0;
   int tries_with_no_success = 0;
@@ -573,12 +749,10 @@ static void HistogramCombine(VP8LHistogramSet* const image_histo,
   const int outer_iters = image_histo_size * iter_mult;
   const int num_pairs = image_histo_size / 2;
   const int num_tries_no_success = outer_iters / 2;
-  const int min_cluster_size = 2;
   VP8LHistogram** const histograms = image_histo->histograms;
-  VP8LHistogram* cur_combo = histos->histograms[0];   // trial histogram
-  VP8LHistogram* best_combo = histos->histograms[1];  // best histogram so far
 
   // Collapse similar histograms in 'image_histo'.
+  ++min_cluster_size;
   for (iter = 0;
        iter < outer_iters && image_histo_size >= min_cluster_size;
        ++iter) {
@@ -602,13 +776,9 @@ static void HistogramCombine(VP8LHistogramSet* const image_histo,
 
       // Calculate cost reduction on combining.
       curr_cost_diff = HistogramAddEval(histograms[idx1], histograms[idx2],
-                                        cur_combo, best_cost_diff);
+                                        tmp_histo, best_cost_diff);
       if (curr_cost_diff < best_cost_diff) {    // found a better pair?
-        {     // swap cur/best combo histograms
-          VP8LHistogram* const tmp_histo = cur_combo;
-          cur_combo = best_combo;
-          best_combo = tmp_histo;
-        }
+        HistogramSwap(&best_combo, &tmp_histo);
         best_cost_diff = curr_cost_diff;
         best_idx1 = idx1;
         best_idx2 = idx2;
@@ -616,11 +786,11 @@ static void HistogramCombine(VP8LHistogramSet* const image_histo,
     }
 
     if (best_idx1 >= 0) {
-      HistogramCopy(best_combo, histograms[best_idx1]);
+      HistogramSwap(&best_combo, &histograms[best_idx1]);
       // swap best_idx2 slot with last one (which is now unused)
       --image_histo_size;
       if (best_idx2 != image_histo_size) {
-        HistogramCopy(histograms[image_histo_size], histograms[best_idx2]);
+        HistogramSwap(&histograms[image_histo_size], &histograms[best_idx2]);
         histograms[image_histo_size] = NULL;
       }
       tries_with_no_success = 0;
@@ -630,6 +800,7 @@ static void HistogramCombine(VP8LHistogramSet* const image_histo,
     }
   }
   image_histo->size = image_histo_size;
+  return best_combo;
 }
 
 // -----------------------------------------------------------------------------
@@ -643,28 +814,37 @@ static void HistogramRemap(const VP8LHistogramSet* const orig_histo,
   int i;
   VP8LHistogram** const orig_histograms = orig_histo->histograms;
   VP8LHistogram** const histograms = image_histo->histograms;
-  for (i = 0; i < orig_histo->size; ++i) {
-    int best_out = 0;
-    double best_bits =
-        HistogramAddThresh(histograms[0], orig_histograms[i], MAX_COST);
-    int k;
-    for (k = 1; k < image_histo->size; ++k) {
-      const double cur_bits =
-          HistogramAddThresh(histograms[k], orig_histograms[i], best_bits);
-      if (cur_bits < best_bits) {
-        best_bits = cur_bits;
-        best_out = k;
+  const int orig_histo_size = orig_histo->size;
+  const int image_histo_size = image_histo->size;
+  if (image_histo_size > 1) {
+    for (i = 0; i < orig_histo_size; ++i) {
+      int best_out = 0;
+      double best_bits =
+          HistogramAddThresh(histograms[0], orig_histograms[i], MAX_COST);
+      int k;
+      for (k = 1; k < image_histo_size; ++k) {
+        const double cur_bits =
+            HistogramAddThresh(histograms[k], orig_histograms[i], best_bits);
+        if (cur_bits < best_bits) {
+          best_bits = cur_bits;
+          best_out = k;
+        }
       }
+      symbols[i] = best_out;
+    }
+  } else {
+    assert(image_histo_size == 1);
+    for (i = 0; i < orig_histo_size; ++i) {
+      symbols[i] = 0;
     }
-    symbols[i] = best_out;
   }
 
   // Recompute each out based on raw and symbols.
-  for (i = 0; i < image_histo->size; ++i) {
+  for (i = 0; i < image_histo_size; ++i) {
     HistogramClear(histograms[i]);
   }
 
-  for (i = 0; i < orig_histo->size; ++i) {
+  for (i = 0; i < orig_histo_size; ++i) {
     const int idx = symbols[i];
     VP8LHistogramAdd(orig_histograms[i], histograms[idx], histograms[idx]);
   }
@@ -672,44 +852,48 @@ static void HistogramRemap(const VP8LHistogramSet* const orig_histo,
 
 static double GetCombineCostFactor(int histo_size, int quality) {
   double combine_cost_factor = 0.16;
-  if (histo_size > 256) combine_cost_factor /= 2.;
-  if (histo_size > 512) combine_cost_factor /= 2.;
-  if (histo_size > 1024) combine_cost_factor /= 2.;
-  if (quality <= 50) combine_cost_factor /= 2.;
+  if (quality < 90) {
+    if (histo_size > 256) combine_cost_factor /= 2.;
+    if (histo_size > 512) combine_cost_factor /= 2.;
+    if (histo_size > 1024) combine_cost_factor /= 2.;
+    if (quality <= 50) combine_cost_factor /= 2.;
+  }
   return combine_cost_factor;
 }
 
 int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              const VP8LBackwardRefs* const refs,
-                             int quality, int histo_bits, int cache_bits,
+                             int quality, int low_effort,
+                             int histo_bits, int cache_bits,
                              VP8LHistogramSet* const image_histo,
+                             VP8LHistogramSet* const tmp_histos,
                              uint16_t* const histogram_symbols) {
   int ok = 0;
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
   const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
   const int image_histo_raw_size = histo_xsize * histo_ysize;
+  const int entropy_combine_num_bins = low_effort ? NUM_PARTITIONS : BIN_SIZE;
 
   // The bin_map for every bin follows following semantics:
   // bin_map[n][0] = num_histo; // The number of histograms in that bin.
   // bin_map[n][1] = index of first histogram in that bin;
   // bin_map[n][num_histo] = index of last histogram in that bin;
-  // bin_map[n][num_histo + 1] ... bin_map[n][bin_depth - 1] = un-used indices.
+  // bin_map[n][num_histo + 1] ... bin_map[n][bin_depth - 1] = unused indices.
   const int bin_depth = image_histo_raw_size + 1;
   int16_t* bin_map = NULL;
-  VP8LHistogramSet* const histos = VP8LAllocateHistogramSet(2, cache_bits);
   VP8LHistogramSet* const orig_histo =
       VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
+  VP8LHistogram* cur_combo;
+  const int entropy_combine =
+      (orig_histo->size > entropy_combine_num_bins * 2) && (quality < 100);
 
-  if (orig_histo == NULL || histos == NULL) {
-    goto Error;
-  }
+  if (orig_histo == NULL) goto Error;
 
   // Don't attempt linear bin-partition heuristic for:
   // histograms of small sizes, as bin_map will be very sparse and;
-  // Higher qualities (> 90), to preserve the compression gains at those
-  // quality settings.
-  if (orig_histo->size > 2 * BIN_SIZE && quality < 90) {
-    const int bin_map_size = bin_depth * BIN_SIZE;
+  // Maximum quality (q==100), to preserve the compression gains at that level.
+  if (entropy_combine) {
+    const int bin_map_size = bin_depth * entropy_combine_num_bins;
     bin_map = (int16_t*)WebPSafeCalloc(bin_map_size, sizeof(*bin_map));
     if (bin_map == NULL) goto Error;
   }
@@ -719,18 +903,33 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
   // Copies the histograms and computes its bit_cost.
   HistogramCopyAndAnalyze(orig_histo, image_histo);
 
-  if (bin_map != NULL) {
+  cur_combo = tmp_histos->histograms[1];  // pick up working slot
+  if (entropy_combine) {
     const double combine_cost_factor =
         GetCombineCostFactor(image_histo_raw_size, quality);
-    HistogramAnalyzeEntropyBin(orig_histo, bin_map);
+    HistogramAnalyzeEntropyBin(orig_histo, bin_map, low_effort);
     // Collapse histograms with similar entropy.
-    HistogramCombineEntropyBin(image_histo, histos->histograms[0],
-                               bin_map, bin_depth, combine_cost_factor);
+    cur_combo = HistogramCombineEntropyBin(image_histo, cur_combo, bin_map,
+                                           bin_depth, entropy_combine_num_bins,
+                                           combine_cost_factor, low_effort);
   }
 
-  // Collapse similar histograms by random histogram-pair compares.
-  HistogramCombine(image_histo, histos, quality);
+  // Don't combine the histograms using stochastic and greedy heuristics for
+  // low-effort compression mode.
+  if (!low_effort || !entropy_combine) {
+    const float x = quality / 100.f;
+    // cubic ramp between 1 and MAX_HISTO_GREEDY:
+    const int threshold_size = (int)(1 + (x * x * x) * (MAX_HISTO_GREEDY - 1));
+    cur_combo = HistogramCombineStochastic(image_histo,
+                                           tmp_histos->histograms[0],
+                                           cur_combo, quality, threshold_size);
+    if ((image_histo->size <= threshold_size) &&
+        !HistogramCombineGreedy(image_histo, cur_combo)) {
+      goto Error;
+    }
+  }
 
+  // TODO(vikasa): Optimize HistogramRemap for low-effort compression mode also.
   // Find the optimal map from original histograms to the final ones.
   HistogramRemap(orig_histo, image_histo, histogram_symbols);
 
@@ -739,6 +938,5 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
  Error:
   WebPSafeFree(bin_map);
   VP8LFreeHistogramSet(orig_histo);
-  VP8LFreeHistogramSet(histos);
   return ok;
 }
diff --git a/src/3rdparty/libwebp/src/enc/histogram.h b/src/3rdparty/libwebp/src/enc/histogram.h
index 1cf4c54..d303d1d 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.h
+++ b/src/3rdparty/libwebp/src/enc/histogram.h
@@ -14,10 +14,6 @@
 #ifndef WEBP_ENC_HISTOGRAM_H_
 #define WEBP_ENC_HISTOGRAM_H_
 
-#include <assert.h>
-#include <stddef.h>
-#include <stdlib.h>
-#include <stdio.h>
 #include <string.h>
 
 #include "./backward_references.h"
@@ -28,6 +24,9 @@
 extern "C" {
 #endif
 
+// Not a trivial literal symbol.
+#define VP8L_NON_TRIVIAL_SYM (0xffffffff)
+
 // A simple container for histograms of data.
 typedef struct {
   // literal_ contains green literal, palette-code and
@@ -39,9 +38,11 @@ typedef struct {
   // Backward reference prefix-code histogram.
   uint32_t distance_[NUM_DISTANCE_CODES];
   int palette_code_bits_;
-  double bit_cost_;      // cached value of VP8LHistogramEstimateBits(this)
-  double literal_cost_;  // Cached values of dominant entropy costs:
-  double red_cost_;      //   literal, red & blue.
+  uint32_t trivial_symbol_;  // True, if histograms for Red, Blue & Alpha
+                             // literal symbols are single valued.
+  double bit_cost_;          // cached value of bit cost.
+  double literal_cost_;      // Cached values of dominant entropy costs:
+  double red_cost_;          // literal, red & blue.
   double blue_cost_;
 } VP8LHistogram;
 
@@ -91,14 +92,6 @@ VP8LHistogram* VP8LAllocateHistogram(int cache_bits);
 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
                                      const PixOrCopy* const v);
 
-// Estimate how many bits the combined entropy of literals and distance
-// approximately maps to.
-double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
-
-// This function estimates the cost in bits excluding the bits needed to
-// represent the entropy code itself.
-double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p);
-
 static WEBP_INLINE int VP8LHistogramNumCodes(int palette_code_bits) {
   return NUM_LITERAL_CODES + NUM_LENGTH_CODES +
       ((palette_code_bits > 0) ? (1 << palette_code_bits) : 0);
@@ -107,10 +100,22 @@ static WEBP_INLINE int VP8LHistogramNumCodes(int palette_code_bits) {
 // Builds the histogram image.
 int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              const VP8LBackwardRefs* const refs,
-                             int quality, int histogram_bits, int cache_bits,
+                             int quality, int low_effort,
+                             int histogram_bits, int cache_bits,
                              VP8LHistogramSet* const image_in,
+                             VP8LHistogramSet* const tmp_histos,
                              uint16_t* const histogram_symbols);
 
+// Returns the entropy for the symbols in the input array.
+// Also sets trivial_symbol to the code value, if the array has only one code
+// value. Otherwise, set it to VP8L_NON_TRIVIAL_SYM.
+double VP8LBitsEntropy(const uint32_t* const array, int n,
+                       uint32_t* const trivial_symbol);
+
+// Estimate how many bits the combined entropy of literals and distance
+// approximately maps to.
+double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/3rdparty/libwebp/src/enc/iterator.c b/src/3rdparty/libwebp/src/enc/iterator.c
index e42ad00..99d960a 100644
--- a/src/3rdparty/libwebp/src/enc/iterator.c
+++ b/src/3rdparty/libwebp/src/enc/iterator.c
@@ -70,13 +70,13 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
   it->enc_ = enc;
   it->y_stride_  = enc->pic_->y_stride;
   it->uv_stride_ = enc->pic_->uv_stride;
-  it->yuv_in_   = (uint8_t*)DO_ALIGN(it->yuv_mem_);
-  it->yuv_out_  = it->yuv_in_ + YUV_SIZE;
-  it->yuv_out2_ = it->yuv_out_ + YUV_SIZE;
-  it->yuv_p_    = it->yuv_out2_ + YUV_SIZE;
+  it->yuv_in_   = (uint8_t*)WEBP_ALIGN(it->yuv_mem_);
+  it->yuv_out_  = it->yuv_in_ + YUV_SIZE_ENC;
+  it->yuv_out2_ = it->yuv_out_ + YUV_SIZE_ENC;
+  it->yuv_p_    = it->yuv_out2_ + YUV_SIZE_ENC;
   it->lf_stats_ = enc->lf_stats_;
   it->percent0_ = enc->percent_;
-  it->y_left_ = (uint8_t*)DO_ALIGN(it->yuv_left_mem_ + 1);
+  it->y_left_ = (uint8_t*)WEBP_ALIGN(it->yuv_left_mem_ + 1);
   it->u_left_ = it->y_left_ + 16 + 16;
   it->v_left_ = it->u_left_ + 16;
   VP8IteratorReset(it);
@@ -136,9 +136,9 @@ void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32) {
   const int uv_w = (w + 1) >> 1;
   const int uv_h = (h + 1) >> 1;
 
-  ImportBlock(ysrc, pic->y_stride,  it->yuv_in_ + Y_OFF, w, h, 16);
-  ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF, uv_w, uv_h, 8);
-  ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF, uv_w, uv_h, 8);
+  ImportBlock(ysrc, pic->y_stride,  it->yuv_in_ + Y_OFF_ENC, w, h, 16);
+  ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF_ENC, uv_w, uv_h, 8);
+  ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF_ENC, uv_w, uv_h, 8);
 
   if (tmp_32 == NULL) return;
 
@@ -185,9 +185,9 @@ void VP8IteratorExport(const VP8EncIterator* const it) {
   const VP8Encoder* const enc = it->enc_;
   if (enc->config_->show_compressed) {
     const int x = it->x_, y = it->y_;
-    const uint8_t* const ysrc = it->yuv_out_ + Y_OFF;
-    const uint8_t* const usrc = it->yuv_out_ + U_OFF;
-    const uint8_t* const vsrc = it->yuv_out_ + V_OFF;
+    const uint8_t* const ysrc = it->yuv_out_ + Y_OFF_ENC;
+    const uint8_t* const usrc = it->yuv_out_ + U_OFF_ENC;
+    const uint8_t* const vsrc = it->yuv_out_ + V_OFF_ENC;
     const WebPPicture* const pic = enc->pic_;
     uint8_t* const ydst = pic->y + (y * pic->y_stride + x) * 16;
     uint8_t* const udst = pic->u + (y * pic->uv_stride + x) * 8;
@@ -286,8 +286,8 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) {
 void VP8IteratorSaveBoundary(VP8EncIterator* const it) {
   VP8Encoder* const enc = it->enc_;
   const int x = it->x_, y = it->y_;
-  const uint8_t* const ysrc = it->yuv_out_ + Y_OFF;
-  const uint8_t* const uvsrc = it->yuv_out_ + U_OFF;
+  const uint8_t* const ysrc = it->yuv_out_ + Y_OFF_ENC;
+  const uint8_t* const uvsrc = it->yuv_out_ + U_OFF_ENC;
   if (x < enc->mb_w_ - 1) {   // left
     int i;
     for (i = 0; i < 16; ++i) {
diff --git a/src/3rdparty/libwebp/src/enc/near_lossless.c b/src/3rdparty/libwebp/src/enc/near_lossless.c
new file mode 100644
index 0000000..9bc0f0e
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/near_lossless.c
@@ -0,0 +1,160 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Near-lossless image preprocessing adjusts pixel values to help
+// compressibility with a guarantee of maximum deviation between original and
+// resulting pixel values.
+//
+// Author: Jyrki Alakuijala (jyrki@google.com)
+// Converted to C by Aleksander Kramarz (akramarz@google.com)
+
+#include <stdlib.h>
+
+#include "../dsp/lossless.h"
+#include "../utils/utils.h"
+#include "./vp8enci.h"
+
+#define MIN_DIM_FOR_NEAR_LOSSLESS 64
+#define MAX_LIMIT_BITS             5
+
+// Computes quantized pixel value and distance from original value.
+static void GetValAndDistance(int a, int initial, int bits,
+                              int* const val, int* const distance) {
+  const int mask = ~((1 << bits) - 1);
+  *val = (initial & mask) | (initial >> (8 - bits));
+  *distance = 2 * abs(a - *val);
+}
+
+// Clamps the value to range [0, 255].
+static int Clamp8b(int val) {
+  const int min_val = 0;
+  const int max_val = 0xff;
+  return (val < min_val) ? min_val : (val > max_val) ? max_val : val;
+}
+
+// Quantizes values {a, a+(1<<bits), a-(1<<bits)} and returns the nearest one.
+static int FindClosestDiscretized(int a, int bits) {
+  int best_val = a, i;
+  int min_distance = 256;
+
+  for (i = -1; i <= 1; ++i) {
+    int candidate, distance;
+    const int val = Clamp8b(a + i * (1 << bits));
+    GetValAndDistance(a, val, bits, &candidate, &distance);
+    if (i != 0) {
+      ++distance;
+    }
+    // Smallest distance but favor i == 0 over i == -1 and i == 1
+    // since that keeps the overall intensity more constant in the
+    // images.
+    if (distance < min_distance) {
+      min_distance = distance;
+      best_val = candidate;
+    }
+  }
+  return best_val;
+}
+
+// Applies FindClosestDiscretized to all channels of pixel.
+static uint32_t ClosestDiscretizedArgb(uint32_t a, int bits) {
+  return
+      (FindClosestDiscretized(a >> 24, bits) << 24) |
+      (FindClosestDiscretized((a >> 16) & 0xff, bits) << 16) |
+      (FindClosestDiscretized((a >> 8) & 0xff, bits) << 8) |
+      (FindClosestDiscretized(a & 0xff, bits));
+}
+
+// Checks if distance between corresponding channel values of pixels a and b
+// is within the given limit.
+static int IsNear(uint32_t a, uint32_t b, int limit) {
+  int k;
+  for (k = 0; k < 4; ++k) {
+    const int delta =
+        (int)((a >> (k * 8)) & 0xff) - (int)((b >> (k * 8)) & 0xff);
+    if (delta >= limit || delta <= -limit) {
+      return 0;
+    }
+  }
+  return 1;
+}
+
+static int IsSmooth(const uint32_t* const prev_row,
+                    const uint32_t* const curr_row,
+                    const uint32_t* const next_row,
+                    int ix, int limit) {
+  // Check that all pixels in 4-connected neighborhood are smooth.
+  return (IsNear(curr_row[ix], curr_row[ix - 1], limit) &&
+          IsNear(curr_row[ix], curr_row[ix + 1], limit) &&
+          IsNear(curr_row[ix], prev_row[ix], limit) &&
+          IsNear(curr_row[ix], next_row[ix], limit));
+}
+
+// Adjusts pixel values of image with given maximum error.
+static void NearLossless(int xsize, int ysize, uint32_t* argb,
+                         int limit_bits, uint32_t* copy_buffer) {
+  int x, y;
+  const int limit = 1 << limit_bits;
+  uint32_t* prev_row = copy_buffer;
+  uint32_t* curr_row = prev_row + xsize;
+  uint32_t* next_row = curr_row + xsize;
+  memcpy(copy_buffer, argb, xsize * 2 * sizeof(argb[0]));
+
+  for (y = 1; y < ysize - 1; ++y) {
+    uint32_t* const curr_argb_row = argb + y * xsize;
+    uint32_t* const next_argb_row = curr_argb_row + xsize;
+    memcpy(next_row, next_argb_row, xsize * sizeof(argb[0]));
+    for (x = 1; x < xsize - 1; ++x) {
+      if (!IsSmooth(prev_row, curr_row, next_row, x, limit)) {
+        curr_argb_row[x] = ClosestDiscretizedArgb(curr_row[x], limit_bits);
+      }
+    }
+    {
+      // Three-way swap.
+      uint32_t* const temp = prev_row;
+      prev_row = curr_row;
+      curr_row = next_row;
+      next_row = temp;
+    }
+  }
+}
+
+static int QualityToLimitBits(int quality) {
+  // quality mapping:
+  //  0..19 -> 5
+  //  0..39 -> 4
+  //  0..59 -> 3
+  //  0..79 -> 2
+  //  0..99 -> 1
+  //  100   -> 0
+  return MAX_LIMIT_BITS - quality / 20;
+}
+
+int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality) {
+  int i;
+  uint32_t* const copy_buffer =
+      (uint32_t*)WebPSafeMalloc(xsize * 3, sizeof(*copy_buffer));
+  const int limit_bits = QualityToLimitBits(quality);
+  assert(argb != NULL);
+  assert(limit_bits >= 0);
+  assert(limit_bits <= MAX_LIMIT_BITS);
+  if (copy_buffer == NULL) {
+    return 0;
+  }
+  // For small icon images, don't attempt to apply near-lossless compression.
+  if (xsize < MIN_DIM_FOR_NEAR_LOSSLESS && ysize < MIN_DIM_FOR_NEAR_LOSSLESS) {
+    WebPSafeFree(copy_buffer);
+    return 1;
+  }
+
+  for (i = limit_bits; i != 0; --i) {
+    NearLossless(xsize, ysize, argb, i, copy_buffer);
+  }
+  WebPSafeFree(copy_buffer);
+  return 1;
+}
diff --git a/src/3rdparty/libwebp/src/enc/picture.c b/src/3rdparty/libwebp/src/enc/picture.c
index 9a66fbe..26679a7 100644
--- a/src/3rdparty/libwebp/src/enc/picture.c
+++ b/src/3rdparty/libwebp/src/enc/picture.c
@@ -15,6 +15,7 @@
 #include <stdlib.h>
 
 #include "./vp8enci.h"
+#include "../dsp/dsp.h"
 #include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/picture_csp.c b/src/3rdparty/libwebp/src/enc/picture_csp.c
index 7875f62..0ef5f9e 100644
--- a/src/3rdparty/libwebp/src/enc/picture_csp.c
+++ b/src/3rdparty/libwebp/src/enc/picture_csp.c
@@ -32,10 +32,6 @@ static const union {
 } test_endian = { 0xff000000u };
 #define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)
 
-static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
-  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
-}
-
 //------------------------------------------------------------------------------
 // Detection of non-trivial transparency
 
@@ -89,9 +85,9 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
 
 static int kLinearToGammaTab[kGammaTabSize + 1];
 static uint16_t kGammaToLinearTab[256];
-static int kGammaTablesOk = 0;
+static volatile int kGammaTablesOk = 0;
 
-static void InitGammaTables(void) {
+static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTables(void) {
   if (!kGammaTablesOk) {
     int v;
     const double scale = (double)(1 << kGammaTabFix) / kGammaScale;
@@ -130,7 +126,7 @@ static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
 
 #else
 
-static void InitGammaTables(void) {}
+static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTables(void) {}
 static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
 static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
   return (int)(base_value << shift);
@@ -162,19 +158,15 @@ static int RGBToV(int r, int g, int b, VP8Random* const rg) {
 static const int kNumIterations = 6;
 static const int kMinDimensionIterativeConversion = 4;
 
-// We use a-priori a different precision for storing RGB and Y/W components
-// We could use YFIX=0 and only uint8_t for fixed_y_t, but it produces some
+// We could use SFIX=0 and only uint8_t for fixed_y_t, but it produces some
 // banding sometimes. Better use extra precision.
-// TODO(skal): cleanup once TFIX/YFIX values are fixed.
+#define SFIX 2                // fixed-point precision of RGB and Y/W
+typedef int16_t fixed_t;      // signed type with extra SFIX precision for UV
+typedef uint16_t fixed_y_t;   // unsigned type with extra SFIX precision for W
 
-typedef int16_t fixed_t;      // signed type with extra TFIX precision for UV
-typedef uint16_t fixed_y_t;   // unsigned type with extra YFIX precision for W
-#define TFIX 6   // fixed-point precision of RGB
-#define YFIX 2   // fixed point precision for Y/W
-
-#define THALF ((1 << TFIX) >> 1)
-#define MAX_Y_T ((256 << YFIX) - 1)
-#define TROUNDER (1 << (YUV_FIX + TFIX - 1))
+#define SHALF (1 << SFIX >> 1)
+#define MAX_Y_T ((256 << SFIX) - 1)
+#define SROUNDER (1 << (YUV_FIX + SFIX - 1))
 
 #if defined(USE_GAMMA_COMPRESSION)
 
@@ -184,9 +176,9 @@ typedef uint16_t fixed_y_t;   // unsigned type with extra YFIX precision for W
 #define kGammaF 2.2
 static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
 static float kLinearToGammaTabF[kGammaTabSize + 2];
-static int kGammaTablesFOk = 0;
+static volatile int kGammaTablesFOk = 0;
 
-static void InitGammaTablesF(void) {
+static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {
   if (!kGammaTablesFOk) {
     int v;
     const double norm = 1. / MAX_Y_T;
@@ -207,52 +199,31 @@ static WEBP_INLINE float GammaToLinearF(int v) {
   return kGammaToLinearTabF[v];
 }
 
-static WEBP_INLINE float LinearToGammaF(float value) {
+static WEBP_INLINE int LinearToGammaF(float value) {
   const float v = value * kGammaTabSize;
   const int tab_pos = (int)v;
   const float x = v - (float)tab_pos;      // fractional part
   const float v0 = kLinearToGammaTabF[tab_pos + 0];
   const float v1 = kLinearToGammaTabF[tab_pos + 1];
   const float y = v1 * x + v0 * (1.f - x);  // interpolate
-  return y;
+  return (int)(y + .5);
 }
 
 #else
 
-static void InitGammaTablesF(void) {}
+static WEBP_TSAN_IGNORE_FUNCTION void InitGammaTablesF(void) {}
 static WEBP_INLINE float GammaToLinearF(int v) {
   const float norm = 1.f / MAX_Y_T;
   return norm * v;
 }
-static WEBP_INLINE float LinearToGammaF(float value) {
-  return MAX_Y_T * value;
+static WEBP_INLINE int LinearToGammaF(float value) {
+  return (int)(MAX_Y_T * value + .5);
 }
 
 #endif    // USE_GAMMA_COMPRESSION
 
 //------------------------------------------------------------------------------
 
-// precision: YFIX -> TFIX
-static WEBP_INLINE int FixedYToW(int v) {
-#if TFIX == YFIX
-  return v;
-#elif TFIX >= YFIX
-  return v << (TFIX - YFIX);
-#else
-  return v >> (YFIX - TFIX);
-#endif
-}
-
-static WEBP_INLINE int FixedWToY(int v) {
-#if TFIX == YFIX
-  return v;
-#elif YFIX >= TFIX
-  return v << (YFIX - TFIX);
-#else
-  return v >> (TFIX - YFIX);
-#endif
-}
-
 static uint8_t clip_8b(fixed_t v) {
   return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
 }
@@ -261,13 +232,6 @@ static fixed_y_t clip_y(int y) {
   return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
 }
 
-// precision: TFIX -> YFIX
-static fixed_y_t clip_fixed_t(fixed_t v) {
-  const int y = FixedWToY(v);
-  const fixed_y_t w = clip_y(y);
-  return w;
-}
-
 //------------------------------------------------------------------------------
 
 static int RGBToGray(int r, int g, int b) {
@@ -279,7 +243,7 @@ static float RGBToGrayF(float r, float g, float b) {
   return 0.299f * r + 0.587f * g + 0.114f * b;
 }
 
-static float ScaleDown(int a, int b, int c, int d) {
+static int ScaleDown(int a, int b, int c, int d) {
   const float A = GammaToLinearF(a);
   const float B = GammaToLinearF(b);
   const float C = GammaToLinearF(c);
@@ -293,30 +257,36 @@ static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int len) {
     const float G = GammaToLinearF(src[1]);
     const float B = GammaToLinearF(src[2]);
     const float Y = RGBToGrayF(R, G, B);
-    *dst++ = (fixed_y_t)(LinearToGammaF(Y) + .5);
+    *dst++ = (fixed_y_t)LinearToGammaF(Y);
     src += 3;
   }
 }
 
-static WEBP_INLINE void UpdateChroma(const fixed_y_t* src1,
-                                     const fixed_y_t* src2,
-                                     fixed_t* dst, fixed_y_t* tmp, int len) {
+static int UpdateChroma(const fixed_y_t* src1,
+                        const fixed_y_t* src2,
+                        fixed_t* dst, fixed_y_t* tmp, int len) {
+  int diff = 0;
   while (len--> 0) {
-    const float r = ScaleDown(src1[0], src1[3], src2[0], src2[3]);
-    const float g = ScaleDown(src1[1], src1[4], src2[1], src2[4]);
-    const float b = ScaleDown(src1[2], src1[5], src2[2], src2[5]);
-    const float W = RGBToGrayF(r, g, b);
-    dst[0] = (fixed_t)FixedYToW((int)(r - W));
-    dst[1] = (fixed_t)FixedYToW((int)(g - W));
-    dst[2] = (fixed_t)FixedYToW((int)(b - W));
+    const int r = ScaleDown(src1[0], src1[3], src2[0], src2[3]);
+    const int g = ScaleDown(src1[1], src1[4], src2[1], src2[4]);
+    const int b = ScaleDown(src1[2], src1[5], src2[2], src2[5]);
+    const int W = RGBToGray(r, g, b);
+    const int r_avg = (src1[0] + src1[3] + src2[0] + src2[3] + 2) >> 2;
+    const int g_avg = (src1[1] + src1[4] + src2[1] + src2[4] + 2) >> 2;
+    const int b_avg = (src1[2] + src1[5] + src2[2] + src2[5] + 2) >> 2;
+    dst[0] = (fixed_t)(r - W);
+    dst[1] = (fixed_t)(g - W);
+    dst[2] = (fixed_t)(b - W);
     dst += 3;
     src1 += 6;
     src2 += 6;
     if (tmp != NULL) {
-      tmp[0] = tmp[1] = clip_y((int)(W + .5));
+      tmp[0] = tmp[1] = clip_y(W);
       tmp += 2;
     }
+    diff += abs(RGBToGray(r_avg, g_avg, b_avg) - W);
   }
+  return diff;
 }
 
 //------------------------------------------------------------------------------
@@ -336,9 +306,8 @@ static WEBP_INLINE int Filter2(int A, int B) { return (A * 3 + B + 2) >> 2; }
 
 //------------------------------------------------------------------------------
 
-// 8bit -> YFIX
-static WEBP_INLINE fixed_y_t UpLift(uint8_t a) {
-  return ((fixed_y_t)a << YFIX) | (1 << (YFIX - 1));
+static WEBP_INLINE fixed_y_t UpLift(uint8_t a) {  // 8bit -> SFIX
+  return ((fixed_y_t)a << SFIX) | SHALF;
 }
 
 static void ImportOneRow(const uint8_t* const r_ptr,
@@ -368,50 +337,48 @@ static void InterpolateTwoRows(const fixed_y_t* const best_y,
                                fixed_y_t* const out2) {
   int i, k;
   {  // special boundary case for i==0
-    const int W0 = FixedYToW(best_y[0]);
-    const int W1 = FixedYToW(best_y[w]);
+    const int W0 = best_y[0];
+    const int W1 = best_y[w];
     for (k = 0; k <= 2; ++k) {
-      out1[k] = clip_fixed_t(Filter2(cur_uv[k], prev_uv[k]) + W0);
-      out2[k] = clip_fixed_t(Filter2(cur_uv[k], next_uv[k]) + W1);
+      out1[k] = clip_y(Filter2(cur_uv[k], prev_uv[k]) + W0);
+      out2[k] = clip_y(Filter2(cur_uv[k], next_uv[k]) + W1);
     }
   }
   for (i = 1; i < w - 1; ++i) {
-    const int W0 = FixedYToW(best_y[i + 0]);
-    const int W1 = FixedYToW(best_y[i + w]);
+    const int W0 = best_y[i + 0];
+    const int W1 = best_y[i + w];
     const int off = 3 * (i >> 1);
     for (k = 0; k <= 2; ++k) {
       const int tmp0 = Filter(cur_uv + off + k, prev_uv + off + k, i & 1);
       const int tmp1 = Filter(cur_uv + off + k, next_uv + off + k, i & 1);
-      out1[3 * i + k] = clip_fixed_t(tmp0 + W0);
-      out2[3 * i + k] = clip_fixed_t(tmp1 + W1);
+      out1[3 * i + k] = clip_y(tmp0 + W0);
+      out2[3 * i + k] = clip_y(tmp1 + W1);
     }
   }
   {  // special boundary case for i == w - 1
-    const int W0 = FixedYToW(best_y[i + 0]);
-    const int W1 = FixedYToW(best_y[i + w]);
+    const int W0 = best_y[i + 0];
+    const int W1 = best_y[i + w];
     const int off = 3 * (i >> 1);
     for (k = 0; k <= 2; ++k) {
-      out1[3 * i + k] =
-          clip_fixed_t(Filter2(cur_uv[off + k], prev_uv[off + k]) + W0);
-      out2[3 * i + k] =
-          clip_fixed_t(Filter2(cur_uv[off + k], next_uv[off + k]) + W1);
+      out1[3 * i + k] = clip_y(Filter2(cur_uv[off + k], prev_uv[off + k]) + W0);
+      out2[3 * i + k] = clip_y(Filter2(cur_uv[off + k], next_uv[off + k]) + W1);
     }
   }
 }
 
 static WEBP_INLINE uint8_t ConvertRGBToY(int r, int g, int b) {
-  const int luma = 16839 * r + 33059 * g + 6420 * b + TROUNDER;
-  return clip_8b(16 + (luma >> (YUV_FIX + TFIX)));
+  const int luma = 16839 * r + 33059 * g + 6420 * b + SROUNDER;
+  return clip_8b(16 + (luma >> (YUV_FIX + SFIX)));
 }
 
 static WEBP_INLINE uint8_t ConvertRGBToU(int r, int g, int b) {
-  const int u =  -9719 * r - 19081 * g + 28800 * b + TROUNDER;
-  return clip_8b(128 + (u >> (YUV_FIX + TFIX)));
+  const int u =  -9719 * r - 19081 * g + 28800 * b + SROUNDER;
+  return clip_8b(128 + (u >> (YUV_FIX + SFIX)));
 }
 
 static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
-  const int v = +28800 * r - 24116 * g -  4684 * b + TROUNDER;
-  return clip_8b(128 + (v >> (YUV_FIX + TFIX)));
+  const int v = +28800 * r - 24116 * g -  4684 * b + SROUNDER;
+  return clip_8b(128 + (v >> (YUV_FIX + SFIX)));
 }
 
 static int ConvertWRGBToYUV(const fixed_y_t* const best_y,
@@ -426,7 +393,7 @@ static int ConvertWRGBToYUV(const fixed_y_t* const best_y,
     for (i = 0; i < picture->width; ++i) {
       const int off = 3 * ((i >> 1) + (j >> 1) * uv_w);
       const int off2 = i + j * picture->y_stride;
-      const int W = FixedYToW(best_y[i + j * w]);
+      const int W = best_y[i + j * w];
       const int r = best_uv[off + 0] + W;
       const int g = best_uv[off + 1] + W;
       const int b = best_uv[off + 2] + W;
@@ -475,6 +442,10 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
   fixed_t* const target_uv = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
   fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
   int ok;
+  int diff_sum = 0;
+  const int first_diff_threshold = (int)(2.5 * w * h);
+  const int min_improvement = 5;   // stop if improvement is below this %
+  const int min_first_improvement = 80;
 
   if (best_y == NULL || best_uv == NULL ||
       target_y == NULL || target_uv == NULL ||
@@ -507,7 +478,7 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
     }
     UpdateW(src1, target_y + (j + 0) * w, w);
     UpdateW(src2, target_y + (j + 1) * w, w);
-    UpdateChroma(src1, src2, target_uv + uv_off, dst_y, uv_w);
+    diff_sum += UpdateChroma(src1, src2, target_uv + uv_off, dst_y, uv_w);
     memcpy(best_uv + uv_off, target_uv + uv_off, 3 * uv_w * sizeof(*best_uv));
     memcpy(dst_y + w, dst_y, w * sizeof(*dst_y));
   }
@@ -517,10 +488,11 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
     int k;
     const fixed_t* cur_uv = best_uv;
     const fixed_t* prev_uv = best_uv;
+    const int old_diff_sum = diff_sum;
+    diff_sum = 0;
     for (j = 0; j < h; j += 2) {
       fixed_y_t* const src1 = tmp_buffer;
       fixed_y_t* const src2 = tmp_buffer + 3 * w;
-
       {
         const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
         InterpolateTwoRows(best_y + j * w, prev_uv, cur_uv, next_uv,
@@ -531,7 +503,7 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
 
       UpdateW(src1, best_rgb_y + 0 * w, w);
       UpdateW(src2, best_rgb_y + 1 * w, w);
-      UpdateChroma(src1, src2, best_rgb_uv, NULL, uv_w);
+      diff_sum += UpdateChroma(src1, src2, best_rgb_uv, NULL, uv_w);
 
       // update two rows of Y and one row of RGB
       for (i = 0; i < 2 * w; ++i) {
@@ -553,7 +525,23 @@ static int PreprocessARGB(const uint8_t* const r_ptr,
         }
       }
     }
-    // TODO(skal): add early-termination criterion
+    // test exit condition
+    if (diff_sum > 0) {
+      const int improvement = 100 * abs(diff_sum - old_diff_sum) / diff_sum;
+      // Check if first iteration gave good result already, without a large
+      // jump of improvement (otherwise it means we need to try few extra
+      // iterations, just to be sure).
+      if (iter == 0 && diff_sum < first_diff_threshold &&
+          improvement < min_first_improvement) {
+        break;
+      }
+      // then, check if improvement is stalling.
+      if (improvement < min_improvement) {
+        break;
+      }
+    } else {
+      break;
+    }
   }
 
   // final reconstruction
@@ -762,23 +750,20 @@ static WEBP_INLINE void ConvertRowToY(const uint8_t* const r_ptr,
                                       int width,
                                       VP8Random* const rg) {
   int i, j;
-  for (i = 0, j = 0; i < width; ++i, j += step) {
+  for (i = 0, j = 0; i < width; i += 1, j += step) {
     dst_y[i] = RGBToY(r_ptr[j], g_ptr[j], b_ptr[j], rg);
   }
 }
 
-static WEBP_INLINE void ConvertRowsToUVWithAlpha(const uint8_t* const r_ptr,
-                                                 const uint8_t* const g_ptr,
-                                                 const uint8_t* const b_ptr,
-                                                 const uint8_t* const a_ptr,
-                                                 int rgb_stride,
-                                                 uint8_t* const dst_u,
-                                                 uint8_t* const dst_v,
-                                                 int width,
-                                                 VP8Random* const rg) {
+static WEBP_INLINE void AccumulateRGBA(const uint8_t* const r_ptr,
+                                       const uint8_t* const g_ptr,
+                                       const uint8_t* const b_ptr,
+                                       const uint8_t* const a_ptr,
+                                       int rgb_stride,
+                                       uint16_t* dst, int width) {
   int i, j;
-  // we loop over 2x2 blocks and produce one U/V value for each.
-  for (i = 0, j = 0; i < (width >> 1); ++i, j += 2 * sizeof(uint32_t)) {
+  // we loop over 2x2 blocks and produce one R/G/B/A value for each.
+  for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * 4, dst += 4) {
     const uint32_t a = SUM4ALPHA(a_ptr + j);
     int r, g, b;
     if (a == 4 * 0xff || a == 0) {
@@ -790,8 +775,10 @@ static WEBP_INLINE void ConvertRowsToUVWithAlpha(const uint8_t* const r_ptr,
       g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 4, rgb_stride);
       b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 4, rgb_stride);
     }
-    dst_u[i] = RGBToU(r, g, b, rg);
-    dst_v[i] = RGBToV(r, g, b, rg);
+    dst[0] = r;
+    dst[1] = g;
+    dst[2] = b;
+    dst[3] = a;
   }
   if (width & 1) {
     const uint32_t a = 2u * SUM2ALPHA(a_ptr + j);
@@ -805,31 +792,39 @@ static WEBP_INLINE void ConvertRowsToUVWithAlpha(const uint8_t* const r_ptr,
       g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 0, rgb_stride);
       b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 0, rgb_stride);
     }
-    dst_u[i] = RGBToU(r, g, b, rg);
-    dst_v[i] = RGBToV(r, g, b, rg);
+    dst[0] = r;
+    dst[1] = g;
+    dst[2] = b;
+    dst[3] = a;
+  }
+}
+
+static WEBP_INLINE void AccumulateRGB(const uint8_t* const r_ptr,
+                                      const uint8_t* const g_ptr,
+                                      const uint8_t* const b_ptr,
+                                      int step, int rgb_stride,
+                                      uint16_t* dst, int width) {
+  int i, j;
+  for (i = 0, j = 0; i < (width >> 1); i += 1, j += 2 * step, dst += 4) {
+    dst[0] = SUM4(r_ptr + j, step);
+    dst[1] = SUM4(g_ptr + j, step);
+    dst[2] = SUM4(b_ptr + j, step);
+  }
+  if (width & 1) {
+    dst[0] = SUM2(r_ptr + j);
+    dst[1] = SUM2(g_ptr + j);
+    dst[2] = SUM2(b_ptr + j);
   }
 }
 
-static WEBP_INLINE void ConvertRowsToUV(const uint8_t* const r_ptr,
-                                        const uint8_t* const g_ptr,
-                                        const uint8_t* const b_ptr,
-                                        int step, int rgb_stride,
+static WEBP_INLINE void ConvertRowsToUV(const uint16_t* rgb,
                                         uint8_t* const dst_u,
                                         uint8_t* const dst_v,
                                         int width,
                                         VP8Random* const rg) {
-  int i, j;
-  for (i = 0, j = 0; i < (width >> 1); ++i, j += 2 * step) {
-    const int r = SUM4(r_ptr + j, step);
-    const int g = SUM4(g_ptr + j, step);
-    const int b = SUM4(b_ptr + j, step);
-    dst_u[i] = RGBToU(r, g, b, rg);
-    dst_v[i] = RGBToV(r, g, b, rg);
-  }
-  if (width & 1) {
-    const int r = SUM2(r_ptr + j);
-    const int g = SUM2(g_ptr + j);
-    const int b = SUM2(b_ptr + j);
+  int i;
+  for (i = 0; i < width; i += 1, rgb += 4) {
+    const int r = rgb[0], g = rgb[1], b = rgb[2];
     dst_u[i] = RGBToU(r, g, b, rg);
     dst_v[i] = RGBToV(r, g, b, rg);
   }
@@ -848,6 +843,7 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
   const int width = picture->width;
   const int height = picture->height;
   const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
+  const int is_rgb = (r_ptr < b_ptr);  // otherwise it's bgr
 
   picture->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420;
   picture->use_argb = 0;
@@ -864,7 +860,7 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
   if (has_alpha) {
     WebPInitAlphaProcessing();
     assert(step == 4);
-#if defined(USE_INVERSE_ALPHA_TABLE)
+#if defined(USE_GAMMA_COMPRESSION) && defined(USE_INVERSE_ALPHA_TABLE)
     assert(kAlphaFix + kGammaFix <= 31);
 #endif
   }
@@ -879,6 +875,11 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
                        picture->a, picture->a_stride);
     }
   } else {
+    const int uv_width = (width + 1) >> 1;
+    int use_dsp = (step == 3);  // use special function in this case
+    // temporary storage for accumulated R/G/B values during conversion to U/V
+    uint16_t* const tmp_rgb =
+        (uint16_t*)WebPSafeMalloc(4 * uv_width, sizeof(*tmp_rgb));
     uint8_t* dst_y = picture->y;
     uint8_t* dst_u = picture->u;
     uint8_t* dst_v = picture->v;
@@ -889,19 +890,32 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
     if (dithering > 0.) {
       VP8InitRandom(&base_rg, dithering);
       rg = &base_rg;
+      use_dsp = 0;   // can't use dsp in this case
     }
-
+    WebPInitConvertARGBToYUV();
     InitGammaTables();
 
+    if (tmp_rgb == NULL) return 0;  // malloc error
+
     // Downsample Y/U/V planes, two rows at a time
     for (y = 0; y < (height >> 1); ++y) {
       int rows_have_alpha = has_alpha;
       const int off1 = (2 * y + 0) * rgb_stride;
       const int off2 = (2 * y + 1) * rgb_stride;
-      ConvertRowToY(r_ptr + off1, g_ptr + off1, b_ptr + off1, step,
-                    dst_y, width, rg);
-      ConvertRowToY(r_ptr + off2, g_ptr + off2, b_ptr + off2, step,
-                    dst_y + picture->y_stride, width, rg);
+      if (use_dsp) {
+        if (is_rgb) {
+          WebPConvertRGB24ToY(r_ptr + off1, dst_y, width);
+          WebPConvertRGB24ToY(r_ptr + off2, dst_y + picture->y_stride, width);
+        } else {
+          WebPConvertBGR24ToY(b_ptr + off1, dst_y, width);
+          WebPConvertBGR24ToY(b_ptr + off2, dst_y + picture->y_stride, width);
+        }
+      } else {
+        ConvertRowToY(r_ptr + off1, g_ptr + off1, b_ptr + off1, step,
+                      dst_y, width, rg);
+        ConvertRowToY(r_ptr + off2, g_ptr + off2, b_ptr + off2, step,
+                      dst_y + picture->y_stride, width, rg);
+      }
       dst_y += 2 * picture->y_stride;
       if (has_alpha) {
         rows_have_alpha &= !WebPExtractAlpha(a_ptr + off1, rgb_stride,
@@ -909,13 +923,19 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
                                              dst_a, picture->a_stride);
         dst_a += 2 * picture->a_stride;
       }
+      // Collect averaged R/G/B(/A)
       if (!rows_have_alpha) {
-        ConvertRowsToUV(r_ptr + off1, g_ptr + off1, b_ptr + off1,
-                        step, rgb_stride, dst_u, dst_v, width, rg);
+        AccumulateRGB(r_ptr + off1, g_ptr + off1, b_ptr + off1,
+                      step, rgb_stride, tmp_rgb, width);
+      } else {
+        AccumulateRGBA(r_ptr + off1, g_ptr + off1, b_ptr + off1, a_ptr + off1,
+                       rgb_stride, tmp_rgb, width);
+      }
+      // Convert to U/V
+      if (rg == NULL) {
+        WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
       } else {
-        ConvertRowsToUVWithAlpha(r_ptr + off1, g_ptr + off1, b_ptr + off1,
-                                 a_ptr + off1, rgb_stride,
-                                 dst_u, dst_v, width, rg);
+        ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg);
       }
       dst_u += picture->uv_stride;
       dst_v += picture->uv_stride;
@@ -923,20 +943,35 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
     if (height & 1) {    // extra last row
       const int off = 2 * y * rgb_stride;
       int row_has_alpha = has_alpha;
-      ConvertRowToY(r_ptr + off, g_ptr + off, b_ptr + off, step,
-                    dst_y, width, rg);
+      if (use_dsp) {
+        if (r_ptr < b_ptr) {
+          WebPConvertRGB24ToY(r_ptr + off, dst_y, width);
+        } else {
+          WebPConvertBGR24ToY(b_ptr + off, dst_y, width);
+        }
+      } else {
+        ConvertRowToY(r_ptr + off, g_ptr + off, b_ptr + off, step,
+                      dst_y, width, rg);
+      }
       if (row_has_alpha) {
         row_has_alpha &= !WebPExtractAlpha(a_ptr + off, 0, width, 1, dst_a, 0);
       }
+      // Collect averaged R/G/B(/A)
       if (!row_has_alpha) {
-        ConvertRowsToUV(r_ptr + off, g_ptr + off, b_ptr + off,
-                        step, 0, dst_u, dst_v, width, rg);
+        // Collect averaged R/G/B
+        AccumulateRGB(r_ptr + off, g_ptr + off, b_ptr + off,
+                      step, /* rgb_stride = */ 0, tmp_rgb, width);
+      } else {
+        AccumulateRGBA(r_ptr + off, g_ptr + off, b_ptr + off, a_ptr + off,
+                       /* rgb_stride = */ 0, tmp_rgb, width);
+      }
+      if (rg == NULL) {
+        WebPConvertRGBA32ToUV(tmp_rgb, dst_u, dst_v, uv_width);
       } else {
-        ConvertRowsToUVWithAlpha(r_ptr + off, g_ptr + off, b_ptr + off,
-                                 a_ptr + off, 0,
-                                 dst_u, dst_v, width, rg);
+        ConvertRowsToUV(tmp_rgb, dst_u, dst_v, uv_width, rg);
       }
     }
+    WebPSafeFree(tmp_rgb);
   }
   return 1;
 }
@@ -978,11 +1013,9 @@ int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
   return PictureARGBToYUVA(picture, colorspace, 0.f, 0);
 }
 
-#if WEBP_ENCODER_ABI_VERSION > 0x0204
 int WebPPictureSmartARGBToYUVA(WebPPicture* picture) {
   return PictureARGBToYUVA(picture, WEBP_YUV420, 0.f, 1);
 }
-#endif
 
 //------------------------------------------------------------------------------
 // call for YUVA -> ARGB conversion
@@ -1066,14 +1099,23 @@ static int Import(WebPPicture* const picture,
   }
   if (!WebPPictureAlloc(picture)) return 0;
 
-  assert(step >= (import_alpha ? 4 : 3));
-  for (y = 0; y < height; ++y) {
-    uint32_t* const dst = &picture->argb[y * picture->argb_stride];
-    int x;
-    for (x = 0; x < width; ++x) {
-      const int offset = step * x + y * rgb_stride;
-      dst[x] = MakeARGB32(import_alpha ? a_ptr[offset] : 0xff,
-                          r_ptr[offset], g_ptr[offset], b_ptr[offset]);
+  VP8EncDspARGBInit();
+
+  if (import_alpha) {
+    assert(step == 4);
+    for (y = 0; y < height; ++y) {
+      uint32_t* const dst = &picture->argb[y * picture->argb_stride];
+      const int offset = y * rgb_stride;
+      VP8PackARGB(a_ptr + offset, r_ptr + offset, g_ptr + offset,
+                  b_ptr + offset, width, dst);
+    }
+  } else {
+    assert(step >= 3);
+    for (y = 0; y < height; ++y) {
+      uint32_t* const dst = &picture->argb[y * picture->argb_stride];
+      const int offset = y * rgb_stride;
+      VP8PackRGB(r_ptr + offset, g_ptr + offset, b_ptr + offset,
+                 width, step, dst);
     }
   }
   return 1;
diff --git a/src/3rdparty/libwebp/src/enc/picture_psnr.c b/src/3rdparty/libwebp/src/enc/picture_psnr.c
index 2254b7e..40214ef 100644
--- a/src/3rdparty/libwebp/src/enc/picture_psnr.c
+++ b/src/3rdparty/libwebp/src/enc/picture_psnr.c
@@ -12,8 +12,10 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <math.h>
+#include <stdlib.h>
 
 #include "./vp8enci.h"
+#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 // local-min distortion
@@ -23,9 +25,9 @@
 
 #define RADIUS 2  // search radius. Shouldn't be too large.
 
-static float AccumulateLSIM(const uint8_t* src, int src_stride,
-                            const uint8_t* ref, int ref_stride,
-                            int w, int h) {
+static void AccumulateLSIM(const uint8_t* src, int src_stride,
+                           const uint8_t* ref, int ref_stride,
+                           int w, int h, DistoStats* stats) {
   int x, y;
   double total_sse = 0.;
   for (y = 0; y < h; ++y) {
@@ -38,16 +40,22 @@ static float AccumulateLSIM(const uint8_t* src, int src_stride,
       const double value = (double)ref[y * ref_stride + x];
       int i, j;
       for (j = y_0; j < y_1; ++j) {
-        const uint8_t* s = src + j * src_stride;
+        const uint8_t* const s = src + j * src_stride;
         for (i = x_0; i < x_1; ++i) {
-          const double sse = (double)(s[i] - value) * (s[i] - value);
+          const double diff = s[i] - value;
+          const double sse = diff * diff;
           if (sse < best_sse) best_sse = sse;
         }
       }
       total_sse += best_sse;
     }
   }
-  return (float)total_sse;
+  stats->w = w * h;
+  stats->xm = 0;
+  stats->ym = 0;
+  stats->xxm = total_sse;
+  stats->yym = 0;
+  stats->xxm = 0;
 }
 #undef RADIUS
 
@@ -64,73 +72,90 @@ static float GetPSNR(const double v) {
 int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
                           int type, float result[5]) {
   DistoStats stats[5];
-  int has_alpha;
-  int uv_w, uv_h;
+  int w, h;
+
+  memset(stats, 0, sizeof(stats));
 
   if (src == NULL || ref == NULL ||
       src->width != ref->width || src->height != ref->height ||
-      src->y == NULL || ref->y == NULL ||
-      src->u == NULL || ref->u == NULL ||
-      src->v == NULL || ref->v == NULL ||
-      result == NULL) {
-    return 0;
-  }
-  // TODO(skal): provide distortion for ARGB too.
-  if (src->use_argb == 1 || src->use_argb != ref->use_argb) {
-    return 0;
-  }
-
-  has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
-  if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
-      (has_alpha && (src->a == NULL || ref->a == NULL))) {
+      src->use_argb != ref->use_argb || result == NULL) {
     return 0;
   }
+  w = src->width;
+  h = src->height;
 
-  memset(stats, 0, sizeof(stats));
+  if (src->use_argb == 1) {
+    if (src->argb == NULL || ref->argb == NULL) {
+      return 0;
+    } else {
+      int i, j, c;
+      uint8_t* tmp1, *tmp2;
+      uint8_t* const tmp_plane =
+          (uint8_t*)WebPSafeMalloc(2ULL * w * h, sizeof(*tmp_plane));
+      if (tmp_plane == NULL) return 0;
+      tmp1 = tmp_plane;
+      tmp2 = tmp_plane + w * h;
+      for (c = 0; c < 4; ++c) {
+        for (j = 0; j < h; ++j) {
+          for (i = 0; i < w; ++i) {
+            tmp1[j * w + i] = src->argb[i + j * src->argb_stride] >> (c * 8);
+            tmp2[j * w + i] = ref->argb[i + j * ref->argb_stride] >> (c * 8);
+          }
+        }
+        if (type >= 2) {
+          AccumulateLSIM(tmp1, w, tmp2, w, w, h, &stats[c]);
+        } else {
+          VP8SSIMAccumulatePlane(tmp1, w, tmp2, w, w, h, &stats[c]);
+        }
+      }
+      free(tmp_plane);
+    }
+  } else {
+    int has_alpha, uv_w, uv_h;
+    if (src->y == NULL || ref->y == NULL ||
+        src->u == NULL || ref->u == NULL ||
+        src->v == NULL || ref->v == NULL) {
+      return 0;
+    }
+    has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
+    if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
+        (has_alpha && (src->a == NULL || ref->a == NULL))) {
+      return 0;
+    }
 
-  uv_w = (src->width + 1) >> 1;
-  uv_h = (src->height + 1) >> 1;
-  if (type >= 2) {
-    float sse[4];
-    sse[0] = AccumulateLSIM(src->y, src->y_stride,
-                            ref->y, ref->y_stride, src->width, src->height);
-    sse[1] = AccumulateLSIM(src->u, src->uv_stride,
-                            ref->u, ref->uv_stride, uv_w, uv_h);
-    sse[2] = AccumulateLSIM(src->v, src->uv_stride,
-                            ref->v, ref->uv_stride, uv_w, uv_h);
-    sse[3] = has_alpha ? AccumulateLSIM(src->a, src->a_stride,
-                                        ref->a, ref->a_stride,
-                                        src->width, src->height)
-                       : 0.f;
-    result[0] = GetPSNR(sse[0] / (src->width * src->height));
-    result[1] = GetPSNR(sse[1] / (uv_w * uv_h));
-    result[2] = GetPSNR(sse[2] / (uv_w * uv_h));
-    result[3] = GetPSNR(sse[3] / (src->width * src->height));
-    {
-      double total_sse = sse[0] + sse[1] + sse[2];
-      int total_pixels = src->width * src->height + 2 * uv_w * uv_h;
+    uv_w = (src->width + 1) >> 1;
+    uv_h = (src->height + 1) >> 1;
+    if (type >= 2) {
+      AccumulateLSIM(src->y, src->y_stride, ref->y, ref->y_stride,
+                     w, h, &stats[0]);
+      AccumulateLSIM(src->u, src->uv_stride, ref->u, ref->uv_stride,
+                     uv_w, uv_h, &stats[1]);
+      AccumulateLSIM(src->v, src->uv_stride, ref->v, ref->uv_stride,
+                     uv_w, uv_h, &stats[2]);
       if (has_alpha) {
-        total_pixels += src->width * src->height;
-        total_sse += sse[3];
+        AccumulateLSIM(src->a, src->a_stride, ref->a, ref->a_stride,
+                       w, h, &stats[3]);
+      }
+    } else {
+      VP8SSIMAccumulatePlane(src->y, src->y_stride,
+                             ref->y, ref->y_stride,
+                             w, h, &stats[0]);
+      VP8SSIMAccumulatePlane(src->u, src->uv_stride,
+                             ref->u, ref->uv_stride,
+                             uv_w, uv_h, &stats[1]);
+      VP8SSIMAccumulatePlane(src->v, src->uv_stride,
+                             ref->v, ref->uv_stride,
+                             uv_w, uv_h, &stats[2]);
+      if (has_alpha) {
+        VP8SSIMAccumulatePlane(src->a, src->a_stride,
+                               ref->a, ref->a_stride,
+                               w, h, &stats[3]);
       }
-      result[4] = GetPSNR(total_sse / total_pixels);
     }
-  } else {
+  }
+  // Final stat calculations.
+  {
     int c;
-    VP8SSIMAccumulatePlane(src->y, src->y_stride,
-                           ref->y, ref->y_stride,
-                           src->width, src->height, &stats[0]);
-    VP8SSIMAccumulatePlane(src->u, src->uv_stride,
-                           ref->u, ref->uv_stride,
-                           uv_w, uv_h, &stats[1]);
-    VP8SSIMAccumulatePlane(src->v, src->uv_stride,
-                           ref->v, ref->uv_stride,
-                           uv_w, uv_h, &stats[2]);
-    if (has_alpha) {
-      VP8SSIMAccumulatePlane(src->a, src->a_stride,
-                             ref->a, ref->a_stride,
-                             src->width, src->height, &stats[3]);
-    }
     for (c = 0; c <= 4; ++c) {
       if (type == 1) {
         const double v = VP8SSIMGet(&stats[c]);
diff --git a/src/3rdparty/libwebp/src/enc/picture_rescale.c b/src/3rdparty/libwebp/src/enc/picture_rescale.c
index 9e45551..9f19e8e 100644
--- a/src/3rdparty/libwebp/src/enc/picture_rescale.c
+++ b/src/3rdparty/libwebp/src/enc/picture_rescale.c
@@ -30,16 +30,6 @@ static void PictureGrabSpecs(const WebPPicture* const src,
 }
 
 //------------------------------------------------------------------------------
-// Picture copying
-
-static void CopyPlane(const uint8_t* src, int src_stride,
-                      uint8_t* dst, int dst_stride, int width, int height) {
-  while (height-- > 0) {
-    memcpy(dst, src, width);
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
 
 // Adjust top-left corner to chroma sample position.
 static void SnapTopLeftPosition(const WebPPicture* const pic,
@@ -70,20 +60,20 @@ int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
   if (!WebPPictureAlloc(dst)) return 0;
 
   if (!src->use_argb) {
-    CopyPlane(src->y, src->y_stride,
-              dst->y, dst->y_stride, dst->width, dst->height);
-    CopyPlane(src->u, src->uv_stride,
-              dst->u, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
-    CopyPlane(src->v, src->uv_stride,
-              dst->v, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
+    WebPCopyPlane(src->y, src->y_stride,
+                  dst->y, dst->y_stride, dst->width, dst->height);
+    WebPCopyPlane(src->u, src->uv_stride, dst->u, dst->uv_stride,
+                  HALVE(dst->width), HALVE(dst->height));
+    WebPCopyPlane(src->v, src->uv_stride, dst->v, dst->uv_stride,
+                  HALVE(dst->width), HALVE(dst->height));
     if (dst->a != NULL)  {
-      CopyPlane(src->a, src->a_stride,
-                dst->a, dst->a_stride, dst->width, dst->height);
+      WebPCopyPlane(src->a, src->a_stride,
+                    dst->a, dst->a_stride, dst->width, dst->height);
     }
   } else {
-    CopyPlane((const uint8_t*)src->argb, 4 * src->argb_stride,
-              (uint8_t*)dst->argb, 4 * dst->argb_stride,
-              4 * dst->width, dst->height);
+    WebPCopyPlane((const uint8_t*)src->argb, 4 * src->argb_stride,
+                  (uint8_t*)dst->argb, 4 * dst->argb_stride,
+                  4 * dst->width, dst->height);
   }
   return 1;
 }
@@ -144,24 +134,23 @@ int WebPPictureCrop(WebPPicture* pic,
   if (!pic->use_argb) {
     const int y_offset = top * pic->y_stride + left;
     const int uv_offset = (top / 2) * pic->uv_stride + left / 2;
-    CopyPlane(pic->y + y_offset, pic->y_stride,
-              tmp.y, tmp.y_stride, width, height);
-    CopyPlane(pic->u + uv_offset, pic->uv_stride,
-              tmp.u, tmp.uv_stride, HALVE(width), HALVE(height));
-    CopyPlane(pic->v + uv_offset, pic->uv_stride,
-              tmp.v, tmp.uv_stride, HALVE(width), HALVE(height));
+    WebPCopyPlane(pic->y + y_offset, pic->y_stride,
+                  tmp.y, tmp.y_stride, width, height);
+    WebPCopyPlane(pic->u + uv_offset, pic->uv_stride,
+                  tmp.u, tmp.uv_stride, HALVE(width), HALVE(height));
+    WebPCopyPlane(pic->v + uv_offset, pic->uv_stride,
+                  tmp.v, tmp.uv_stride, HALVE(width), HALVE(height));
 
     if (tmp.a != NULL) {
       const int a_offset = top * pic->a_stride + left;
-      CopyPlane(pic->a + a_offset, pic->a_stride,
-                tmp.a, tmp.a_stride, width, height);
+      WebPCopyPlane(pic->a + a_offset, pic->a_stride,
+                    tmp.a, tmp.a_stride, width, height);
     }
   } else {
     const uint8_t* const src =
         (const uint8_t*)(pic->argb + top * pic->argb_stride + left);
-    CopyPlane(src, pic->argb_stride * 4,
-              (uint8_t*)tmp.argb, tmp.argb_stride * 4,
-              width * 4, height);
+    WebPCopyPlane(src, pic->argb_stride * 4, (uint8_t*)tmp.argb,
+                  tmp.argb_stride * 4, width * 4, height);
   }
   WebPPictureFree(pic);
   *pic = tmp;
@@ -210,16 +199,10 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
   if (pic == NULL) return 0;
   prev_width = pic->width;
   prev_height = pic->height;
-  // if width is unspecified, scale original proportionally to height ratio.
-  if (width == 0) {
-    width = (prev_width * height + prev_height / 2) / prev_height;
+  if (!WebPRescalerGetScaledDimensions(
+          prev_width, prev_height, &width, &height)) {
+    return 0;
   }
-  // if height is unspecified, scale original proportionally to width ratio.
-  if (height == 0) {
-    height = (prev_height * width + prev_width / 2) / prev_width;
-  }
-  // Check if the overall dimensions still make sense.
-  if (width <= 0 || height <= 0) return 0;
 
   PictureGrabSpecs(pic, &tmp);
   tmp.width = width;
diff --git a/src/3rdparty/libwebp/src/enc/picture_tools.c b/src/3rdparty/libwebp/src/enc/picture_tools.c
index 7c73646..bf97af8 100644
--- a/src/3rdparty/libwebp/src/enc/picture_tools.c
+++ b/src/3rdparty/libwebp/src/enc/picture_tools.c
@@ -11,6 +11,8 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
+#include <assert.h>
+
 #include "./vp8enci.h"
 #include "../dsp/yuv.h"
 
@@ -120,6 +122,24 @@ void WebPCleanupTransparentArea(WebPPicture* pic) {
 #undef SIZE
 #undef SIZE2
 
+void WebPCleanupTransparentAreaLossless(WebPPicture* const pic) {
+  int x, y, w, h;
+  uint32_t* argb;
+  assert(pic != NULL && pic->use_argb);
+  w = pic->width;
+  h = pic->height;
+  argb = pic->argb;
+
+  for (y = 0; y < h; ++y) {
+    for (x = 0; x < w; ++x) {
+      if ((argb[x] & 0xff000000) == 0) {
+        argb[x] = 0x00000000;
+      }
+    }
+    argb += pic->argb_stride;
+  }
+}
+
 //------------------------------------------------------------------------------
 // Blend color and remove transparency info
 
diff --git a/src/3rdparty/libwebp/src/enc/quant.c b/src/3rdparty/libwebp/src/enc/quant.c
index 9130a41..dd6885a 100644
--- a/src/3rdparty/libwebp/src/enc/quant.c
+++ b/src/3rdparty/libwebp/src/enc/quant.c
@@ -30,7 +30,7 @@
 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
                           // power-law modulation. Must be strictly less than 1.
 
-#define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
+#define I4_PENALTY 14000  // Rate-penalty for quick i4/i16 decision
 
 // number of non-zero coeffs below which we consider the block very flat
 // (and apply a penalty to complex predictions)
@@ -41,6 +41,8 @@
 
 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
 
+#define RD_DISTO_MULT      256  // distortion multiplier (equivalent of lambda)
+
 // #define DEBUG_BLOCK
 
 //------------------------------------------------------------------------------
@@ -54,15 +56,37 @@ static void PrintBlockInfo(const VP8EncIterator* const it,
                            const VP8ModeScore* const rd) {
   int i, j;
   const int is_i16 = (it->mb_->type_ == 1);
+  const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
+  const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
+  const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
+  const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
   printf("SOURCE / OUTPUT / ABS DELTA\n");
-  for (j = 0; j < 24; ++j) {
-    if (j == 16) printf("\n");   // newline before the U/V block
-    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
     printf("     ");
-    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
+    for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
     printf("     ");
     for (i = 0; i < 16; ++i) {
-      printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
+      printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
+    }
+    printf("\n");
+  }
+  printf("\n");   // newline before the U/V block
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
+    printf(" ");
+    for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
+    printf("    ");
+    for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
+    printf(" ");
+    for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
+    printf("   ");
+    for (i = 0; i < 8; ++i) {
+      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
+    }
+    printf(" ");
+    for (i = 8; i < 16; ++i) {
+      printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
     }
     printf("\n");
   }
@@ -444,15 +468,12 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
 // Quantize
 
 // Layout:
-// +----+
-// |YYYY| 0
-// |YYYY| 4
-// |YYYY| 8
-// |YYYY| 12
-// +----+
-// |UUVV| 16
-// |UUVV| 20
-// +----+
+// +----+----+
+// |YYYY|UUVV| 0
+// |YYYY|UUVV| 4
+// |YYYY|....| 8
+// |YYYY|....| 12
+// +----+----+
 
 const int VP8Scan[16] = {  // Luma
   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
@@ -538,13 +559,12 @@ typedef struct {
 #define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
 
 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
-  // TODO: incorporate the "* 256" in the tables?
-  rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
+  rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
 }
 
 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
                                           score_t distortion) {
-  return rate * lambda + 256 * distortion;
+  return rate * lambda + RD_DISTO_MULT * distortion;
 }
 
 static int TrellisQuantizeBlock(const VP8Encoder* const enc,
@@ -553,7 +573,8 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
                                 const VP8Matrix* const mtx,
                                 int lambda) {
   const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
-  const CostArray* const costs = enc->proba_.level_cost_[coeff_type];
+  CostArrayPtr const costs =
+      (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
   const int first = (coeff_type == 0) ? 1 : 0;
   Node nodes[16][NUM_NODES];
   ScoreState score_states[2][NUM_NODES];
@@ -590,7 +611,7 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
       const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
       ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
-      ss_cur[m].costs = costs[VP8EncBands[first]][ctx0];
+      ss_cur[m].costs = costs[first][ctx0];
     }
   }
 
@@ -624,7 +645,7 @@ static int TrellisQuantizeBlock(const VP8Encoder* const enc,
       int best_prev = 0;   // default, in case
 
       ss_cur[m].score = MAX_COST;
-      ss_cur[m].costs = costs[band][ctx];
+      ss_cur[m].costs = costs[n + 1][ctx];
       if (level > MAX_LEVEL || level < 0) {   // node is dead?
         continue;
       }
@@ -719,14 +740,14 @@ static int ReconstructIntra16(VP8EncIterator* const it,
                               int mode) {
   const VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
-  const uint8_t* const src = it->yuv_in_ + Y_OFF;
+  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   int nz = 0;
   int n;
   int16_t tmp[16][16], dc_tmp[16];
 
-  for (n = 0; n < 16; ++n) {
-    VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
+  for (n = 0; n < 16; n += 2) {
+    VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
   }
   VP8FTransformWHT(tmp[0], dc_tmp);
   nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
@@ -746,12 +767,13 @@ static int ReconstructIntra16(VP8EncIterator* const it,
       }
     }
   } else {
-    for (n = 0; n < 16; ++n) {
+    for (n = 0; n < 16; n += 2) {
       // Zero-out the first coeff, so that: a) nz is correct below, and
       // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
-      tmp[n][0] = 0;
-      nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
-      assert(rd->y_ac_levels[n][0] == 0);
+      tmp[n][0] = tmp[n + 1][0] = 0;
+      nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
+      assert(rd->y_ac_levels[n + 0][0] == 0);
+      assert(rd->y_ac_levels[n + 1][0] == 0);
     }
   }
 
@@ -792,14 +814,14 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
                          uint8_t* const yuv_out, int mode) {
   const VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
-  const uint8_t* const src = it->yuv_in_ + U_OFF;
+  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   int nz = 0;
   int n;
   int16_t tmp[8][16];
 
-  for (n = 0; n < 8; ++n) {
-    VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
+  for (n = 0; n < 8; n += 2) {
+    VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
   }
   if (DO_TRELLIS_UV && it->do_trellis_) {
     int ch, x, y;
@@ -816,8 +838,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
       }
     }
   } else {
-    for (n = 0; n < 8; ++n) {
-      nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
+    for (n = 0; n < 8; n += 2) {
+      nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
     }
   }
 
@@ -842,6 +864,12 @@ static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
   if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
 }
 
+static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
+  VP8ModeScore* const tmp = *a;
+  *a = *b;
+  *b = tmp;
+}
+
 static void SwapPtr(uint8_t** a, uint8_t** b) {
   uint8_t* const tmp = *a;
   *a = *b;
@@ -865,46 +893,47 @@ static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
   return 1;
 }
 
-static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
+static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
   const int kNumBlocks = 16;
   VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i16_;
   const int tlambda = dqm->tlambda_;
-  const uint8_t* const src = it->yuv_in_ + Y_OFF;
-  VP8ModeScore rd16;
+  const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
+  VP8ModeScore rd_tmp;
+  VP8ModeScore* rd_cur = &rd_tmp;
+  VP8ModeScore* rd_best = rd;
   int mode;
 
   rd->mode_i16 = -1;
   for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
-    uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF;  // scratch buffer
-    int nz;
+    uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC;  // scratch buffer
+    rd_cur->mode_i16 = mode;
 
     // Reconstruct
-    nz = ReconstructIntra16(it, &rd16, tmp_dst, mode);
+    rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
 
     // Measure RD-score
-    rd16.D = VP8SSE16x16(src, tmp_dst);
-    rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
-            : 0;
-    rd16.H = VP8FixedCostsI16[mode];
-    rd16.R = VP8GetCostLuma16(it, &rd16);
+    rd_cur->D = VP8SSE16x16(src, tmp_dst);
+    rd_cur->SD =
+        tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
+    rd_cur->H = VP8FixedCostsI16[mode];
+    rd_cur->R = VP8GetCostLuma16(it, rd_cur);
     if (mode > 0 &&
-        IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
+        IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
       // penalty to avoid flat area to be mispredicted by complex mode
-      rd16.R += FLATNESS_PENALTY * kNumBlocks;
+      rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
     }
 
     // Since we always examine Intra16 first, we can overwrite *rd directly.
-    SetRDScore(lambda, &rd16);
-    if (mode == 0 || rd16.score < rd->score) {
-      CopyScore(rd, &rd16);
-      rd->mode_i16 = mode;
-      rd->nz = nz;
-      memcpy(rd->y_ac_levels, rd16.y_ac_levels, sizeof(rd16.y_ac_levels));
-      memcpy(rd->y_dc_levels, rd16.y_dc_levels, sizeof(rd16.y_dc_levels));
+    SetRDScore(lambda, rd_cur);
+    if (mode == 0 || rd_cur->score < rd_best->score) {
+      SwapModeScore(&rd_cur, &rd_best);
       SwapOut(it);
     }
   }
+  if (rd_best != rd) {
+    memcpy(rd, rd_best, sizeof(*rd));
+  }
   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
   VP8SetIntra16Mode(it, rd->mode_i16);
 
@@ -933,8 +962,8 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i4_;
   const int tlambda = dqm->tlambda_;
-  const uint8_t* const src0 = it->yuv_in_ + Y_OFF;
-  uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF;
+  const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
+  uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
   int total_header_bits = 0;
   VP8ModeScore rd_best;
 
@@ -972,17 +1001,28 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
                   : 0;
       rd_tmp.H = mode_costs[mode];
-      rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
+
+      // Add flatness penalty
       if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
-        rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
+        rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
+      } else {
+        rd_tmp.R = 0;
       }
 
+      // early-out check
       SetRDScore(lambda, &rd_tmp);
+      if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
+
+      // finish computing score
+      rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
+      SetRDScore(lambda, &rd_tmp);
+
       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
         CopyScore(&rd_i4, &rd_tmp);
         best_mode = mode;
         SwapPtr(&tmp_dst, &best_block);
-        memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels, sizeof(tmp_levels));
+        memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
+               sizeof(rd_best.y_ac_levels[it->i4_]));
       }
     }
     SetRDScore(dqm->lambda_mode_, &rd_i4);
@@ -1016,9 +1056,10 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
   const int kNumBlocks = 8;
   const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_uv_;
-  const uint8_t* const src = it->yuv_in_ + U_OFF;
-  uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF;  // scratch buffer
-  uint8_t* const dst0 = it->yuv_out_ + U_OFF;
+  const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
+  uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC;  // scratch buffer
+  uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
+  uint8_t* dst = dst0;
   VP8ModeScore rd_best;
   int mode;
 
@@ -1032,7 +1073,7 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
 
     // Compute RD-score
     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
-    rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
+    rd_uv.SD = 0;    // not calling TDisto here: it tends to flatten areas.
     rd_uv.H  = VP8FixedCostsUV[mode];
     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
     if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
@@ -1044,11 +1085,14 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
       CopyScore(&rd_best, &rd_uv);
       rd->mode_uv = mode;
       memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
-      memcpy(dst0, tmp_dst, UV_SIZE);   //  TODO: SwapUVOut() ?
+      SwapPtr(&dst, &tmp_dst);
     }
   }
   VP8SetIntraUVMode(it, rd->mode_uv);
   AddScore(rd, &rd_best);
+  if (dst != dst0) {   // copy 16x8 block if needed
+    VP8Copy16x8(dst, dst0);
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -1060,35 +1104,41 @@ static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
   int nz = 0;
 
   if (is_i16) {
-    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF, it->preds_[0]);
+    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
   } else {
     VP8IteratorStartI4(it);
     do {
       const int mode =
           it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
-      const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
-      uint8_t* const dst = it->yuv_out_ + Y_OFF + VP8Scan[it->i4_];
+      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
+      uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
       VP8MakeIntra4Preds(it);
       nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
                               src, dst, mode) << it->i4_;
-    } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF));
+    } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
   }
 
-  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF, it->mb_->uv_mode_);
+  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
   rd->nz = nz;
 }
 
 // Refine intra16/intra4 sub-modes based on distortion only (not rate).
-static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
-  const int is_i16 = (it->mb_->type_ == 1);
+static void RefineUsingDistortion(VP8EncIterator* const it,
+                                  int try_both_modes, int refine_uv_mode,
+                                  VP8ModeScore* const rd) {
   score_t best_score = MAX_COST;
+  score_t score_i4 = (score_t)I4_PENALTY;
+  int16_t tmp_levels[16][16];
+  uint8_t modes_i4[16];
+  int nz = 0;
+  int mode;
+  int is_i16 = try_both_modes || (it->mb_->type_ == 1);
 
-  if (try_both_i4_i16 || is_i16) {
-    int mode;
+  if (is_i16) {   // First, evaluate Intra16 distortion
     int best_mode = -1;
+    const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
     for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
       const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
-      const uint8_t* const src = it->yuv_in_ + Y_OFF;
       const score_t score = VP8SSE16x16(src, ref);
       if (score < best_score) {
         best_mode = mode;
@@ -1096,39 +1146,72 @@ static void DistoRefine(VP8EncIterator* const it, int try_both_i4_i16) {
       }
     }
     VP8SetIntra16Mode(it, best_mode);
+    // we'll reconstruct later, if i16 mode actually gets selected
   }
-  if (try_both_i4_i16 || !is_i16) {
-    uint8_t modes_i4[16];
+
+  // Next, evaluate Intra4
+  if (try_both_modes || !is_i16) {
     // We don't evaluate the rate here, but just account for it through a
     // constant penalty (i4 mode usually needs more bits compared to i16).
-    score_t score_i4 = (score_t)I4_PENALTY;
-
+    is_i16 = 0;
     VP8IteratorStartI4(it);
     do {
-      int mode;
-      int best_sub_mode = -1;
-      score_t best_sub_score = MAX_COST;
-      const uint8_t* const src = it->yuv_in_ + Y_OFF + VP8Scan[it->i4_];
+      int best_i4_mode = -1;
+      score_t best_i4_score = MAX_COST;
+      const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
 
-      // TODO(skal): we don't really need the prediction pixels here,
-      // but just the distortion against 'src'.
       VP8MakeIntra4Preds(it);
       for (mode = 0; mode < NUM_BMODES; ++mode) {
         const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
         const score_t score = VP8SSE4x4(src, ref);
-        if (score < best_sub_score) {
-          best_sub_mode = mode;
-          best_sub_score = score;
+        if (score < best_i4_score) {
+          best_i4_mode = mode;
+          best_i4_score = score;
         }
       }
-      modes_i4[it->i4_] = best_sub_mode;
-      score_i4 += best_sub_score;
-      if (score_i4 >= best_score) break;
-    } while (VP8IteratorRotateI4(it, it->yuv_in_ + Y_OFF));
-    if (score_i4 < best_score) {
-      VP8SetIntra4Mode(it, modes_i4);
+      modes_i4[it->i4_] = best_i4_mode;
+      score_i4 += best_i4_score;
+      if (score_i4 >= best_score) {
+        // Intra4 won't be better than Intra16. Bail out and pick Intra16.
+        is_i16 = 1;
+        break;
+      } else {  // reconstruct partial block inside yuv_out2_ buffer
+        uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
+        nz |= ReconstructIntra4(it, tmp_levels[it->i4_],
+                                src, tmp_dst, best_i4_mode) << it->i4_;
+      }
+    } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
+  }
+
+  // Final reconstruction, depending on which mode is selected.
+  if (!is_i16) {
+    VP8SetIntra4Mode(it, modes_i4);
+    memcpy(rd->y_ac_levels, tmp_levels, sizeof(tmp_levels));
+    SwapOut(it);
+    best_score = score_i4;
+  } else {
+    nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
+  }
+
+  // ... and UV!
+  if (refine_uv_mode) {
+    int best_mode = -1;
+    score_t best_uv_score = MAX_COST;
+    const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
+    for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
+      const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
+      const score_t score = VP8SSE16x8(src, ref);
+      if (score < best_uv_score) {
+        best_mode = mode;
+        best_uv_score = score;
+      }
     }
+    VP8SetIntraUVMode(it, best_mode);
   }
+  nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
+
+  rd->nz = nz;
+  rd->score = best_score;
 }
 
 //------------------------------------------------------------------------------
@@ -1158,13 +1241,13 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
       SimpleQuantize(it, rd);
     }
   } else {
-    // For method == 2, pick the best intra4/intra16 based on SSE (~tad slower).
-    // For method <= 1, we refine intra4 or intra16 (but don't re-examine mode).
-    DistoRefine(it, (method >= 2));
-    SimpleQuantize(it, rd);
+    // At this point we have heuristically decided intra16 / intra4.
+    // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
+    // For method <= 1, we don't re-examine the decision but just go ahead with
+    // quantization/reconstruction.
+    RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
   }
   is_skipped = (rd->nz == 0);
   VP8SetSkip(it, is_skipped);
   return is_skipped;
 }
-
diff --git a/src/3rdparty/libwebp/src/enc/syntax.c b/src/3rdparty/libwebp/src/enc/syntax.c
index d1ff0a5..a0e79ef 100644
--- a/src/3rdparty/libwebp/src/enc/syntax.c
+++ b/src/3rdparty/libwebp/src/enc/syntax.c
@@ -186,8 +186,8 @@ static int PutWebPHeaders(const VP8Encoder* const enc, size_t size0,
 // Segmentation header
 static void PutSegmentHeader(VP8BitWriter* const bw,
                              const VP8Encoder* const enc) {
-  const VP8SegmentHeader* const hdr = &enc->segment_hdr_;
-  const VP8Proba* const proba = &enc->proba_;
+  const VP8EncSegmentHeader* const hdr = &enc->segment_hdr_;
+  const VP8EncProba* const proba = &enc->proba_;
   if (VP8PutBitUniform(bw, (hdr->num_segments_ > 1))) {
     // We always 'update' the quant and filter strength values
     const int update_data = 1;
@@ -197,16 +197,16 @@ static void PutSegmentHeader(VP8BitWriter* const bw,
       // we always use absolute values, not relative ones
       VP8PutBitUniform(bw, 1);   // (segment_feature_mode = 1. Paragraph 9.3.)
       for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        VP8PutSignedValue(bw, enc->dqm_[s].quant_, 7);
+        VP8PutSignedBits(bw, enc->dqm_[s].quant_, 7);
       }
       for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
-        VP8PutSignedValue(bw, enc->dqm_[s].fstrength_, 6);
+        VP8PutSignedBits(bw, enc->dqm_[s].fstrength_, 6);
       }
     }
     if (hdr->update_map_) {
       for (s = 0; s < 3; ++s) {
         if (VP8PutBitUniform(bw, (proba->segments_[s] != 255u))) {
-          VP8PutValue(bw, proba->segments_[s], 8);
+          VP8PutBits(bw, proba->segments_[s], 8);
         }
       }
     }
@@ -215,20 +215,20 @@ static void PutSegmentHeader(VP8BitWriter* const bw,
 
 // Filtering parameters header
 static void PutFilterHeader(VP8BitWriter* const bw,
-                            const VP8FilterHeader* const hdr) {
+                            const VP8EncFilterHeader* const hdr) {
   const int use_lf_delta = (hdr->i4x4_lf_delta_ != 0);
   VP8PutBitUniform(bw, hdr->simple_);
-  VP8PutValue(bw, hdr->level_, 6);
-  VP8PutValue(bw, hdr->sharpness_, 3);
+  VP8PutBits(bw, hdr->level_, 6);
+  VP8PutBits(bw, hdr->sharpness_, 3);
   if (VP8PutBitUniform(bw, use_lf_delta)) {
     // '0' is the default value for i4x4_lf_delta_ at frame #0.
     const int need_update = (hdr->i4x4_lf_delta_ != 0);
     if (VP8PutBitUniform(bw, need_update)) {
       // we don't use ref_lf_delta => emit four 0 bits
-      VP8PutValue(bw, 0, 4);
+      VP8PutBits(bw, 0, 4);
       // we use mode_lf_delta for i4x4
-      VP8PutSignedValue(bw, hdr->i4x4_lf_delta_, 6);
-      VP8PutValue(bw, 0, 3);    // all others unused
+      VP8PutSignedBits(bw, hdr->i4x4_lf_delta_, 6);
+      VP8PutBits(bw, 0, 3);    // all others unused
     }
   }
 }
@@ -236,12 +236,12 @@ static void PutFilterHeader(VP8BitWriter* const bw,
 // Nominal quantization parameters
 static void PutQuant(VP8BitWriter* const bw,
                      const VP8Encoder* const enc) {
-  VP8PutValue(bw, enc->base_quant_, 7);
-  VP8PutSignedValue(bw, enc->dq_y1_dc_, 4);
-  VP8PutSignedValue(bw, enc->dq_y2_dc_, 4);
-  VP8PutSignedValue(bw, enc->dq_y2_ac_, 4);
-  VP8PutSignedValue(bw, enc->dq_uv_dc_, 4);
-  VP8PutSignedValue(bw, enc->dq_uv_ac_, 4);
+  VP8PutBits(bw, enc->base_quant_, 7);
+  VP8PutSignedBits(bw, enc->dq_y1_dc_, 4);
+  VP8PutSignedBits(bw, enc->dq_y2_dc_, 4);
+  VP8PutSignedBits(bw, enc->dq_y2_ac_, 4);
+  VP8PutSignedBits(bw, enc->dq_uv_dc_, 4);
+  VP8PutSignedBits(bw, enc->dq_uv_ac_, 4);
 }
 
 // Partition sizes
@@ -277,9 +277,9 @@ static int GeneratePartition0(VP8Encoder* const enc) {
 
   PutSegmentHeader(bw, enc);
   PutFilterHeader(bw, &enc->filter_hdr_);
-  VP8PutValue(bw, enc->num_parts_ == 8 ? 3 :
-                  enc->num_parts_ == 4 ? 2 :
-                  enc->num_parts_ == 2 ? 1 : 0, 2);
+  VP8PutBits(bw, enc->num_parts_ == 8 ? 3 :
+                 enc->num_parts_ == 4 ? 2 :
+                 enc->num_parts_ == 2 ? 1 : 0, 2);
   PutQuant(bw, enc);
   VP8PutBitUniform(bw, 0);   // no proba update
   VP8WriteProbas(bw, &enc->proba_);
diff --git a/src/3rdparty/libwebp/src/enc/token.c b/src/3rdparty/libwebp/src/enc/token.c
index 8af13a0..e73256b 100644
--- a/src/3rdparty/libwebp/src/enc/token.c
+++ b/src/3rdparty/libwebp/src/enc/token.c
@@ -30,15 +30,15 @@
 #define MIN_PAGE_SIZE 8192          // minimum number of token per page
 #define FIXED_PROBA_BIT (1u << 14)
 
-typedef uint16_t token_t;  // bit#15: bit
-                           // bit #14: constant proba or idx
-                           // bits 0..13: slot or constant proba
+typedef uint16_t token_t;  // bit #15: bit value
+                           // bit #14: flags for constant proba or idx
+                           // bits #0..13: slot or constant proba
 struct VP8Tokens {
   VP8Tokens* next_;        // pointer to next page
 };
 // Token data is located in memory just after the next_ field.
 // This macro is used to return their address and hide the trick.
-#define TOKEN_DATA(p) ((token_t*)&(p)[1])
+#define TOKEN_DATA(p) ((const token_t*)&(p)[1])
 
 //------------------------------------------------------------------------------
 
@@ -53,10 +53,10 @@ void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
 
 void VP8TBufferClear(VP8TBuffer* const b) {
   if (b != NULL) {
-    const VP8Tokens* p = b->pages_;
+    VP8Tokens* p = b->pages_;
     while (p != NULL) {
-      const VP8Tokens* const next = p->next_;
-      WebPSafeFree((void*)p);
+      VP8Tokens* const next = p->next_;
+      WebPSafeFree(p);
       p = next;
     }
     VP8TBufferInit(b, b->page_size_);
@@ -65,8 +65,8 @@ void VP8TBufferClear(VP8TBuffer* const b) {
 
 static int TBufferNewPage(VP8TBuffer* const b) {
   VP8Tokens* page = NULL;
-  const size_t size = sizeof(*page) + b->page_size_ * sizeof(token_t);
   if (!b->error_) {
+    const size_t size = sizeof(*page) + b->page_size_ * sizeof(token_t);
     page = (VP8Tokens*)WebPSafeMalloc(1ULL, size);
   }
   if (page == NULL) {
@@ -78,19 +78,19 @@ static int TBufferNewPage(VP8TBuffer* const b) {
   *b->last_page_ = page;
   b->last_page_ = &page->next_;
   b->left_ = b->page_size_;
-  b->tokens_ = TOKEN_DATA(page);
+  b->tokens_ = (token_t*)TOKEN_DATA(page);
   return 1;
 }
 
 //------------------------------------------------------------------------------
 
-#define TOKEN_ID(t, b, ctx, p) \
-    ((p) + NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
+#define TOKEN_ID(t, b, ctx) \
+    (NUM_PROBAS * ((ctx) + NUM_CTX * ((b) + NUM_BANDS * (t))))
 
-static WEBP_INLINE int AddToken(VP8TBuffer* const b,
-                                int bit, uint32_t proba_idx) {
+static WEBP_INLINE uint32_t AddToken(VP8TBuffer* const b,
+                                     uint32_t bit, uint32_t proba_idx) {
   assert(proba_idx < FIXED_PROBA_BIT);
-  assert(bit == 0 || bit == 1);
+  assert(bit <= 1);
   if (b->left_ > 0 || TBufferNewPage(b)) {
     const int slot = --b->left_;
     b->tokens_[slot] = (bit << 15) | proba_idx;
@@ -99,20 +99,21 @@ static WEBP_INLINE int AddToken(VP8TBuffer* const b,
 }
 
 static WEBP_INLINE void AddConstantToken(VP8TBuffer* const b,
-                                         int bit, int proba) {
+                                         uint32_t bit, uint32_t proba) {
   assert(proba < 256);
-  assert(bit == 0 || bit == 1);
+  assert(bit <= 1);
   if (b->left_ > 0 || TBufferNewPage(b)) {
     const int slot = --b->left_;
     b->tokens_[slot] = (bit << 15) | FIXED_PROBA_BIT | proba;
   }
 }
 
-int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
+int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
+                         int first, int last,
                          const int16_t* const coeffs,
                          VP8TBuffer* const tokens) {
   int n = first;
-  uint32_t base_id = TOKEN_ID(coeff_type, n, ctx, 0);
+  uint32_t base_id = TOKEN_ID(coeff_type, n, ctx);
   if (!AddToken(tokens, last >= 0, base_id + 0)) {
     return 0;
   }
@@ -120,14 +121,13 @@ int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
   while (n < 16) {
     const int c = coeffs[n++];
     const int sign = c < 0;
-    int v = sign ? -c : c;
+    const uint32_t v = sign ? -c : c;
     if (!AddToken(tokens, v != 0, base_id + 1)) {
-      ctx = 0;
-      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], ctx, 0);
+      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 0);  // ctx=0
       continue;
     }
     if (!AddToken(tokens, v > 1, base_id + 2)) {
-      ctx = 1;
+      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 1);  // ctx=1
     } else {
       if (!AddToken(tokens, v > 4, base_id + 3)) {
         if (AddToken(tokens, v != 2, base_id + 4))
@@ -142,40 +142,40 @@ int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
       } else {
         int mask;
         const uint8_t* tab;
-        if (v < 3 + (8 << 1)) {          // VP8Cat3  (3b)
+        uint32_t residue = v - 3;
+        if (residue < (8 << 1)) {          // VP8Cat3  (3b)
           AddToken(tokens, 0, base_id + 8);
           AddToken(tokens, 0, base_id + 9);
-          v -= 3 + (8 << 0);
+          residue -= (8 << 0);
           mask = 1 << 2;
           tab = VP8Cat3;
-        } else if (v < 3 + (8 << 2)) {   // VP8Cat4  (4b)
+        } else if (residue < (8 << 2)) {   // VP8Cat4  (4b)
           AddToken(tokens, 0, base_id + 8);
           AddToken(tokens, 1, base_id + 9);
-          v -= 3 + (8 << 1);
+          residue -= (8 << 1);
           mask = 1 << 3;
           tab = VP8Cat4;
-        } else if (v < 3 + (8 << 3)) {   // VP8Cat5  (5b)
+        } else if (residue < (8 << 3)) {   // VP8Cat5  (5b)
           AddToken(tokens, 1, base_id + 8);
           AddToken(tokens, 0, base_id + 10);
-          v -= 3 + (8 << 2);
+          residue -= (8 << 2);
           mask = 1 << 4;
           tab = VP8Cat5;
         } else {                         // VP8Cat6 (11b)
           AddToken(tokens, 1, base_id + 8);
           AddToken(tokens, 1, base_id + 10);
-          v -= 3 + (8 << 3);
+          residue -= (8 << 3);
           mask = 1 << 10;
           tab = VP8Cat6;
         }
         while (mask) {
-          AddConstantToken(tokens, !!(v & mask), *tab++);
+          AddConstantToken(tokens, !!(residue & mask), *tab++);
           mask >>= 1;
         }
       }
-      ctx = 2;
+      base_id = TOKEN_ID(coeff_type, VP8EncBands[n], 2);  // ctx=2
     }
     AddConstantToken(tokens, sign, 128);
-    base_id = TOKEN_ID(coeff_type, VP8EncBands[n], ctx, 0);
     if (n == 16 || !AddToken(tokens, n <= last, base_id + 0)) {
       return 1;   // EOB
     }
@@ -224,7 +224,6 @@ void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats) {
 int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
                   const uint8_t* const probas, int final_pass) {
   const VP8Tokens* p = b->pages_;
-  (void)final_pass;
   assert(!b->error_);
   while (p != NULL) {
     const VP8Tokens* const next = p->next_;
diff --git a/src/3rdparty/libwebp/src/enc/tree.c b/src/3rdparty/libwebp/src/enc/tree.c
index e5d05e5..f141006 100644
--- a/src/3rdparty/libwebp/src/enc/tree.c
+++ b/src/3rdparty/libwebp/src/enc/tree.c
@@ -154,7 +154,7 @@ const uint8_t
 };
 
 void VP8DefaultProbas(VP8Encoder* const enc) {
-  VP8Proba* const probas = &enc->proba_;
+  VP8EncProba* const probas = &enc->proba_;
   probas->use_skip_proba_ = 0;
   memset(probas->segments_, 255u, sizeof(probas->segments_));
   memcpy(probas->coeffs_, VP8CoeffsProba0, sizeof(VP8CoeffsProba0));
@@ -482,7 +482,7 @@ const uint8_t
   }
 };
 
-void VP8WriteProbas(VP8BitWriter* const bw, const VP8Proba* const probas) {
+void VP8WriteProbas(VP8BitWriter* const bw, const VP8EncProba* const probas) {
   int t, b, c, p;
   for (t = 0; t < NUM_TYPES; ++t) {
     for (b = 0; b < NUM_BANDS; ++b) {
@@ -491,14 +491,14 @@ void VP8WriteProbas(VP8BitWriter* const bw, const VP8Proba* const probas) {
           const uint8_t p0 = probas->coeffs_[t][b][c][p];
           const int update = (p0 != VP8CoeffsProba0[t][b][c][p]);
           if (VP8PutBit(bw, update, VP8CoeffsUpdateProba[t][b][c][p])) {
-            VP8PutValue(bw, p0, 8);
+            VP8PutBits(bw, p0, 8);
           }
         }
       }
     }
   }
   if (VP8PutBitUniform(bw, probas->use_skip_proba_)) {
-    VP8PutValue(bw, probas->skip_proba_, 8);
+    VP8PutBits(bw, probas->skip_proba_, 8);
   }
 }
 
diff --git a/src/3rdparty/libwebp/src/enc/vp8enci.h b/src/3rdparty/libwebp/src/enc/vp8enci.h
index 20f58c6..b2cc8d1 100644
--- a/src/3rdparty/libwebp/src/enc/vp8enci.h
+++ b/src/3rdparty/libwebp/src/enc/vp8enci.h
@@ -15,10 +15,16 @@
 #define WEBP_ENC_VP8ENCI_H_
 
 #include <string.h>     // for memcpy()
-#include "../webp/encode.h"
+#include "../dec/common.h"
 #include "../dsp/dsp.h"
 #include "../utils/bit_writer.h"
 #include "../utils/thread.h"
+#include "../utils/utils.h"
+#include "../webp/encode.h"
+
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+#include "./vp8li.h"
+#endif  // WEBP_EXPERIMENTAL_FEATURES
 
 #ifdef __cplusplus
 extern "C" {
@@ -29,35 +35,10 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 4
-#define ENC_REV_VERSION 4
-
-// intra prediction modes
-enum { B_DC_PRED = 0,   // 4x4 modes
-       B_TM_PRED = 1,
-       B_VE_PRED = 2,
-       B_HE_PRED = 3,
-       B_RD_PRED = 4,
-       B_VR_PRED = 5,
-       B_LD_PRED = 6,
-       B_VL_PRED = 7,
-       B_HD_PRED = 8,
-       B_HU_PRED = 9,
-       NUM_BMODES = B_HU_PRED + 1 - B_DC_PRED,  // = 10
-
-       // Luma16 or UV modes
-       DC_PRED = B_DC_PRED, V_PRED = B_VE_PRED,
-       H_PRED = B_HE_PRED, TM_PRED = B_TM_PRED,
-       NUM_PRED_MODES = 4
-     };
+#define ENC_MIN_VERSION 5
+#define ENC_REV_VERSION 0
 
-enum { NUM_MB_SEGMENTS = 4,
-       MAX_NUM_PARTITIONS = 8,
-       NUM_TYPES = 4,   // 0: i16-AC,  1: i16-DC,  2:chroma-AC,  3:i4-AC
-       NUM_BANDS = 8,
-       NUM_CTX = 3,
-       NUM_PROBAS = 11,
-       MAX_LF_LEVELS = 64,       // Maximum loop filter level
+enum { MAX_LF_LEVELS = 64,       // Maximum loop filter level
        MAX_VARIABLE_LEVEL = 67,  // last (inclusive) level with variable cost
        MAX_LEVEL = 2047          // max level (note: max codable is 2047 + 67)
      };
@@ -69,66 +50,34 @@ typedef enum {   // Rate-distortion optimization levels
   RD_OPT_TRELLIS_ALL = 3   // trellis-quant for every scoring (much slower)
 } VP8RDLevel;
 
-// YUV-cache parameters. Cache is 16-pixels wide.
-// The original or reconstructed samples can be accessed using VP8Scan[]
+// YUV-cache parameters. Cache is 32-bytes wide (= one cacheline).
+// The original or reconstructed samples can be accessed using VP8Scan[].
 // The predicted blocks can be accessed using offsets to yuv_p_ and
-// the arrays VP8*ModeOffsets[];
-//         +----+      YUV Samples area. See VP8Scan[] for accessing the blocks.
-//  Y_OFF  |YYYY| <- original samples  ('yuv_in_')
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-//  U_OFF  |UUVV| V_OFF  (=U_OFF + 8)
-//         |UUVV|
-//         +----+
-//  Y_OFF  |YYYY| <- compressed/decoded samples  ('yuv_out_')
-//         |YYYY|    There are two buffers like this ('yuv_out_'/'yuv_out2_')
-//         |YYYY|
-//         |YYYY|
-//  U_OFF  |UUVV| V_OFF
-//         |UUVV|
-//          x2 (for yuv_out2_)
-//         +----+     Prediction area ('yuv_p_', size = PRED_SIZE)
-// I16DC16 |YYYY|  Intra16 predictions (16x16 block each)
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-// I16TM16 |YYYY|
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-// I16VE16 |YYYY|
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-// I16HE16 |YYYY|
-//         |YYYY|
-//         |YYYY|
-//         |YYYY|
-//         +----+  Chroma U/V predictions (16x8 block each)
-// C8DC8   |UUVV|
-//         |UUVV|
-// C8TM8   |UUVV|
-//         |UUVV|
-// C8VE8   |UUVV|
-//         |UUVV|
-// C8HE8   |UUVV|
-//         |UUVV|
-//         +----+  Intra 4x4 predictions (4x4 block each)
-//         |YYYY| I4DC4 I4TM4 I4VE4 I4HE4
-//         |YYYY| I4RD4 I4VR4 I4LD4 I4VL4
-//         |YY..| I4HD4 I4HU4 I4TMP
-//         +----+
-#define BPS       16   // this is the common stride
-#define Y_SIZE   (BPS * 16)
-#define UV_SIZE  (BPS * 8)
-#define YUV_SIZE (Y_SIZE + UV_SIZE)
-#define PRED_SIZE (6 * 16 * BPS + 12 * BPS)
-#define Y_OFF    (0)
-#define U_OFF    (Y_SIZE)
-#define V_OFF    (U_OFF + 8)
-#define ALIGN_CST 15
-#define DO_ALIGN(PTR) ((uintptr_t)((PTR) + ALIGN_CST) & ~ALIGN_CST)
+// the arrays VP8*ModeOffsets[].
+// * YUV Samples area (yuv_in_/yuv_out_/yuv_out2_)
+//   (see VP8Scan[] for accessing the blocks, along with
+//   Y_OFF_ENC/U_OFF_ENC/V_OFF_ENC):
+//             +----+----+
+//  Y_OFF_ENC  |YYYY|UUVV|
+//  U_OFF_ENC  |YYYY|UUVV|
+//  V_OFF_ENC  |YYYY|....| <- 25% wasted U/V area
+//             |YYYY|....|
+//             +----+----+
+// * Prediction area ('yuv_p_', size = PRED_SIZE_ENC)
+//   Intra16 predictions (16x16 block each, two per row):
+//         |I16DC16|I16TM16|
+//         |I16VE16|I16HE16|
+//   Chroma U/V predictions (16x8 block each, two per row):
+//         |C8DC8|C8TM8|
+//         |C8VE8|C8HE8|
+//   Intra 4x4 predictions (4x4 block each)
+//         |I4DC4 I4TM4 I4VE4 I4HE4|I4RD4 I4VR4 I4LD4 I4VL4|
+//         |I4HD4 I4HU4 I4TMP .....|.......................| <- ~31% wasted
+#define YUV_SIZE_ENC (BPS * 16)
+#define PRED_SIZE_ENC (32 * BPS + 16 * BPS + 8 * BPS)   // I16+Chroma+I4 preds
+#define Y_OFF_ENC    (0)
+#define U_OFF_ENC    (16)
+#define V_OFF_ENC    (16 + 8)
 
 extern const int VP8Scan[16];           // in quant.c
 extern const int VP8UVModeOffsets[4];   // in analyze.c
@@ -138,26 +87,26 @@ extern const int VP8I4ModeOffsets[NUM_BMODES];
 // Layout of prediction blocks
 // intra 16x16
 #define I16DC16 (0 * 16 * BPS)
-#define I16TM16 (1 * 16 * BPS)
-#define I16VE16 (2 * 16 * BPS)
-#define I16HE16 (3 * 16 * BPS)
+#define I16TM16 (I16DC16 + 16)
+#define I16VE16 (1 * 16 * BPS)
+#define I16HE16 (I16VE16 + 16)
 // chroma 8x8, two U/V blocks side by side (hence: 16x8 each)
-#define C8DC8 (4 * 16 * BPS)
-#define C8TM8 (4 * 16 * BPS + 8 * BPS)
-#define C8VE8 (5 * 16 * BPS)
-#define C8HE8 (5 * 16 * BPS + 8 * BPS)
+#define C8DC8 (2 * 16 * BPS)
+#define C8TM8 (C8DC8 + 1 * 16)
+#define C8VE8 (2 * 16 * BPS + 8 * BPS)
+#define C8HE8 (C8VE8 + 1 * 16)
 // intra 4x4
-#define I4DC4 (6 * 16 * BPS +  0)
-#define I4TM4 (6 * 16 * BPS +  4)
-#define I4VE4 (6 * 16 * BPS +  8)
-#define I4HE4 (6 * 16 * BPS + 12)
-#define I4RD4 (6 * 16 * BPS + 4 * BPS +  0)
-#define I4VR4 (6 * 16 * BPS + 4 * BPS +  4)
-#define I4LD4 (6 * 16 * BPS + 4 * BPS +  8)
-#define I4VL4 (6 * 16 * BPS + 4 * BPS + 12)
-#define I4HD4 (6 * 16 * BPS + 8 * BPS +  0)
-#define I4HU4 (6 * 16 * BPS + 8 * BPS +  4)
-#define I4TMP (6 * 16 * BPS + 8 * BPS +  8)
+#define I4DC4 (3 * 16 * BPS +  0)
+#define I4TM4 (I4DC4 +  4)
+#define I4VE4 (I4DC4 +  8)
+#define I4HE4 (I4DC4 + 12)
+#define I4RD4 (I4DC4 + 16)
+#define I4VR4 (I4DC4 + 20)
+#define I4LD4 (I4DC4 + 24)
+#define I4VL4 (I4DC4 + 28)
+#define I4HD4 (3 * 16 * BPS + 4 * BPS)
+#define I4HU4 (I4HD4 + 4)
+#define I4TMP (I4HD4 + 8)
 
 typedef int64_t score_t;     // type used for scores, rate, distortion
 // Note that MAX_COST is not the maximum allowed by sizeof(score_t),
@@ -172,14 +121,6 @@ static WEBP_INLINE int QUANTDIV(uint32_t n, uint32_t iQ, uint32_t B) {
   return (int)((n * iQ + B) >> QFIX);
 }
 
-// size of histogram used by CollectHistogram.
-#define MAX_COEFF_THRESH   31
-typedef struct VP8Histogram VP8Histogram;
-struct VP8Histogram {
-  // TODO(skal): we only need to store the max_value and last_non_zero actually.
-  int distribution[MAX_COEFF_THRESH + 1];
-};
-
 // Uncomment the following to remove token-buffer code:
 // #define DISABLE_TOKEN_BUFFER
 
@@ -190,6 +131,8 @@ typedef uint32_t proba_t;   // 16b + 16b
 typedef uint8_t ProbaArray[NUM_CTX][NUM_PROBAS];
 typedef proba_t StatsArray[NUM_CTX][NUM_PROBAS];
 typedef uint16_t CostArray[NUM_CTX][MAX_VARIABLE_LEVEL + 1];
+typedef const uint16_t* (*CostArrayPtr)[NUM_CTX];   // for easy casting
+typedef const uint16_t* CostArrayMap[16][NUM_CTX];
 typedef double LFStats[NUM_MB_SEGMENTS][MAX_LF_LEVELS];  // filter stats
 
 typedef struct VP8Encoder VP8Encoder;
@@ -200,7 +143,7 @@ typedef struct {
   int update_map_;        // whether to update the segment map or not.
                           // must be 0 if there's only 1 segment.
   int size_;              // bit-cost for transmitting the segment map
-} VP8SegmentHeader;
+} VP8EncSegmentHeader;
 
 // Struct collecting all frame-persistent probabilities.
 typedef struct {
@@ -209,10 +152,11 @@ typedef struct {
   ProbaArray coeffs_[NUM_TYPES][NUM_BANDS];      // 1056 bytes
   StatsArray stats_[NUM_TYPES][NUM_BANDS];       // 4224 bytes
   CostArray level_cost_[NUM_TYPES][NUM_BANDS];   // 13056 bytes
+  CostArrayMap remapped_costs_[NUM_TYPES];       // 1536 bytes
   int dirty_;               // if true, need to call VP8CalculateLevelCosts()
   int use_skip_proba_;      // Note: we always use skip_proba for now.
   int nb_skip_;             // number of skipped blocks
-} VP8Proba;
+} VP8EncProba;
 
 // Filter parameters. Not actually used in the code (we don't perform
 // the in-loop filtering), but filled from user's config
@@ -221,7 +165,7 @@ typedef struct {
   int level_;              // base filter level [0..63]
   int sharpness_;          // [0..7]
   int i4x4_lf_delta_;      // delta filter level for i4x4 relative to i16x16
-} VP8FilterHeader;
+} VP8EncFilterHeader;
 
 //------------------------------------------------------------------------------
 // Informations about the macroblocks.
@@ -307,9 +251,10 @@ typedef struct {
   uint8_t* y_top_;     // top luma samples at position 'x_'
   uint8_t* uv_top_;    // top u/v samples at position 'x_', packed as 16 bytes
 
-  // memory for storing y/u/v_left_ and yuv_in_/out_*
-  uint8_t yuv_left_mem_[17 + 16 + 16 + 8 + ALIGN_CST];     // memory for *_left_
-  uint8_t yuv_mem_[3 * YUV_SIZE + PRED_SIZE + ALIGN_CST];  // memory for yuv_*
+  // memory for storing y/u/v_left_
+  uint8_t yuv_left_mem_[17 + 16 + 16 + 8 + WEBP_ALIGN_CST];
+  // memory for yuv_*
+  uint8_t yuv_mem_[3 * YUV_SIZE_ENC + PRED_SIZE_ENC + WEBP_ALIGN_CST];
 } VP8EncIterator;
 
   // in iterator.c
@@ -381,7 +326,8 @@ int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
                   const uint8_t* const probas, int final_pass);
 
 // record the coding of coefficients without knowing the probabilities yet
-int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
+int VP8RecordCoeffTokens(const int ctx, const int coeff_type,
+                         int first, int last,
                          const int16_t* const coeffs,
                          VP8TBuffer* const tokens);
 
@@ -401,8 +347,8 @@ struct VP8Encoder {
   WebPPicture* pic_;            // input / output picture
 
   // headers
-  VP8FilterHeader   filter_hdr_;     // filtering information
-  VP8SegmentHeader  segment_hdr_;    // segment information
+  VP8EncFilterHeader   filter_hdr_;     // filtering information
+  VP8EncSegmentHeader  segment_hdr_;    // segment information
 
   int profile_;                      // VP8's profile, deduced from Config.
 
@@ -438,12 +384,12 @@ struct VP8Encoder {
   int dq_uv_dc_, dq_uv_ac_;
 
   // probabilities and statistics
-  VP8Proba proba_;
-  uint64_t sse_[4];        // sum of Y/U/V/A squared errors for all macroblocks
-  uint64_t sse_count_;     // pixel count for the sse_[] stats
-  int      coded_size_;
-  int      residual_bytes_[3][4];
-  int      block_count_[3];
+  VP8EncProba proba_;
+  uint64_t    sse_[4];      // sum of Y/U/V/A squared errors for all macroblocks
+  uint64_t    sse_count_;   // pixel count for the sse_[] stats
+  int         coded_size_;
+  int         residual_bytes_[3][4];
+  int         block_count_[3];
 
   // quality/speed settings
   int method_;               // 0=fastest, 6=best/slowest.
@@ -473,7 +419,7 @@ extern const uint8_t
 // Reset the token probabilities to their initial (default) values
 void VP8DefaultProbas(VP8Encoder* const enc);
 // Write the token probabilities
-void VP8WriteProbas(VP8BitWriter* const bw, const VP8Proba* const probas);
+void VP8WriteProbas(VP8BitWriter* const bw, const VP8EncProba* const probas);
 // Writes the partition #0 modes (that is: all intra modes)
 void VP8CodeIntraModes(VP8Encoder* const enc);
 
@@ -486,7 +432,6 @@ int VP8EncWrite(VP8Encoder* const enc);
 void VP8EncFreeBitWriters(VP8Encoder* const enc);
 
   // in frame.c
-extern const uint8_t VP8EncBands[16 + 1];
 extern const uint8_t VP8Cat3[];
 extern const uint8_t VP8Cat4[];
 extern const uint8_t VP8Cat5[];
@@ -569,12 +514,21 @@ int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height);
 // Returns false in case of error (invalid param, out-of-memory).
 int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
 
+// Clean-up the RGB samples under fully transparent area, to help lossless
+// compressibility (no guarantee, though). Assumes that pic->use_argb is true.
+void WebPCleanupTransparentAreaLossless(WebPPicture* const pic);
+
+  // in near_lossless.c
+// Near lossless preprocessing in RGB color-space.
+int VP8ApplyNearLossless(int xsize, int ysize, uint32_t* argb, int quality);
+// Near lossless adjustment for predictors.
+void VP8ApplyNearLosslessPredict(int xsize, int ysize, int pred_bits,
+                                 const uint32_t* argb_orig,
+                                 uint32_t* argb, uint32_t* argb_scratch,
+                                 const uint32_t* const transform_data,
+                                 int quality, int subtract_green);
 //------------------------------------------------------------------------------
 
-#if WEBP_ENCODER_ABI_VERSION <= 0x0203
-void WebPMemoryWriterClear(WebPMemoryWriter* writer);
-#endif
-
 #ifdef __cplusplus
 }    // extern "C"
 #endif
diff --git a/src/3rdparty/libwebp/src/enc/vp8l.c b/src/3rdparty/libwebp/src/enc/vp8l.c
index c2bb13d..db94e78 100644
--- a/src/3rdparty/libwebp/src/enc/vp8l.c
+++ b/src/3rdparty/libwebp/src/enc/vp8l.c
@@ -13,10 +13,10 @@
 //
 
 #include <assert.h>
-#include <stdio.h>
 #include <stdlib.h>
 
 #include "./backward_references.h"
+#include "./histogram.h"
 #include "./vp8enci.h"
 #include "./vp8li.h"
 #include "../dsp/lossless.h"
@@ -25,23 +25,105 @@
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
+#include "./delta_palettization.h"
+
 #define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
-#define MAX_HUFF_IMAGE_SIZE       (16 * 1024 * 1024)
-#define MAX_COLORS_FOR_GRAPH      64
+// Maximum number of histogram images (sub-blocks).
+#define MAX_HUFF_IMAGE_SIZE       2600
 
-// -----------------------------------------------------------------------------
-// Palette
+// Palette reordering for smaller sum of deltas (and for smaller storage).
 
-static int CompareColors(const void* p1, const void* p2) {
-  const uint32_t a = *(const uint32_t*)p1;
-  const uint32_t b = *(const uint32_t*)p2;
+static int PaletteCompareColorsForQsort(const void* p1, const void* p2) {
+  const uint32_t a = WebPMemToUint32(p1);
+  const uint32_t b = WebPMemToUint32(p2);
   assert(a != b);
   return (a < b) ? -1 : 1;
 }
 
+static WEBP_INLINE uint32_t PaletteComponentDistance(uint32_t v) {
+  return (v <= 128) ? v : (256 - v);
+}
+
+// Computes a value that is related to the entropy created by the
+// palette entry diff.
+//
+// Note that the last & 0xff is a no-operation in the next statement, but
+// removed by most compilers and is here only for regularity of the code.
+static WEBP_INLINE uint32_t PaletteColorDistance(uint32_t col1, uint32_t col2) {
+  const uint32_t diff = VP8LSubPixels(col1, col2);
+  const int kMoreWeightForRGBThanForAlpha = 9;
+  uint32_t score;
+  score =  PaletteComponentDistance((diff >>  0) & 0xff);
+  score += PaletteComponentDistance((diff >>  8) & 0xff);
+  score += PaletteComponentDistance((diff >> 16) & 0xff);
+  score *= kMoreWeightForRGBThanForAlpha;
+  score += PaletteComponentDistance((diff >> 24) & 0xff);
+  return score;
+}
+
+static WEBP_INLINE void SwapColor(uint32_t* const col1, uint32_t* const col2) {
+  const uint32_t tmp = *col1;
+  *col1 = *col2;
+  *col2 = tmp;
+}
+
+static void GreedyMinimizeDeltas(uint32_t palette[], int num_colors) {
+  // Find greedily always the closest color of the predicted color to minimize
+  // deltas in the palette. This reduces storage needs since the
+  // palette is stored with delta encoding.
+  uint32_t predict = 0x00000000;
+  int i, k;
+  for (i = 0; i < num_colors; ++i) {
+    int best_ix = i;
+    uint32_t best_score = ~0U;
+    for (k = i; k < num_colors; ++k) {
+      const uint32_t cur_score = PaletteColorDistance(palette[k], predict);
+      if (best_score > cur_score) {
+        best_score = cur_score;
+        best_ix = k;
+      }
+    }
+    SwapColor(&palette[best_ix], &palette[i]);
+    predict = palette[i];
+  }
+}
+
+// The palette has been sorted by alpha. This function checks if the other
+// components of the palette have a monotonic development with regards to
+// position in the palette. If all have monotonic development, there is
+// no benefit to re-organize them greedily. A monotonic development
+// would be spotted in green-only situations (like lossy alpha) or gray-scale
+// images.
+static int PaletteHasNonMonotonousDeltas(uint32_t palette[], int num_colors) {
+  uint32_t predict = 0x000000;
+  int i;
+  uint8_t sign_found = 0x00;
+  for (i = 0; i < num_colors; ++i) {
+    const uint32_t diff = VP8LSubPixels(palette[i], predict);
+    const uint8_t rd = (diff >> 16) & 0xff;
+    const uint8_t gd = (diff >>  8) & 0xff;
+    const uint8_t bd = (diff >>  0) & 0xff;
+    if (rd != 0x00) {
+      sign_found |= (rd < 0x80) ? 1 : 2;
+    }
+    if (gd != 0x00) {
+      sign_found |= (gd < 0x80) ? 8 : 16;
+    }
+    if (bd != 0x00) {
+      sign_found |= (bd < 0x80) ? 64 : 128;
+    }
+    predict = palette[i];
+  }
+  return (sign_found & (sign_found << 1)) != 0;  // two consequent signs.
+}
+
+// -----------------------------------------------------------------------------
+// Palette
+
 // If number of colors in the image is less than or equal to MAX_PALETTE_SIZE,
 // creates a palette and returns true, else returns false.
 static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
+                                   int low_effort,
                                    uint32_t palette[MAX_PALETTE_SIZE],
                                    int* const palette_size) {
   int i, x, y, key;
@@ -92,84 +174,240 @@ static int AnalyzeAndCreatePalette(const WebPPicture* const pic,
       ++num_colors;
     }
   }
-
-  qsort(palette, num_colors, sizeof(*palette), CompareColors);
   *palette_size = num_colors;
+  qsort(palette, num_colors, sizeof(*palette), PaletteCompareColorsForQsort);
+  if (!low_effort && PaletteHasNonMonotonousDeltas(palette, num_colors)) {
+    GreedyMinimizeDeltas(palette, num_colors);
+  }
   return 1;
 }
 
-static int AnalyzeEntropy(const uint32_t* argb,
-                          int width, int height, int argb_stride,
-                          double* const nonpredicted_bits,
-                          double* const predicted_bits) {
-  int x, y;
-  const uint32_t* last_line = NULL;
-  uint32_t last_pix = argb[0];    // so we're sure that pix_diff == 0
+// These five modes are evaluated and their respective entropy is computed.
+typedef enum {
+  kDirect = 0,
+  kSpatial = 1,
+  kSubGreen = 2,
+  kSpatialSubGreen = 3,
+  kPalette = 4,
+  kNumEntropyIx = 5
+} EntropyIx;
+
+typedef enum {
+  kHistoAlpha = 0,
+  kHistoAlphaPred,
+  kHistoGreen,
+  kHistoGreenPred,
+  kHistoRed,
+  kHistoRedPred,
+  kHistoBlue,
+  kHistoBluePred,
+  kHistoRedSubGreen,
+  kHistoRedPredSubGreen,
+  kHistoBlueSubGreen,
+  kHistoBluePredSubGreen,
+  kHistoPalette,
+  kHistoTotal  // Must be last.
+} HistoIx;
+
+static void AddSingleSubGreen(uint32_t p, uint32_t* r, uint32_t* b) {
+  const uint32_t green = p >> 8;  // The upper bits are masked away later.
+  ++r[((p >> 16) - green) & 0xff];
+  ++b[(p - green) & 0xff];
+}
 
-  VP8LHistogramSet* const histo_set = VP8LAllocateHistogramSet(2, 0);
-  if (histo_set == NULL) return 0;
+static void AddSingle(uint32_t p,
+                      uint32_t* a, uint32_t* r, uint32_t* g, uint32_t* b) {
+  ++a[p >> 24];
+  ++r[(p >> 16) & 0xff];
+  ++g[(p >> 8) & 0xff];
+  ++b[(p & 0xff)];
+}
 
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      const uint32_t pix = argb[x];
-      const uint32_t pix_diff = VP8LSubPixels(pix, last_pix);
-      if (pix_diff == 0) continue;
-      if (last_line != NULL && pix == last_line[x]) {
-        continue;
+static int AnalyzeEntropy(const uint32_t* argb,
+                          int width, int height, int argb_stride,
+                          int use_palette,
+                          EntropyIx* const min_entropy_ix,
+                          int* const red_and_blue_always_zero) {
+  // Allocate histogram set with cache_bits = 0.
+  uint32_t* const histo =
+      (uint32_t*)WebPSafeCalloc(kHistoTotal, sizeof(*histo) * 256);
+  if (histo != NULL) {
+    int i, x, y;
+    const uint32_t* prev_row = argb;
+    const uint32_t* curr_row = argb + argb_stride;
+    for (y = 1; y < height; ++y) {
+      uint32_t prev_pix = curr_row[0];
+      for (x = 1; x < width; ++x) {
+        const uint32_t pix = curr_row[x];
+        const uint32_t pix_diff = VP8LSubPixels(pix, prev_pix);
+        if ((pix_diff == 0) || (pix == prev_row[x])) continue;
+        prev_pix = pix;
+        AddSingle(pix,
+                  &histo[kHistoAlpha * 256],
+                  &histo[kHistoRed * 256],
+                  &histo[kHistoGreen * 256],
+                  &histo[kHistoBlue * 256]);
+        AddSingle(pix_diff,
+                  &histo[kHistoAlphaPred * 256],
+                  &histo[kHistoRedPred * 256],
+                  &histo[kHistoGreenPred * 256],
+                  &histo[kHistoBluePred * 256]);
+        AddSingleSubGreen(pix,
+                          &histo[kHistoRedSubGreen * 256],
+                          &histo[kHistoBlueSubGreen * 256]);
+        AddSingleSubGreen(pix_diff,
+                          &histo[kHistoRedPredSubGreen * 256],
+                          &histo[kHistoBluePredSubGreen * 256]);
+        {
+          // Approximate the palette by the entropy of the multiplicative hash.
+          const int hash = ((pix + (pix >> 19)) * 0x39c5fba7) >> 24;
+          ++histo[kHistoPalette * 256 + (hash & 0xff)];
+        }
+      }
+      prev_row = curr_row;
+      curr_row += argb_stride;
+    }
+    {
+      double entropy_comp[kHistoTotal];
+      double entropy[kNumEntropyIx];
+      EntropyIx k;
+      EntropyIx last_mode_to_analyze =
+          use_palette ? kPalette : kSpatialSubGreen;
+      int j;
+      // Let's add one zero to the predicted histograms. The zeros are removed
+      // too efficiently by the pix_diff == 0 comparison, at least one of the
+      // zeros is likely to exist.
+      ++histo[kHistoRedPredSubGreen * 256];
+      ++histo[kHistoBluePredSubGreen * 256];
+      ++histo[kHistoRedPred * 256];
+      ++histo[kHistoGreenPred * 256];
+      ++histo[kHistoBluePred * 256];
+      ++histo[kHistoAlphaPred * 256];
+
+      for (j = 0; j < kHistoTotal; ++j) {
+        entropy_comp[j] = VP8LBitsEntropy(&histo[j * 256], 256, NULL);
       }
-      last_pix = pix;
+      entropy[kDirect] = entropy_comp[kHistoAlpha] +
+          entropy_comp[kHistoRed] +
+          entropy_comp[kHistoGreen] +
+          entropy_comp[kHistoBlue];
+      entropy[kSpatial] = entropy_comp[kHistoAlphaPred] +
+          entropy_comp[kHistoRedPred] +
+          entropy_comp[kHistoGreenPred] +
+          entropy_comp[kHistoBluePred];
+      entropy[kSubGreen] = entropy_comp[kHistoAlpha] +
+          entropy_comp[kHistoRedSubGreen] +
+          entropy_comp[kHistoGreen] +
+          entropy_comp[kHistoBlueSubGreen];
+      entropy[kSpatialSubGreen] = entropy_comp[kHistoAlphaPred] +
+          entropy_comp[kHistoRedPredSubGreen] +
+          entropy_comp[kHistoGreenPred] +
+          entropy_comp[kHistoBluePredSubGreen];
+      // Palette mode seems more efficient in a breakeven case. Bias with 1.0.
+      entropy[kPalette] = entropy_comp[kHistoPalette] - 1.0;
+
+      *min_entropy_ix = kDirect;
+      for (k = kDirect + 1; k <= last_mode_to_analyze; ++k) {
+        if (entropy[*min_entropy_ix] > entropy[k]) {
+          *min_entropy_ix = k;
+        }
+      }
+      *red_and_blue_always_zero = 1;
+      // Let's check if the histogram of the chosen entropy mode has
+      // non-zero red and blue values. If all are zero, we can later skip
+      // the cross color optimization.
       {
-        const PixOrCopy pix_token = PixOrCopyCreateLiteral(pix);
-        const PixOrCopy pix_diff_token = PixOrCopyCreateLiteral(pix_diff);
-        VP8LHistogramAddSinglePixOrCopy(histo_set->histograms[0], &pix_token);
-        VP8LHistogramAddSinglePixOrCopy(histo_set->histograms[1],
-                                        &pix_diff_token);
+        static const uint8_t kHistoPairs[5][2] = {
+          { kHistoRed, kHistoBlue },
+          { kHistoRedPred, kHistoBluePred },
+          { kHistoRedSubGreen, kHistoBlueSubGreen },
+          { kHistoRedPredSubGreen, kHistoBluePredSubGreen },
+          { kHistoRed, kHistoBlue }
+        };
+        const uint32_t* const red_histo =
+            &histo[256 * kHistoPairs[*min_entropy_ix][0]];
+        const uint32_t* const blue_histo =
+            &histo[256 * kHistoPairs[*min_entropy_ix][1]];
+        for (i = 1; i < 256; ++i) {
+          if ((red_histo[i] | blue_histo[i]) != 0) {
+            *red_and_blue_always_zero = 0;
+            break;
+          }
+        }
       }
     }
-    last_line = argb;
-    argb += argb_stride;
+    free(histo);
+    return 1;
+  } else {
+    return 0;
   }
-  *nonpredicted_bits = VP8LHistogramEstimateBitsBulk(histo_set->histograms[0]);
-  *predicted_bits = VP8LHistogramEstimateBitsBulk(histo_set->histograms[1]);
-  VP8LFreeHistogramSet(histo_set);
-  return 1;
 }
 
-static int AnalyzeAndInit(VP8LEncoder* const enc, WebPImageHint image_hint) {
+static int GetHistoBits(int method, int use_palette, int width, int height) {
+  // Make tile size a function of encoding method (Range: 0 to 6).
+  int histo_bits = (use_palette ? 9 : 7) - method;
+  while (1) {
+    const int huff_image_size = VP8LSubSampleSize(width, histo_bits) *
+                                VP8LSubSampleSize(height, histo_bits);
+    if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
+    ++histo_bits;
+  }
+  return (histo_bits < MIN_HUFFMAN_BITS) ? MIN_HUFFMAN_BITS :
+         (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
+}
+
+static int GetTransformBits(int method, int histo_bits) {
+  const int max_transform_bits = (method < 4) ? 6 : (method > 4) ? 4 : 5;
+  return (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
+}
+
+static int AnalyzeAndInit(VP8LEncoder* const enc) {
   const WebPPicture* const pic = enc->pic_;
   const int width = pic->width;
   const int height = pic->height;
   const int pix_cnt = width * height;
+  const WebPConfig* const config = enc->config_;
+  const int method = config->method;
+  const int low_effort = (config->method == 0);
   // we round the block size up, so we're guaranteed to have
   // at max MAX_REFS_BLOCK_PER_IMAGE blocks used:
   int refs_block_size = (pix_cnt - 1) / MAX_REFS_BLOCK_PER_IMAGE + 1;
   assert(pic != NULL && pic->argb != NULL);
 
+  enc->use_cross_color_ = 0;
+  enc->use_predict_ = 0;
+  enc->use_subtract_green_ = 0;
   enc->use_palette_ =
-      AnalyzeAndCreatePalette(pic, enc->palette_, &enc->palette_size_);
+      AnalyzeAndCreatePalette(pic, low_effort,
+                              enc->palette_, &enc->palette_size_);
 
-  if (image_hint == WEBP_HINT_GRAPH) {
-    if (enc->use_palette_ && enc->palette_size_ < MAX_COLORS_FOR_GRAPH) {
-      enc->use_palette_ = 0;
-    }
-  }
+  // TODO(jyrki): replace the decision to be based on an actual estimate
+  // of entropy, or even spatial variance of entropy.
+  enc->histo_bits_ = GetHistoBits(method, enc->use_palette_,
+                                  pic->width, pic->height);
+  enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
 
-  if (!enc->use_palette_) {
-    if (image_hint == WEBP_HINT_PHOTO) {
-      enc->use_predict_ = 1;
-      enc->use_cross_color_ = 1;
-    } else {
-      double non_pred_entropy, pred_entropy;
-      if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride,
-                          &non_pred_entropy, &pred_entropy)) {
-        return 0;
-      }
-      if (pred_entropy < 0.95 * non_pred_entropy) {
-        enc->use_predict_ = 1;
-        enc->use_cross_color_ = 1;
-      }
+  if (low_effort) {
+    // AnalyzeEntropy is somewhat slow.
+    enc->use_predict_ = !enc->use_palette_;
+    enc->use_subtract_green_ = !enc->use_palette_;
+    enc->use_cross_color_ = 0;
+  } else {
+    int red_and_blue_always_zero;
+    EntropyIx min_entropy_ix;
+    if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride,
+                        enc->use_palette_, &min_entropy_ix,
+                        &red_and_blue_always_zero)) {
+      return 0;
     }
+    enc->use_palette_ = (min_entropy_ix == kPalette);
+    enc->use_subtract_green_ =
+        (min_entropy_ix == kSubGreen) || (min_entropy_ix == kSpatialSubGreen);
+    enc->use_predict_ =
+        (min_entropy_ix == kSpatial) || (min_entropy_ix == kSpatialSubGreen);
+    enc->use_cross_color_ = red_and_blue_always_zero ? 0 : enc->use_predict_;
   }
+
   if (!VP8LHashChainInit(&enc->hash_chain_, pix_cnt)) return 0;
 
   // palette-friendly input typically uses less literals
@@ -271,9 +509,9 @@ static void StoreHuffmanTreeOfHuffmanTreeToBitMask(
       break;
     }
   }
-  VP8LWriteBits(bw, 4, codes_to_store - 4);
+  VP8LPutBits(bw, codes_to_store - 4, 4);
   for (i = 0; i < codes_to_store; ++i) {
-    VP8LWriteBits(bw, 3, code_length_bitdepth[kStorageOrder[i]]);
+    VP8LPutBits(bw, code_length_bitdepth[kStorageOrder[i]], 3);
   }
 }
 
@@ -301,16 +539,16 @@ static void StoreHuffmanTreeToBitMask(
   for (i = 0; i < num_tokens; ++i) {
     const int ix = tokens[i].code;
     const int extra_bits = tokens[i].extra_bits;
-    VP8LWriteBits(bw, huffman_code->code_lengths[ix], huffman_code->codes[ix]);
+    VP8LPutBits(bw, huffman_code->codes[ix], huffman_code->code_lengths[ix]);
     switch (ix) {
       case 16:
-        VP8LWriteBits(bw, 2, extra_bits);
+        VP8LPutBits(bw, extra_bits, 2);
         break;
       case 17:
-        VP8LWriteBits(bw, 3, extra_bits);
+        VP8LPutBits(bw, extra_bits, 3);
         break;
       case 18:
-        VP8LWriteBits(bw, 7, extra_bits);
+        VP8LPutBits(bw, extra_bits, 7);
         break;
     }
   }
@@ -330,7 +568,7 @@ static void StoreFullHuffmanCode(VP8LBitWriter* const bw,
   huffman_code.code_lengths = code_length_bitdepth;
   huffman_code.codes = code_length_bitdepth_symbols;
 
-  VP8LWriteBits(bw, 1, 0);
+  VP8LPutBits(bw, 0, 1);
   num_tokens = VP8LCreateCompressedHuffmanTree(tree, tokens, max_tokens);
   {
     uint32_t histogram[CODE_LENGTH_CODES] = { 0 };
@@ -367,13 +605,13 @@ static void StoreFullHuffmanCode(VP8LBitWriter* const bw,
     }
     write_trimmed_length = (trimmed_length > 1 && trailing_zero_bits > 12);
     length = write_trimmed_length ? trimmed_length : num_tokens;
-    VP8LWriteBits(bw, 1, write_trimmed_length);
+    VP8LPutBits(bw, write_trimmed_length, 1);
     if (write_trimmed_length) {
       const int nbits = VP8LBitsLog2Ceiling(trimmed_length - 1);
       const int nbitpairs = (nbits == 0) ? 1 : (nbits + 1) / 2;
-      VP8LWriteBits(bw, 3, nbitpairs - 1);
+      VP8LPutBits(bw, nbitpairs - 1, 3);
       assert(trimmed_length >= 2);
-      VP8LWriteBits(bw, nbitpairs * 2, trimmed_length - 2);
+      VP8LPutBits(bw, trimmed_length - 2, nbitpairs * 2);
     }
     StoreHuffmanTreeToBitMask(bw, tokens, length, &huffman_code);
   }
@@ -400,31 +638,42 @@ static void StoreHuffmanCode(VP8LBitWriter* const bw,
 
   if (count == 0) {   // emit minimal tree for empty cases
     // bits: small tree marker: 1, count-1: 0, large 8-bit code: 0, code: 0
-    VP8LWriteBits(bw, 4, 0x01);
+    VP8LPutBits(bw, 0x01, 4);
   } else if (count <= 2 && symbols[0] < kMaxSymbol && symbols[1] < kMaxSymbol) {
-    VP8LWriteBits(bw, 1, 1);  // Small tree marker to encode 1 or 2 symbols.
-    VP8LWriteBits(bw, 1, count - 1);
+    VP8LPutBits(bw, 1, 1);  // Small tree marker to encode 1 or 2 symbols.
+    VP8LPutBits(bw, count - 1, 1);
     if (symbols[0] <= 1) {
-      VP8LWriteBits(bw, 1, 0);  // Code bit for small (1 bit) symbol value.
-      VP8LWriteBits(bw, 1, symbols[0]);
+      VP8LPutBits(bw, 0, 1);  // Code bit for small (1 bit) symbol value.
+      VP8LPutBits(bw, symbols[0], 1);
     } else {
-      VP8LWriteBits(bw, 1, 1);
-      VP8LWriteBits(bw, 8, symbols[0]);
+      VP8LPutBits(bw, 1, 1);
+      VP8LPutBits(bw, symbols[0], 8);
     }
     if (count == 2) {
-      VP8LWriteBits(bw, 8, symbols[1]);
+      VP8LPutBits(bw, symbols[1], 8);
     }
   } else {
     StoreFullHuffmanCode(bw, huff_tree, tokens, huffman_code);
   }
 }
 
-static void WriteHuffmanCode(VP8LBitWriter* const bw,
+static WEBP_INLINE void WriteHuffmanCode(VP8LBitWriter* const bw,
                              const HuffmanTreeCode* const code,
                              int code_index) {
   const int depth = code->code_lengths[code_index];
   const int symbol = code->codes[code_index];
-  VP8LWriteBits(bw, depth, symbol);
+  VP8LPutBits(bw, symbol, depth);
+}
+
+static WEBP_INLINE void WriteHuffmanCodeWithExtraBits(
+    VP8LBitWriter* const bw,
+    const HuffmanTreeCode* const code,
+    int code_index,
+    int bits,
+    int n_bits) {
+  const int depth = code->code_lengths[code_index];
+  const int symbol = code->codes[code_index];
+  VP8LPutBits(bw, (bits << depth) | symbol, depth + n_bits);
 }
 
 static WebPEncodingError StoreImageToBitMask(
@@ -432,40 +681,51 @@ static WebPEncodingError StoreImageToBitMask(
     VP8LBackwardRefs* const refs,
     const uint16_t* histogram_symbols,
     const HuffmanTreeCode* const huffman_codes) {
+  const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
+  const int tile_mask = (histo_bits == 0) ? 0 : -(1 << histo_bits);
   // x and y trace the position in the image.
   int x = 0;
   int y = 0;
-  const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
+  int tile_x = x & tile_mask;
+  int tile_y = y & tile_mask;
+  int histogram_ix = histogram_symbols[0];
+  const HuffmanTreeCode* codes = huffman_codes + 5 * histogram_ix;
   VP8LRefsCursor c = VP8LRefsCursorInit(refs);
   while (VP8LRefsCursorOk(&c)) {
     const PixOrCopy* const v = c.cur_pos;
-    const int histogram_ix = histogram_symbols[histo_bits ?
-                                               (y >> histo_bits) * histo_xsize +
-                                               (x >> histo_bits) : 0];
-    const HuffmanTreeCode* const codes = huffman_codes + 5 * histogram_ix;
-    if (PixOrCopyIsCacheIdx(v)) {
-      const int code = PixOrCopyCacheIdx(v);
-      const int literal_ix = 256 + NUM_LENGTH_CODES + code;
-      WriteHuffmanCode(bw, codes, literal_ix);
-    } else if (PixOrCopyIsLiteral(v)) {
+    if ((tile_x != (x & tile_mask)) || (tile_y != (y & tile_mask))) {
+      tile_x = x & tile_mask;
+      tile_y = y & tile_mask;
+      histogram_ix = histogram_symbols[(y >> histo_bits) * histo_xsize +
+                                       (x >> histo_bits)];
+      codes = huffman_codes + 5 * histogram_ix;
+    }
+    if (PixOrCopyIsLiteral(v)) {
       static const int order[] = { 1, 2, 0, 3 };
       int k;
       for (k = 0; k < 4; ++k) {
         const int code = PixOrCopyLiteral(v, order[k]);
         WriteHuffmanCode(bw, codes + k, code);
       }
+    } else if (PixOrCopyIsCacheIdx(v)) {
+      const int code = PixOrCopyCacheIdx(v);
+      const int literal_ix = 256 + NUM_LENGTH_CODES + code;
+      WriteHuffmanCode(bw, codes, literal_ix);
     } else {
       int bits, n_bits;
-      int code, distance;
+      int code;
 
+      const int distance = PixOrCopyDistance(v);
       VP8LPrefixEncode(v->len, &code, &n_bits, &bits);
-      WriteHuffmanCode(bw, codes, 256 + code);
-      VP8LWriteBits(bw, n_bits, bits);
+      WriteHuffmanCodeWithExtraBits(bw, codes, 256 + code, bits, n_bits);
 
-      distance = PixOrCopyDistance(v);
+      // Don't write the distance with the extra bits code since
+      // the distance can be up to 18 bits of extra bits, and the prefix
+      // 15 bits, totaling to 33, and our PutBits only supports up to 32 bits.
+      // TODO(jyrki): optimize this further.
       VP8LPrefixEncode(distance, &code, &n_bits, &bits);
       WriteHuffmanCode(bw, codes + 4, code);
-      VP8LWriteBits(bw, n_bits, bits);
+      VP8LPutBits(bw, bits, n_bits);
     }
     x += PixOrCopyLength(v);
     while (x >= width) {
@@ -491,21 +751,28 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
   HuffmanTreeToken* tokens = NULL;
   HuffmanTreeCode huffman_codes[5] = { { 0, NULL, NULL } };
   const uint16_t histogram_symbols[1] = { 0 };    // only one tree, one symbol
-  VP8LHistogramSet* const histogram_image = VP8LAllocateHistogramSet(1, 0);
+  int cache_bits = 0;
+  VP8LHistogramSet* histogram_image = NULL;
   HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
         3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
-  if (histogram_image == NULL || huff_tree == NULL) {
+  if (huff_tree == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
 
   // Calculate backward references from ARGB image.
-  refs = VP8LGetBackwardReferences(width, height, argb, quality, 0, 1,
+  refs = VP8LGetBackwardReferences(width, height, argb, quality, 0, &cache_bits,
                                    hash_chain, refs_array);
   if (refs == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
+  histogram_image = VP8LAllocateHistogramSet(1, cache_bits);
+  if (histogram_image == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
   // Build histogram image and symbols from backward references.
   VP8LHistogramStoreRefs(refs, histogram_image->histograms[0]);
 
@@ -517,7 +784,7 @@ static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
   }
 
   // No color cache, no Huffman image.
-  VP8LWriteBits(bw, 1, 0);
+  VP8LPutBits(bw, 0, 1);
 
   // Find maximum number of symbols for the huffman tree-set.
   for (i = 0; i < 5; ++i) {
@@ -557,16 +824,17 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
                                              VP8LHashChain* const hash_chain,
                                              VP8LBackwardRefs refs_array[2],
                                              int width, int height, int quality,
-                                             int cache_bits,
-                                             int histogram_bits) {
+                                             int low_effort, int* cache_bits,
+                                             int histogram_bits,
+                                             size_t init_byte_position,
+                                             int* const hdr_size,
+                                             int* const data_size) {
   WebPEncodingError err = VP8_ENC_OK;
-  const int use_2d_locality = 1;
-  const int use_color_cache = (cache_bits > 0);
   const uint32_t histogram_image_xysize =
       VP8LSubSampleSize(width, histogram_bits) *
       VP8LSubSampleSize(height, histogram_bits);
-  VP8LHistogramSet* histogram_image =
-      VP8LAllocateHistogramSet(histogram_image_xysize, cache_bits);
+  VP8LHistogramSet* histogram_image = NULL;
+  VP8LHistogramSet* tmp_histos = NULL;
   int histogram_image_size = 0;
   size_t bit_array_size = 0;
   HuffmanTree* huff_tree = NULL;
@@ -579,28 +847,39 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
                                 sizeof(*histogram_symbols));
   assert(histogram_bits >= MIN_HUFFMAN_BITS);
   assert(histogram_bits <= MAX_HUFFMAN_BITS);
+  assert(hdr_size != NULL);
+  assert(data_size != NULL);
 
   VP8LBackwardRefsInit(&refs, refs_array[0].block_size_);
-  if (histogram_image == NULL || histogram_symbols == NULL) {
-    VP8LFreeHistogramSet(histogram_image);
-    WebPSafeFree(histogram_symbols);
-    return 0;
+  if (histogram_symbols == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
   }
 
+  *cache_bits = MAX_COLOR_CACHE_BITS;
   // 'best_refs' is the reference to the best backward refs and points to one
   // of refs_array[0] or refs_array[1].
   // Calculate backward references from ARGB image.
   best_refs = VP8LGetBackwardReferences(width, height, argb, quality,
-                                        cache_bits, use_2d_locality,
-                                        hash_chain, refs_array);
+                                        low_effort, cache_bits, hash_chain,
+                                        refs_array);
   if (best_refs == NULL || !VP8LBackwardRefsCopy(best_refs, &refs)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
+  histogram_image =
+      VP8LAllocateHistogramSet(histogram_image_xysize, *cache_bits);
+  tmp_histos = VP8LAllocateHistogramSet(2, *cache_bits);
+  if (histogram_image == NULL || tmp_histos == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
   // Build histogram image and symbols from backward references.
-  if (!VP8LGetHistoImageSymbols(width, height, &refs,
-                                quality, histogram_bits, cache_bits,
-                                histogram_image,
-                                histogram_symbols)) {
+  if (!VP8LGetHistoImageSymbols(width, height, &refs, quality, low_effort,
+                                histogram_bits, *cache_bits, histogram_image,
+                                tmp_histos, histogram_symbols)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
   // Create Huffman bit lengths and codes for each histogram image.
@@ -608,41 +887,53 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
   bit_array_size = 5 * histogram_image_size;
   huffman_codes = (HuffmanTreeCode*)WebPSafeCalloc(bit_array_size,
                                                    sizeof(*huffman_codes));
+  // Note: some histogram_image entries may point to tmp_histos[], so the latter
+  // need to outlive the following call to GetHuffBitLengthsAndCodes().
   if (huffman_codes == NULL ||
       !GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
   // Free combined histograms.
   VP8LFreeHistogramSet(histogram_image);
   histogram_image = NULL;
 
+  // Free scratch histograms.
+  VP8LFreeHistogramSet(tmp_histos);
+  tmp_histos = NULL;
+
   // Color Cache parameters.
-  VP8LWriteBits(bw, 1, use_color_cache);
-  if (use_color_cache) {
-    VP8LWriteBits(bw, 4, cache_bits);
+  if (*cache_bits > 0) {
+    VP8LPutBits(bw, 1, 1);
+    VP8LPutBits(bw, *cache_bits, 4);
+  } else {
+    VP8LPutBits(bw, 0, 1);
   }
 
   // Huffman image + meta huffman.
   {
     const int write_histogram_image = (histogram_image_size > 1);
-    VP8LWriteBits(bw, 1, write_histogram_image);
+    VP8LPutBits(bw, write_histogram_image, 1);
     if (write_histogram_image) {
       uint32_t* const histogram_argb =
           (uint32_t*)WebPSafeMalloc(histogram_image_xysize,
                                     sizeof(*histogram_argb));
       int max_index = 0;
       uint32_t i;
-      if (histogram_argb == NULL) goto Error;
+      if (histogram_argb == NULL) {
+        err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+        goto Error;
+      }
       for (i = 0; i < histogram_image_xysize; ++i) {
         const int symbol_index = histogram_symbols[i] & 0xffff;
-        histogram_argb[i] = 0xff000000 | (symbol_index << 8);
+        histogram_argb[i] = (symbol_index << 8);
         if (symbol_index >= max_index) {
           max_index = symbol_index + 1;
         }
       }
       histogram_image_size = max_index;
 
-      VP8LWriteBits(bw, 3, histogram_bits - 2);
+      VP8LPutBits(bw, histogram_bits - 2, 3);
       err = EncodeImageNoHuffman(bw, histogram_argb, hash_chain, refs_array,
                                  VP8LSubSampleSize(width, histogram_bits),
                                  VP8LSubSampleSize(height, histogram_bits),
@@ -658,7 +949,10 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
     int max_tokens = 0;
     huff_tree = (HuffmanTree*)WebPSafeMalloc(3ULL * CODE_LENGTH_CODES,
                                              sizeof(*huff_tree));
-    if (huff_tree == NULL) goto Error;
+    if (huff_tree == NULL) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
     // Find maximum number of symbols for the huffman tree-set.
     for (i = 0; i < 5 * histogram_image_size; ++i) {
       HuffmanTreeCode* const codes = &huffman_codes[i];
@@ -668,7 +962,10 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
     }
     tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens,
                                                sizeof(*tokens));
-    if (tokens == NULL) goto Error;
+    if (tokens == NULL) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
     for (i = 0; i < 5 * histogram_image_size; ++i) {
       HuffmanTreeCode* const codes = &huffman_codes[i];
       StoreHuffmanCode(bw, huff_tree, tokens, codes);
@@ -676,14 +973,18 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
     }
   }
 
+  *hdr_size = (int)(VP8LBitWriterNumBytes(bw) - init_byte_position);
   // Store actual literals.
   err = StoreImageToBitMask(bw, width, histogram_bits, &refs,
                             histogram_symbols, huffman_codes);
+  *data_size =
+        (int)(VP8LBitWriterNumBytes(bw) - init_byte_position - *hdr_size);
 
  Error:
   WebPSafeFree(tokens);
   WebPSafeFree(huff_tree);
   VP8LFreeHistogramSet(histogram_image);
+  VP8LFreeHistogramSet(tmp_histos);
   VP8LBackwardRefsClear(&refs);
   if (huffman_codes != NULL) {
     WebPSafeFree(huffman_codes->codes);
@@ -696,59 +997,28 @@ static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
 // -----------------------------------------------------------------------------
 // Transforms
 
-// Check if it would be a good idea to subtract green from red and blue. We
-// only impact entropy in red/blue components, don't bother to look at others.
-static WebPEncodingError EvalAndApplySubtractGreen(VP8LEncoder* const enc,
-                                                   int width, int height,
-                                                   VP8LBitWriter* const bw) {
-  if (!enc->use_palette_) {
-    int i;
-    const uint32_t* const argb = enc->argb_;
-    double bit_cost_before, bit_cost_after;
-    // Allocate histogram with cache_bits = 1.
-    VP8LHistogram* const histo = VP8LAllocateHistogram(1);
-    if (histo == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
-    for (i = 0; i < width * height; ++i) {
-      const uint32_t c = argb[i];
-      ++histo->red_[(c >> 16) & 0xff];
-      ++histo->blue_[(c >> 0) & 0xff];
-    }
-    bit_cost_before = VP8LHistogramEstimateBits(histo);
-
-    VP8LHistogramInit(histo, 1);
-    for (i = 0; i < width * height; ++i) {
-      const uint32_t c = argb[i];
-      const int green = (c >> 8) & 0xff;
-      ++histo->red_[((c >> 16) - green) & 0xff];
-      ++histo->blue_[((c >> 0) - green) & 0xff];
-    }
-    bit_cost_after = VP8LHistogramEstimateBits(histo);
-    VP8LFreeHistogram(histo);
-
-    // Check if subtracting green yields low entropy.
-    enc->use_subtract_green_ = (bit_cost_after < bit_cost_before);
-    if (enc->use_subtract_green_) {
-      VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
-      VP8LWriteBits(bw, 2, SUBTRACT_GREEN);
-      VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
-    }
-  }
-  return VP8_ENC_OK;
+static void ApplySubtractGreen(VP8LEncoder* const enc, int width, int height,
+                               VP8LBitWriter* const bw) {
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, SUBTRACT_GREEN, 2);
+  VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
 }
 
 static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
-                                            int width, int height, int quality,
+                                            int width, int height,
+                                            int quality, int low_effort,
                                             VP8LBitWriter* const bw) {
   const int pred_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, pred_bits);
   const int transform_height = VP8LSubSampleSize(height, pred_bits);
 
-  VP8LResidualImage(width, height, pred_bits, enc->argb_, enc->argb_scratch_,
-                    enc->transform_data_);
-  VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
-  VP8LWriteBits(bw, 2, PREDICTOR_TRANSFORM);
+  VP8LResidualImage(width, height, pred_bits, low_effort, enc->argb_,
+                    enc->argb_scratch_, enc->transform_data_,
+                    enc->config_->exact);
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
   assert(pred_bits >= 2);
-  VP8LWriteBits(bw, 3, pred_bits - 2);
+  VP8LPutBits(bw, pred_bits - 2, 3);
   return EncodeImageNoHuffman(bw, enc->transform_data_,
                               (VP8LHashChain*)&enc->hash_chain_,
                               (VP8LBackwardRefs*)enc->refs_,  // cast const away
@@ -766,10 +1036,10 @@ static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
 
   VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality,
                           enc->argb_, enc->transform_data_);
-  VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
-  VP8LWriteBits(bw, 2, CROSS_COLOR_TRANSFORM);
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, CROSS_COLOR_TRANSFORM, 2);
   assert(ccolor_transform_bits >= 2);
-  VP8LWriteBits(bw, 3, ccolor_transform_bits - 2);
+  VP8LPutBits(bw, ccolor_transform_bits - 2, 3);
   return EncodeImageNoHuffman(bw, enc->transform_data_,
                               (VP8LHashChain*)&enc->hash_chain_,
                               (VP8LBackwardRefs*)enc->refs_,  // cast const away
@@ -799,14 +1069,14 @@ static int WriteImageSize(const WebPPicture* const pic,
   const int height = pic->height - 1;
   assert(width < WEBP_MAX_DIMENSION && height < WEBP_MAX_DIMENSION);
 
-  VP8LWriteBits(bw, VP8L_IMAGE_SIZE_BITS, width);
-  VP8LWriteBits(bw, VP8L_IMAGE_SIZE_BITS, height);
+  VP8LPutBits(bw, width, VP8L_IMAGE_SIZE_BITS);
+  VP8LPutBits(bw, height, VP8L_IMAGE_SIZE_BITS);
   return !bw->error_;
 }
 
 static int WriteRealAlphaAndVersion(VP8LBitWriter* const bw, int has_alpha) {
-  VP8LWriteBits(bw, 1, has_alpha);
-  VP8LWriteBits(bw, VP8L_VERSION_BITS, VP8L_VERSION);
+  VP8LPutBits(bw, has_alpha, 1);
+  VP8LPutBits(bw, VP8L_VERSION, VP8L_VERSION_BITS);
   return !bw->error_;
 }
 
@@ -846,39 +1116,107 @@ static WebPEncodingError WriteImage(const WebPPicture* const pic,
 
 // Allocates the memory for argb (W x H) buffer, 2 rows of context for
 // prediction and transform data.
+// Flags influencing the memory allocated:
+//  enc->transform_bits_
+//  enc->use_predict_, enc->use_cross_color_
 static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
                                                  int width, int height) {
   WebPEncodingError err = VP8_ENC_OK;
-  const int tile_size = 1 << enc->transform_bits_;
-  const uint64_t image_size = width * height;
-  const uint64_t argb_scratch_size = tile_size * width + width;
-  const int transform_data_size =
-      VP8LSubSampleSize(width, enc->transform_bits_) *
-      VP8LSubSampleSize(height, enc->transform_bits_);
-  const uint64_t total_size =
-      image_size + argb_scratch_size + (uint64_t)transform_data_size;
-  uint32_t* mem = (uint32_t*)WebPSafeMalloc(total_size, sizeof(*mem));
-  if (mem == NULL) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
+  if (enc->argb_ == NULL) {
+    const int tile_size = 1 << enc->transform_bits_;
+    const uint64_t image_size = width * height;
+    // Ensure enough size for tiles, as well as for two scanlines and two
+    // extra pixels for CopyImageWithPrediction.
+    const uint64_t argb_scratch_size =
+        enc->use_predict_ ? tile_size * width + width + 2 : 0;
+    const int transform_data_size =
+        (enc->use_predict_ || enc->use_cross_color_)
+            ? VP8LSubSampleSize(width, enc->transform_bits_) *
+              VP8LSubSampleSize(height, enc->transform_bits_)
+            : 0;
+    const uint64_t total_size =
+        image_size + WEBP_ALIGN_CST +
+        argb_scratch_size + WEBP_ALIGN_CST +
+        (uint64_t)transform_data_size;
+    uint32_t* mem = (uint32_t*)WebPSafeMalloc(total_size, sizeof(*mem));
+    if (mem == NULL) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
+    enc->argb_ = mem;
+    mem = (uint32_t*)WEBP_ALIGN(mem + image_size);
+    enc->argb_scratch_ = mem;
+    mem = (uint32_t*)WEBP_ALIGN(mem + argb_scratch_size);
+    enc->transform_data_ = mem;
+    enc->current_width_ = width;
   }
-  enc->argb_ = mem;
-  mem += image_size;
-  enc->argb_scratch_ = mem;
-  mem += argb_scratch_size;
-  enc->transform_data_ = mem;
-  enc->current_width_ = width;
-
  Error:
   return err;
 }
 
-static void ApplyPalette(uint32_t* src, uint32_t* dst,
-                         uint32_t src_stride, uint32_t dst_stride,
-                         const uint32_t* palette, int palette_size,
-                         int width, int height, int xbits, uint8_t* row) {
+static void ClearTransformBuffer(VP8LEncoder* const enc) {
+  WebPSafeFree(enc->argb_);
+  enc->argb_ = NULL;
+}
+
+static WebPEncodingError MakeInputImageCopy(VP8LEncoder* const enc) {
+  WebPEncodingError err = VP8_ENC_OK;
+  const WebPPicture* const picture = enc->pic_;
+  const int width = picture->width;
+  const int height = picture->height;
+  int y;
+  err = AllocateTransformBuffer(enc, width, height);
+  if (err != VP8_ENC_OK) return err;
+  for (y = 0; y < height; ++y) {
+    memcpy(enc->argb_ + y * width,
+           picture->argb + y * picture->argb_stride,
+           width * sizeof(*enc->argb_));
+  }
+  assert(enc->current_width_ == width);
+  return VP8_ENC_OK;
+}
+
+// -----------------------------------------------------------------------------
+
+static void MapToPalette(const uint32_t palette[], int num_colors,
+                         uint32_t* const last_pix, int* const last_idx,
+                         const uint32_t* src, uint8_t* dst, int width) {
+  int x;
+  int prev_idx = *last_idx;
+  uint32_t prev_pix = *last_pix;
+  for (x = 0; x < width; ++x) {
+    const uint32_t pix = src[x];
+    if (pix != prev_pix) {
+      int i;
+      for (i = 0; i < num_colors; ++i) {
+        if (pix == palette[i]) {
+          prev_idx = i;
+          prev_pix = pix;
+          break;
+        }
+      }
+    }
+    dst[x] = prev_idx;
+  }
+  *last_idx = prev_idx;
+  *last_pix = prev_pix;
+}
+
+// Remap argb values in src[] to packed palettes entries in dst[]
+// using 'row' as a temporary buffer of size 'width'.
+// We assume that all src[] values have a corresponding entry in the palette.
+// Note: src[] can be the same as dst[]
+static WebPEncodingError ApplyPalette(const uint32_t* src, uint32_t src_stride,
+                                      uint32_t* dst, uint32_t dst_stride,
+                                      const uint32_t* palette, int palette_size,
+                                      int width, int height, int xbits) {
+  // TODO(skal): this tmp buffer is not needed if VP8LBundleColorMap() can be
+  // made to work in-place.
+  uint8_t* const tmp_row = (uint8_t*)WebPSafeMalloc(width, sizeof(*tmp_row));
   int i, x, y;
   int use_LUT = 1;
+
+  if (tmp_row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
   for (i = 0; i < palette_size; ++i) {
     if ((palette[i] & 0xffff00ffu) != 0) {
       use_LUT = 0;
@@ -895,9 +1233,9 @@ static void ApplyPalette(uint32_t* src, uint32_t* dst,
     for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         const int color = (src[x] >> 8) & 0xff;
-        row[x] = inv_palette[color];
+        tmp_row[x] = inv_palette[color];
       }
-      VP8LBundleColorMap(row, width, xbits, dst);
+      VP8LBundleColorMap(tmp_row, width, xbits, dst);
       src += src_stride;
       dst += dst_stride;
     }
@@ -906,41 +1244,28 @@ static void ApplyPalette(uint32_t* src, uint32_t* dst,
     uint32_t last_pix = palette[0];
     int last_idx = 0;
     for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const uint32_t pix = src[x];
-        if (pix != last_pix) {
-          for (i = 0; i < palette_size; ++i) {
-            if (pix == palette[i]) {
-              last_idx = i;
-              last_pix = pix;
-              break;
-            }
-          }
-        }
-        row[x] = last_idx;
-      }
-      VP8LBundleColorMap(row, width, xbits, dst);
+      MapToPalette(palette, palette_size, &last_pix, &last_idx,
+                   src, tmp_row, width);
+      VP8LBundleColorMap(tmp_row, width, xbits, dst);
       src += src_stride;
       dst += dst_stride;
     }
   }
+  WebPSafeFree(tmp_row);
+  return VP8_ENC_OK;
 }
 
 // Note: Expects "enc->palette_" to be set properly.
-// Also, "enc->palette_" will be modified after this call and should not be used
-// later.
-static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
-                                       VP8LEncoder* const enc, int quality) {
+static WebPEncodingError MapImageFromPalette(VP8LEncoder* const enc,
+                                             int in_place) {
   WebPEncodingError err = VP8_ENC_OK;
-  int i;
   const WebPPicture* const pic = enc->pic_;
-  uint32_t* src = pic->argb;
-  uint32_t* dst;
   const int width = pic->width;
   const int height = pic->height;
-  uint32_t* const palette = enc->palette_;
+  const uint32_t* const palette = enc->palette_;
+  const uint32_t* src = in_place ? enc->argb_ : pic->argb;
+  const int src_stride = in_place ? enc->current_width_ : pic->argb_stride;
   const int palette_size = enc->palette_size_;
-  uint8_t* row = NULL;
   int xbits;
 
   // Replace each input pixel by corresponding palette index.
@@ -952,67 +1277,74 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
   }
 
   err = AllocateTransformBuffer(enc, VP8LSubSampleSize(width, xbits), height);
-  if (err != VP8_ENC_OK) goto Error;
-  dst = enc->argb_;
-
-  row = (uint8_t*)WebPSafeMalloc(width, sizeof(*row));
-  if (row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
+  if (err != VP8_ENC_OK) return err;
 
-  ApplyPalette(src, dst, pic->argb_stride, enc->current_width_,
-               palette, palette_size, width, height, xbits, row);
+  err = ApplyPalette(src, src_stride,
+                     enc->argb_, enc->current_width_,
+                     palette, palette_size, width, height, xbits);
+  return err;
+}
 
-  // Save palette to bitstream.
-  VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
-  VP8LWriteBits(bw, 2, COLOR_INDEXING_TRANSFORM);
-  assert(palette_size >= 1);
-  VP8LWriteBits(bw, 8, palette_size - 1);
+// Save palette_[] to bitstream.
+static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
+                                       VP8LEncoder* const enc) {
+  int i;
+  uint32_t tmp_palette[MAX_PALETTE_SIZE];
+  const int palette_size = enc->palette_size_;
+  const uint32_t* const palette = enc->palette_;
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, COLOR_INDEXING_TRANSFORM, 2);
+  assert(palette_size >= 1 && palette_size <= MAX_PALETTE_SIZE);
+  VP8LPutBits(bw, palette_size - 1, 8);
   for (i = palette_size - 1; i >= 1; --i) {
-    palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
+    tmp_palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
   }
-  err = EncodeImageNoHuffman(bw, palette, &enc->hash_chain_, enc->refs_,
-                             palette_size, 1, quality);
-
- Error:
-  WebPSafeFree(row);
-  return err;
+  tmp_palette[0] = palette[0];
+  return EncodeImageNoHuffman(bw, tmp_palette, &enc->hash_chain_, enc->refs_,
+                              palette_size, 1, 20 /* quality */);
 }
 
-// -----------------------------------------------------------------------------
+#ifdef WEBP_EXPERIMENTAL_FEATURES
 
-static int GetHistoBits(int method, int use_palette, int width, int height) {
-  const int hist_size = VP8LGetHistogramSize(MAX_COLOR_CACHE_BITS);
-  // Make tile size a function of encoding method (Range: 0 to 6).
-  int histo_bits = (use_palette ? 9 : 7) - method;
-  while (1) {
-    const int huff_image_size = VP8LSubSampleSize(width, histo_bits) *
-                                VP8LSubSampleSize(height, histo_bits);
-    if ((uint64_t)huff_image_size * hist_size <= MAX_HUFF_IMAGE_SIZE) break;
-    ++histo_bits;
-  }
-  return (histo_bits < MIN_HUFFMAN_BITS) ? MIN_HUFFMAN_BITS :
-         (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
-}
+static WebPEncodingError EncodeDeltaPalettePredictorImage(
+    VP8LBitWriter* const bw, VP8LEncoder* const enc, int quality) {
+  const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
 
-static int GetTransformBits(int method, int histo_bits) {
-  const int max_transform_bits = (method < 4) ? 6 : (method > 4) ? 4 : 5;
-  return (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
-}
+  const int pred_bits = 5;
+  const int transform_width = VP8LSubSampleSize(width, pred_bits);
+  const int transform_height = VP8LSubSampleSize(height, pred_bits);
+  const int pred = 7;   // default is Predictor7 (Top/Left Average)
+  const int tiles_per_row = VP8LSubSampleSize(width, pred_bits);
+  const int tiles_per_col = VP8LSubSampleSize(height, pred_bits);
+  uint32_t* predictors;
+  int tile_x, tile_y;
+  WebPEncodingError err = VP8_ENC_OK;
 
-static int GetCacheBits(float quality) {
-  return (quality <= 25.f) ? 0 : 7;
-}
+  predictors = (uint32_t*)WebPSafeMalloc(tiles_per_col * tiles_per_row,
+                                         sizeof(*predictors));
+  if (predictors == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
 
-static void FinishEncParams(VP8LEncoder* const enc) {
-  const WebPConfig* const config = enc->config_;
-  const WebPPicture* const pic = enc->pic_;
-  const int method = config->method;
-  const float quality = config->quality;
-  const int use_palette = enc->use_palette_;
-  enc->histo_bits_ = GetHistoBits(method, use_palette, pic->width, pic->height);
-  enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
-  enc->cache_bits_ = GetCacheBits(quality);
+  for (tile_y = 0; tile_y < tiles_per_col; ++tile_y) {
+    for (tile_x = 0; tile_x < tiles_per_row; ++tile_x) {
+      predictors[tile_y * tiles_per_row + tile_x] = 0xff000000u | (pred << 8);
+    }
+  }
+
+  VP8LPutBits(bw, TRANSFORM_PRESENT, 1);
+  VP8LPutBits(bw, PREDICTOR_TRANSFORM, 2);
+  VP8LPutBits(bw, pred_bits - 2, 3);
+  err = EncodeImageNoHuffman(bw, predictors, &enc->hash_chain_,
+                             (VP8LBackwardRefs*)enc->refs_,  // cast const away
+                             transform_width, transform_height,
+                             quality);
+  WebPSafeFree(predictors);
+  return err;
 }
 
+#endif // WEBP_EXPERIMENTAL_FEATURES
+
 // -----------------------------------------------------------------------------
 // VP8LEncoder
 
@@ -1026,7 +1358,7 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
   enc->config_ = config;
   enc->pic_ = picture;
 
-  VP8LDspInit();
+  VP8LEncDspInit();
 
   return enc;
 }
@@ -1036,7 +1368,7 @@ static void VP8LEncoderDelete(VP8LEncoder* enc) {
     VP8LHashChainClear(&enc->hash_chain_);
     VP8LBackwardRefsClear(&enc->refs_[0]);
     VP8LBackwardRefsClear(&enc->refs_[1]);
-    WebPSafeFree(enc->argb_);
+    ClearTransformBuffer(enc);
     WebPSafeFree(enc);
   }
 }
@@ -1049,10 +1381,15 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
                                    VP8LBitWriter* const bw) {
   WebPEncodingError err = VP8_ENC_OK;
   const int quality = (int)config->quality;
+  const int low_effort = (config->method == 0);
   const int width = picture->width;
   const int height = picture->height;
   VP8LEncoder* const enc = VP8LEncoderNew(config, picture);
   const size_t byte_position = VP8LBitWriterNumBytes(bw);
+  int use_near_lossless = 0;
+  int hdr_size = 0;
+  int data_size = 0;
+  int use_delta_palettization = 0;
 
   if (enc == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
@@ -1062,70 +1399,83 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   // ---------------------------------------------------------------------------
   // Analyze image (entropy, num_palettes etc)
 
-  if (!AnalyzeAndInit(enc, config->image_hint)) {
+  if (!AnalyzeAndInit(enc)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
 
-  FinishEncParams(enc);
-
-  if (enc->use_palette_) {
-    err = EncodePalette(bw, enc, quality);
-    if (err != VP8_ENC_OK) goto Error;
-    // Color cache is disabled for palette.
-    enc->cache_bits_ = 0;
+  // Apply near-lossless preprocessing.
+  use_near_lossless = !enc->use_palette_ && (config->near_lossless < 100);
+  if (use_near_lossless) {
+    if (!VP8ApplyNearLossless(width, height, picture->argb,
+                              config->near_lossless)) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+      goto Error;
+    }
   }
 
-  // In case image is not packed.
-  if (enc->argb_ == NULL) {
-    int y;
-    err = AllocateTransformBuffer(enc, width, height);
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+  if (config->delta_palettization) {
+    enc->use_predict_ = 1;
+    enc->use_cross_color_ = 0;
+    enc->use_subtract_green_ = 0;
+    enc->use_palette_ = 1;
+    err = MakeInputImageCopy(enc);
     if (err != VP8_ENC_OK) goto Error;
-    assert(enc->argb_ != NULL);
-    for (y = 0; y < height; ++y) {
-      memcpy(enc->argb_ + y * width,
-             picture->argb + y * picture->argb_stride,
-             width * sizeof(*enc->argb_));
+    err = WebPSearchOptimalDeltaPalette(enc);
+    if (err != VP8_ENC_OK) goto Error;
+    if (enc->use_palette_) {
+      err = AllocateTransformBuffer(enc, width, height);
+      if (err != VP8_ENC_OK) goto Error;
+      err = EncodeDeltaPalettePredictorImage(bw, enc, quality);
+      if (err != VP8_ENC_OK) goto Error;
+      use_delta_palettization = 1;
     }
-    enc->current_width_ = width;
   }
+#endif  // WEBP_EXPERIMENTAL_FEATURES
 
-  // ---------------------------------------------------------------------------
-  // Apply transforms and write transform data.
-
-  err = EvalAndApplySubtractGreen(enc, enc->current_width_, height, bw);
-  if (err != VP8_ENC_OK) goto Error;
-
-  if (enc->use_predict_) {
-    err = ApplyPredictFilter(enc, enc->current_width_, height, quality, bw);
+  // Encode palette
+  if (enc->use_palette_) {
+    err = EncodePalette(bw, enc);
     if (err != VP8_ENC_OK) goto Error;
-  }
-
-  if (enc->use_cross_color_) {
-    err = ApplyCrossColorFilter(enc, enc->current_width_, height, quality, bw);
+    err = MapImageFromPalette(enc, use_delta_palettization);
     if (err != VP8_ENC_OK) goto Error;
   }
+  if (!use_delta_palettization) {
+    // In case image is not packed.
+    if (enc->argb_ == NULL) {
+      err = MakeInputImageCopy(enc);
+      if (err != VP8_ENC_OK) goto Error;
+    }
 
-  VP8LWriteBits(bw, 1, !TRANSFORM_PRESENT);  // No more transforms.
+    // -------------------------------------------------------------------------
+    // Apply transforms and write transform data.
 
-  // ---------------------------------------------------------------------------
-  // Estimate the color cache size.
+    if (enc->use_subtract_green_) {
+      ApplySubtractGreen(enc, enc->current_width_, height, bw);
+    }
 
-  if (enc->cache_bits_ > 0) {
-    if (!VP8LCalculateEstimateForCacheSize(enc->argb_, enc->current_width_,
-                                           height, quality, &enc->hash_chain_,
-                                           &enc->refs_[0], &enc->cache_bits_)) {
-      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-      goto Error;
+    if (enc->use_predict_) {
+      err = ApplyPredictFilter(enc, enc->current_width_, height, quality,
+                               low_effort, bw);
+      if (err != VP8_ENC_OK) goto Error;
+    }
+
+    if (enc->use_cross_color_) {
+      err = ApplyCrossColorFilter(enc, enc->current_width_,
+                                  height, quality, bw);
+      if (err != VP8_ENC_OK) goto Error;
     }
   }
 
+  VP8LPutBits(bw, !TRANSFORM_PRESENT, 1);  // No more transforms.
+
   // ---------------------------------------------------------------------------
   // Encode and write the transformed image.
-
   err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_,
-                            enc->current_width_, height, quality,
-                            enc->cache_bits_, enc->histo_bits_);
+                            enc->current_width_, height, quality, low_effort,
+                            &enc->cache_bits_, enc->histo_bits_, byte_position,
+                            &hdr_size, &data_size);
   if (err != VP8_ENC_OK) goto Error;
 
   if (picture->stats != NULL) {
@@ -1140,6 +1490,8 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     stats->cache_bits = enc->cache_bits_;
     stats->palette_size = enc->palette_size_;
     stats->lossless_size = (int)(VP8LBitWriterNumBytes(bw) - byte_position);
+    stats->lossless_hdr_size = hdr_size;
+    stats->lossless_data_size = data_size;
   }
 
  Error:
@@ -1170,7 +1522,7 @@ int VP8LEncodeImage(const WebPConfig* const config,
   // Initialize BitWriter with size corresponding to 16 bpp to photo images and
   // 8 bpp for graphical images.
   initial_size = (config->image_hint == WEBP_HINT_GRAPH) ?
-                 width * height : width * height * 2;
+      width * height : width * height * 2;
   if (!VP8LBitWriterInit(&bw, initial_size)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
@@ -1234,7 +1586,7 @@ int VP8LEncodeImage(const WebPConfig* const config,
 
  Error:
   if (bw.error_) err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-  VP8LBitWriterDestroy(&bw);
+  VP8LBitWriterWipeOut(&bw);
   if (err != VP8_ENC_OK) {
     WebPEncodingSetError(picture, err);
     return 0;
diff --git a/src/3rdparty/libwebp/src/enc/webpenc.c b/src/3rdparty/libwebp/src/enc/webpenc.c
index ca85e0b..fece736 100644
--- a/src/3rdparty/libwebp/src/enc/webpenc.c
+++ b/src/3rdparty/libwebp/src/enc/webpenc.c
@@ -16,9 +16,9 @@
 #include <string.h>
 #include <math.h>
 
+#include "./cost.h"
 #include "./vp8enci.h"
 #include "./vp8li.h"
-#include "./cost.h"
 #include "../utils/utils.h"
 
 // #define PRINT_MEMORY_INFO
@@ -38,14 +38,14 @@ int WebPGetEncoderVersion(void) {
 //------------------------------------------------------------------------------
 
 static void ResetSegmentHeader(VP8Encoder* const enc) {
-  VP8SegmentHeader* const hdr = &enc->segment_hdr_;
+  VP8EncSegmentHeader* const hdr = &enc->segment_hdr_;
   hdr->num_segments_ = enc->config_->segments;
   hdr->update_map_  = (hdr->num_segments_ > 1);
   hdr->size_ = 0;
 }
 
 static void ResetFilterHeader(VP8Encoder* const enc) {
-  VP8FilterHeader* const hdr = &enc->filter_hdr_;
+  VP8EncFilterHeader* const hdr = &enc->filter_hdr_;
   hdr->simple_ = 1;
   hdr->level_ = 0;
   hdr->sharpness_ = 0;
@@ -79,7 +79,9 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) {
 //-------------------+---+---+---+---+---+---+---+
 // basic rd-opt      |   |   |   | x | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
-// disto-score i4/16 |   |   | x |   |   |   |   |
+// disto-refine i4/16| x | x | x |   |   |   |   |
+//-------------------+---+---+---+---+---+---+---+
+// disto-refine uv   |   | x | x |   |   |   |   |
 //-------------------+---+---+---+---+---+---+---+
 // rd-opt i4/16      |   |   | ~ | x | x | x | x |
 //-------------------+---+---+---+---+---+---+---+
@@ -131,35 +133,36 @@ static void MapConfigToTools(VP8Encoder* const enc) {
 //       VP8EncIterator: 3360
 //         VP8ModeScore: 872
 //       VP8SegmentInfo: 732
-//             VP8Proba: 18352
+//          VP8EncProba: 18352
 //              LFStats: 2048
 // Picture size (yuv): 419328
 
 static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
                                   WebPPicture* const picture) {
+  VP8Encoder* enc;
   const int use_filter =
       (config->filter_strength > 0) || (config->autofilter > 0);
   const int mb_w = (picture->width + 15) >> 4;
   const int mb_h = (picture->height + 15) >> 4;
   const int preds_w = 4 * mb_w + 1;
   const int preds_h = 4 * mb_h + 1;
-  const size_t preds_size = preds_w * preds_h * sizeof(uint8_t);
+  const size_t preds_size = preds_w * preds_h * sizeof(*enc->preds_);
   const int top_stride = mb_w * 16;
-  const size_t nz_size = (mb_w + 1) * sizeof(uint32_t) + ALIGN_CST;
-  const size_t info_size = mb_w * mb_h * sizeof(VP8MBInfo);
-  const size_t samples_size = 2 * top_stride * sizeof(uint8_t)  // top-luma/u/v
-                            + ALIGN_CST;                        // align all
+  const size_t nz_size = (mb_w + 1) * sizeof(*enc->nz_) + WEBP_ALIGN_CST;
+  const size_t info_size = mb_w * mb_h * sizeof(*enc->mb_info_);
+  const size_t samples_size =
+      2 * top_stride * sizeof(*enc->y_top_)  // top-luma/u/v
+      + WEBP_ALIGN_CST;                      // align all
   const size_t lf_stats_size =
-      config->autofilter ? sizeof(LFStats) + ALIGN_CST : 0;
-  VP8Encoder* enc;
+      config->autofilter ? sizeof(*enc->lf_stats_) + WEBP_ALIGN_CST : 0;
   uint8_t* mem;
-  const uint64_t size = (uint64_t)sizeof(VP8Encoder)   // main struct
-                      + ALIGN_CST                      // cache alignment
-                      + info_size                      // modes info
-                      + preds_size                     // prediction modes
-                      + samples_size                   // top/left samples
-                      + nz_size                        // coeff context bits
-                      + lf_stats_size;                 // autofilter stats
+  const uint64_t size = (uint64_t)sizeof(*enc)   // main struct
+                      + WEBP_ALIGN_CST           // cache alignment
+                      + info_size                // modes info
+                      + preds_size               // prediction modes
+                      + samples_size             // top/left samples
+                      + nz_size                  // coeff context bits
+                      + lf_stats_size;           // autofilter stats
 
 #ifdef PRINT_MEMORY_INFO
   printf("===================================\n");
@@ -171,16 +174,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
          "            non-zero: %ld\n"
          "            lf-stats: %ld\n"
          "               total: %ld\n",
-         sizeof(VP8Encoder) + ALIGN_CST, info_size,
+         sizeof(*enc) + WEBP_ALIGN_CST, info_size,
          preds_size, samples_size, nz_size, lf_stats_size, size);
   printf("Transient object sizes:\n"
          "      VP8EncIterator: %ld\n"
          "        VP8ModeScore: %ld\n"
          "      VP8SegmentInfo: %ld\n"
-         "            VP8Proba: %ld\n"
+         "         VP8EncProba: %ld\n"
          "             LFStats: %ld\n",
          sizeof(VP8EncIterator), sizeof(VP8ModeScore),
-         sizeof(VP8SegmentInfo), sizeof(VP8Proba),
+         sizeof(VP8SegmentInfo), sizeof(VP8EncProba),
          sizeof(LFStats));
   printf("Picture size (yuv): %ld\n",
          mb_w * mb_h * 384 * sizeof(uint8_t));
@@ -192,7 +195,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
     return NULL;
   }
   enc = (VP8Encoder*)mem;
-  mem = (uint8_t*)DO_ALIGN(mem + sizeof(*enc));
+  mem = (uint8_t*)WEBP_ALIGN(mem + sizeof(*enc));
   memset(enc, 0, sizeof(*enc));
   enc->num_parts_ = 1 << config->partitions;
   enc->mb_w_ = mb_w;
@@ -201,14 +204,14 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   enc->mb_info_ = (VP8MBInfo*)mem;
   mem += info_size;
   enc->preds_ = ((uint8_t*)mem) + 1 + enc->preds_w_;
-  mem += preds_w * preds_h * sizeof(uint8_t);
-  enc->nz_ = 1 + (uint32_t*)DO_ALIGN(mem);
+  mem += preds_size;
+  enc->nz_ = 1 + (uint32_t*)WEBP_ALIGN(mem);
   mem += nz_size;
-  enc->lf_stats_ = lf_stats_size ? (LFStats*)DO_ALIGN(mem) : NULL;
+  enc->lf_stats_ = lf_stats_size ? (LFStats*)WEBP_ALIGN(mem) : NULL;
   mem += lf_stats_size;
 
   // top samples (all 16-aligned)
-  mem = (uint8_t*)DO_ALIGN(mem);
+  mem = (uint8_t*)WEBP_ALIGN(mem);
   enc->y_top_ = (uint8_t*)mem;
   enc->uv_top_ = enc->y_top_ + top_stride;
   mem += 2 * top_stride;
@@ -225,8 +228,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   ResetSegmentHeader(enc);
   ResetFilterHeader(enc);
   ResetBoundaryPredictions(enc);
-  VP8GetResidualCostInit();
-  VP8SetResidualCoeffsInit();
+  VP8EncDspCostInit();
   VP8EncInitAlpha(enc);
 
   // lower quality means smaller output -> we modulate a little the page
@@ -326,14 +328,17 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
 
   if (!config->lossless) {
     VP8Encoder* enc = NULL;
+
+    if (!config->exact) {
+      WebPCleanupTransparentArea(pic);
+    }
+
     if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
       // Make sure we have YUVA samples.
       if (config->preprocessing & 4) {
-#if WEBP_ENCODER_ABI_VERSION > 0x0204
         if (!WebPPictureSmartARGBToYUVA(pic)) {
           return 0;
         }
-#endif
       } else {
         float dithering = 0.f;
         if (config->preprocessing & 2) {
@@ -375,6 +380,10 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
       return 0;
     }
 
+    if (!config->exact) {
+      WebPCleanupTransparentAreaLossless(pic);
+    }
+
     ok = VP8LEncodeImage(config, pic);  // Sets pic->error in case of problem.
   }
author	Liang Qi <liang.qi@theqtcompany.com>	2016-03-09 10:22:13 +0100
committer	Liang Qi <liang.qi@theqtcompany.com>	2016-03-11 20:05:19 +0000
commit	b114e552211456fbde3ff6ca2da21cbc8d1e90e2 (patch)
tree	9c033ea7bcc9cc7314eaa8aff57356b2ae301257 /src/3rdparty/libwebp/src/enc
parent	1d4f24820c0fff474d524e006d715e13e409a4b8 (diff)