24 files changed, 3692 insertions, 2629 deletions
diff --git a/src/3rdparty/libwebp/src/enc/alpha.c b/src/3rdparty/libwebp/src/enc/alpha.c
index 21d4b5c..79cb94d 100644
--- a/src/3rdparty/libwebp/src/enc/alpha.c
+++ b/src/3rdparty/libwebp/src/enc/alpha.c
@@ -17,6 +17,7 @@
 #include "./vp8enci.h"
 #include "../utils/filters.h"
 #include "../utils/quant_levels.h"
+#include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
 // -----------------------------------------------------------------------------
@@ -34,7 +35,7 @@
 //
 // 'output' corresponds to the buffer containing compressed alpha data.
 //          This buffer is allocated by this method and caller should call
-//          free(*output) when done.
+//          WebPSafeFree(*output) when done.
 // 'output_size' corresponds to size of this compressed alpha buffer.
 //
 // Returns 1 on successfully encoding the alpha and
@@ -46,12 +47,11 @@
 
 static int EncodeLossless(const uint8_t* const data, int width, int height,
                           int effort_level,  // in [0..6] range
-                          VP8BitWriter* const bw,
+                          VP8LBitWriter* const bw,
                           WebPAuxStats* const stats) {
   int ok = 0;
   WebPConfig config;
   WebPPicture picture;
-  VP8LBitWriter tmp_bw;
 
   WebPPictureInit(&picture);
   picture.width = width;
@@ -83,16 +83,15 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   config.quality = 8.f * effort_level;
   assert(config.quality >= 0 && config.quality <= 100.f);
 
-  ok = VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
-  ok = ok && (VP8LEncodeStream(&config, &picture, &tmp_bw) == VP8_ENC_OK);
+  ok = (VP8LEncodeStream(&config, &picture, bw) == VP8_ENC_OK);
   WebPPictureFree(&picture);
-  if (ok) {
-    const uint8_t* const buffer = VP8LBitWriterFinish(&tmp_bw);
-    const size_t buffer_size = VP8LBitWriterNumBytes(&tmp_bw);
-    VP8BitWriterAppend(bw, buffer, buffer_size);
+  ok = ok && !bw->error_;
+  if (!ok) {
+    VP8LBitWriterDestroy(bw);
+    return 0;
   }
-  VP8LBitWriterDestroy(&tmp_bw);
-  return ok && !bw->error_;
+  return 1;
+
 }
 
 // -----------------------------------------------------------------------------
@@ -114,8 +113,10 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   const uint8_t* alpha_src;
   WebPFilterFunc filter_func;
   uint8_t header;
-  size_t expected_size;
   const size_t data_size = width * height;
+  const uint8_t* output = NULL;
+  size_t output_size = 0;
+  VP8LBitWriter tmp_bw;
 
   assert((uint64_t)data_size == (uint64_t)width * height);  // as per spec
   assert(filter >= 0 && filter < WEBP_FILTER_LAST);
@@ -124,15 +125,6 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   assert(sizeof(header) == ALPHA_HEADER_LEN);
   // TODO(skal): have a common function and #define's to validate alpha params.
 
-  expected_size =
-      (method == ALPHA_NO_COMPRESSION) ? (ALPHA_HEADER_LEN + data_size)
-                                       : (data_size >> 5);
-  header = method | (filter << 2);
-  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
-
-  VP8BitWriterInit(&result->bw, expected_size);
-  VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
-
   filter_func = WebPFilters[filter];
   if (filter_func != NULL) {
     filter_func(data, width, height, width, tmp_alpha);
@@ -141,14 +133,42 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
     alpha_src = data;
   }
 
+  if (method != ALPHA_NO_COMPRESSION) {
+    ok = VP8LBitWriterInit(&tmp_bw, data_size >> 3);
+    ok = ok && EncodeLossless(alpha_src, width, height, effort_level,
+                              &tmp_bw, &result->stats);
+    if (ok) {
+      output = VP8LBitWriterFinish(&tmp_bw);
+      output_size = VP8LBitWriterNumBytes(&tmp_bw);
+      if (output_size > data_size) {
+        // compressed size is larger than source! Revert to uncompressed mode.
+        method = ALPHA_NO_COMPRESSION;
+        VP8LBitWriterDestroy(&tmp_bw);
+      }
+    } else {
+      VP8LBitWriterDestroy(&tmp_bw);
+      return 0;
+    }
+  }
+
   if (method == ALPHA_NO_COMPRESSION) {
-    ok = VP8BitWriterAppend(&result->bw, alpha_src, width * height);
-    ok = ok && !result->bw.error_;
-  } else {
-    ok = EncodeLossless(alpha_src, width, height, effort_level,
-                        &result->bw, &result->stats);
-    VP8BitWriterFinish(&result->bw);
+    output = alpha_src;
+    output_size = data_size;
+    ok = 1;
+  }
+
+  // Emit final result.
+  header = method | (filter << 2);
+  if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
+
+  VP8BitWriterInit(&result->bw, ALPHA_HEADER_LEN + output_size);
+  ok = ok && VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
+  ok = ok && VP8BitWriterAppend(&result->bw, output, output_size);
+
+  if (method != ALPHA_NO_COMPRESSION) {
+    VP8LBitWriterDestroy(&tmp_bw);
   }
+  ok = ok && !result->bw.error_;
   result->score = VP8BitWriterSize(&result->bw);
   return ok;
 }
@@ -231,7 +251,7 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
       GetFilterMap(alpha, width, height, filter, effort_level);
   InitFilterTrial(&best);
   if (try_map != FILTER_TRY_NONE) {
-    uint8_t* filtered_alpha =  (uint8_t*)malloc(data_size);
+    uint8_t* filtered_alpha =  (uint8_t*)WebPSafeMalloc(1ULL, data_size);
     if (filtered_alpha == NULL) return 0;
 
     for (filter = WEBP_FILTER_NONE; ok && try_map; ++filter, try_map >>= 1) {
@@ -248,7 +268,7 @@ static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
         }
       }
     }
-    free(filtered_alpha);
+    WebPSafeFree(filtered_alpha);
   } else {
     ok = EncodeAlphaInternal(alpha, width, height, method, WEBP_FILTER_NONE,
                              reduce_levels, effort_level, NULL, &best);
@@ -298,7 +318,7 @@ static int EncodeAlpha(VP8Encoder* const enc,
     filter = WEBP_FILTER_NONE;
   }
 
-  quant_alpha = (uint8_t*)malloc(data_size);
+  quant_alpha = (uint8_t*)WebPSafeMalloc(1ULL, data_size);
   if (quant_alpha == NULL) {
     return 0;
   }
@@ -325,7 +345,7 @@ static int EncodeAlpha(VP8Encoder* const enc,
     }
   }
 
-  free(quant_alpha);
+  WebPSafeFree(quant_alpha);
   return ok;
 }
 
@@ -346,7 +366,7 @@ static int CompressAlphaJob(VP8Encoder* const enc, void* dummy) {
     return 0;
   }
   if (alpha_size != (uint32_t)alpha_size) {  // Sanity check.
-    free(alpha_data);
+    WebPSafeFree(alpha_data);
     return 0;
   }
   enc->alpha_data_size_ = (uint32_t)alpha_size;
@@ -361,7 +381,7 @@ void VP8EncInitAlpha(VP8Encoder* const enc) {
   enc->alpha_data_size_ = 0;
   if (enc->thread_level_ > 0) {
     WebPWorker* const worker = &enc->alpha_worker_;
-    WebPWorkerInit(worker);
+    WebPGetWorkerInterface()->Init(worker);
     worker->data1 = enc;
     worker->data2 = NULL;
     worker->hook = (WebPWorkerHook)CompressAlphaJob;
@@ -372,10 +392,11 @@ int VP8EncStartAlpha(VP8Encoder* const enc) {
   if (enc->has_alpha_) {
     if (enc->thread_level_ > 0) {
       WebPWorker* const worker = &enc->alpha_worker_;
-      if (!WebPWorkerReset(worker)) {    // Makes sure worker is good to go.
+      // Makes sure worker is good to go.
+      if (!WebPGetWorkerInterface()->Reset(worker)) {
         return 0;
       }
-      WebPWorkerLaunch(worker);
+      WebPGetWorkerInterface()->Launch(worker);
       return 1;
     } else {
       return CompressAlphaJob(enc, NULL);   // just do the job right away
@@ -388,7 +409,7 @@ int VP8EncFinishAlpha(VP8Encoder* const enc) {
   if (enc->has_alpha_) {
     if (enc->thread_level_ > 0) {
       WebPWorker* const worker = &enc->alpha_worker_;
-      if (!WebPWorkerSync(worker)) return 0;  // error
+      if (!WebPGetWorkerInterface()->Sync(worker)) return 0;  // error
     }
   }
   return WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
@@ -398,10 +419,12 @@ int VP8EncDeleteAlpha(VP8Encoder* const enc) {
   int ok = 1;
   if (enc->thread_level_ > 0) {
     WebPWorker* const worker = &enc->alpha_worker_;
-    ok = WebPWorkerSync(worker);  // finish anything left in flight
-    WebPWorkerEnd(worker);  // still need to end the worker, even if !ok
+    // finish anything left in flight
+    ok = WebPGetWorkerInterface()->Sync(worker);
+    // still need to end the worker, even if !ok
+    WebPGetWorkerInterface()->End(worker);
   }
-  free(enc->alpha_data_);
+  WebPSafeFree(enc->alpha_data_);
   enc->alpha_data_ = NULL;
   enc->alpha_data_size_ = 0;
   enc->has_alpha_ = 0;
diff --git a/src/3rdparty/libwebp/src/enc/analysis.c b/src/3rdparty/libwebp/src/enc/analysis.c
index 7d4cfdc..e019465 100644
--- a/src/3rdparty/libwebp/src/enc/analysis.c
+++ b/src/3rdparty/libwebp/src/enc/analysis.c
@@ -30,7 +30,7 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
   const int w = enc->mb_w_;
   const int h = enc->mb_h_;
   const int majority_cnt_3_x_3_grid = 5;
-  uint8_t* const tmp = (uint8_t*)WebPSafeMalloc((uint64_t)w * h, sizeof(*tmp));
+  uint8_t* const tmp = (uint8_t*)WebPSafeMalloc(w * h, sizeof(*tmp));
   assert((uint64_t)(w * h) == (uint64_t)w * h);   // no overflow, as per spec
 
   if (tmp == NULL) return;
@@ -63,7 +63,7 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
       mb->segment_ = tmp[x + y * w];
     }
   }
-  free(tmp);
+  WebPSafeFree(tmp);
 }
 
 //------------------------------------------------------------------------------
@@ -141,7 +141,11 @@ static void MergeHistograms(const VP8Histogram* const in,
 
 static void AssignSegments(VP8Encoder* const enc,
                            const int alphas[MAX_ALPHA + 1]) {
-  const int nb = enc->segment_hdr_.num_segments_;
+  // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
+  // explicit check is needed to avoid spurious warning about 'n + 1' exceeding
+  // array bounds of 'centers' with some compilers (noticed with gcc-4.9).
+  const int nb = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS) ?
+                 enc->segment_hdr_.num_segments_ : NUM_MB_SEGMENTS;
   int centers[NUM_MB_SEGMENTS];
   int weighted_average = 0;
   int map[MAX_ALPHA + 1];
@@ -151,6 +155,7 @@ static void AssignSegments(VP8Encoder* const enc,
   int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];
 
   assert(nb >= 1);
+  assert(nb <= NUM_MB_SEGMENTS);
 
   // bracket the input
   for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
@@ -225,18 +230,15 @@ static void AssignSegments(VP8Encoder* const enc,
 // susceptibility and set best modes for this macroblock.
 // Segment assignment is done later.
 
-// Number of modes to inspect for alpha_ evaluation. For high-quality settings
-// (method >= FAST_ANALYSIS_METHOD) we don't need to test all the possible modes
-// during the analysis phase.
-#define FAST_ANALYSIS_METHOD 4  // method above which we do partial analysis
+// Number of modes to inspect for alpha_ evaluation. We don't need to test all
+// the possible modes during the analysis phase: we risk falling into a local
+// optimum, or be subject to boundary effect
 #define MAX_INTRA16_MODE 2
 #define MAX_INTRA4_MODE  2
 #define MAX_UV_MODE      2
 
 static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
-  const int max_mode =
-      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA16_MODE
-                                                  : NUM_PRED_MODES;
+  const int max_mode = MAX_INTRA16_MODE;
   int mode;
   int best_alpha = DEFAULT_ALPHA;
   int best_mode = 0;
@@ -262,9 +264,7 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) {
 static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
                                    int best_alpha) {
   uint8_t modes[16];
-  const int max_mode =
-      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_INTRA4_MODE
-                                                  : NUM_BMODES;
+  const int max_mode = MAX_INTRA4_MODE;
   int i4_alpha;
   VP8Histogram total_histo = { { 0 } };
   int cur_histo = 0;
@@ -306,10 +306,9 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it,
 static int MBAnalyzeBestUVMode(VP8EncIterator* const it) {
   int best_alpha = DEFAULT_ALPHA;
   int best_mode = 0;
-  const int max_mode =
-      (it->enc_->method_ >= FAST_ANALYSIS_METHOD) ? MAX_UV_MODE
-                                                  : NUM_PRED_MODES;
+  const int max_mode = MAX_UV_MODE;
   int mode;
+
   VP8MakeChroma8Preds(it);
   for (mode = 0; mode < max_mode; ++mode) {
     VP8Histogram histo = { { 0 } };
@@ -425,7 +424,7 @@ static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
 // initialize the job struct with some TODOs
 static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
                            int start_row, int end_row) {
-  WebPWorkerInit(&job->worker);
+  WebPGetWorkerInterface()->Init(&job->worker);
   job->worker.data1 = job;
   job->worker.data2 = &job->it;
   job->worker.hook = (WebPWorkerHook)DoSegmentsJob;
@@ -458,6 +457,8 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
 #else
     const int do_mt = 0;
 #endif
+    const WebPWorkerInterface* const worker_interface =
+        WebPGetWorkerInterface();
     SegmentJob main_job;
     if (do_mt) {
       SegmentJob side_job;
@@ -467,23 +468,23 @@ int VP8EncAnalyze(VP8Encoder* const enc) {
       InitSegmentJob(enc, &side_job, split_row, last_row);
       // we don't need to call Reset() on main_job.worker, since we're calling
       // WebPWorkerExecute() on it
-      ok &= WebPWorkerReset(&side_job.worker);
+      ok &= worker_interface->Reset(&side_job.worker);
       // launch the two jobs in parallel
       if (ok) {
-        WebPWorkerLaunch(&side_job.worker);
-        WebPWorkerExecute(&main_job.worker);
-        ok &= WebPWorkerSync(&side_job.worker);
-        ok &= WebPWorkerSync(&main_job.worker);
+        worker_interface->Launch(&side_job.worker);
+        worker_interface->Execute(&main_job.worker);
+        ok &= worker_interface->Sync(&side_job.worker);
+        ok &= worker_interface->Sync(&main_job.worker);
       }
-      WebPWorkerEnd(&side_job.worker);
+      worker_interface->End(&side_job.worker);
       if (ok) MergeJobs(&side_job, &main_job);  // merge results together
     } else {
       // Even for single-thread case, we use the generic Worker tools.
       InitSegmentJob(enc, &main_job, 0, last_row);
-      WebPWorkerExecute(&main_job.worker);
-      ok &= WebPWorkerSync(&main_job.worker);
+      worker_interface->Execute(&main_job.worker);
+      ok &= worker_interface->Sync(&main_job.worker);
     }
-    WebPWorkerEnd(&main_job.worker);
+    worker_interface->End(&main_job.worker);
     if (ok) {
       enc->alpha_ = main_job.alpha / total_mb;
       enc->uv_alpha_ = main_job.uv_alpha / total_mb;
diff --git a/src/3rdparty/libwebp/src/enc/backward_references.c b/src/3rdparty/libwebp/src/enc/backward_references.c
index 77b4be7..a3c30aa 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references.c
+++ b/src/3rdparty/libwebp/src/enc/backward_references.c
@@ -12,7 +12,6 @@
 
 #include <assert.h>
 #include <math.h>
-#include <stdio.h>
 
 #include "./backward_references.h"
 #include "./histogram.h"
@@ -22,10 +21,12 @@
 
 #define VALUES_IN_BYTE 256
 
-#define HASH_BITS 18
-#define HASH_SIZE (1 << HASH_BITS)
 #define HASH_MULTIPLIER (0xc6a4a7935bd1e995ULL)
 
+#define MIN_BLOCK_SIZE 256  // minimum block size for backward references
+
+#define MAX_ENTROPY    (1e30f)
+
 // 1M window (4M bytes) minus 120 special codes for short distances.
 #define WINDOW_SIZE ((1 << 20) - 120)
 
@@ -33,14 +34,6 @@
 #define MIN_LENGTH 2
 #define MAX_LENGTH 4096
 
-typedef struct {
-  // Stores the most recently added position with the given hash value.
-  int32_t hash_to_first_index_[HASH_SIZE];
-  // chain_[pos] stores the previous position with the same hash value
-  // for every pixel in the image.
-  int32_t* chain_;
-} HashChain;
-
 // -----------------------------------------------------------------------------
 
 static const uint8_t plane_to_code_lut[128] = {
@@ -78,65 +71,152 @@ static WEBP_INLINE int FindMatchLength(const uint32_t* const array1,
 // -----------------------------------------------------------------------------
 //  VP8LBackwardRefs
 
-void VP8LInitBackwardRefs(VP8LBackwardRefs* const refs) {
-  if (refs != NULL) {
-    refs->refs = NULL;
-    refs->size = 0;
-    refs->max_size = 0;
+struct PixOrCopyBlock {
+  PixOrCopyBlock* next_;   // next block (or NULL)
+  PixOrCopy* start_;       // data start
+  int size_;               // currently used size
+};
+
+static void ClearBackwardRefs(VP8LBackwardRefs* const refs) {
+  assert(refs != NULL);
+  if (refs->tail_ != NULL) {
+    *refs->tail_ = refs->free_blocks_;  // recycle all blocks at once
   }
+  refs->free_blocks_ = refs->refs_;
+  refs->tail_ = &refs->refs_;
+  refs->last_block_ = NULL;
+  refs->refs_ = NULL;
 }
 
-void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs) {
-  if (refs != NULL) {
-    free(refs->refs);
-    VP8LInitBackwardRefs(refs);
+void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs) {
+  assert(refs != NULL);
+  ClearBackwardRefs(refs);
+  while (refs->free_blocks_ != NULL) {
+    PixOrCopyBlock* const next = refs->free_blocks_->next_;
+    WebPSafeFree(refs->free_blocks_);
+    refs->free_blocks_ = next;
   }
 }
 
-int VP8LBackwardRefsAlloc(VP8LBackwardRefs* const refs, int max_size) {
+void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size) {
   assert(refs != NULL);
-  refs->size = 0;
-  refs->max_size = 0;
-  refs->refs = (PixOrCopy*)WebPSafeMalloc((uint64_t)max_size,
-                                          sizeof(*refs->refs));
-  if (refs->refs == NULL) return 0;
-  refs->max_size = max_size;
+  memset(refs, 0, sizeof(*refs));
+  refs->tail_ = &refs->refs_;
+  refs->block_size_ =
+      (block_size < MIN_BLOCK_SIZE) ? MIN_BLOCK_SIZE : block_size;
+}
+
+VP8LRefsCursor VP8LRefsCursorInit(const VP8LBackwardRefs* const refs) {
+  VP8LRefsCursor c;
+  c.cur_block_ = refs->refs_;
+  if (refs->refs_ != NULL) {
+    c.cur_pos = c.cur_block_->start_;
+    c.last_pos_ = c.cur_pos + c.cur_block_->size_;
+  } else {
+    c.cur_pos = NULL;
+    c.last_pos_ = NULL;
+  }
+  return c;
+}
+
+void VP8LRefsCursorNextBlock(VP8LRefsCursor* const c) {
+  PixOrCopyBlock* const b = c->cur_block_->next_;
+  c->cur_pos = (b == NULL) ? NULL : b->start_;
+  c->last_pos_ = (b == NULL) ? NULL : b->start_ + b->size_;
+  c->cur_block_ = b;
+}
+
+// Create a new block, either from the free list or allocated
+static PixOrCopyBlock* BackwardRefsNewBlock(VP8LBackwardRefs* const refs) {
+  PixOrCopyBlock* b = refs->free_blocks_;
+  if (b == NULL) {   // allocate new memory chunk
+    const size_t total_size =
+        sizeof(*b) + refs->block_size_ * sizeof(*b->start_);
+    b = (PixOrCopyBlock*)WebPSafeMalloc(1ULL, total_size);
+    if (b == NULL) {
+      refs->error_ |= 1;
+      return NULL;
+    }
+    b->start_ = (PixOrCopy*)((uint8_t*)b + sizeof(*b));  // not always aligned
+  } else {  // recycle from free-list
+    refs->free_blocks_ = b->next_;
+  }
+  *refs->tail_ = b;
+  refs->tail_ = &b->next_;
+  refs->last_block_ = b;
+  b->next_ = NULL;
+  b->size_ = 0;
+  return b;
+}
+
+static WEBP_INLINE void BackwardRefsCursorAdd(VP8LBackwardRefs* const refs,
+                                              const PixOrCopy v) {
+  PixOrCopyBlock* b = refs->last_block_;
+  if (b == NULL || b->size_ == refs->block_size_) {
+    b = BackwardRefsNewBlock(refs);
+    if (b == NULL) return;   // refs->error_ is set
+  }
+  b->start_[b->size_++] = v;
+}
+
+int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
+                         VP8LBackwardRefs* const dst) {
+  const PixOrCopyBlock* b = src->refs_;
+  ClearBackwardRefs(dst);
+  assert(src->block_size_ == dst->block_size_);
+  while (b != NULL) {
+    PixOrCopyBlock* const new_b = BackwardRefsNewBlock(dst);
+    if (new_b == NULL) return 0;   // dst->error_ is set
+    memcpy(new_b->start_, b->start_, b->size_ * sizeof(*b->start_));
+    new_b->size_ = b->size_;
+    b = b->next_;
+  }
   return 1;
 }
 
 // -----------------------------------------------------------------------------
 // Hash chains
 
-static WEBP_INLINE uint64_t GetPixPairHash64(const uint32_t* const argb) {
-  uint64_t key = ((uint64_t)(argb[1]) << 32) | argb[0];
-  key = (key * HASH_MULTIPLIER) >> (64 - HASH_BITS);
-  return key;
-}
-
-static int HashChainInit(HashChain* const p, int size) {
+// initialize as empty
+static void HashChainInit(VP8LHashChain* const p) {
   int i;
-  p->chain_ = (int*)WebPSafeMalloc((uint64_t)size, sizeof(*p->chain_));
-  if (p->chain_ == NULL) {
-    return 0;
-  }
-  for (i = 0; i < size; ++i) {
+  assert(p != NULL);
+  for (i = 0; i < p->size_; ++i) {
     p->chain_[i] = -1;
   }
   for (i = 0; i < HASH_SIZE; ++i) {
     p->hash_to_first_index_[i] = -1;
   }
+}
+
+int VP8LHashChainInit(VP8LHashChain* const p, int size) {
+  assert(p->size_ == 0);
+  assert(p->chain_ == NULL);
+  assert(size > 0);
+  p->chain_ = (int*)WebPSafeMalloc(size, sizeof(*p->chain_));
+  if (p->chain_ == NULL) return 0;
+  p->size_ = size;
+  HashChainInit(p);
   return 1;
 }
 
-static void HashChainDelete(HashChain* const p) {
-  if (p != NULL) {
-    free(p->chain_);
-    free(p);
-  }
+void VP8LHashChainClear(VP8LHashChain* const p) {
+  assert(p != NULL);
+  WebPSafeFree(p->chain_);
+  p->size_ = 0;
+  p->chain_ = NULL;
+}
+
+// -----------------------------------------------------------------------------
+
+static WEBP_INLINE uint64_t GetPixPairHash64(const uint32_t* const argb) {
+  uint64_t key = ((uint64_t)argb[1] << 32) | argb[0];
+  key = (key * HASH_MULTIPLIER) >> (64 - HASH_BITS);
+  return key;
 }
 
 // Insertion of two pixels at a time.
-static void HashChainInsert(HashChain* const p,
+static void HashChainInsert(VP8LHashChain* const p,
                             const uint32_t* const argb, int pos) {
   const uint64_t hash_code = GetPixPairHash64(argb);
   p->chain_[pos] = p->hash_to_first_index_[hash_code];
@@ -161,7 +241,7 @@ static void GetParamsForHashChainFindCopy(int quality, int xsize,
   *iter_limit = (cache_bits > 0) ? iter_neg : iter_neg / 2;
 }
 
-static int HashChainFindCopy(const HashChain* const p,
+static int HashChainFindCopy(const VP8LHashChain* const p,
                              int base_position, int xsize_signed,
                              const uint32_t* const argb, int max_len,
                              int window_size, int iter_pos, int iter_limit,
@@ -185,10 +265,8 @@ static int HashChainFindCopy(const HashChain* const p,
     uint64_t val;
     uint32_t curr_length;
     uint32_t distance;
-    const uint64_t* const ptr1 =
-        (const uint64_t*)(argb + pos + best_length - 1);
-    const uint64_t* const ptr2 =
-        (const uint64_t*)(argb_start + best_length - 1);
+    const uint32_t* const ptr1 = (argb + pos + best_length - 1);
+    const uint32_t* const ptr2 = (argb_start + best_length - 1);
 
     if (iter_pos < 0) {
       if (iter_pos < iter_limit || best_val >= 0xff0000) {
@@ -199,7 +277,7 @@ static int HashChainFindCopy(const HashChain* const p,
 
     // Before 'expensive' linear match, check if the two arrays match at the
     // current best length index and also for the succeeding elements.
-    if (*ptr1 != *ptr2) continue;
+    if (ptr1[0] != ptr2[0] || ptr1[1] != ptr2[1]) continue;
 
     curr_length = FindMatchLength(argb + pos, argb_start, max_len);
     if (curr_length < best_length) continue;
@@ -237,64 +315,61 @@ static int HashChainFindCopy(const HashChain* const p,
 }
 
 static WEBP_INLINE void PushBackCopy(VP8LBackwardRefs* const refs, int length) {
-  int size = refs->size;
   while (length >= MAX_LENGTH) {
-    refs->refs[size++] = PixOrCopyCreateCopy(1, MAX_LENGTH);
+    BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, MAX_LENGTH));
     length -= MAX_LENGTH;
   }
   if (length > 0) {
-    refs->refs[size++] = PixOrCopyCreateCopy(1, length);
+    BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(1, length));
   }
-  refs->size = size;
 }
 
-static void BackwardReferencesRle(int xsize, int ysize,
-                                  const uint32_t* const argb,
-                                  VP8LBackwardRefs* const refs) {
+static int BackwardReferencesRle(int xsize, int ysize,
+                                 const uint32_t* const argb,
+                                 VP8LBackwardRefs* const refs) {
   const int pix_count = xsize * ysize;
   int match_len = 0;
   int i;
-  refs->size = 0;
+  ClearBackwardRefs(refs);
   PushBackCopy(refs, match_len);    // i=0 case
-  refs->refs[refs->size++] = PixOrCopyCreateLiteral(argb[0]);
+  BackwardRefsCursorAdd(refs, PixOrCopyCreateLiteral(argb[0]));
   for (i = 1; i < pix_count; ++i) {
     if (argb[i] == argb[i - 1]) {
       ++match_len;
     } else {
       PushBackCopy(refs, match_len);
       match_len = 0;
-      refs->refs[refs->size++] = PixOrCopyCreateLiteral(argb[i]);
+      BackwardRefsCursorAdd(refs, PixOrCopyCreateLiteral(argb[i]));
     }
   }
   PushBackCopy(refs, match_len);
+  return !refs->error_;
 }
 
 static int BackwardReferencesHashChain(int xsize, int ysize,
                                        const uint32_t* const argb,
                                        int cache_bits, int quality,
+                                       VP8LHashChain* const hash_chain,
                                        VP8LBackwardRefs* const refs) {
   int i;
   int ok = 0;
   int cc_init = 0;
   const int use_color_cache = (cache_bits > 0);
   const int pix_count = xsize * ysize;
-  HashChain* const hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
   VP8LColorCache hashers;
   int window_size = WINDOW_SIZE;
   int iter_pos = 1;
   int iter_limit = -1;
 
-  if (hash_chain == NULL) return 0;
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     if (!cc_init) goto Error;
   }
 
-  if (!HashChainInit(hash_chain, pix_count)) goto Error;
-
-  refs->size = 0;
+  ClearBackwardRefs(refs);
   GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
                                 &window_size, &iter_pos, &iter_limit);
+  HashChainInit(hash_chain);
   for (i = 0; i < pix_count; ) {
     // Alternative#1: Code the pixels starting at 'i' using backward reference.
     int offset = 0;
@@ -320,14 +395,15 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
         if (len2 > len + 1) {
           const uint32_t pixel = argb[i];
           // Alternative#2 is a better match. So push pixel at 'i' as literal.
+          PixOrCopy v;
           if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
             const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
-            refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
+            v = PixOrCopyCreateCacheIdx(ix);
           } else {
             if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
-            refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
+            v = PixOrCopyCreateLiteral(pixel);
           }
-          ++refs->size;
+          BackwardRefsCursorAdd(refs, v);
           i++;  // Backward reference to be done for next pixel.
           len = len2;
           offset = offset2;
@@ -336,7 +412,7 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
       if (len >= MAX_LENGTH) {
         len = MAX_LENGTH - 1;
       }
-      refs->refs[refs->size++] = PixOrCopyCreateCopy(offset, len);
+      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
       if (use_color_cache) {
         for (k = 0; k < len; ++k) {
           VP8LColorCacheInsert(&hashers, argb[i + k]);
@@ -352,25 +428,25 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
       i += len;
     } else {
       const uint32_t pixel = argb[i];
+      PixOrCopy v;
       if (use_color_cache && VP8LColorCacheContains(&hashers, pixel)) {
         // push pixel as a PixOrCopyCreateCacheIdx pixel
         const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
-        refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
+        v = PixOrCopyCreateCacheIdx(ix);
       } else {
         if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
-        refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
+        v = PixOrCopyCreateLiteral(pixel);
       }
-      ++refs->size;
+      BackwardRefsCursorAdd(refs, v);
       if (i + 1 < pix_count) {
         HashChainInsert(hash_chain, &argb[i], i);
       }
       ++i;
     }
   }
-  ok = 1;
+  ok = !refs->error_;
 Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
-  HashChainDelete(hash_chain);
   return ok;
 }
 
@@ -387,11 +463,12 @@ typedef struct {
 static int BackwardReferencesTraceBackwards(
     int xsize, int ysize, int recursive_cost_model,
     const uint32_t* const argb, int quality, int cache_bits,
+    VP8LHashChain* const hash_chain,
     VP8LBackwardRefs* const refs);
 
 static void ConvertPopulationCountTableToBitEstimates(
-    int num_symbols, const int population_counts[], double output[]) {
-  int sum = 0;
+    int num_symbols, const uint32_t population_counts[], double output[]) {
+  uint32_t sum = 0;
   int nonzeros = 0;
   int i;
   for (i = 0; i < num_symbols; ++i) {
@@ -412,39 +489,45 @@ static void ConvertPopulationCountTableToBitEstimates(
 
 static int CostModelBuild(CostModel* const m, int xsize, int ysize,
                           int recursion_level, const uint32_t* const argb,
-                          int quality, int cache_bits) {
+                          int quality, int cache_bits,
+                          VP8LHashChain* const hash_chain,
+                          VP8LBackwardRefs* const refs) {
   int ok = 0;
-  VP8LHistogram histo;
-  VP8LBackwardRefs refs;
-
-  if (!VP8LBackwardRefsAlloc(&refs, xsize * ysize)) goto Error;
+  VP8LHistogram* histo = NULL;
 
+  ClearBackwardRefs(refs);
   if (recursion_level > 0) {
     if (!BackwardReferencesTraceBackwards(xsize, ysize, recursion_level - 1,
-                                          argb, quality, cache_bits, &refs)) {
+                                          argb, quality, cache_bits, hash_chain,
+                                          refs)) {
       goto Error;
     }
   } else {
     if (!BackwardReferencesHashChain(xsize, ysize, argb, cache_bits, quality,
-                                     &refs)) {
+                                     hash_chain, refs)) {
       goto Error;
     }
   }
-  VP8LHistogramCreate(&histo, &refs, cache_bits);
+  histo = VP8LAllocateHistogram(cache_bits);
+  if (histo == NULL) goto Error;
+
+  VP8LHistogramCreate(histo, refs, cache_bits);
+
   ConvertPopulationCountTableToBitEstimates(
-      VP8LHistogramNumCodes(&histo), histo.literal_, m->literal_);
+      VP8LHistogramNumCodes(histo->palette_code_bits_),
+      histo->literal_, m->literal_);
   ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo.red_, m->red_);
+      VALUES_IN_BYTE, histo->red_, m->red_);
   ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo.blue_, m->blue_);
+      VALUES_IN_BYTE, histo->blue_, m->blue_);
   ConvertPopulationCountTableToBitEstimates(
-      VALUES_IN_BYTE, histo.alpha_, m->alpha_);
+      VALUES_IN_BYTE, histo->alpha_, m->alpha_);
   ConvertPopulationCountTableToBitEstimates(
-      NUM_DISTANCE_CODES, histo.distance_, m->distance_);
+      NUM_DISTANCE_CODES, histo->distance_, m->distance_);
   ok = 1;
 
  Error:
-  VP8LClearBackwardRefs(&refs);
+  VP8LFreeHistogram(histo);
   return ok;
 }
 
@@ -476,16 +559,16 @@ static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
 
 static int BackwardReferencesHashChainDistanceOnly(
     int xsize, int ysize, int recursive_cost_model, const uint32_t* const argb,
-    int quality, int cache_bits, uint32_t* const dist_array) {
+    int quality, int cache_bits, VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs* const refs, uint32_t* const dist_array) {
   int i;
   int ok = 0;
   int cc_init = 0;
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
   float* const cost =
-      (float*)WebPSafeMalloc((uint64_t)pix_count, sizeof(*cost));
-  CostModel* cost_model = (CostModel*)malloc(sizeof(*cost_model));
-  HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
+      (float*)WebPSafeMalloc(pix_count, sizeof(*cost));
+  CostModel* cost_model = (CostModel*)WebPSafeMalloc(1ULL, sizeof(*cost_model));
   VP8LColorCache hashers;
   const double mul0 = (recursive_cost_model != 0) ? 1.0 : 0.68;
   const double mul1 = (recursive_cost_model != 0) ? 1.0 : 0.82;
@@ -494,9 +577,7 @@ static int BackwardReferencesHashChainDistanceOnly(
   int iter_pos = 1;
   int iter_limit = -1;
 
-  if (cost == NULL || cost_model == NULL || hash_chain == NULL) goto Error;
-
-  if (!HashChainInit(hash_chain, pix_count)) goto Error;
+  if (cost == NULL || cost_model == NULL) goto Error;
 
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
@@ -504,7 +585,7 @@ static int BackwardReferencesHashChainDistanceOnly(
   }
 
   if (!CostModelBuild(cost_model, xsize, ysize, recursive_cost_model, argb,
-                      quality, cache_bits)) {
+                      quality, cache_bits, hash_chain, refs)) {
     goto Error;
   }
 
@@ -515,6 +596,7 @@ static int BackwardReferencesHashChainDistanceOnly(
   dist_array[0] = 0;
   GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
                                 &window_size, &iter_pos, &iter_limit);
+  HashChainInit(hash_chain);
   for (i = 0; i < pix_count; ++i) {
     double prev_cost = 0.0;
     int shortmax;
@@ -589,12 +671,11 @@ static int BackwardReferencesHashChainDistanceOnly(
   }
   // Last pixel still to do, it can only be a single step if not reached
   // through cheaper means already.
-  ok = 1;
+  ok = !refs->error_;
 Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
-  HashChainDelete(hash_chain);
-  free(cost_model);
-  free(cost);
+  WebPSafeFree(cost_model);
+  WebPSafeFree(cost);
   return ok;
 }
 
@@ -621,6 +702,7 @@ static int BackwardReferencesHashChainFollowChosenPath(
     int xsize, int ysize, const uint32_t* const argb,
     int quality, int cache_bits,
     const uint32_t* const chosen_path, int chosen_path_size,
+    VP8LHashChain* const hash_chain,
     VP8LBackwardRefs* const refs) {
   const int pix_count = xsize * ysize;
   const int use_color_cache = (cache_bits > 0);
@@ -633,20 +715,17 @@ static int BackwardReferencesHashChainFollowChosenPath(
   int window_size = WINDOW_SIZE;
   int iter_pos = 1;
   int iter_limit = -1;
-  HashChain* hash_chain = (HashChain*)malloc(sizeof(*hash_chain));
   VP8LColorCache hashers;
 
-  if (hash_chain == NULL || !HashChainInit(hash_chain, pix_count)) {
-    goto Error;
-  }
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
     if (!cc_init) goto Error;
   }
 
-  refs->size = 0;
+  ClearBackwardRefs(refs);
   GetParamsForHashChainFindCopy(quality, xsize, cache_bits,
                                 &window_size, &iter_pos, &iter_limit);
+  HashChainInit(hash_chain);
   for (ix = 0; ix < chosen_path_size; ++ix, ++size) {
     int offset = 0;
     int len = 0;
@@ -656,7 +735,7 @@ static int BackwardReferencesHashChainFollowChosenPath(
                         window_size, iter_pos, iter_limit,
                         &offset, &len);
       assert(len == max_len);
-      refs->refs[size] = PixOrCopyCreateCopy(offset, len);
+      BackwardRefsCursorAdd(refs, PixOrCopyCreateCopy(offset, len));
       if (use_color_cache) {
         for (k = 0; k < len; ++k) {
           VP8LColorCacheInsert(&hashers, argb[i + k]);
@@ -670,26 +749,25 @@ static int BackwardReferencesHashChainFollowChosenPath(
       }
       i += len;
     } else {
+      PixOrCopy v;
       if (use_color_cache && VP8LColorCacheContains(&hashers, argb[i])) {
         // push pixel as a color cache index
         const int idx = VP8LColorCacheGetIndex(&hashers, argb[i]);
-        refs->refs[size] = PixOrCopyCreateCacheIdx(idx);
+        v = PixOrCopyCreateCacheIdx(idx);
       } else {
         if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
-        refs->refs[size] = PixOrCopyCreateLiteral(argb[i]);
+        v = PixOrCopyCreateLiteral(argb[i]);
       }
+      BackwardRefsCursorAdd(refs, v);
       if (i + 1 < pix_count) {
         HashChainInsert(hash_chain, &argb[i], i);
       }
       ++i;
     }
   }
-  assert(size <= refs->max_size);
-  refs->size = size;
-  ok = 1;
+  ok = !refs->error_;
 Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
-  HashChainDelete(hash_chain);
   return ok;
 }
 
@@ -698,142 +776,129 @@ static int BackwardReferencesTraceBackwards(int xsize, int ysize,
                                             int recursive_cost_model,
                                             const uint32_t* const argb,
                                             int quality, int cache_bits,
+                                            VP8LHashChain* const hash_chain,
                                             VP8LBackwardRefs* const refs) {
   int ok = 0;
   const int dist_array_size = xsize * ysize;
   uint32_t* chosen_path = NULL;
   int chosen_path_size = 0;
   uint32_t* dist_array =
-      (uint32_t*)WebPSafeMalloc((uint64_t)dist_array_size, sizeof(*dist_array));
+      (uint32_t*)WebPSafeMalloc(dist_array_size, sizeof(*dist_array));
 
   if (dist_array == NULL) goto Error;
 
   if (!BackwardReferencesHashChainDistanceOnly(
-      xsize, ysize, recursive_cost_model, argb, quality, cache_bits,
-      dist_array)) {
+      xsize, ysize, recursive_cost_model, argb, quality, cache_bits, hash_chain,
+      refs, dist_array)) {
     goto Error;
   }
   TraceBackwards(dist_array, dist_array_size, &chosen_path, &chosen_path_size);
   if (!BackwardReferencesHashChainFollowChosenPath(
       xsize, ysize, argb, quality, cache_bits, chosen_path, chosen_path_size,
-      refs)) {
+      hash_chain, refs)) {
     goto Error;
   }
   ok = 1;
  Error:
-  free(dist_array);
+  WebPSafeFree(dist_array);
   return ok;
 }
 
 static void BackwardReferences2DLocality(int xsize,
-                                         VP8LBackwardRefs* const refs) {
-  int i;
-  for (i = 0; i < refs->size; ++i) {
-    if (PixOrCopyIsCopy(&refs->refs[i])) {
-      const int dist = refs->refs[i].argb_or_distance;
+                                         const VP8LBackwardRefs* const refs) {
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  while (VP8LRefsCursorOk(&c)) {
+    if (PixOrCopyIsCopy(c.cur_pos)) {
+      const int dist = c.cur_pos->argb_or_distance;
       const int transformed_dist = DistanceToPlaneCode(xsize, dist);
-      refs->refs[i].argb_or_distance = transformed_dist;
+      c.cur_pos->argb_or_distance = transformed_dist;
     }
+    VP8LRefsCursorNext(&c);
   }
 }
 
-int VP8LGetBackwardReferences(int width, int height,
-                              const uint32_t* const argb,
-                              int quality, int cache_bits, int use_2d_locality,
-                              VP8LBackwardRefs* const best) {
-  int ok = 0;
+VP8LBackwardRefs* VP8LGetBackwardReferences(
+    int width, int height, const uint32_t* const argb, int quality,
+    int cache_bits, int use_2d_locality, VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs refs_array[2]) {
   int lz77_is_useful;
-  VP8LBackwardRefs refs_rle, refs_lz77;
   const int num_pix = width * height;
-
-  VP8LBackwardRefsAlloc(&refs_rle, num_pix);
-  VP8LBackwardRefsAlloc(&refs_lz77, num_pix);
-  VP8LInitBackwardRefs(best);
-  if (refs_rle.refs == NULL || refs_lz77.refs == NULL) {
- Error1:
-    VP8LClearBackwardRefs(&refs_rle);
-    VP8LClearBackwardRefs(&refs_lz77);
-    goto End;
-  }
+  VP8LBackwardRefs* best = NULL;
+  VP8LBackwardRefs* const refs_lz77 = &refs_array[0];
+  VP8LBackwardRefs* const refs_rle = &refs_array[1];
 
   if (!BackwardReferencesHashChain(width, height, argb, cache_bits, quality,
-                                   &refs_lz77)) {
-    goto End;
+                                   hash_chain, refs_lz77)) {
+    return NULL;
+  }
+  if (!BackwardReferencesRle(width, height, argb, refs_rle)) {
+    return NULL;
   }
-  // Backward Reference using RLE only.
-  BackwardReferencesRle(width, height, argb, &refs_rle);
 
   {
     double bit_cost_lz77, bit_cost_rle;
-    VP8LHistogram* const histo = (VP8LHistogram*)malloc(sizeof(*histo));
-    if (histo == NULL) goto Error1;
-    // Evaluate lz77 coding
-    VP8LHistogramCreate(histo, &refs_lz77, cache_bits);
+    VP8LHistogram* const histo = VP8LAllocateHistogram(cache_bits);
+    if (histo == NULL) return NULL;
+    // Evaluate LZ77 coding.
+    VP8LHistogramCreate(histo, refs_lz77, cache_bits);
     bit_cost_lz77 = VP8LHistogramEstimateBits(histo);
-    // Evaluate RLE coding
-    VP8LHistogramCreate(histo, &refs_rle, cache_bits);
+    // Evaluate RLE coding.
+    VP8LHistogramCreate(histo, refs_rle, cache_bits);
     bit_cost_rle = VP8LHistogramEstimateBits(histo);
     // Decide if LZ77 is useful.
     lz77_is_useful = (bit_cost_lz77 < bit_cost_rle);
-    free(histo);
+    VP8LFreeHistogram(histo);
   }
 
   // Choose appropriate backward reference.
   if (lz77_is_useful) {
     // TraceBackwards is costly. Don't execute it at lower quality.
     const int try_lz77_trace_backwards = (quality >= 25);
-    *best = refs_lz77;   // default guess: lz77 is better
-    VP8LClearBackwardRefs(&refs_rle);
+    best = refs_lz77;   // default guess: lz77 is better
     if (try_lz77_trace_backwards) {
       // Set recursion level for large images using a color cache.
       const int recursion_level =
           (num_pix < 320 * 200) && (cache_bits > 0) ? 1 : 0;
-      VP8LBackwardRefs refs_trace;
-      if (!VP8LBackwardRefsAlloc(&refs_trace, num_pix)) {
-        goto End;
-      }
+      VP8LBackwardRefs* const refs_trace = &refs_array[1];
+      ClearBackwardRefs(refs_trace);
       if (BackwardReferencesTraceBackwards(width, height, recursion_level, argb,
-                                           quality, cache_bits, &refs_trace)) {
-        VP8LClearBackwardRefs(&refs_lz77);
-        *best = refs_trace;
+                                           quality, cache_bits, hash_chain,
+                                           refs_trace)) {
+        best = refs_trace;
       }
     }
   } else {
-    VP8LClearBackwardRefs(&refs_lz77);
-    *best = refs_rle;
+    best = refs_rle;
   }
 
   if (use_2d_locality) BackwardReferences2DLocality(width, best);
 
-  ok = 1;
-
- End:
-  if (!ok) {
-    VP8LClearBackwardRefs(best);
-  }
-  return ok;
+  return best;
 }
 
-// Returns 1 on success.
-static int ComputeCacheHistogram(const uint32_t* const argb,
-                                 int xsize, int ysize,
-                                 const VP8LBackwardRefs* const refs,
-                                 int cache_bits,
-                                 VP8LHistogram* const histo) {
+// Returns entropy for the given cache bits.
+static double ComputeCacheEntropy(const uint32_t* const argb,
+                                  int xsize, int ysize,
+                                  const VP8LBackwardRefs* const refs,
+                                  int cache_bits) {
   int pixel_index = 0;
-  int i;
   uint32_t k;
-  VP8LColorCache hashers;
   const int use_color_cache = (cache_bits > 0);
   int cc_init = 0;
+  double entropy = MAX_ENTROPY;
+  const double kSmallPenaltyForLargeCache = 4.0;
+  VP8LColorCache hashers;
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  VP8LHistogram* histo = VP8LAllocateHistogram(cache_bits);
+  if (histo == NULL) goto Error;
 
   if (use_color_cache) {
     cc_init = VP8LColorCacheInit(&hashers, cache_bits);
-    if (!cc_init) return 0;
+    if (!cc_init) goto Error;
   }
 
-  for (i = 0; i < refs->size; ++i) {
-    const PixOrCopy* const v = &refs->refs[i];
+  while (VP8LRefsCursorOk(&c)) {
+    const PixOrCopy* const v = c.cur_pos;
     if (PixOrCopyIsLiteral(v)) {
       if (use_color_cache &&
           VP8LColorCacheContains(&hashers, argb[pixel_index])) {
@@ -853,42 +918,58 @@ static int ComputeCacheHistogram(const uint32_t* const argb,
       }
     }
     pixel_index += PixOrCopyLength(v);
+    VP8LRefsCursorNext(&c);
   }
   assert(pixel_index == xsize * ysize);
   (void)xsize;  // xsize is not used in non-debug compilations otherwise.
   (void)ysize;  // ysize is not used in non-debug compilations otherwise.
+  entropy = VP8LHistogramEstimateBits(histo) +
+      kSmallPenaltyForLargeCache * cache_bits;
+ Error:
   if (cc_init) VP8LColorCacheClear(&hashers);
-  return 1;
+  VP8LFreeHistogram(histo);
+  return entropy;
 }
 
-// Returns how many bits are to be used for a color cache.
+// *best_cache_bits will contain how many bits are to be used for a color cache.
+// Returns 0 in case of memory error.
 int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
-                                      int xsize, int ysize,
+                                      int xsize, int ysize, int quality,
+                                      VP8LHashChain* const hash_chain,
+                                      VP8LBackwardRefs* const refs,
                                       int* const best_cache_bits) {
-  int ok = 0;
-  int cache_bits;
-  double lowest_entropy = 1e99;
-  VP8LBackwardRefs refs;
-  static const double kSmallPenaltyForLargeCache = 4.0;
-  static const int quality = 30;
-  if (!VP8LBackwardRefsAlloc(&refs, xsize * ysize) ||
-      !BackwardReferencesHashChain(xsize, ysize, argb, 0, quality, &refs)) {
-    goto Error;
+  int eval_low = 1;
+  int eval_high = 1;
+  double entropy_low = MAX_ENTROPY;
+  double entropy_high = MAX_ENTROPY;
+  int cache_bits_low = 0;
+  int cache_bits_high = MAX_COLOR_CACHE_BITS;
+
+  if (!BackwardReferencesHashChain(xsize, ysize, argb, 0, quality, hash_chain,
+                                   refs)) {
+    return 0;
   }
-  for (cache_bits = 0; cache_bits <= MAX_COLOR_CACHE_BITS; ++cache_bits) {
-    double cur_entropy;
-    VP8LHistogram histo;
-    VP8LHistogramInit(&histo, cache_bits);
-    ComputeCacheHistogram(argb, xsize, ysize, &refs, cache_bits, &histo);
-    cur_entropy = VP8LHistogramEstimateBits(&histo) +
-        kSmallPenaltyForLargeCache * cache_bits;
-    if (cache_bits == 0 || cur_entropy < lowest_entropy) {
-      *best_cache_bits = cache_bits;
-      lowest_entropy = cur_entropy;
+  // Do a binary search to find the optimal entropy for cache_bits.
+  while (cache_bits_high - cache_bits_low > 1) {
+    if (eval_low) {
+      entropy_low =
+          ComputeCacheEntropy(argb, xsize, ysize, refs, cache_bits_low);
+      eval_low = 0;
+    }
+    if (eval_high) {
+      entropy_high =
+          ComputeCacheEntropy(argb, xsize, ysize, refs, cache_bits_high);
+      eval_high = 0;
+    }
+    if (entropy_high < entropy_low) {
+      *best_cache_bits = cache_bits_high;
+      cache_bits_low = (cache_bits_low + cache_bits_high) / 2;
+      eval_low = 1;
+    } else {
+      *best_cache_bits = cache_bits_low;
+      cache_bits_high = (cache_bits_low + cache_bits_high) / 2;
+      eval_high = 1;
     }
   }
-  ok = 1;
- Error:
-  VP8LClearBackwardRefs(&refs);
-  return ok;
+  return 1;
 }
diff --git a/src/3rdparty/libwebp/src/enc/backward_references.h b/src/3rdparty/libwebp/src/enc/backward_references.h
index e1c75f0..c2c81c5 100644
--- a/src/3rdparty/libwebp/src/enc/backward_references.h
+++ b/src/3rdparty/libwebp/src/enc/backward_references.h
@@ -113,36 +113,96 @@ static WEBP_INLINE uint32_t PixOrCopyDistance(const PixOrCopy* const p) {
 }
 
 // -----------------------------------------------------------------------------
-// VP8LBackwardRefs
+// VP8LHashChain
+
+#define HASH_BITS 18
+#define HASH_SIZE (1 << HASH_BITS)
+
+typedef struct VP8LHashChain VP8LHashChain;
+struct VP8LHashChain {
+  // Stores the most recently added position with the given hash value.
+  int32_t hash_to_first_index_[HASH_SIZE];
+  // chain_[pos] stores the previous position with the same hash value
+  // for every pixel in the image.
+  int32_t* chain_;
+  // This is the maximum size of the hash_chain that can be constructed.
+  // Typically this is the pixel count (width x height) for a given image.
+  int size_;
+};
 
-typedef struct {
-  PixOrCopy* refs;
-  int size;      // currently used
-  int max_size;  // maximum capacity
-} VP8LBackwardRefs;
+// Must be called first, to set size.
+int VP8LHashChainInit(VP8LHashChain* const p, int size);
+void VP8LHashChainClear(VP8LHashChain* const p);  // release memory
 
-// Initialize the object. Must be called first. 'refs' can be NULL.
-void VP8LInitBackwardRefs(VP8LBackwardRefs* const refs);
+// -----------------------------------------------------------------------------
+// VP8LBackwardRefs (block-based backward-references storage)
+
+// maximum number of reference blocks the image will be segmented into
+#define MAX_REFS_BLOCK_PER_IMAGE 16
+
+typedef struct PixOrCopyBlock PixOrCopyBlock;   // forward declaration
+typedef struct VP8LBackwardRefs VP8LBackwardRefs;
+
+// Container for blocks chain
+struct VP8LBackwardRefs {
+  int block_size_;               // common block-size
+  int error_;                    // set to true if some memory error occurred
+  PixOrCopyBlock* refs_;         // list of currently used blocks
+  PixOrCopyBlock** tail_;        // for list recycling
+  PixOrCopyBlock* free_blocks_;  // free-list
+  PixOrCopyBlock* last_block_;   // used for adding new refs (internal)
+};
 
-// Release memory and re-initialize the object. 'refs' can be NULL.
-void VP8LClearBackwardRefs(VP8LBackwardRefs* const refs);
+// Initialize the object. 'block_size' is the common block size to store
+// references (typically, width * height / MAX_REFS_BLOCK_PER_IMAGE).
+void VP8LBackwardRefsInit(VP8LBackwardRefs* const refs, int block_size);
+// Release memory for backward references.
+void VP8LBackwardRefsClear(VP8LBackwardRefs* const refs);
+// Copies the 'src' backward refs to the 'dst'. Returns 0 in case of error.
+int VP8LBackwardRefsCopy(const VP8LBackwardRefs* const src,
+                         VP8LBackwardRefs* const dst);
 
-// Allocate 'max_size' references. Returns false in case of memory error.
-int VP8LBackwardRefsAlloc(VP8LBackwardRefs* const refs, int max_size);
+// Cursor for iterating on references content
+typedef struct {
+  // public:
+  PixOrCopy* cur_pos;           // current position
+  // private:
+  PixOrCopyBlock* cur_block_;   // current block in the refs list
+  const PixOrCopy* last_pos_;   // sentinel for switching to next block
+} VP8LRefsCursor;
+
+// Returns a cursor positioned at the beginning of the references list.
+VP8LRefsCursor VP8LRefsCursorInit(const VP8LBackwardRefs* const refs);
+// Returns true if cursor is pointing at a valid position.
+static WEBP_INLINE int VP8LRefsCursorOk(const VP8LRefsCursor* const c) {
+  return (c->cur_pos != NULL);
+}
+// Move to next block of references. Internal, not to be called directly.
+void VP8LRefsCursorNextBlock(VP8LRefsCursor* const c);
+// Move to next position, or NULL. Should not be called if !VP8LRefsCursorOk().
+static WEBP_INLINE void VP8LRefsCursorNext(VP8LRefsCursor* const c) {
+  assert(c != NULL);
+  assert(VP8LRefsCursorOk(c));
+  if (++c->cur_pos == c->last_pos_) VP8LRefsCursorNextBlock(c);
+}
 
 // -----------------------------------------------------------------------------
 // Main entry points
 
 // Evaluates best possible backward references for specified quality.
 // Further optimize for 2D locality if use_2d_locality flag is set.
-int VP8LGetBackwardReferences(int width, int height,
-                              const uint32_t* const argb,
-                              int quality, int cache_bits, int use_2d_locality,
-                              VP8LBackwardRefs* const best);
+// The return value is the pointer to the best of the two backward refs viz,
+// refs[0] or refs[1].
+VP8LBackwardRefs* VP8LGetBackwardReferences(
+    int width, int height, const uint32_t* const argb, int quality,
+    int cache_bits, int use_2d_locality, VP8LHashChain* const hash_chain,
+    VP8LBackwardRefs refs[2]);
 
 // Produce an estimate for a good color cache size for the image.
 int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
-                                      int xsize, int ysize,
+                                      int xsize, int ysize, int quality,
+                                      VP8LHashChain* const hash_chain,
+                                      VP8LBackwardRefs* const ref,
                                       int* const best_cache_bits);
 
 #ifdef __cplusplus
diff --git a/src/3rdparty/libwebp/src/enc/config.c b/src/3rdparty/libwebp/src/enc/config.c
index af7f0b0..53a3bb2 100644
--- a/src/3rdparty/libwebp/src/enc/config.c
+++ b/src/3rdparty/libwebp/src/enc/config.c
@@ -111,7 +111,11 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->show_compressed < 0 || config->show_compressed > 1)
     return 0;
+#if WEBP_ENCODER_ABI_VERSION > 0x0204
+  if (config->preprocessing < 0 || config->preprocessing > 7)
+#else
   if (config->preprocessing < 0 || config->preprocessing > 3)
+#endif
     return 0;
   if (config->partitions < 0 || config->partitions > 3)
     return 0;
@@ -138,3 +142,25 @@ int WebPValidateConfig(const WebPConfig* config) {
 
 //------------------------------------------------------------------------------
 
+#if WEBP_ENCODER_ABI_VERSION > 0x0202
+#define MAX_LEVEL 9
+
+// Mapping between -z level and -m / -q parameter settings.
+static const struct {
+  uint8_t method_;
+  uint8_t quality_;
+} kLosslessPresets[MAX_LEVEL + 1] = {
+  { 0,  0 }, { 1, 20 }, { 2, 25 }, { 3, 30 }, { 3, 50 },
+  { 4, 50 }, { 4, 75 }, { 4, 90 }, { 5, 90 }, { 6, 100 }
+};
+
+int WebPConfigLosslessPreset(WebPConfig* config, int level) {
+  if (config == NULL || level < 0 || level > MAX_LEVEL) return 0;
+  config->lossless = 1;
+  config->method = kLosslessPresets[level].method_;
+  config->quality = kLosslessPresets[level].quality_;
+  return 1;
+}
+#endif
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/cost.c b/src/3rdparty/libwebp/src/enc/cost.c
index 09699f8..9d2cc01 100644
--- a/src/3rdparty/libwebp/src/enc/cost.c
+++ b/src/3rdparty/libwebp/src/enc/cost.c
@@ -360,9 +360,10 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) {
       for (ctx = 0; ctx < NUM_CTX; ++ctx) {
         const uint8_t* const p = proba->coeffs_[ctype][band][ctx];
         uint16_t* const table = proba->level_cost_[ctype][band][ctx];
-        const int cost_base = VP8BitCost(1, p[1]);
+        const int cost0 = (ctx > 0) ? VP8BitCost(1, p[0]) : 0;
+        const int cost_base = VP8BitCost(1, p[1]) + cost0;
         int v;
-        table[0] = VP8BitCost(0, p[1]);
+        table[0] = VP8BitCost(0, p[1]) + cost0;
         for (v = 1; v <= MAX_VARIABLE_LEVEL; ++v) {
           table[v] = cost_base + VariableLevelCost(v, p);
         }
@@ -486,4 +487,249 @@ const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
 };
 
 //------------------------------------------------------------------------------
+// Mode costs
 
+static int GetResidualCost(int ctx0, const VP8Residual* const res) {
+  int n = res->first;
+  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  const int p0 = res->prob[n][ctx0][0];
+  const uint16_t* t = res->cost[n][ctx0];
+  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
+  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
+  // be missing during the loop.
+  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;
+
+  if (res->last < 0) {
+    return VP8BitCost(0, p0);
+  }
+  for (; n < res->last; ++n) {
+    const int v = abs(res->coeffs[n]);
+    const int b = VP8EncBands[n + 1];
+    const int ctx = (v >= 2) ? 2 : v;
+    cost += VP8LevelCost(t, v);
+    t = res->cost[b][ctx];
+  }
+  // Last coefficient is always non-zero
+  {
+    const int v = abs(res->coeffs[n]);
+    assert(v != 0);
+    cost += VP8LevelCost(t, v);
+    if (n < 15) {
+      const int b = VP8EncBands[n + 1];
+      const int ctx = (v == 1) ? 1 : 2;
+      const int last_p0 = res->prob[b][ctx][0];
+      cost += VP8BitCost(0, last_p0);
+    }
+  }
+  return cost;
+}
+
+//------------------------------------------------------------------------------
+// init function
+
+#if defined(WEBP_USE_MIPS32)
+extern int VP8GetResidualCostMIPS32(int ctx0, const VP8Residual* const res);
+#endif  // WEBP_USE_MIPS32
+
+// TODO(skal): this, and GetResidualCost(), should probably go somewhere
+// under src/dsp/ at some point.
+VP8GetResidualCostFunc VP8GetResidualCost;
+
+void VP8GetResidualCostInit(void) {
+  VP8GetResidualCost = GetResidualCost;
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_MIPS32)
+    if (VP8GetCPUInfo(kMIPS32)) {
+      VP8GetResidualCost = VP8GetResidualCostMIPS32;
+    }
+#endif
+  }
+}
+
+//------------------------------------------------------------------------------
+// helper functions for residuals struct VP8Residual.
+
+void VP8InitResidual(int first, int coeff_type,
+                     VP8Encoder* const enc, VP8Residual* const res) {
+  res->coeff_type = coeff_type;
+  res->prob  = enc->proba_.coeffs_[coeff_type];
+  res->stats = enc->proba_.stats_[coeff_type];
+  res->cost  = enc->proba_.level_cost_[coeff_type];
+  res->first = first;
+}
+
+static void SetResidualCoeffs(const int16_t* const coeffs,
+                              VP8Residual* const res) {
+  int n;
+  res->last = -1;
+  assert(res->first == 0 || coeffs[0] == 0);
+  for (n = 15; n >= 0; --n) {
+    if (coeffs[n]) {
+      res->last = n;
+      break;
+    }
+  }
+  res->coeffs = coeffs;
+}
+
+//------------------------------------------------------------------------------
+// init function
+
+#if defined(WEBP_USE_SSE2)
+extern void VP8SetResidualCoeffsSSE2(const int16_t* const coeffs,
+                                     VP8Residual* const res);
+#endif  // WEBP_USE_SSE2
+
+VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+void VP8SetResidualCoeffsInit(void) {
+  VP8SetResidualCoeffs = SetResidualCoeffs;
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8SetResidualCoeffs = VP8SetResidualCoeffsSSE2;
+    }
+#endif
+  }
+}
+
+//------------------------------------------------------------------------------
+// Mode costs
+
+int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]) {
+  const int x = (it->i4_ & 3), y = (it->i4_ >> 2);
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+  int R = 0;
+  int ctx;
+
+  VP8InitResidual(0, 3, enc, &res);
+  ctx = it->top_nz_[x] + it->left_nz_[y];
+  VP8SetResidualCoeffs(levels, &res);
+  R += VP8GetResidualCost(ctx, &res);
+  return R;
+}
+
+int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd) {
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+  int x, y;
+  int R = 0;
+
+  VP8IteratorNzToBytes(it);   // re-import the non-zero context
+
+  // DC
+  VP8InitResidual(0, 1, enc, &res);
+  VP8SetResidualCoeffs(rd->y_dc_levels, &res);
+  R += VP8GetResidualCost(it->top_nz_[8] + it->left_nz_[8], &res);
+
+  // AC
+  VP8InitResidual(1, 0, enc, &res);
+  for (y = 0; y < 4; ++y) {
+    for (x = 0; x < 4; ++x) {
+      const int ctx = it->top_nz_[x] + it->left_nz_[y];
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      R += VP8GetResidualCost(ctx, &res);
+      it->top_nz_[x] = it->left_nz_[y] = (res.last >= 0);
+    }
+  }
+  return R;
+}
+
+int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
+  VP8Residual res;
+  VP8Encoder* const enc = it->enc_;
+  int ch, x, y;
+  int R = 0;
+
+  VP8IteratorNzToBytes(it);  // re-import the non-zero context
+
+  VP8InitResidual(0, 2, enc, &res);
+  for (ch = 0; ch <= 2; ch += 2) {
+    for (y = 0; y < 2; ++y) {
+      for (x = 0; x < 2; ++x) {
+        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        R += VP8GetResidualCost(ctx, &res);
+        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = (res.last >= 0);
+      }
+    }
+  }
+  return R;
+}
+
+
+//------------------------------------------------------------------------------
+// Recording of token probabilities.
+
+// Record proba context used
+static int Record(int bit, proba_t* const stats) {
+  proba_t p = *stats;
+  if (p >= 0xffff0000u) {               // an overflow is inbound.
+    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
+  }
+  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
+  p += 0x00010000u + bit;
+  *stats = p;
+  return bit;
+}
+
+// We keep the table-free variant around for reference, in case.
+#define USE_LEVEL_CODE_TABLE
+
+// Simulate block coding, but only record statistics.
+// Note: no need to record the fixed probas.
+int VP8RecordCoeffs(int ctx, const VP8Residual* const res) {
+  int n = res->first;
+  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
+  proba_t* s = res->stats[n][ctx];
+  if (res->last  < 0) {
+    Record(0, s + 0);
+    return 0;
+  }
+  while (n <= res->last) {
+    int v;
+    Record(1, s + 0);  // order of record doesn't matter
+    while ((v = res->coeffs[n++]) == 0) {
+      Record(0, s + 1);
+      s = res->stats[VP8EncBands[n]][0];
+    }
+    Record(1, s + 1);
+    if (!Record(2u < (unsigned int)(v + 1), s + 2)) {  // v = -1 or 1
+      s = res->stats[VP8EncBands[n]][1];
+    } else {
+      v = abs(v);
+#if !defined(USE_LEVEL_CODE_TABLE)
+      if (!Record(v > 4, s + 3)) {
+        if (Record(v != 2, s + 4))
+          Record(v == 4, s + 5);
+      } else if (!Record(v > 10, s + 6)) {
+        Record(v > 6, s + 7);
+      } else if (!Record((v >= 3 + (8 << 2)), s + 8)) {
+        Record((v >= 3 + (8 << 1)), s + 9);
+      } else {
+        Record((v >= 3 + (8 << 3)), s + 10);
+      }
+#else
+      if (v > MAX_VARIABLE_LEVEL) {
+        v = MAX_VARIABLE_LEVEL;
+      }
+
+      {
+        const int bits = VP8LevelCodes[v - 1][1];
+        int pattern = VP8LevelCodes[v - 1][0];
+        int i;
+        for (i = 0; (pattern >>= 1) != 0; ++i) {
+          const int mask = 2 << i;
+          if (pattern & 1) Record(!!(bits & mask), s + 3 + i);
+        }
+      }
+#endif
+      s = res->stats[VP8EncBands[n]][2];
+    }
+  }
+  if (n < 16) Record(0, s + 0);
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/cost.h b/src/3rdparty/libwebp/src/enc/cost.h
index 3cbad1a..4e55895 100644
--- a/src/3rdparty/libwebp/src/enc/cost.h
+++ b/src/3rdparty/libwebp/src/enc/cost.h
@@ -14,12 +14,38 @@
 #ifndef WEBP_ENC_COST_H_
 #define WEBP_ENC_COST_H_
 
+#include <assert.h>
+#include <stdlib.h>
 #include "./vp8enci.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+// On-the-fly info about the current set of residuals. Handy to avoid
+// passing zillions of params.
+typedef struct {
+  int first;
+  int last;
+  const int16_t* coeffs;
+
+  int coeff_type;
+  ProbaArray* prob;
+  StatsArray* stats;
+  CostArray*  cost;
+} VP8Residual;
+
+void VP8InitResidual(int first, int coeff_type,
+                     VP8Encoder* const enc, VP8Residual* const res);
+
+typedef void (*VP8SetResidualCoeffsFunc)(const int16_t* const coeffs,
+                                         VP8Residual* const res);
+extern VP8SetResidualCoeffsFunc VP8SetResidualCoeffs;
+
+void VP8SetResidualCoeffsInit(void);  // must be called first
+
+int VP8RecordCoeffs(int ctx, const VP8Residual* const res);
+
 // approximate cost per level:
 extern const uint16_t VP8LevelFixedCosts[MAX_LEVEL + 1];
 extern const uint16_t VP8EntropyCost[256];        // 8bit fixed-point log(p)
@@ -29,6 +55,12 @@ static WEBP_INLINE int VP8BitCost(int bit, uint8_t proba) {
   return !bit ? VP8EntropyCost[proba] : VP8EntropyCost[255 - proba];
 }
 
+// Cost calculation function.
+typedef int (*VP8GetResidualCostFunc)(int ctx0, const VP8Residual* const res);
+extern VP8GetResidualCostFunc VP8GetResidualCost;
+
+void VP8GetResidualCostInit(void);  // must be called first
+
 // Level cost calculations
 extern const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2];
 void VP8CalculateLevelCosts(VP8Proba* const proba);
diff --git a/src/3rdparty/libwebp/src/enc/filter.c b/src/3rdparty/libwebp/src/enc/filter.c
index dd27804..11db4bd 100644
--- a/src/3rdparty/libwebp/src/enc/filter.c
+++ b/src/3rdparty/libwebp/src/enc/filter.c
@@ -13,6 +13,7 @@
 
 #include <assert.h>
 #include "./vp8enci.h"
+#include "../dsp/dsp.h"
 
 // This table gives, for a given sharpness, the filtering strength to be
 // used (at least) in order to filter a given edge step delta.
@@ -61,180 +62,6 @@ int VP8FilterStrengthFromDelta(int sharpness, int delta) {
   return kLevelsFromDelta[sharpness][pos];
 }
 
-// -----------------------------------------------------------------------------
-// NOTE: clip1, tables and InitTables are repeated entries of dsp.c
-static uint8_t abs0[255 + 255 + 1];     // abs(i)
-static uint8_t abs1[255 + 255 + 1];     // abs(i)>>1
-static int8_t sclip1[1020 + 1020 + 1];  // clips [-1020, 1020] to [-128, 127]
-static int8_t sclip2[112 + 112 + 1];    // clips [-112, 112] to [-16, 15]
-static uint8_t clip1[255 + 510 + 1];    // clips [-255,510] to [0,255]
-
-static int tables_ok = 0;
-
-static void InitTables(void) {
-  if (!tables_ok) {
-    int i;
-    for (i = -255; i <= 255; ++i) {
-      abs0[255 + i] = (i < 0) ? -i : i;
-      abs1[255 + i] = abs0[255 + i] >> 1;
-    }
-    for (i = -1020; i <= 1020; ++i) {
-      sclip1[1020 + i] = (i < -128) ? -128 : (i > 127) ? 127 : i;
-    }
-    for (i = -112; i <= 112; ++i) {
-      sclip2[112 + i] = (i < -16) ? -16 : (i > 15) ? 15 : i;
-    }
-    for (i = -255; i <= 255 + 255; ++i) {
-      clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i;
-    }
-    tables_ok = 1;
-  }
-}
-
-//------------------------------------------------------------------------------
-// Edge filtering functions
-
-// 4 pixels in, 2 pixels out
-static WEBP_INLINE void do_filter2(uint8_t* p, int step) {
-  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  const int a = 3 * (q0 - p0) + sclip1[1020 + p1 - q1];
-  const int a1 = sclip2[112 + ((a + 4) >> 3)];
-  const int a2 = sclip2[112 + ((a + 3) >> 3)];
-  p[-step] = clip1[255 + p0 + a2];
-  p[    0] = clip1[255 + q0 - a1];
-}
-
-// 4 pixels in, 4 pixels out
-static WEBP_INLINE void do_filter4(uint8_t* p, int step) {
-  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  const int a = 3 * (q0 - p0);
-  const int a1 = sclip2[112 + ((a + 4) >> 3)];
-  const int a2 = sclip2[112 + ((a + 3) >> 3)];
-  const int a3 = (a1 + 1) >> 1;
-  p[-2*step] = clip1[255 + p1 + a3];
-  p[-  step] = clip1[255 + p0 + a2];
-  p[      0] = clip1[255 + q0 - a1];
-  p[   step] = clip1[255 + q1 - a3];
-}
-
-// high edge-variance
-static WEBP_INLINE int hev(const uint8_t* p, int step, int thresh) {
-  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  return (abs0[255 + p1 - p0] > thresh) || (abs0[255 + q1 - q0] > thresh);
-}
-
-static WEBP_INLINE int needs_filter(const uint8_t* p, int step, int thresh) {
-  const int p1 = p[-2*step], p0 = p[-step], q0 = p[0], q1 = p[step];
-  return (2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) <= thresh;
-}
-
-static WEBP_INLINE int needs_filter2(const uint8_t* p,
-                                     int step, int t, int it) {
-  const int p3 = p[-4*step], p2 = p[-3*step], p1 = p[-2*step], p0 = p[-step];
-  const int q0 = p[0], q1 = p[step], q2 = p[2*step], q3 = p[3*step];
-  if ((2 * abs0[255 + p0 - q0] + abs1[255 + p1 - q1]) > t)
-    return 0;
-  return abs0[255 + p3 - p2] <= it && abs0[255 + p2 - p1] <= it &&
-         abs0[255 + p1 - p0] <= it && abs0[255 + q3 - q2] <= it &&
-         abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it;
-}
-
-//------------------------------------------------------------------------------
-// Simple In-loop filtering (Paragraph 15.2)
-
-static void SimpleVFilter16(uint8_t* p, int stride, int thresh) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i, stride, thresh)) {
-      do_filter2(p + i, stride);
-    }
-  }
-}
-
-static void SimpleHFilter16(uint8_t* p, int stride, int thresh) {
-  int i;
-  for (i = 0; i < 16; ++i) {
-    if (needs_filter(p + i * stride, 1, thresh)) {
-      do_filter2(p + i * stride, 1);
-    }
-  }
-}
-
-static void SimpleVFilter16i(uint8_t* p, int stride, int thresh) {
-  int k;
-  for (k = 3; k > 0; --k) {
-    p += 4 * stride;
-    SimpleVFilter16(p, stride, thresh);
-  }
-}
-
-static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) {
-  int k;
-  for (k = 3; k > 0; --k) {
-    p += 4;
-    SimpleHFilter16(p, stride, thresh);
-  }
-}
-
-//------------------------------------------------------------------------------
-// Complex In-loop filtering (Paragraph 15.3)
-
-static WEBP_INLINE void FilterLoop24(uint8_t* p,
-                                     int hstride, int vstride, int size,
-                                     int thresh, int ithresh, int hev_thresh) {
-  while (size-- > 0) {
-    if (needs_filter2(p, hstride, thresh, ithresh)) {
-      if (hev(p, hstride, hev_thresh)) {
-        do_filter2(p, hstride);
-      } else {
-        do_filter4(p, hstride);
-      }
-    }
-    p += vstride;
-  }
-}
-
-// on three inner edges
-static void VFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
-  int k;
-  for (k = 3; k > 0; --k) {
-    p += 4 * stride;
-    FilterLoop24(p, stride, 1, 16, thresh, ithresh, hev_thresh);
-  }
-}
-
-static void HFilter16i(uint8_t* p, int stride,
-                       int thresh, int ithresh, int hev_thresh) {
-  int k;
-  for (k = 3; k > 0; --k) {
-    p += 4;
-    FilterLoop24(p, 1, stride, 16, thresh, ithresh, hev_thresh);
-  }
-}
-
-static void VFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop24(u + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
-  FilterLoop24(v + 4 * stride, stride, 1, 8, thresh, ithresh, hev_thresh);
-}
-
-static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
-                      int thresh, int ithresh, int hev_thresh) {
-  FilterLoop24(u + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
-  FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh);
-}
-
-//------------------------------------------------------------------------------
-
-void (*VP8EncVFilter16i)(uint8_t*, int, int, int, int) = VFilter16i;
-void (*VP8EncHFilter16i)(uint8_t*, int, int, int, int) = HFilter16i;
-void (*VP8EncVFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = VFilter8i;
-void (*VP8EncHFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8i;
-
-void (*VP8EncSimpleVFilter16i)(uint8_t*, int, int) = SimpleVFilter16i;
-void (*VP8EncSimpleHFilter16i)(uint8_t*, int, int) = SimpleHFilter16i;
-
 //------------------------------------------------------------------------------
 // Paragraph 15.4: compute the inner-edge filtering strength
 
@@ -266,14 +93,14 @@ static void DoFilter(const VP8EncIterator* const it, int level) {
   memcpy(y_dst, it->yuv_out_, YUV_SIZE * sizeof(uint8_t));
 
   if (enc->filter_hdr_.simple_ == 1) {   // simple
-    VP8EncSimpleHFilter16i(y_dst, BPS, limit);
-    VP8EncSimpleVFilter16i(y_dst, BPS, limit);
+    VP8SimpleHFilter16i(y_dst, BPS, limit);
+    VP8SimpleVFilter16i(y_dst, BPS, limit);
   } else {    // complex
     const int hev_thresh = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
-    VP8EncHFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
-    VP8EncHFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
-    VP8EncVFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
-    VP8EncVFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
+    VP8HFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
+    VP8HFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
+    VP8VFilter16i(y_dst, BPS, limit, ilevel, hev_thresh);
+    VP8VFilter8i(u_dst, v_dst, BPS, limit, ilevel, hev_thresh);
   }
 }
 
@@ -387,7 +214,6 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
 void VP8InitFilter(VP8EncIterator* const it) {
   if (it->lf_stats_ != NULL) {
     int s, i;
-    InitTables();
     for (s = 0; s < NUM_MB_SEGMENTS; s++) {
       for (i = 0; i < MAX_LF_LEVELS; i++) {
         (*it->lf_stats_)[s][i] = 0;
@@ -468,4 +294,3 @@ void VP8AdjustFilterStrength(VP8EncIterator* const it) {
 }
 
 // -----------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/enc/frame.c b/src/3rdparty/libwebp/src/enc/frame.c
index 2582244..cdf1dab 100644
--- a/src/3rdparty/libwebp/src/enc/frame.c
+++ b/src/3rdparty/libwebp/src/enc/frame.c
@@ -11,8 +11,6 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include <assert.h>
-#include <stdlib.h>
 #include <string.h>
 #include <math.h>
 
@@ -23,19 +21,6 @@
 #define SEGMENT_VISU 0
 #define DEBUG_SEARCH 0    // useful to track search convergence
 
-// On-the-fly info about the current set of residuals. Handy to avoid
-// passing zillions of params.
-typedef struct {
-  int first;
-  int last;
-  const int16_t* coeffs;
-
-  int coeff_type;
-  ProbaArray* prob;
-  StatsArray* stats;
-  CostArray*  cost;
-} VP8Residual;
-
 //------------------------------------------------------------------------------
 // multi-pass convergence
 
@@ -142,83 +127,6 @@ static int FinalizeSkipProba(VP8Encoder* const enc) {
   return size;
 }
 
-//------------------------------------------------------------------------------
-// Recording of token probabilities.
-
-static void ResetTokenStats(VP8Encoder* const enc) {
-  VP8Proba* const proba = &enc->proba_;
-  memset(proba->stats_, 0, sizeof(proba->stats_));
-}
-
-// Record proba context used
-static int Record(int bit, proba_t* const stats) {
-  proba_t p = *stats;
-  if (p >= 0xffff0000u) {               // an overflow is inbound.
-    p = ((p + 1u) >> 1) & 0x7fff7fffu;  // -> divide the stats by 2.
-  }
-  // record bit count (lower 16 bits) and increment total count (upper 16 bits).
-  p += 0x00010000u + bit;
-  *stats = p;
-  return bit;
-}
-
-// We keep the table free variant around for reference, in case.
-#define USE_LEVEL_CODE_TABLE
-
-// Simulate block coding, but only record statistics.
-// Note: no need to record the fixed probas.
-static int RecordCoeffs(int ctx, const VP8Residual* const res) {
-  int n = res->first;
-  // should be stats[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  proba_t* s = res->stats[n][ctx];
-  if (res->last  < 0) {
-    Record(0, s + 0);
-    return 0;
-  }
-  while (n <= res->last) {
-    int v;
-    Record(1, s + 0);  // order of record doesn't matter
-    while ((v = res->coeffs[n++]) == 0) {
-      Record(0, s + 1);
-      s = res->stats[VP8EncBands[n]][0];
-    }
-    Record(1, s + 1);
-    if (!Record(2u < (unsigned int)(v + 1), s + 2)) {  // v = -1 or 1
-      s = res->stats[VP8EncBands[n]][1];
-    } else {
-      v = abs(v);
-#if !defined(USE_LEVEL_CODE_TABLE)
-      if (!Record(v > 4, s + 3)) {
-        if (Record(v != 2, s + 4))
-          Record(v == 4, s + 5);
-      } else if (!Record(v > 10, s + 6)) {
-        Record(v > 6, s + 7);
-      } else if (!Record((v >= 3 + (8 << 2)), s + 8)) {
-        Record((v >= 3 + (8 << 1)), s + 9);
-      } else {
-        Record((v >= 3 + (8 << 3)), s + 10);
-      }
-#else
-      if (v > MAX_VARIABLE_LEVEL)
-        v = MAX_VARIABLE_LEVEL;
-
-      {
-        const int bits = VP8LevelCodes[v - 1][1];
-        int pattern = VP8LevelCodes[v - 1][0];
-        int i;
-        for (i = 0; (pattern >>= 1) != 0; ++i) {
-          const int mask = 2 << i;
-          if (pattern & 1) Record(!!(bits & mask), s + 3 + i);
-        }
-      }
-#endif
-      s = res->stats[VP8EncBands[n]][2];
-    }
-  }
-  if (n < 16) Record(0, s + 0);
-  return 1;
-}
-
 // Collect statistics and deduce probabilities for next coding pass.
 // Return the total bit-cost for coding the probability updates.
 static int CalcTokenProba(int nb, int total) {
@@ -231,6 +139,11 @@ static int BranchCost(int nb, int total, int proba) {
   return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba);
 }
 
+static void ResetTokenStats(VP8Encoder* const enc) {
+  VP8Proba* const proba = &enc->proba_;
+  memset(proba->stats_, 0, sizeof(proba->stats_));
+}
+
 static int FinalizeTokenProbas(VP8Proba* const proba) {
   int has_changed = 0;
   int size = 0;
@@ -309,131 +222,6 @@ static void SetSegmentProbas(VP8Encoder* const enc) {
 }
 
 //------------------------------------------------------------------------------
-// helper functions for residuals struct VP8Residual.
-
-static void InitResidual(int first, int coeff_type,
-                         VP8Encoder* const enc, VP8Residual* const res) {
-  res->coeff_type = coeff_type;
-  res->prob  = enc->proba_.coeffs_[coeff_type];
-  res->stats = enc->proba_.stats_[coeff_type];
-  res->cost  = enc->proba_.level_cost_[coeff_type];
-  res->first = first;
-}
-
-static void SetResidualCoeffs(const int16_t* const coeffs,
-                              VP8Residual* const res) {
-  int n;
-  res->last = -1;
-  for (n = 15; n >= res->first; --n) {
-    if (coeffs[n]) {
-      res->last = n;
-      break;
-    }
-  }
-  res->coeffs = coeffs;
-}
-
-//------------------------------------------------------------------------------
-// Mode costs
-
-static int GetResidualCost(int ctx0, const VP8Residual* const res) {
-  int n = res->first;
-  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
-  int p0 = res->prob[n][ctx0][0];
-  const uint16_t* t = res->cost[n][ctx0];
-  int cost;
-
-  if (res->last < 0) {
-    return VP8BitCost(0, p0);
-  }
-  cost = VP8BitCost(1, p0);
-  for (; n < res->last; ++n) {
-    const int v = abs(res->coeffs[n]);
-    const int b = VP8EncBands[n + 1];
-    const int ctx = (v >= 2) ? 2 : v;
-    cost += VP8LevelCost(t, v);
-    t = res->cost[b][ctx];
-    // the masking trick is faster than "if (v) cost += ..." with clang
-    cost += (v ? ~0U : 0) & VP8BitCost(1, res->prob[b][ctx][0]);
-  }
-  // Last coefficient is always non-zero
-  {
-    const int v = abs(res->coeffs[n]);
-    assert(v != 0);
-    cost += VP8LevelCost(t, v);
-    if (n < 15) {
-      const int b = VP8EncBands[n + 1];
-      const int ctx = (v == 1) ? 1 : 2;
-      const int last_p0 = res->prob[b][ctx][0];
-      cost += VP8BitCost(0, last_p0);
-    }
-  }
-  return cost;
-}
-
-int VP8GetCostLuma4(VP8EncIterator* const it, const int16_t levels[16]) {
-  const int x = (it->i4_ & 3), y = (it->i4_ >> 2);
-  VP8Residual res;
-  VP8Encoder* const enc = it->enc_;
-  int R = 0;
-  int ctx;
-
-  InitResidual(0, 3, enc, &res);
-  ctx = it->top_nz_[x] + it->left_nz_[y];
-  SetResidualCoeffs(levels, &res);
-  R += GetResidualCost(ctx, &res);
-  return R;
-}
-
-int VP8GetCostLuma16(VP8EncIterator* const it, const VP8ModeScore* const rd) {
-  VP8Residual res;
-  VP8Encoder* const enc = it->enc_;
-  int x, y;
-  int R = 0;
-
-  VP8IteratorNzToBytes(it);   // re-import the non-zero context
-
-  // DC
-  InitResidual(0, 1, enc, &res);
-  SetResidualCoeffs(rd->y_dc_levels, &res);
-  R += GetResidualCost(it->top_nz_[8] + it->left_nz_[8], &res);
-
-  // AC
-  InitResidual(1, 0, enc, &res);
-  for (y = 0; y < 4; ++y) {
-    for (x = 0; x < 4; ++x) {
-      const int ctx = it->top_nz_[x] + it->left_nz_[y];
-      SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
-      R += GetResidualCost(ctx, &res);
-      it->top_nz_[x] = it->left_nz_[y] = (res.last >= 0);
-    }
-  }
-  return R;
-}
-
-int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) {
-  VP8Residual res;
-  VP8Encoder* const enc = it->enc_;
-  int ch, x, y;
-  int R = 0;
-
-  VP8IteratorNzToBytes(it);  // re-import the non-zero context
-
-  InitResidual(0, 2, enc, &res);
-  for (ch = 0; ch <= 2; ch += 2) {
-    for (y = 0; y < 2; ++y) {
-      for (x = 0; x < 2; ++x) {
-        const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
-        SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
-        R += GetResidualCost(ctx, &res);
-        it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = (res.last >= 0);
-      }
-    }
-  }
-  return R;
-}
-
-//------------------------------------------------------------------------------
 // Coefficient coding
 
 static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) {
@@ -521,32 +309,32 @@ static void CodeResiduals(VP8BitWriter* const bw, VP8EncIterator* const it,
 
   pos1 = VP8BitWriterPos(bw);
   if (i16) {
-    InitResidual(0, 1, enc, &res);
-    SetResidualCoeffs(rd->y_dc_levels, &res);
+    VP8InitResidual(0, 1, enc, &res);
+    VP8SetResidualCoeffs(rd->y_dc_levels, &res);
     it->top_nz_[8] = it->left_nz_[8] =
       PutCoeffs(bw, it->top_nz_[8] + it->left_nz_[8], &res);
-    InitResidual(1, 0, enc, &res);
+    VP8InitResidual(1, 0, enc, &res);
   } else {
-    InitResidual(0, 3, enc, &res);
+    VP8InitResidual(0, 3, enc, &res);
   }
 
   // luma-AC
   for (y = 0; y < 4; ++y) {
     for (x = 0; x < 4; ++x) {
       const int ctx = it->top_nz_[x] + it->left_nz_[y];
-      SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
       it->top_nz_[x] = it->left_nz_[y] = PutCoeffs(bw, ctx, &res);
     }
   }
   pos2 = VP8BitWriterPos(bw);
 
   // U/V
-  InitResidual(0, 2, enc, &res);
+  VP8InitResidual(0, 2, enc, &res);
   for (ch = 0; ch <= 2; ch += 2) {
     for (y = 0; y < 2; ++y) {
       for (x = 0; x < 2; ++x) {
         const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
-        SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
         it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
             PutCoeffs(bw, ctx, &res);
       }
@@ -571,33 +359,33 @@ static void RecordResiduals(VP8EncIterator* const it,
   VP8IteratorNzToBytes(it);
 
   if (it->mb_->type_ == 1) {   // i16x16
-    InitResidual(0, 1, enc, &res);
-    SetResidualCoeffs(rd->y_dc_levels, &res);
+    VP8InitResidual(0, 1, enc, &res);
+    VP8SetResidualCoeffs(rd->y_dc_levels, &res);
     it->top_nz_[8] = it->left_nz_[8] =
-      RecordCoeffs(it->top_nz_[8] + it->left_nz_[8], &res);
-    InitResidual(1, 0, enc, &res);
+      VP8RecordCoeffs(it->top_nz_[8] + it->left_nz_[8], &res);
+    VP8InitResidual(1, 0, enc, &res);
   } else {
-    InitResidual(0, 3, enc, &res);
+    VP8InitResidual(0, 3, enc, &res);
   }
 
   // luma-AC
   for (y = 0; y < 4; ++y) {
     for (x = 0; x < 4; ++x) {
       const int ctx = it->top_nz_[x] + it->left_nz_[y];
-      SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
-      it->top_nz_[x] = it->left_nz_[y] = RecordCoeffs(ctx, &res);
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      it->top_nz_[x] = it->left_nz_[y] = VP8RecordCoeffs(ctx, &res);
     }
   }
 
   // U/V
-  InitResidual(0, 2, enc, &res);
+  VP8InitResidual(0, 2, enc, &res);
   for (ch = 0; ch <= 2; ch += 2) {
     for (y = 0; y < 2; ++y) {
       for (x = 0; x < 2; ++x) {
         const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
-        SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
         it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
-            RecordCoeffs(ctx, &res);
+            VP8RecordCoeffs(ctx, &res);
       }
     }
   }
@@ -610,8 +398,8 @@ static void RecordResiduals(VP8EncIterator* const it,
 
 #if !defined(DISABLE_TOKEN_BUFFER)
 
-static void RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
-                         VP8TBuffer* const tokens) {
+static int RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
+                        VP8TBuffer* const tokens) {
   int x, y, ch;
   VP8Residual res;
   VP8Encoder* const enc = it->enc_;
@@ -619,44 +407,45 @@ static void RecordTokens(VP8EncIterator* const it, const VP8ModeScore* const rd,
   VP8IteratorNzToBytes(it);
   if (it->mb_->type_ == 1) {   // i16x16
     const int ctx = it->top_nz_[8] + it->left_nz_[8];
-    InitResidual(0, 1, enc, &res);
-    SetResidualCoeffs(rd->y_dc_levels, &res);
+    VP8InitResidual(0, 1, enc, &res);
+    VP8SetResidualCoeffs(rd->y_dc_levels, &res);
     it->top_nz_[8] = it->left_nz_[8] =
         VP8RecordCoeffTokens(ctx, 1,
                              res.first, res.last, res.coeffs, tokens);
-    RecordCoeffs(ctx, &res);
-    InitResidual(1, 0, enc, &res);
+    VP8RecordCoeffs(ctx, &res);
+    VP8InitResidual(1, 0, enc, &res);
   } else {
-    InitResidual(0, 3, enc, &res);
+    VP8InitResidual(0, 3, enc, &res);
   }
 
   // luma-AC
   for (y = 0; y < 4; ++y) {
     for (x = 0; x < 4; ++x) {
       const int ctx = it->top_nz_[x] + it->left_nz_[y];
-      SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
+      VP8SetResidualCoeffs(rd->y_ac_levels[x + y * 4], &res);
       it->top_nz_[x] = it->left_nz_[y] =
           VP8RecordCoeffTokens(ctx, res.coeff_type,
                                res.first, res.last, res.coeffs, tokens);
-      RecordCoeffs(ctx, &res);
+      VP8RecordCoeffs(ctx, &res);
     }
   }
 
   // U/V
-  InitResidual(0, 2, enc, &res);
+  VP8InitResidual(0, 2, enc, &res);
   for (ch = 0; ch <= 2; ch += 2) {
     for (y = 0; y < 2; ++y) {
       for (x = 0; x < 2; ++x) {
         const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
-        SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
+        VP8SetResidualCoeffs(rd->uv_levels[ch * 2 + x + y * 2], &res);
         it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] =
             VP8RecordCoeffTokens(ctx, 2,
                                  res.first, res.last, res.coeffs, tokens);
-        RecordCoeffs(ctx, &res);
+        VP8RecordCoeffs(ctx, &res);
       }
     }
   }
   VP8IteratorBytesToNz(it);
+  return !tokens->error_;
 }
 
 #endif    // !DISABLE_TOKEN_BUFFER
@@ -719,7 +508,7 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
       }
       case 7: *info = mb->alpha_; break;
       default: *info = 0; break;
-    };
+    }
   }
 #if SEGMENT_VISU  // visualize segments and prediction modes
   SetBlock(it->yuv_out_ + Y_OFF, mb->segment_ * 64, 16);
@@ -863,7 +652,10 @@ static int PreLoopInitialize(VP8Encoder* const enc) {
   for (p = 0; ok && p < enc->num_parts_; ++p) {
     ok = VP8BitWriterInit(enc->parts_ + p, bytes_per_parts);
   }
-  if (!ok) VP8EncFreeBitWriters(enc);  // malloc error occurred
+  if (!ok) {
+    VP8EncFreeBitWriters(enc);  // malloc error occurred
+    WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
   return ok;
 }
 
@@ -928,11 +720,6 @@ int VP8EncLoop(VP8Encoder* const enc) {
     } else {   // reset predictors after a skip
       ResetAfterSkip(&it);
     }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (enc->use_layer_) {
-      VP8EncCodeLayerBlock(&it);
-    }
-#endif
     StoreSideInfo(&it);
     VP8StoreFilterStats(&it);
     VP8IteratorExport(&it);
@@ -997,14 +784,13 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
         cnt = max_count;
       }
       VP8Decimate(&it, &info, rd_opt);
-      RecordTokens(&it, &info, &enc->tokens_);
+      ok = RecordTokens(&it, &info, &enc->tokens_);
+      if (!ok) {
+        WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+        break;
+      }
       size_p0 += info.H;
       distortion += info.D;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-      if (enc->use_layer_) {
-        VP8EncCodeLayerBlock(&it);
-      }
-#endif
       if (is_last_pass) {
         StoreSideInfo(&it);
         VP8StoreFilterStats(&it);
diff --git a/src/3rdparty/libwebp/src/enc/histogram.c b/src/3rdparty/libwebp/src/enc/histogram.c
index abd253b..7c6abb4 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.c
+++ b/src/3rdparty/libwebp/src/enc/histogram.c
@@ -10,31 +10,64 @@
 // Author: Jyrki Alakuijala (jyrki@google.com)
 //
 #ifdef HAVE_CONFIG_H
-#include "config.h"
+#include "../webp/config.h"
 #endif
 
 #include <math.h>
-#include <stdio.h>
 
 #include "./backward_references.h"
 #include "./histogram.h"
 #include "../dsp/lossless.h"
 #include "../utils/utils.h"
 
+#define MAX_COST 1.e38
+
+// Number of partitions for the three dominant (literal, red and blue) symbol
+// costs.
+#define NUM_PARTITIONS 4
+// The size of the bin-hash corresponding to the three dominant costs.
+#define BIN_SIZE (NUM_PARTITIONS * NUM_PARTITIONS * NUM_PARTITIONS)
+
 static void HistogramClear(VP8LHistogram* const p) {
-  memset(p->literal_, 0, sizeof(p->literal_));
-  memset(p->red_, 0, sizeof(p->red_));
-  memset(p->blue_, 0, sizeof(p->blue_));
-  memset(p->alpha_, 0, sizeof(p->alpha_));
-  memset(p->distance_, 0, sizeof(p->distance_));
-  p->bit_cost_ = 0;
+  uint32_t* const literal = p->literal_;
+  const int cache_bits = p->palette_code_bits_;
+  const int histo_size = VP8LGetHistogramSize(cache_bits);
+  memset(p, 0, histo_size);
+  p->palette_code_bits_ = cache_bits;
+  p->literal_ = literal;
+}
+
+static void HistogramCopy(const VP8LHistogram* const src,
+                          VP8LHistogram* const dst) {
+  uint32_t* const dst_literal = dst->literal_;
+  const int dst_cache_bits = dst->palette_code_bits_;
+  const int histo_size = VP8LGetHistogramSize(dst_cache_bits);
+  assert(src->palette_code_bits_ == dst_cache_bits);
+  memcpy(dst, src, histo_size);
+  dst->literal_ = dst_literal;
+}
+
+int VP8LGetHistogramSize(int cache_bits) {
+  const int literal_size = VP8LHistogramNumCodes(cache_bits);
+  const size_t total_size = sizeof(VP8LHistogram) + sizeof(int) * literal_size;
+  assert(total_size <= (size_t)0x7fffffff);
+  return (int)total_size;
+}
+
+void VP8LFreeHistogram(VP8LHistogram* const histo) {
+  WebPSafeFree(histo);
+}
+
+void VP8LFreeHistogramSet(VP8LHistogramSet* const histo) {
+  WebPSafeFree(histo);
 }
 
 void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
                             VP8LHistogram* const histo) {
-  int i;
-  for (i = 0; i < refs->size; ++i) {
-    VP8LHistogramAddSinglePixOrCopy(histo, &refs->refs[i]);
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  while (VP8LRefsCursorOk(&c)) {
+    VP8LHistogramAddSinglePixOrCopy(histo, c.cur_pos);
+    VP8LRefsCursorNext(&c);
   }
 }
 
@@ -53,13 +86,24 @@ void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits) {
   HistogramClear(p);
 }
 
+VP8LHistogram* VP8LAllocateHistogram(int cache_bits) {
+  VP8LHistogram* histo = NULL;
+  const int total_size = VP8LGetHistogramSize(cache_bits);
+  uint8_t* const memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
+  if (memory == NULL) return NULL;
+  histo = (VP8LHistogram*)memory;
+  // literal_ won't necessary be aligned.
+  histo->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
+  VP8LHistogramInit(histo, cache_bits);
+  return histo;
+}
+
 VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
   int i;
   VP8LHistogramSet* set;
-  VP8LHistogram* bulk;
-  const uint64_t total_size = sizeof(*set)
-                            + (uint64_t)size * sizeof(*set->histograms)
-                            + (uint64_t)size * sizeof(**set->histograms);
+  const size_t total_size = sizeof(*set)
+                            + sizeof(*set->histograms) * size
+                            + (size_t)VP8LGetHistogramSize(cache_bits) * size;
   uint8_t* memory = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*memory));
   if (memory == NULL) return NULL;
 
@@ -67,12 +111,15 @@ VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits) {
   memory += sizeof(*set);
   set->histograms = (VP8LHistogram**)memory;
   memory += size * sizeof(*set->histograms);
-  bulk = (VP8LHistogram*)memory;
   set->max_size = size;
   set->size = size;
   for (i = 0; i < size; ++i) {
-    set->histograms[i] = bulk + i;
+    set->histograms[i] = (VP8LHistogram*)memory;
+    // literal_ won't necessary be aligned.
+    set->histograms[i]->literal_ = (uint32_t*)(memory + sizeof(VP8LHistogram));
     VP8LHistogramInit(set->histograms[i], cache_bits);
+    // There's no padding/alignment between successive histograms.
+    memory += VP8LGetHistogramSize(cache_bits);
   }
   return set;
 }
@@ -87,36 +134,21 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
     ++histo->literal_[PixOrCopyLiteral(v, 1)];
     ++histo->blue_[PixOrCopyLiteral(v, 0)];
   } else if (PixOrCopyIsCacheIdx(v)) {
-    int literal_ix = 256 + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
+    const int literal_ix =
+        NUM_LITERAL_CODES + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
     ++histo->literal_[literal_ix];
   } else {
     int code, extra_bits;
     VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits);
-    ++histo->literal_[256 + code];
+    ++histo->literal_[NUM_LITERAL_CODES + code];
     VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
     ++histo->distance_[code];
   }
 }
 
-static double BitsEntropy(const int* const array, int n) {
-  double retval = 0.;
-  int sum = 0;
-  int nonzeros = 0;
-  int max_val = 0;
-  int i;
+static WEBP_INLINE double BitsEntropyRefine(int nonzeros, int sum, int max_val,
+                                            double retval) {
   double mix;
-  for (i = 0; i < n; ++i) {
-    if (array[i] != 0) {
-      sum += array[i];
-      ++nonzeros;
-      retval -= VP8LFastSLog2(array[i]);
-      if (max_val < array[i]) {
-        max_val = array[i];
-      }
-    }
-  }
-  retval += VP8LFastSLog2(sum);
-
   if (nonzeros < 5) {
     if (nonzeros <= 1) {
       return 0;
@@ -147,95 +179,142 @@ static double BitsEntropy(const int* const array, int n) {
   }
 }
 
-// Returns the cost encode the rle-encoded entropy code.
-// The constants in this function are experimental.
-static double HuffmanCost(const int* const population, int length) {
-  // Small bias because Huffman code length is typically not stored in
-  // full length.
-  static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
-  static const double kSmallBias = 9.1;
-  double retval = kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
-  int streak = 0;
-  int i = 0;
-  for (; i < length - 1; ++i) {
-    ++streak;
-    if (population[i] == population[i + 1]) {
-      continue;
-    }
- last_streak_hack:
-    // population[i] points now to the symbol in the streak of same values.
-    if (streak > 3) {
-      if (population[i] == 0) {
-        retval += 1.5625 + 0.234375 * streak;
-      } else {
-        retval += 2.578125 + 0.703125 * streak;
-      }
-    } else {
-      if (population[i] == 0) {
-        retval += 1.796875 * streak;
-      } else {
-        retval += 3.28125 * streak;
+static double BitsEntropy(const uint32_t* const array, int n) {
+  double retval = 0.;
+  uint32_t sum = 0;
+  int nonzeros = 0;
+  uint32_t max_val = 0;
+  int i;
+  for (i = 0; i < n; ++i) {
+    if (array[i] != 0) {
+      sum += array[i];
+      ++nonzeros;
+      retval -= VP8LFastSLog2(array[i]);
+      if (max_val < array[i]) {
+        max_val = array[i];
       }
     }
-    streak = 0;
   }
-  if (i == length - 1) {
-    ++streak;
-    goto last_streak_hack;
+  retval += VP8LFastSLog2(sum);
+  return BitsEntropyRefine(nonzeros, sum, max_val, retval);
+}
+
+static double BitsEntropyCombined(const uint32_t* const X,
+                                  const uint32_t* const Y, int n) {
+  double retval = 0.;
+  int sum = 0;
+  int nonzeros = 0;
+  int max_val = 0;
+  int i;
+  for (i = 0; i < n; ++i) {
+    const int xy = X[i] + Y[i];
+    if (xy != 0) {
+      sum += xy;
+      ++nonzeros;
+      retval -= VP8LFastSLog2(xy);
+      if (max_val < xy) {
+        max_val = xy;
+      }
+    }
   }
+  retval += VP8LFastSLog2(sum);
+  return BitsEntropyRefine(nonzeros, sum, max_val, retval);
+}
+
+static double InitialHuffmanCost(void) {
+  // Small bias because Huffman code length is typically not stored in
+  // full length.
+  static const int kHuffmanCodeOfHuffmanCodeSize = CODE_LENGTH_CODES * 3;
+  static const double kSmallBias = 9.1;
+  return kHuffmanCodeOfHuffmanCodeSize - kSmallBias;
+}
+
+// Finalize the Huffman cost based on streak numbers and length type (<3 or >=3)
+static double FinalHuffmanCost(const VP8LStreaks* const stats) {
+  double retval = InitialHuffmanCost();
+  retval += stats->counts[0] * 1.5625 + 0.234375 * stats->streaks[0][1];
+  retval += stats->counts[1] * 2.578125 + 0.703125 * stats->streaks[1][1];
+  retval += 1.796875 * stats->streaks[0][0];
+  retval += 3.28125 * stats->streaks[1][0];
   return retval;
 }
 
-static double PopulationCost(const int* const population, int length) {
+// Trampolines
+static double HuffmanCost(const uint32_t* const population, int length) {
+  const VP8LStreaks stats = VP8LHuffmanCostCount(population, length);
+  return FinalHuffmanCost(&stats);
+}
+
+static double HuffmanCostCombined(const uint32_t* const X,
+                                  const uint32_t* const Y, int length) {
+  const VP8LStreaks stats = VP8LHuffmanCostCombinedCount(X, Y, length);
+  return FinalHuffmanCost(&stats);
+}
+
+// Aggregated costs
+static double PopulationCost(const uint32_t* const population, int length) {
   return BitsEntropy(population, length) + HuffmanCost(population, length);
 }
 
-static double ExtraCost(const int* const population, int length) {
-  int i;
-  double cost = 0.;
-  for (i = 2; i < length - 2; ++i) cost += (i >> 1) * population[i + 2];
-  return cost;
+static double GetCombinedEntropy(const uint32_t* const X,
+                                 const uint32_t* const Y, int length) {
+  return BitsEntropyCombined(X, Y, length) + HuffmanCostCombined(X, Y, length);
 }
 
 // Estimates the Entropy + Huffman + other block overhead size cost.
 double VP8LHistogramEstimateBits(const VP8LHistogram* const p) {
-  return PopulationCost(p->literal_, VP8LHistogramNumCodes(p))
-       + PopulationCost(p->red_, 256)
-       + PopulationCost(p->blue_, 256)
-       + PopulationCost(p->alpha_, 256)
-       + PopulationCost(p->distance_, NUM_DISTANCE_CODES)
-       + ExtraCost(p->literal_ + 256, NUM_LENGTH_CODES)
-       + ExtraCost(p->distance_, NUM_DISTANCE_CODES);
+  return
+      PopulationCost(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_))
+      + PopulationCost(p->red_, NUM_LITERAL_CODES)
+      + PopulationCost(p->blue_, NUM_LITERAL_CODES)
+      + PopulationCost(p->alpha_, NUM_LITERAL_CODES)
+      + PopulationCost(p->distance_, NUM_DISTANCE_CODES)
+      + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
+      + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
 }
 
 double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p) {
-  return BitsEntropy(p->literal_, VP8LHistogramNumCodes(p))
-       + BitsEntropy(p->red_, 256)
-       + BitsEntropy(p->blue_, 256)
-       + BitsEntropy(p->alpha_, 256)
-       + BitsEntropy(p->distance_, NUM_DISTANCE_CODES)
-       + ExtraCost(p->literal_ + 256, NUM_LENGTH_CODES)
-       + ExtraCost(p->distance_, NUM_DISTANCE_CODES);
+  return
+      BitsEntropy(p->literal_, VP8LHistogramNumCodes(p->palette_code_bits_))
+      + BitsEntropy(p->red_, NUM_LITERAL_CODES)
+      + BitsEntropy(p->blue_, NUM_LITERAL_CODES)
+      + BitsEntropy(p->alpha_, NUM_LITERAL_CODES)
+      + BitsEntropy(p->distance_, NUM_DISTANCE_CODES)
+      + VP8LExtraCost(p->literal_ + NUM_LITERAL_CODES, NUM_LENGTH_CODES)
+      + VP8LExtraCost(p->distance_, NUM_DISTANCE_CODES);
 }
 
 // -----------------------------------------------------------------------------
 // Various histogram combine/cost-eval functions
 
-// Adds 'in' histogram to 'out'
-static void HistogramAdd(const VP8LHistogram* const in,
-                         VP8LHistogram* const out) {
-  int i;
-  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
-    out->literal_[i] += in->literal_[i];
-  }
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    out->distance_[i] += in->distance_[i];
-  }
-  for (i = 0; i < 256; ++i) {
-    out->red_[i] += in->red_[i];
-    out->blue_[i] += in->blue_[i];
-    out->alpha_[i] += in->alpha_[i];
-  }
+static int GetCombinedHistogramEntropy(const VP8LHistogram* const a,
+                                       const VP8LHistogram* const b,
+                                       double cost_threshold,
+                                       double* cost) {
+  const int palette_code_bits = a->palette_code_bits_;
+  assert(a->palette_code_bits_ == b->palette_code_bits_);
+  *cost += GetCombinedEntropy(a->literal_, b->literal_,
+                              VP8LHistogramNumCodes(palette_code_bits));
+  *cost += VP8LExtraCostCombined(a->literal_ + NUM_LITERAL_CODES,
+                                 b->literal_ + NUM_LITERAL_CODES,
+                                 NUM_LENGTH_CODES);
+  if (*cost > cost_threshold) return 0;
+
+  *cost += GetCombinedEntropy(a->red_, b->red_, NUM_LITERAL_CODES);
+  if (*cost > cost_threshold) return 0;
+
+  *cost += GetCombinedEntropy(a->blue_, b->blue_, NUM_LITERAL_CODES);
+  if (*cost > cost_threshold) return 0;
+
+  *cost += GetCombinedEntropy(a->alpha_, b->alpha_, NUM_LITERAL_CODES);
+  if (*cost > cost_threshold) return 0;
+
+  *cost += GetCombinedEntropy(a->distance_, b->distance_, NUM_DISTANCE_CODES);
+  *cost += VP8LExtraCostCombined(a->distance_, b->distance_,
+                                 NUM_DISTANCE_CODES);
+  if (*cost > cost_threshold) return 0;
+
+  return 1;
 }
 
 // Performs out = a + b, computing the cost C(a+b) - C(a) - C(b) while comparing
@@ -250,41 +329,14 @@ static double HistogramAddEval(const VP8LHistogram* const a,
                                double cost_threshold) {
   double cost = 0;
   const double sum_cost = a->bit_cost_ + b->bit_cost_;
-  int i;
-
   cost_threshold += sum_cost;
 
-  // palette_code_bits_ is part of the cost evaluation for literal_.
-  // TODO(skal): remove/simplify this palette_code_bits_?
-  out->palette_code_bits_ =
-      (a->palette_code_bits_ > b->palette_code_bits_) ? a->palette_code_bits_ :
-                                                        b->palette_code_bits_;
-  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
-    out->literal_[i] = a->literal_[i] + b->literal_[i];
-  }
-  cost += PopulationCost(out->literal_, VP8LHistogramNumCodes(out));
-  cost += ExtraCost(out->literal_ + 256, NUM_LENGTH_CODES);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) out->red_[i] = a->red_[i] + b->red_[i];
-  cost += PopulationCost(out->red_, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) out->blue_[i] = a->blue_[i] + b->blue_[i];
-  cost += PopulationCost(out->blue_, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    out->distance_[i] = a->distance_[i] + b->distance_[i];
+  if (GetCombinedHistogramEntropy(a, b, cost_threshold, &cost)) {
+    VP8LHistogramAdd(a, b, out);
+    out->bit_cost_ = cost;
+    out->palette_code_bits_ = a->palette_code_bits_;
   }
-  cost += PopulationCost(out->distance_, NUM_DISTANCE_CODES);
-  cost += ExtraCost(out->distance_, NUM_DISTANCE_CODES);
-  if (cost > cost_threshold) return cost;
 
-  for (i = 0; i < 256; ++i) out->alpha_[i] = a->alpha_[i] + b->alpha_[i];
-  cost += PopulationCost(out->alpha_, 256);
-
-  out->bit_cost_ = cost;
   return cost - sum_cost;
 }
 
@@ -294,52 +346,92 @@ static double HistogramAddEval(const VP8LHistogram* const a,
 static double HistogramAddThresh(const VP8LHistogram* const a,
                                  const VP8LHistogram* const b,
                                  double cost_threshold) {
-  int tmp[PIX_OR_COPY_CODES_MAX];  // <= max storage we'll need
-  int i;
   double cost = -a->bit_cost_;
+  GetCombinedHistogramEntropy(a, b, cost_threshold, &cost);
+  return cost;
+}
 
-  for (i = 0; i < PIX_OR_COPY_CODES_MAX; ++i) {
-    tmp[i] = a->literal_[i] + b->literal_[i];
-  }
-  // note that the tests are ordered so that the usually largest
-  // cost shares come first.
-  cost += PopulationCost(tmp, VP8LHistogramNumCodes(a));
-  cost += ExtraCost(tmp + 256, NUM_LENGTH_CODES);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) tmp[i] = a->red_[i] + b->red_[i];
-  cost += PopulationCost(tmp, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < 256; ++i) tmp[i] = a->blue_[i] + b->blue_[i];
-  cost += PopulationCost(tmp, 256);
-  if (cost > cost_threshold) return cost;
-
-  for (i = 0; i < NUM_DISTANCE_CODES; ++i) {
-    tmp[i] = a->distance_[i] + b->distance_[i];
-  }
-  cost += PopulationCost(tmp, NUM_DISTANCE_CODES);
-  cost += ExtraCost(tmp, NUM_DISTANCE_CODES);
-  if (cost > cost_threshold) return cost;
+// -----------------------------------------------------------------------------
 
-  for (i = 0; i < 256; ++i) tmp[i] = a->alpha_[i] + b->alpha_[i];
-  cost += PopulationCost(tmp, 256);
+// The structure to keep track of cost range for the three dominant entropy
+// symbols.
+// TODO(skal): Evaluate if float can be used here instead of double for
+// representing the entropy costs.
+typedef struct {
+  double literal_max_;
+  double literal_min_;
+  double red_max_;
+  double red_min_;
+  double blue_max_;
+  double blue_min_;
+} DominantCostRange;
+
+static void DominantCostRangeInit(DominantCostRange* const c) {
+  c->literal_max_ = 0.;
+  c->literal_min_ = MAX_COST;
+  c->red_max_ = 0.;
+  c->red_min_ = MAX_COST;
+  c->blue_max_ = 0.;
+  c->blue_min_ = MAX_COST;
+}
 
-  return cost;
+static void UpdateDominantCostRange(
+    const VP8LHistogram* const h, DominantCostRange* const c) {
+  if (c->literal_max_ < h->literal_cost_) c->literal_max_ = h->literal_cost_;
+  if (c->literal_min_ > h->literal_cost_) c->literal_min_ = h->literal_cost_;
+  if (c->red_max_ < h->red_cost_) c->red_max_ = h->red_cost_;
+  if (c->red_min_ > h->red_cost_) c->red_min_ = h->red_cost_;
+  if (c->blue_max_ < h->blue_cost_) c->blue_max_ = h->blue_cost_;
+  if (c->blue_min_ > h->blue_cost_) c->blue_min_ = h->blue_cost_;
 }
 
-// -----------------------------------------------------------------------------
+static void UpdateHistogramCost(VP8LHistogram* const h) {
+  const double alpha_cost = PopulationCost(h->alpha_, NUM_LITERAL_CODES);
+  const double distance_cost =
+      PopulationCost(h->distance_, NUM_DISTANCE_CODES) +
+      VP8LExtraCost(h->distance_, NUM_DISTANCE_CODES);
+  const int num_codes = VP8LHistogramNumCodes(h->palette_code_bits_);
+  h->literal_cost_ = PopulationCost(h->literal_, num_codes) +
+                     VP8LExtraCost(h->literal_ + NUM_LITERAL_CODES,
+                                   NUM_LENGTH_CODES);
+  h->red_cost_ = PopulationCost(h->red_, NUM_LITERAL_CODES);
+  h->blue_cost_ = PopulationCost(h->blue_, NUM_LITERAL_CODES);
+  h->bit_cost_ = h->literal_cost_ + h->red_cost_ + h->blue_cost_ +
+                 alpha_cost + distance_cost;
+}
 
-static void HistogramBuildImage(int xsize, int histo_bits,
-                                const VP8LBackwardRefs* const backward_refs,
-                                VP8LHistogramSet* const image) {
-  int i;
+static int GetBinIdForEntropy(double min, double max, double val) {
+  const double range = max - min + 1e-6;
+  const double delta = val - min;
+  return (int)(NUM_PARTITIONS * delta / range);
+}
+
+// TODO(vikasa): Evaluate, if there's any correlation between red & blue.
+static int GetHistoBinIndex(
+    const VP8LHistogram* const h, const DominantCostRange* const c) {
+  const int bin_id =
+      GetBinIdForEntropy(c->blue_min_, c->blue_max_, h->blue_cost_) +
+      NUM_PARTITIONS * GetBinIdForEntropy(c->red_min_, c->red_max_,
+                                          h->red_cost_) +
+      NUM_PARTITIONS * NUM_PARTITIONS * GetBinIdForEntropy(c->literal_min_,
+                                                           c->literal_max_,
+                                                           h->literal_cost_);
+  assert(bin_id < BIN_SIZE);
+  return bin_id;
+}
+
+// Construct the histograms from backward references.
+static void HistogramBuild(
+    int xsize, int histo_bits, const VP8LBackwardRefs* const backward_refs,
+    VP8LHistogramSet* const image_histo) {
   int x = 0, y = 0;
   const int histo_xsize = VP8LSubSampleSize(xsize, histo_bits);
-  VP8LHistogram** const histograms = image->histograms;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  VP8LRefsCursor c = VP8LRefsCursorInit(backward_refs);
   assert(histo_bits > 0);
-  for (i = 0; i < backward_refs->size; ++i) {
-    const PixOrCopy* const v = &backward_refs->refs[i];
+  // Construct the Histo from a given backward references.
+  while (VP8LRefsCursorOk(&c)) {
+    const PixOrCopy* const v = c.cur_pos;
     const int ix = (y >> histo_bits) * histo_xsize + (x >> histo_bits);
     VP8LHistogramAddSinglePixOrCopy(histograms[ix], v);
     x += PixOrCopyLength(v);
@@ -347,9 +439,119 @@ static void HistogramBuildImage(int xsize, int histo_bits,
       x -= xsize;
       ++y;
     }
+    VP8LRefsCursorNext(&c);
   }
 }
 
+// Copies the histograms and computes its bit_cost.
+static void HistogramCopyAndAnalyze(
+    VP8LHistogramSet* const orig_histo, VP8LHistogramSet* const image_histo) {
+  int i;
+  const int histo_size = orig_histo->size;
+  VP8LHistogram** const orig_histograms = orig_histo->histograms;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  for (i = 0; i < histo_size; ++i) {
+    VP8LHistogram* const histo = orig_histograms[i];
+    UpdateHistogramCost(histo);
+    // Copy histograms from orig_histo[] to image_histo[].
+    HistogramCopy(histo, histograms[i]);
+  }
+}
+
+// Partition histograms to different entropy bins for three dominant (literal,
+// red and blue) symbol costs and compute the histogram aggregate bit_cost.
+static void HistogramAnalyzeEntropyBin(
+    VP8LHistogramSet* const image_histo, int16_t* const bin_map) {
+  int i;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  const int histo_size = image_histo->size;
+  const int bin_depth = histo_size + 1;
+  DominantCostRange cost_range;
+  DominantCostRangeInit(&cost_range);
+
+  // Analyze the dominant (literal, red and blue) entropy costs.
+  for (i = 0; i < histo_size; ++i) {
+    VP8LHistogram* const histo = histograms[i];
+    UpdateDominantCostRange(histo, &cost_range);
+  }
+
+  // bin-hash histograms on three of the dominant (literal, red and blue)
+  // symbol costs.
+  for (i = 0; i < histo_size; ++i) {
+    int num_histos;
+    VP8LHistogram* const histo = histograms[i];
+    const int16_t bin_id = (int16_t)GetHistoBinIndex(histo, &cost_range);
+    const int bin_offset = bin_id * bin_depth;
+    // bin_map[n][0] for every bin 'n' maintains the counter for the number of
+    // histograms in that bin.
+    // Get and increment the num_histos in that bin.
+    num_histos = ++bin_map[bin_offset];
+    assert(bin_offset + num_histos < bin_depth * BIN_SIZE);
+    // Add histogram i'th index at num_histos (last) position in the bin_map.
+    bin_map[bin_offset + num_histos] = i;
+  }
+}
+
+// Compact the histogram set by moving the valid one left in the set to the
+// head and moving the ones that have been merged to other histograms towards
+// the end.
+// TODO(vikasa): Evaluate if this method can be avoided by altering the code
+// logic of HistogramCombineEntropyBin main loop.
+static void HistogramCompactBins(VP8LHistogramSet* const image_histo) {
+  int start = 0;
+  int end = image_histo->size - 1;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  while (start < end) {
+    while (start <= end && histograms[start] != NULL &&
+           histograms[start]->bit_cost_ != 0.) {
+      ++start;
+    }
+    while (start <= end && histograms[end]->bit_cost_ == 0.) {
+      histograms[end] = NULL;
+      --end;
+    }
+    if (start < end) {
+      assert(histograms[start] != NULL);
+      assert(histograms[end] != NULL);
+      HistogramCopy(histograms[end], histograms[start]);
+      histograms[end] = NULL;
+      --end;
+    }
+  }
+  image_histo->size = end + 1;
+}
+
+static void HistogramCombineEntropyBin(VP8LHistogramSet* const image_histo,
+                                       VP8LHistogram* const histos,
+                                       int16_t* const bin_map, int bin_depth,
+                                       double combine_cost_factor) {
+  int bin_id;
+  VP8LHistogram* cur_combo = histos;
+  VP8LHistogram** const histograms = image_histo->histograms;
+
+  for (bin_id = 0; bin_id < BIN_SIZE; ++bin_id) {
+    const int bin_offset = bin_id * bin_depth;
+    const int num_histos = bin_map[bin_offset];
+    const int idx1 = bin_map[bin_offset + 1];
+    int n;
+    for (n = 2; n <= num_histos; ++n) {
+      const int idx2 = bin_map[bin_offset + n];
+      const double bit_cost_idx2 = histograms[idx2]->bit_cost_;
+      if (bit_cost_idx2 > 0.) {
+        const double bit_cost_thresh = -bit_cost_idx2 * combine_cost_factor;
+        const double curr_cost_diff =
+            HistogramAddEval(histograms[idx1], histograms[idx2],
+                             cur_combo, bit_cost_thresh);
+        if (curr_cost_diff < bit_cost_thresh) {
+          HistogramCopy(cur_combo, histograms[idx1]);
+          histograms[idx2]->bit_cost_ = 0.;
+        }
+      }
+    }
+  }
+  HistogramCompactBins(image_histo);
+}
+
 static uint32_t MyRand(uint32_t *seed) {
   *seed *= 16807U;
   if (*seed == 0) {
@@ -358,48 +560,45 @@ static uint32_t MyRand(uint32_t *seed) {
   return *seed;
 }
 
-static int HistogramCombine(const VP8LHistogramSet* const in,
-                            VP8LHistogramSet* const out, int iter_mult,
-                            int num_pairs, int num_tries_no_success) {
-  int ok = 0;
-  int i, iter;
+static void HistogramCombine(VP8LHistogramSet* const image_histo,
+                             VP8LHistogramSet* const histos, int quality) {
+  int iter;
   uint32_t seed = 0;
   int tries_with_no_success = 0;
-  int out_size = in->size;
-  const int outer_iters = in->size * iter_mult;
+  int image_histo_size = image_histo->size;
+  const int iter_mult = (quality < 25) ? 2 : 2 + (quality - 25) / 8;
+  const int outer_iters = image_histo_size * iter_mult;
+  const int num_pairs = image_histo_size / 2;
+  const int num_tries_no_success = outer_iters / 2;
   const int min_cluster_size = 2;
-  VP8LHistogram* const histos = (VP8LHistogram*)malloc(2 * sizeof(*histos));
-  VP8LHistogram* cur_combo = histos + 0;    // trial merged histogram
-  VP8LHistogram* best_combo = histos + 1;   // best merged histogram so far
-  if (histos == NULL) goto End;
-
-  // Copy histograms from in[] to out[].
-  assert(in->size <= out->size);
-  for (i = 0; i < in->size; ++i) {
-    in->histograms[i]->bit_cost_ = VP8LHistogramEstimateBits(in->histograms[i]);
-    *out->histograms[i] = *in->histograms[i];
-  }
-
-  // Collapse similar histograms in 'out'.
-  for (iter = 0; iter < outer_iters && out_size >= min_cluster_size; ++iter) {
+  VP8LHistogram** const histograms = image_histo->histograms;
+  VP8LHistogram* cur_combo = histos->histograms[0];   // trial histogram
+  VP8LHistogram* best_combo = histos->histograms[1];  // best histogram so far
+
+  // Collapse similar histograms in 'image_histo'.
+  for (iter = 0;
+       iter < outer_iters && image_histo_size >= min_cluster_size;
+       ++iter) {
     double best_cost_diff = 0.;
     int best_idx1 = -1, best_idx2 = 1;
     int j;
-    const int num_tries = (num_pairs < out_size) ? num_pairs : out_size;
+    const int num_tries =
+        (num_pairs < image_histo_size) ? num_pairs : image_histo_size;
     seed += iter;
     for (j = 0; j < num_tries; ++j) {
       double curr_cost_diff;
       // Choose two histograms at random and try to combine them.
-      const uint32_t idx1 = MyRand(&seed) % out_size;
+      const uint32_t idx1 = MyRand(&seed) % image_histo_size;
       const uint32_t tmp = (j & 7) + 1;
-      const uint32_t diff = (tmp < 3) ? tmp : MyRand(&seed) % (out_size - 1);
-      const uint32_t idx2 = (idx1 + diff + 1) % out_size;
+      const uint32_t diff =
+          (tmp < 3) ? tmp : MyRand(&seed) % (image_histo_size - 1);
+      const uint32_t idx2 = (idx1 + diff + 1) % image_histo_size;
       if (idx1 == idx2) {
         continue;
       }
+
       // Calculate cost reduction on combining.
-      curr_cost_diff = HistogramAddEval(out->histograms[idx1],
-                                        out->histograms[idx2],
+      curr_cost_diff = HistogramAddEval(histograms[idx1], histograms[idx2],
                                         cur_combo, best_cost_diff);
       if (curr_cost_diff < best_cost_diff) {    // found a better pair?
         {     // swap cur/best combo histograms
@@ -414,12 +613,12 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
     }
 
     if (best_idx1 >= 0) {
-      *out->histograms[best_idx1] = *best_combo;
+      HistogramCopy(best_combo, histograms[best_idx1]);
       // swap best_idx2 slot with last one (which is now unused)
-      --out_size;
-      if (best_idx2 != out_size) {
-        out->histograms[best_idx2] = out->histograms[out_size];
-        out->histograms[out_size] = NULL;   // just for sanity check.
+      --image_histo_size;
+      if (best_idx2 != image_histo_size) {
+        HistogramCopy(histograms[image_histo_size], histograms[best_idx2]);
+        histograms[image_histo_size] = NULL;
       }
       tries_with_no_success = 0;
     }
@@ -427,38 +626,28 @@ static int HistogramCombine(const VP8LHistogramSet* const in,
       break;
     }
   }
-  out->size = out_size;
-  ok = 1;
-
- End:
-  free(histos);
-  return ok;
+  image_histo->size = image_histo_size;
 }
 
 // -----------------------------------------------------------------------------
 // Histogram refinement
 
-// What is the bit cost of moving square_histogram from cur_symbol to candidate.
-static double HistogramDistance(const VP8LHistogram* const square_histogram,
-                                const VP8LHistogram* const candidate,
-                                double cost_threshold) {
-  return HistogramAddThresh(candidate, square_histogram, cost_threshold);
-}
-
 // Find the best 'out' histogram for each of the 'in' histograms.
 // Note: we assume that out[]->bit_cost_ is already up-to-date.
-static void HistogramRemap(const VP8LHistogramSet* const in,
-                           const VP8LHistogramSet* const out,
+static void HistogramRemap(const VP8LHistogramSet* const orig_histo,
+                           const VP8LHistogramSet* const image_histo,
                            uint16_t* const symbols) {
   int i;
-  for (i = 0; i < in->size; ++i) {
+  VP8LHistogram** const orig_histograms = orig_histo->histograms;
+  VP8LHistogram** const histograms = image_histo->histograms;
+  for (i = 0; i < orig_histo->size; ++i) {
     int best_out = 0;
     double best_bits =
-        HistogramDistance(in->histograms[i], out->histograms[0], 1.e38);
+        HistogramAddThresh(histograms[0], orig_histograms[i], MAX_COST);
     int k;
-    for (k = 1; k < out->size; ++k) {
+    for (k = 1; k < image_histo->size; ++k) {
       const double cur_bits =
-          HistogramDistance(in->histograms[i], out->histograms[k], best_bits);
+          HistogramAddThresh(histograms[k], orig_histograms[i], best_bits);
       if (cur_bits < best_bits) {
         best_bits = cur_bits;
         best_out = k;
@@ -468,45 +657,85 @@ static void HistogramRemap(const VP8LHistogramSet* const in,
   }
 
   // Recompute each out based on raw and symbols.
-  for (i = 0; i < out->size; ++i) {
-    HistogramClear(out->histograms[i]);
+  for (i = 0; i < image_histo->size; ++i) {
+    HistogramClear(histograms[i]);
   }
-  for (i = 0; i < in->size; ++i) {
-    HistogramAdd(in->histograms[i], out->histograms[symbols[i]]);
+
+  for (i = 0; i < orig_histo->size; ++i) {
+    const int idx = symbols[i];
+    VP8LHistogramAdd(orig_histograms[i], histograms[idx], histograms[idx]);
   }
 }
 
+static double GetCombineCostFactor(int histo_size, int quality) {
+  double combine_cost_factor = 0.16;
+  if (histo_size > 256) combine_cost_factor /= 2.;
+  if (histo_size > 512) combine_cost_factor /= 2.;
+  if (histo_size > 1024) combine_cost_factor /= 2.;
+  if (quality <= 50) combine_cost_factor /= 2.;
+  return combine_cost_factor;
+}
+
 int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              const VP8LBackwardRefs* const refs,
                              int quality, int histo_bits, int cache_bits,
-                             VP8LHistogramSet* const image_in,
+                             VP8LHistogramSet* const image_histo,
                              uint16_t* const histogram_symbols) {
   int ok = 0;
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(xsize, histo_bits) : 1;
   const int histo_ysize = histo_bits ? VP8LSubSampleSize(ysize, histo_bits) : 1;
-  const int histo_image_raw_size = histo_xsize * histo_ysize;
-
-  // Heuristic params for HistogramCombine().
-  const int num_tries_no_success = 8 + (quality >> 1);
-  const int iter_mult = (quality < 27) ? 1 : 1 + ((quality - 27) >> 4);
-  const int num_pairs = (quality < 25) ? 10 : (5 * quality) >> 3;
-
-  VP8LHistogramSet* const image_out =
-      VP8LAllocateHistogramSet(histo_image_raw_size, cache_bits);
-  if (image_out == NULL) return 0;
-
-  // Build histogram image.
-  HistogramBuildImage(xsize, histo_bits, refs, image_out);
-  // Collapse similar histograms.
-  if (!HistogramCombine(image_out, image_in, iter_mult, num_pairs,
-                        num_tries_no_success)) {
+  const int image_histo_raw_size = histo_xsize * histo_ysize;
+
+  // The bin_map for every bin follows following semantics:
+  // bin_map[n][0] = num_histo; // The number of histograms in that bin.
+  // bin_map[n][1] = index of first histogram in that bin;
+  // bin_map[n][num_histo] = index of last histogram in that bin;
+  // bin_map[n][num_histo + 1] ... bin_map[n][bin_depth - 1] = un-used indices.
+  const int bin_depth = image_histo_raw_size + 1;
+  int16_t* bin_map = NULL;
+  VP8LHistogramSet* const histos = VP8LAllocateHistogramSet(2, cache_bits);
+  VP8LHistogramSet* const orig_histo =
+      VP8LAllocateHistogramSet(image_histo_raw_size, cache_bits);
+
+  if (orig_histo == NULL || histos == NULL) {
     goto Error;
   }
+
+  // Don't attempt linear bin-partition heuristic for:
+  // histograms of small sizes, as bin_map will be very sparse and;
+  // Higher qualities (> 90), to preserve the compression gains at those
+  // quality settings.
+  if (orig_histo->size > 2 * BIN_SIZE && quality < 90) {
+    const int bin_map_size = bin_depth * BIN_SIZE;
+    bin_map = (int16_t*)WebPSafeCalloc(bin_map_size, sizeof(*bin_map));
+    if (bin_map == NULL) goto Error;
+  }
+
+  // Construct the histograms from backward references.
+  HistogramBuild(xsize, histo_bits, refs, orig_histo);
+  // Copies the histograms and computes its bit_cost.
+  HistogramCopyAndAnalyze(orig_histo, image_histo);
+
+  if (bin_map != NULL) {
+    const double combine_cost_factor =
+        GetCombineCostFactor(image_histo_raw_size, quality);
+    HistogramAnalyzeEntropyBin(orig_histo, bin_map);
+    // Collapse histograms with similar entropy.
+    HistogramCombineEntropyBin(image_histo, histos->histograms[0],
+                               bin_map, bin_depth, combine_cost_factor);
+  }
+
+  // Collapse similar histograms by random histogram-pair compares.
+  HistogramCombine(image_histo, histos, quality);
+
   // Find the optimal map from original histograms to the final ones.
-  HistogramRemap(image_out, image_in, histogram_symbols);
+  HistogramRemap(orig_histo, image_histo, histogram_symbols);
+
   ok = 1;
 
-Error:
-  free(image_out);
+ Error:
+  WebPSafeFree(bin_map);
+  VP8LFreeHistogramSet(orig_histo);
+  VP8LFreeHistogramSet(histos);
   return ok;
 }
diff --git a/src/3rdparty/libwebp/src/enc/histogram.h b/src/3rdparty/libwebp/src/enc/histogram.h
index 4d346a8..1cf4c54 100644
--- a/src/3rdparty/libwebp/src/enc/histogram.h
+++ b/src/3rdparty/libwebp/src/enc/histogram.h
@@ -32,18 +32,21 @@ extern "C" {
 typedef struct {
   // literal_ contains green literal, palette-code and
   // copy-length-prefix histogram
-  int literal_[PIX_OR_COPY_CODES_MAX];
-  int red_[256];
-  int blue_[256];
-  int alpha_[256];
+  uint32_t* literal_;         // Pointer to the allocated buffer for literal.
+  uint32_t red_[NUM_LITERAL_CODES];
+  uint32_t blue_[NUM_LITERAL_CODES];
+  uint32_t alpha_[NUM_LITERAL_CODES];
   // Backward reference prefix-code histogram.
-  int distance_[NUM_DISTANCE_CODES];
+  uint32_t distance_[NUM_DISTANCE_CODES];
   int palette_code_bits_;
-  double bit_cost_;   // cached value of VP8LHistogramEstimateBits(this)
+  double bit_cost_;      // cached value of VP8LHistogramEstimateBits(this)
+  double literal_cost_;  // Cached values of dominant entropy costs:
+  double red_cost_;      //   literal, red & blue.
+  double blue_cost_;
 } VP8LHistogram;
 
 // Collection of histograms with fixed capacity, allocated as one
-// big memory chunk. Can be destroyed by simply calling 'free()'.
+// big memory chunk. Can be destroyed by calling WebPSafeFree().
 typedef struct {
   int size;         // number of slots currently in use
   int max_size;     // maximum capacity
@@ -59,6 +62,9 @@ void VP8LHistogramCreate(VP8LHistogram* const p,
                          const VP8LBackwardRefs* const refs,
                          int palette_code_bits);
 
+// Return the size of the histogram for a given palette_code_bits.
+int VP8LGetHistogramSize(int palette_code_bits);
+
 // Set the palette_code_bits and reset the stats.
 void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits);
 
@@ -66,10 +72,21 @@ void VP8LHistogramInit(VP8LHistogram* const p, int palette_code_bits);
 void VP8LHistogramStoreRefs(const VP8LBackwardRefs* const refs,
                             VP8LHistogram* const histo);
 
+// Free the memory allocated for the histogram.
+void VP8LFreeHistogram(VP8LHistogram* const histo);
+
+// Free the memory allocated for the histogram set.
+void VP8LFreeHistogramSet(VP8LHistogramSet* const histo);
+
 // Allocate an array of pointer to histograms, allocated and initialized
 // using 'cache_bits'. Return NULL in case of memory error.
 VP8LHistogramSet* VP8LAllocateHistogramSet(int size, int cache_bits);
 
+// Allocate and initialize histogram object with specified 'cache_bits'.
+// Returns NULL in case of memory error.
+// Special case of VP8LAllocateHistogramSet, with size equals 1.
+VP8LHistogram* VP8LAllocateHistogram(int cache_bits);
+
 // Accumulate a token 'v' into a histogram.
 void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
                                      const PixOrCopy* const v);
@@ -82,9 +99,9 @@ double VP8LHistogramEstimateBits(const VP8LHistogram* const p);
 // represent the entropy code itself.
 double VP8LHistogramEstimateBitsBulk(const VP8LHistogram* const p);
 
-static WEBP_INLINE int VP8LHistogramNumCodes(const VP8LHistogram* const p) {
-  return 256 + NUM_LENGTH_CODES +
-      ((p->palette_code_bits_ > 0) ? (1 << p->palette_code_bits_) : 0);
+static WEBP_INLINE int VP8LHistogramNumCodes(int palette_code_bits) {
+  return NUM_LITERAL_CODES + NUM_LENGTH_CODES +
+      ((palette_code_bits > 0) ? (1 << palette_code_bits) : 0);
 }
 
 // Builds the histogram image.
diff --git a/src/3rdparty/libwebp/src/enc/layer.c b/src/3rdparty/libwebp/src/enc/layer.c
deleted file mode 100644
index 2402362..0000000
--- a/src/3rdparty/libwebp/src/enc/layer.c
+++ /dev/null
@@ -1,44 +0,0 @@
-// Copyright 2011 Google Inc. All Rights Reserved.
-//
-// Use of this source code is governed by a BSD-style license
-// that can be found in the COPYING file in the root of the source
-// tree. An additional intellectual property rights grant can be found
-// in the file PATENTS. All contributing project authors may
-// be found in the AUTHORS file in the root of the source tree.
-// -----------------------------------------------------------------------------
-//
-// Enhancement layer (for YUV444/422)
-//
-// Author: Skal (pascal.massimino@gmail.com)
-
-#include <stdlib.h>
-
-#include "./vp8enci.h"
-
-//------------------------------------------------------------------------------
-
-void VP8EncInitLayer(VP8Encoder* const enc) {
-  enc->use_layer_ = (enc->pic_->u0 != NULL);
-  enc->layer_data_size_ = 0;
-  enc->layer_data_ = NULL;
-  if (enc->use_layer_) {
-    VP8BitWriterInit(&enc->layer_bw_, enc->mb_w_ * enc->mb_h_ * 3);
-  }
-}
-
-void VP8EncCodeLayerBlock(VP8EncIterator* it) {
-  (void)it;   // remove a warning
-}
-
-int VP8EncFinishLayer(VP8Encoder* const enc) {
-  if (enc->use_layer_) {
-    enc->layer_data_ = VP8BitWriterFinish(&enc->layer_bw_);
-    enc->layer_data_size_ = VP8BitWriterSize(&enc->layer_bw_);
-  }
-  return 1;
-}
-
-void VP8EncDeleteLayer(VP8Encoder* enc) {
-  free(enc->layer_data_);
-}
-
diff --git a/src/3rdparty/libwebp/src/enc/picture.c b/src/3rdparty/libwebp/src/enc/picture.c
index 011690d..9a66fbe 100644
--- a/src/3rdparty/libwebp/src/enc/picture.c
+++ b/src/3rdparty/libwebp/src/enc/picture.c
@@ -7,506 +7,170 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// WebPPicture utils: colorspace conversion, crop, ...
+// WebPPicture class basis
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <assert.h>
 #include <stdlib.h>
-#include <math.h>
 
 #include "./vp8enci.h"
-#include "../utils/alpha_processing.h"
-#include "../utils/random.h"
-#include "../utils/rescaler.h"
 #include "../utils/utils.h"
-#include "../dsp/dsp.h"
-#include "../dsp/yuv.h"
-
-// Uncomment to disable gamma-compression during RGB->U/V averaging
-#define USE_GAMMA_COMPRESSION
-
-#define HALVE(x) (((x) + 1) >> 1)
-#define IS_YUV_CSP(csp, YUV_CSP) (((csp) & WEBP_CSP_UV_MASK) == (YUV_CSP))
-
-static const union {
-  uint32_t argb;
-  uint8_t  bytes[4];
-} test_endian = { 0xff000000u };
-#define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)
-
-static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
-  return (0xff000000u | (r << 16) | (g << 8) | b);
-}
 
 //------------------------------------------------------------------------------
 // WebPPicture
 //------------------------------------------------------------------------------
 
-int WebPPictureAlloc(WebPPicture* picture) {
-  if (picture != NULL) {
-    const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
-    const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT;
-    const int width = picture->width;
-    const int height = picture->height;
-
-    if (!picture->use_argb) {
-      const int y_stride = width;
-      const int uv_width = HALVE(width);
-      const int uv_height = HALVE(height);
-      const int uv_stride = uv_width;
-      int uv0_stride = 0;
-      int a_width, a_stride;
-      uint64_t y_size, uv_size, uv0_size, a_size, total_size;
-      uint8_t* mem;
-
-      // U/V
-      switch (uv_csp) {
-        case WEBP_YUV420:
-          break;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-        case WEBP_YUV400:    // for now, we'll just reset the U/V samples
-          break;
-        case WEBP_YUV422:
-          uv0_stride = uv_width;
-          break;
-        case WEBP_YUV444:
-          uv0_stride = width;
-          break;
-#endif
-        default:
-          return 0;
-      }
-      uv0_size = height * uv0_stride;
-
-      // alpha
-      a_width = has_alpha ? width : 0;
-      a_stride = a_width;
-      y_size = (uint64_t)y_stride * height;
-      uv_size = (uint64_t)uv_stride * uv_height;
-      a_size =  (uint64_t)a_stride * height;
-
-      total_size = y_size + a_size + 2 * uv_size + 2 * uv0_size;
-
-      // Security and validation checks
-      if (width <= 0 || height <= 0 ||         // luma/alpha param error
-          uv_width < 0 || uv_height < 0) {     // u/v param error
-        return 0;
-      }
-      // Clear previous buffer and allocate a new one.
-      WebPPictureFree(picture);   // erase previous buffer
-      mem = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*mem));
-      if (mem == NULL) return 0;
-
-      // From now on, we're in the clear, we can no longer fail...
-      picture->memory_ = (void*)mem;
-      picture->y_stride  = y_stride;
-      picture->uv_stride = uv_stride;
-      picture->a_stride  = a_stride;
-      picture->uv0_stride = uv0_stride;
-      // TODO(skal): we could align the y/u/v planes and adjust stride.
-      picture->y = mem;
-      mem += y_size;
-
-      picture->u = mem;
-      mem += uv_size;
-      picture->v = mem;
-      mem += uv_size;
-
-      if (a_size) {
-        picture->a = mem;
-        mem += a_size;
-      }
-      if (uv0_size) {
-        picture->u0 = mem;
-        mem += uv0_size;
-        picture->v0 = mem;
-        mem += uv0_size;
-      }
-      (void)mem;  // makes the static analyzer happy
-    } else {
-      void* memory;
-      const uint64_t argb_size = (uint64_t)width * height;
-      if (width <= 0 || height <= 0) {
-        return 0;
-      }
-      // Clear previous buffer and allocate a new one.
-      WebPPictureFree(picture);   // erase previous buffer
-      memory = WebPSafeMalloc(argb_size, sizeof(*picture->argb));
-      if (memory == NULL) return 0;
+static int DummyWriter(const uint8_t* data, size_t data_size,
+                       const WebPPicture* const picture) {
+  // The following are to prevent 'unused variable' error message.
+  (void)data;
+  (void)data_size;
+  (void)picture;
+  return 1;
+}
 
-      // TODO(skal): align plane to cache line?
-      picture->memory_argb_ = memory;
-      picture->argb = (uint32_t*)memory;
-      picture->argb_stride = width;
-    }
+int WebPPictureInitInternal(WebPPicture* picture, int version) {
+  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
+    return 0;   // caller/system version mismatch!
+  }
+  if (picture != NULL) {
+    memset(picture, 0, sizeof(*picture));
+    picture->writer = DummyWriter;
+    WebPEncodingSetError(picture, VP8_ENC_OK);
   }
   return 1;
 }
 
-// Remove reference to the ARGB buffer (doesn't free anything).
-static void PictureResetARGB(WebPPicture* const picture) {
+//------------------------------------------------------------------------------
+
+static void WebPPictureResetBufferARGB(WebPPicture* const picture) {
   picture->memory_argb_ = NULL;
   picture->argb = NULL;
   picture->argb_stride = 0;
 }
 
-// Remove reference to the YUVA buffer (doesn't free anything).
-static void PictureResetYUVA(WebPPicture* const picture) {
+static void WebPPictureResetBufferYUVA(WebPPicture* const picture) {
   picture->memory_ = NULL;
   picture->y = picture->u = picture->v = picture->a = NULL;
-  picture->u0 = picture->v0 = NULL;
   picture->y_stride = picture->uv_stride = 0;
   picture->a_stride = 0;
-  picture->uv0_stride = 0;
 }
 
-// Grab the 'specs' (writer, *opaque, width, height...) from 'src' and copy them
-// into 'dst'. Mark 'dst' as not owning any memory.
-static void WebPPictureGrabSpecs(const WebPPicture* const src,
-                                 WebPPicture* const dst) {
-  assert(src != NULL && dst != NULL);
-  *dst = *src;
-  PictureResetYUVA(dst);
-  PictureResetARGB(dst);
+void WebPPictureResetBuffers(WebPPicture* const picture) {
+  WebPPictureResetBufferARGB(picture);
+  WebPPictureResetBufferYUVA(picture);
 }
 
-// Allocate a new argb buffer, discarding any existing one and preserving
-// the other YUV(A) buffer.
-static int PictureAllocARGB(WebPPicture* const picture) {
-  WebPPicture tmp;
-  free(picture->memory_argb_);
-  PictureResetARGB(picture);
-  picture->use_argb = 1;
-  WebPPictureGrabSpecs(picture, &tmp);
-  if (!WebPPictureAlloc(&tmp)) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
-  }
-  picture->memory_argb_ = tmp.memory_argb_;
-  picture->argb = tmp.argb;
-  picture->argb_stride = tmp.argb_stride;
-  return 1;
-}
+int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height) {
+  void* memory;
+  const uint64_t argb_size = (uint64_t)width * height;
 
-// Release memory owned by 'picture' (both YUV and ARGB buffers).
-void WebPPictureFree(WebPPicture* picture) {
-  if (picture != NULL) {
-    free(picture->memory_);
-    free(picture->memory_argb_);
-    PictureResetYUVA(picture);
-    PictureResetARGB(picture);
-  }
-}
+  assert(picture != NULL);
 
-//------------------------------------------------------------------------------
-// Picture copying
+  WebPSafeFree(picture->memory_argb_);
+  WebPPictureResetBufferARGB(picture);
 
-// Not worth moving to dsp/enc.c (only used here).
-static void CopyPlane(const uint8_t* src, int src_stride,
-                      uint8_t* dst, int dst_stride, int width, int height) {
-  while (height-- > 0) {
-    memcpy(dst, src, width);
-    src += src_stride;
-    dst += dst_stride;
+  if (width <= 0 || height <= 0) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
   }
-}
-
-// Adjust top-left corner to chroma sample position.
-static void SnapTopLeftPosition(const WebPPicture* const pic,
-                                int* const left, int* const top) {
-  if (!pic->use_argb) {
-    const int is_yuv422 = IS_YUV_CSP(pic->colorspace, WEBP_YUV422);
-    if (IS_YUV_CSP(pic->colorspace, WEBP_YUV420) || is_yuv422) {
-      *left &= ~1;
-      if (!is_yuv422) *top &= ~1;
-    }
+  // allocate a new buffer.
+  memory = WebPSafeMalloc(argb_size, sizeof(*picture->argb));
+  if (memory == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
-}
-
-// Adjust top-left corner and verify that the sub-rectangle is valid.
-static int AdjustAndCheckRectangle(const WebPPicture* const pic,
-                                   int* const left, int* const top,
-                                   int width, int height) {
-  SnapTopLeftPosition(pic, left, top);
-  if ((*left) < 0 || (*top) < 0) return 0;
-  if (width <= 0 || height <= 0) return 0;
-  if ((*left) + width > pic->width) return 0;
-  if ((*top) + height > pic->height) return 0;
+  // TODO(skal): align plane to cache line?
+  picture->memory_argb_ = memory;
+  picture->argb = (uint32_t*)memory;
+  picture->argb_stride = width;
   return 1;
 }
 
-int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
-  if (src == NULL || dst == NULL) return 0;
-  if (src == dst) return 1;
+int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height) {
+  const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
+  const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT;
+  const int y_stride = width;
+  const int uv_width = (width + 1) >> 1;
+  const int uv_height = (height + 1) >> 1;
+  const int uv_stride = uv_width;
+  int a_width, a_stride;
+  uint64_t y_size, uv_size, a_size, total_size;
+  uint8_t* mem;
 
-  WebPPictureGrabSpecs(src, dst);
-  if (!WebPPictureAlloc(dst)) return 0;
+  assert(picture != NULL);
 
-  if (!src->use_argb) {
-    CopyPlane(src->y, src->y_stride,
-              dst->y, dst->y_stride, dst->width, dst->height);
-    CopyPlane(src->u, src->uv_stride,
-              dst->u, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
-    CopyPlane(src->v, src->uv_stride,
-              dst->v, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
-    if (dst->a != NULL)  {
-      CopyPlane(src->a, src->a_stride,
-                dst->a, dst->a_stride, dst->width, dst->height);
-    }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (dst->u0 != NULL)  {
-      int uv0_width = src->width;
-      if (IS_YUV_CSP(dst->colorspace, WEBP_YUV422)) {
-        uv0_width = HALVE(uv0_width);
-      }
-      CopyPlane(src->u0, src->uv0_stride,
-                dst->u0, dst->uv0_stride, uv0_width, dst->height);
-      CopyPlane(src->v0, src->uv0_stride,
-                dst->v0, dst->uv0_stride, uv0_width, dst->height);
-    }
-#endif
-  } else {
-    CopyPlane((const uint8_t*)src->argb, 4 * src->argb_stride,
-              (uint8_t*)dst->argb, 4 * dst->argb_stride,
-              4 * dst->width, dst->height);
-  }
-  return 1;
-}
+  WebPSafeFree(picture->memory_);
+  WebPPictureResetBufferYUVA(picture);
 
-int WebPPictureIsView(const WebPPicture* picture) {
-  if (picture == NULL) return 0;
-  if (picture->use_argb) {
-    return (picture->memory_argb_ == NULL);
+  if (uv_csp != WEBP_YUV420) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
   }
-  return (picture->memory_ == NULL);
-}
 
-int WebPPictureView(const WebPPicture* src,
-                    int left, int top, int width, int height,
-                    WebPPicture* dst) {
-  if (src == NULL || dst == NULL) return 0;
+  // alpha
+  a_width = has_alpha ? width : 0;
+  a_stride = a_width;
+  y_size = (uint64_t)y_stride * height;
+  uv_size = (uint64_t)uv_stride * uv_height;
+  a_size =  (uint64_t)a_stride * height;
 
-  // verify rectangle position.
-  if (!AdjustAndCheckRectangle(src, &left, &top, width, height)) return 0;
+  total_size = y_size + a_size + 2 * uv_size;
 
-  if (src != dst) {  // beware of aliasing! We don't want to leak 'memory_'.
-    WebPPictureGrabSpecs(src, dst);
+  // Security and validation checks
+  if (width <= 0 || height <= 0 ||         // luma/alpha param error
+      uv_width < 0 || uv_height < 0) {     // u/v param error
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_BAD_DIMENSION);
   }
-  dst->width = width;
-  dst->height = height;
-  if (!src->use_argb) {
-    dst->y = src->y + top * src->y_stride + left;
-    dst->u = src->u + (top >> 1) * src->uv_stride + (left >> 1);
-    dst->v = src->v + (top >> 1) * src->uv_stride + (left >> 1);
-    dst->y_stride = src->y_stride;
-    dst->uv_stride = src->uv_stride;
-    if (src->a != NULL) {
-      dst->a = src->a + top * src->a_stride + left;
-      dst->a_stride = src->a_stride;
-    }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (src->u0 != NULL) {
-      const int left_pos =
-          IS_YUV_CSP(dst->colorspace, WEBP_YUV422) ? (left >> 1) : left;
-      dst->u0 = src->u0 + top * src->uv0_stride + left_pos;
-      dst->v0 = src->v0 + top * src->uv0_stride + left_pos;
-      dst->uv0_stride = src->uv0_stride;
-    }
-#endif
-  } else {
-    dst->argb = src->argb + top * src->argb_stride + left;
-    dst->argb_stride = src->argb_stride;
+  // allocate a new buffer.
+  mem = (uint8_t*)WebPSafeMalloc(total_size, sizeof(*mem));
+  if (mem == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
   }
-  return 1;
-}
-
-//------------------------------------------------------------------------------
-// Picture cropping
 
-int WebPPictureCrop(WebPPicture* pic,
-                    int left, int top, int width, int height) {
-  WebPPicture tmp;
+  // From now on, we're in the clear, we can no longer fail...
+  picture->memory_ = (void*)mem;
+  picture->y_stride  = y_stride;
+  picture->uv_stride = uv_stride;
+  picture->a_stride  = a_stride;
 
-  if (pic == NULL) return 0;
-  if (!AdjustAndCheckRectangle(pic, &left, &top, width, height)) return 0;
+  // TODO(skal): we could align the y/u/v planes and adjust stride.
+  picture->y = mem;
+  mem += y_size;
 
-  WebPPictureGrabSpecs(pic, &tmp);
-  tmp.width = width;
-  tmp.height = height;
-  if (!WebPPictureAlloc(&tmp)) return 0;
+  picture->u = mem;
+  mem += uv_size;
+  picture->v = mem;
+  mem += uv_size;
 
-  if (!pic->use_argb) {
-    const int y_offset = top * pic->y_stride + left;
-    const int uv_offset = (top / 2) * pic->uv_stride + left / 2;
-    CopyPlane(pic->y + y_offset, pic->y_stride,
-              tmp.y, tmp.y_stride, width, height);
-    CopyPlane(pic->u + uv_offset, pic->uv_stride,
-              tmp.u, tmp.uv_stride, HALVE(width), HALVE(height));
-    CopyPlane(pic->v + uv_offset, pic->uv_stride,
-              tmp.v, tmp.uv_stride, HALVE(width), HALVE(height));
-
-    if (tmp.a != NULL) {
-      const int a_offset = top * pic->a_stride + left;
-      CopyPlane(pic->a + a_offset, pic->a_stride,
-                tmp.a, tmp.a_stride, width, height);
-    }
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (tmp.u0 != NULL) {
-      int w = width;
-      int left_pos = left;
-      if (IS_YUV_CSP(tmp.colorspace, WEBP_YUV422)) {
-        w = HALVE(w);
-        left_pos = HALVE(left_pos);
-      }
-      CopyPlane(pic->u0 + top * pic->uv0_stride + left_pos, pic->uv0_stride,
-                tmp.u0, tmp.uv0_stride, w, height);
-      CopyPlane(pic->v0 + top * pic->uv0_stride + left_pos, pic->uv0_stride,
-                tmp.v0, tmp.uv0_stride, w, height);
-    }
-#endif
-  } else {
-    const uint8_t* const src =
-        (const uint8_t*)(pic->argb + top * pic->argb_stride + left);
-    CopyPlane(src, pic->argb_stride * 4,
-              (uint8_t*)tmp.argb, tmp.argb_stride * 4,
-              width * 4, height);
+  if (a_size > 0) {
+    picture->a = mem;
+    mem += a_size;
   }
-  WebPPictureFree(pic);
-  *pic = tmp;
+  (void)mem;  // makes the static analyzer happy
   return 1;
 }
 
-//------------------------------------------------------------------------------
-// Simple picture rescaler
-
-static void RescalePlane(const uint8_t* src,
-                         int src_width, int src_height, int src_stride,
-                         uint8_t* dst,
-                         int dst_width, int dst_height, int dst_stride,
-                         int32_t* const work,
-                         int num_channels) {
-  WebPRescaler rescaler;
-  int y = 0;
-  WebPRescalerInit(&rescaler, src_width, src_height,
-                   dst, dst_width, dst_height, dst_stride,
-                   num_channels,
-                   src_width, dst_width,
-                   src_height, dst_height,
-                   work);
-  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
-  while (y < src_height) {
-    y += WebPRescalerImport(&rescaler, src_height - y,
-                            src + y * src_stride, src_stride);
-    WebPRescalerExport(&rescaler);
-  }
-}
+int WebPPictureAlloc(WebPPicture* picture) {
+  if (picture != NULL) {
+    const int width = picture->width;
+    const int height = picture->height;
 
-static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
-  uint32_t* ptr = pic->argb;
-  int y;
-  for (y = 0; y < pic->height; ++y) {
-    WebPMultARGBRow(ptr, pic->width, inverse);
-    ptr += pic->argb_stride;
-  }
-}
+    WebPPictureFree(picture);   // erase previous buffer
 
-static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
-  const uint8_t* ptr_a = pic->a;
-  if (ptr_a != NULL) {
-    uint8_t* ptr_y = pic->y;
-    int y;
-    for (y = 0; y < pic->height; ++y) {
-      WebPMultRow(ptr_y, ptr_a, pic->width, inverse);
-      ptr_y += pic->y_stride;
-      ptr_a += pic->a_stride;
+    if (!picture->use_argb) {
+      return WebPPictureAllocYUVA(picture, width, height);
+    } else {
+      return WebPPictureAllocARGB(picture, width, height);
     }
   }
+  return 1;
 }
 
-int WebPPictureRescale(WebPPicture* pic, int width, int height) {
-  WebPPicture tmp;
-  int prev_width, prev_height;
-  int32_t* work;
-
-  if (pic == NULL) return 0;
-  prev_width = pic->width;
-  prev_height = pic->height;
-  // if width is unspecified, scale original proportionally to height ratio.
-  if (width == 0) {
-    width = (prev_width * height + prev_height / 2) / prev_height;
-  }
-  // if height is unspecified, scale original proportionally to width ratio.
-  if (height == 0) {
-    height = (prev_height * width + prev_width / 2) / prev_width;
-  }
-  // Check if the overall dimensions still make sense.
-  if (width <= 0 || height <= 0) return 0;
-
-  WebPPictureGrabSpecs(pic, &tmp);
-  tmp.width = width;
-  tmp.height = height;
-  if (!WebPPictureAlloc(&tmp)) return 0;
-
-  if (!pic->use_argb) {
-    work = (int32_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
-    if (work == NULL) {
-      WebPPictureFree(&tmp);
-      return 0;
-    }
-    // If present, we need to rescale alpha first (for AlphaMultiplyY).
-    if (pic->a != NULL) {
-      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
-                   tmp.a, width, height, tmp.a_stride, work, 1);
-    }
-
-    // We take transparency into account on the luma plane only. That's not
-    // totally exact blending, but still is a good approximation.
-    AlphaMultiplyY(pic, 0);
-    RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
-                 tmp.y, width, height, tmp.y_stride, work, 1);
-    AlphaMultiplyY(&tmp, 1);
-
-    RescalePlane(pic->u,
-                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
-                 tmp.u,
-                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
-    RescalePlane(pic->v,
-                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
-                 tmp.v,
-                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
-
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (tmp.u0 != NULL) {
-      const int s = IS_YUV_CSP(tmp.colorspace, WEBP_YUV422) ? 2 : 1;
-      RescalePlane(
-          pic->u0, (prev_width + s / 2) / s, prev_height, pic->uv0_stride,
-          tmp.u0, (width + s / 2) / s, height, tmp.uv0_stride, work, 1);
-      RescalePlane(
-          pic->v0, (prev_width + s / 2) / s, prev_height, pic->uv0_stride,
-          tmp.v0, (width + s / 2) / s, height, tmp.uv0_stride, work, 1);
-    }
-#endif
-  } else {
-    work = (int32_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
-    if (work == NULL) {
-      WebPPictureFree(&tmp);
-      return 0;
-    }
-    // In order to correctly interpolate colors, we need to apply the alpha
-    // weighting first (black-matting), scale the RGB values, and remove
-    // the premultiplication afterward (while preserving the alpha channel).
-    AlphaMultiplyARGB(pic, 0);
-    RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
-                 pic->argb_stride * 4,
-                 (uint8_t*)tmp.argb, width, height,
-                 tmp.argb_stride * 4,
-                 work, 4);
-    AlphaMultiplyARGB(&tmp, 1);
+void WebPPictureFree(WebPPicture* picture) {
+  if (picture != NULL) {
+    WebPSafeFree(picture->memory_);
+    WebPSafeFree(picture->memory_argb_);
+    WebPPictureResetBuffers(picture);
   }
-  WebPPictureFree(pic);
-  free(work);
-  *pic = tmp;
-  return 1;
 }
 
 //------------------------------------------------------------------------------
@@ -538,7 +202,7 @@ int WebPMemoryWrite(const uint8_t* data, size_t data_size,
     if (w->size > 0) {
       memcpy(new_mem, w->mem, w->size);
     }
-    free(w->mem);
+    WebPSafeFree(w->mem);
     w->mem = new_mem;
     // down-cast is ok, thanks to WebPSafeMalloc
     w->max_size = (size_t)next_max_size;
@@ -550,713 +214,15 @@ int WebPMemoryWrite(const uint8_t* data, size_t data_size,
   return 1;
 }
 
-//------------------------------------------------------------------------------
-// Detection of non-trivial transparency
-
-// Returns true if alpha[] has non-0xff values.
-static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
-                          int x_step, int y_step) {
-  if (alpha == NULL) return 0;
-  while (height-- > 0) {
-    int x;
-    for (x = 0; x < width * x_step; x += x_step) {
-      if (alpha[x] != 0xff) return 1;  // TODO(skal): check 4/8 bytes at a time.
-    }
-    alpha += y_step;
-  }
-  return 0;
-}
-
-// Checking for the presence of non-opaque alpha.
-int WebPPictureHasTransparency(const WebPPicture* picture) {
-  if (picture == NULL) return 0;
-  if (!picture->use_argb) {
-    return CheckNonOpaque(picture->a, picture->width, picture->height,
-                          1, picture->a_stride);
-  } else {
-    int x, y;
-    const uint32_t* argb = picture->argb;
-    if (argb == NULL) return 0;
-    for (y = 0; y < picture->height; ++y) {
-      for (x = 0; x < picture->width; ++x) {
-        if (argb[x] < 0xff000000u) return 1;   // test any alpha values != 0xff
-      }
-      argb += picture->argb_stride;
-    }
-  }
-  return 0;
-}
-
-//------------------------------------------------------------------------------
-// RGB -> YUV conversion
-
-static int RGBToY(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
-}
-
-static int RGBToU(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
-}
-
-static int RGBToV(int r, int g, int b, VP8Random* const rg) {
-  return VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
-}
-
-//------------------------------------------------------------------------------
-
-#if defined(USE_GAMMA_COMPRESSION)
-
-// gamma-compensates loss of resolution during chroma subsampling
-#define kGamma 0.80
-#define kGammaFix 12     // fixed-point precision for linear values
-#define kGammaScale ((1 << kGammaFix) - 1)
-#define kGammaTabFix 7   // fixed-point fractional bits precision
-#define kGammaTabScale (1 << kGammaTabFix)
-#define kGammaTabRounder (kGammaTabScale >> 1)
-#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
-
-static int kLinearToGammaTab[kGammaTabSize + 1];
-static uint16_t kGammaToLinearTab[256];
-static int kGammaTablesOk = 0;
-
-static void InitGammaTables(void) {
-  if (!kGammaTablesOk) {
-    int v;
-    const double scale = 1. / kGammaScale;
-    for (v = 0; v <= 255; ++v) {
-      kGammaToLinearTab[v] =
-          (uint16_t)(pow(v / 255., kGamma) * kGammaScale + .5);
-    }
-    for (v = 0; v <= kGammaTabSize; ++v) {
-      const double x = scale * (v << kGammaTabFix);
-      kLinearToGammaTab[v] = (int)(pow(x, 1. / kGamma) * 255. + .5);
-    }
-    kGammaTablesOk = 1;
+void WebPMemoryWriterClear(WebPMemoryWriter* writer) {
+  if (writer != NULL) {
+    WebPSafeFree(writer->mem);
+    writer->mem = NULL;
+    writer->size = 0;
+    writer->max_size = 0;
   }
 }
 
-static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
-  return kGammaToLinearTab[v];
-}
-
-// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
-// U/V value, suitable for RGBToU/V calls.
-static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
-  const int v = base_value << shift;              // final uplifted value
-  const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
-  const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
-  const int v0 = kLinearToGammaTab[tab_pos];
-  const int v1 = kLinearToGammaTab[tab_pos + 1];
-  const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
-  return (y + kGammaTabRounder) >> kGammaTabFix;             // descale
-}
-
-#else
-
-static void InitGammaTables(void) {}
-static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
-static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
-  (void)shift;
-  return v;
-}
-
-#endif    // USE_GAMMA_COMPRESSION
-
-//------------------------------------------------------------------------------
-
-#define SUM4(ptr) LinearToGamma(                         \
-    GammaToLinear((ptr)[0]) +                            \
-    GammaToLinear((ptr)[step]) +                         \
-    GammaToLinear((ptr)[rgb_stride]) +                   \
-    GammaToLinear((ptr)[rgb_stride + step]), 0)          \
-
-#define SUM2H(ptr) \
-    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[step]), 1)
-#define SUM2V(ptr) \
-    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
-#define SUM1(ptr)  \
-    LinearToGamma(GammaToLinear((ptr)[0]), 2)
-
-#define RGB_TO_UV(x, y, SUM) {                           \
-  const int src = (2 * (step * (x) + (y) * rgb_stride)); \
-  const int dst = (x) + (y) * picture->uv_stride;        \
-  const int r = SUM(r_ptr + src);                        \
-  const int g = SUM(g_ptr + src);                        \
-  const int b = SUM(b_ptr + src);                        \
-  picture->u[dst] = RGBToU(r, g, b, &rg);                \
-  picture->v[dst] = RGBToV(r, g, b, &rg);                \
-}
-
-#define RGB_TO_UV0(x_in, x_out, y, SUM) {                \
-  const int src = (step * (x_in) + (y) * rgb_stride);    \
-  const int dst = (x_out) + (y) * picture->uv0_stride;   \
-  const int r = SUM(r_ptr + src);                        \
-  const int g = SUM(g_ptr + src);                        \
-  const int b = SUM(b_ptr + src);                        \
-  picture->u0[dst] = RGBToU(r, g, b, &rg);               \
-  picture->v0[dst] = RGBToV(r, g, b, &rg);               \
-}
-
-static void MakeGray(WebPPicture* const picture) {
-  int y;
-  const int uv_width = HALVE(picture->width);
-  const int uv_height = HALVE(picture->height);
-  for (y = 0; y < uv_height; ++y) {
-    memset(picture->u + y * picture->uv_stride, 128, uv_width);
-    memset(picture->v + y * picture->uv_stride, 128, uv_width);
-  }
-}
-
-static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
-                              const uint8_t* const g_ptr,
-                              const uint8_t* const b_ptr,
-                              const uint8_t* const a_ptr,
-                              int step,         // bytes per pixel
-                              int rgb_stride,   // bytes per scanline
-                              float dithering,
-                              WebPPicture* const picture) {
-  const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
-  int x, y;
-  const int width = picture->width;
-  const int height = picture->height;
-  const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
-  VP8Random rg;
-
-  picture->colorspace = uv_csp;
-  picture->use_argb = 0;
-  if (has_alpha) {
-    picture->colorspace |= WEBP_CSP_ALPHA_BIT;
-  }
-  if (!WebPPictureAlloc(picture)) return 0;
-
-  VP8InitRandom(&rg, dithering);
-  InitGammaTables();
-
-  // Import luma plane
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      const int offset = step * x + y * rgb_stride;
-      picture->y[x + y * picture->y_stride] =
-          RGBToY(r_ptr[offset], g_ptr[offset], b_ptr[offset], &rg);
-    }
-  }
-
-  // Downsample U/V plane
-  if (uv_csp != WEBP_YUV400) {
-    for (y = 0; y < (height >> 1); ++y) {
-      for (x = 0; x < (width >> 1); ++x) {
-        RGB_TO_UV(x, y, SUM4);
-      }
-      if (width & 1) {
-        RGB_TO_UV(x, y, SUM2V);
-      }
-    }
-    if (height & 1) {
-      for (x = 0; x < (width >> 1); ++x) {
-        RGB_TO_UV(x, y, SUM2H);
-      }
-      if (width & 1) {
-        RGB_TO_UV(x, y, SUM1);
-      }
-    }
-
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    // Store original U/V samples too
-    if (uv_csp == WEBP_YUV422) {
-      for (y = 0; y < height; ++y) {
-        for (x = 0; x < (width >> 1); ++x) {
-          RGB_TO_UV0(2 * x, x, y, SUM2H);
-        }
-        if (width & 1) {
-          RGB_TO_UV0(2 * x, x, y, SUM1);
-        }
-      }
-    } else if (uv_csp == WEBP_YUV444) {
-      for (y = 0; y < height; ++y) {
-        for (x = 0; x < width; ++x) {
-          RGB_TO_UV0(x, x, y, SUM1);
-        }
-      }
-    }
-#endif
-  } else {
-    MakeGray(picture);
-  }
-
-  if (has_alpha) {
-    assert(step >= 4);
-    assert(picture->a != NULL);
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        picture->a[x + y * picture->a_stride] =
-            a_ptr[step * x + y * rgb_stride];
-      }
-    }
-  }
-  return 1;
-}
-
-static int Import(WebPPicture* const picture,
-                  const uint8_t* const rgb, int rgb_stride,
-                  int step, int swap_rb, int import_alpha) {
-  const uint8_t* const r_ptr = rgb + (swap_rb ? 2 : 0);
-  const uint8_t* const g_ptr = rgb + 1;
-  const uint8_t* const b_ptr = rgb + (swap_rb ? 0 : 2);
-  const uint8_t* const a_ptr = import_alpha ? rgb + 3 : NULL;
-  const int width = picture->width;
-  const int height = picture->height;
-
-  if (!picture->use_argb) {
-    return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
-                              0.f /* no dithering */, picture);
-  }
-  if (import_alpha) {
-    picture->colorspace |= WEBP_CSP_ALPHA_BIT;
-  } else {
-    picture->colorspace &= ~WEBP_CSP_ALPHA_BIT;
-  }
-  if (!WebPPictureAlloc(picture)) return 0;
-
-  if (!import_alpha) {
-    int x, y;
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const int offset = step * x + y * rgb_stride;
-        const uint32_t argb =
-            MakeARGB32(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
-        picture->argb[x + y * picture->argb_stride] = argb;
-      }
-    }
-  } else {
-    int x, y;
-    assert(step >= 4);
-    for (y = 0; y < height; ++y) {
-      for (x = 0; x < width; ++x) {
-        const int offset = step * x + y * rgb_stride;
-        const uint32_t argb = ((uint32_t)a_ptr[offset] << 24) |
-                              (r_ptr[offset] << 16) |
-                              (g_ptr[offset] <<  8) |
-                              (b_ptr[offset]);
-        picture->argb[x + y * picture->argb_stride] = argb;
-      }
-    }
-  }
-  return 1;
-}
-#undef SUM4
-#undef SUM2V
-#undef SUM2H
-#undef SUM1
-#undef RGB_TO_UV
-
-int WebPPictureImportRGB(WebPPicture* picture,
-                         const uint8_t* rgb, int rgb_stride) {
-  return Import(picture, rgb, rgb_stride, 3, 0, 0);
-}
-
-int WebPPictureImportBGR(WebPPicture* picture,
-                         const uint8_t* rgb, int rgb_stride) {
-  return Import(picture, rgb, rgb_stride, 3, 1, 0);
-}
-
-int WebPPictureImportRGBA(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return Import(picture, rgba, rgba_stride, 4, 0, 1);
-}
-
-int WebPPictureImportBGRA(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return Import(picture, rgba, rgba_stride, 4, 1, 1);
-}
-
-int WebPPictureImportRGBX(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return Import(picture, rgba, rgba_stride, 4, 0, 0);
-}
-
-int WebPPictureImportBGRX(WebPPicture* picture,
-                          const uint8_t* rgba, int rgba_stride) {
-  return Import(picture, rgba, rgba_stride, 4, 1, 0);
-}
-
-//------------------------------------------------------------------------------
-// Automatic YUV <-> ARGB conversions.
-
-int WebPPictureYUVAToARGB(WebPPicture* picture) {
-  if (picture == NULL) return 0;
-  if (picture->y == NULL || picture->u == NULL || picture->v == NULL) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
-  }
-  if ((picture->colorspace & WEBP_CSP_ALPHA_BIT) && picture->a == NULL) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
-  }
-  if ((picture->colorspace & WEBP_CSP_UV_MASK) != WEBP_YUV420) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
-  }
-  // Allocate a new argb buffer (discarding the previous one).
-  if (!PictureAllocARGB(picture)) return 0;
-
-  // Convert
-  {
-    int y;
-    const int width = picture->width;
-    const int height = picture->height;
-    const int argb_stride = 4 * picture->argb_stride;
-    uint8_t* dst = (uint8_t*)picture->argb;
-    const uint8_t *cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y;
-    WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST);
-
-    // First row, with replicated top samples.
-    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
-    cur_y += picture->y_stride;
-    dst += argb_stride;
-    // Center rows.
-    for (y = 1; y + 1 < height; y += 2) {
-      const uint8_t* const top_u = cur_u;
-      const uint8_t* const top_v = cur_v;
-      cur_u += picture->uv_stride;
-      cur_v += picture->uv_stride;
-      upsample(cur_y, cur_y + picture->y_stride, top_u, top_v, cur_u, cur_v,
-               dst, dst + argb_stride, width);
-      cur_y += 2 * picture->y_stride;
-      dst += 2 * argb_stride;
-    }
-    // Last row (if needed), with replicated bottom samples.
-    if (height > 1 && !(height & 1)) {
-      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
-    }
-    // Insert alpha values if needed, in replacement for the default 0xff ones.
-    if (picture->colorspace & WEBP_CSP_ALPHA_BIT) {
-      for (y = 0; y < height; ++y) {
-        uint32_t* const argb_dst = picture->argb + y * picture->argb_stride;
-        const uint8_t* const src = picture->a + y * picture->a_stride;
-        int x;
-        for (x = 0; x < width; ++x) {
-          argb_dst[x] = (argb_dst[x] & 0x00ffffffu) | ((uint32_t)src[x] << 24);
-        }
-      }
-    }
-  }
-  return 1;
-}
-
-int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
-                                  float dithering) {
-  if (picture == NULL) return 0;
-  if (picture->argb == NULL) {
-    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
-  } else {
-    const uint8_t* const argb = (const uint8_t*)picture->argb;
-    const uint8_t* const r = ALPHA_IS_LAST ? argb + 2 : argb + 1;
-    const uint8_t* const g = ALPHA_IS_LAST ? argb + 1 : argb + 2;
-    const uint8_t* const b = ALPHA_IS_LAST ? argb + 0 : argb + 3;
-    const uint8_t* const a = ALPHA_IS_LAST ? argb + 3 : argb + 0;
-    // We work on a tmp copy of 'picture', because ImportYUVAFromRGBA()
-    // would be calling WebPPictureFree(picture) otherwise.
-    WebPPicture tmp = *picture;
-    PictureResetARGB(&tmp);  // reset ARGB buffer so that it's not free()'d.
-    tmp.use_argb = 0;
-    tmp.colorspace = colorspace & WEBP_CSP_UV_MASK;
-    if (!ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride, dithering,
-                            &tmp)) {
-      return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
-    }
-    // Copy back the YUV specs into 'picture'.
-    tmp.argb = picture->argb;
-    tmp.argb_stride = picture->argb_stride;
-    tmp.memory_argb_ = picture->memory_argb_;
-    *picture = tmp;
-  }
-  return 1;
-}
-
-int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
-  return WebPPictureARGBToYUVADithered(picture, colorspace, 0.f);
-}
-
-//------------------------------------------------------------------------------
-// Helper: clean up fully transparent area to help compressibility.
-
-#define SIZE 8
-#define SIZE2 (SIZE / 2)
-static int is_transparent_area(const uint8_t* ptr, int stride, int size) {
-  int y, x;
-  for (y = 0; y < size; ++y) {
-    for (x = 0; x < size; ++x) {
-      if (ptr[x]) {
-        return 0;
-      }
-    }
-    ptr += stride;
-  }
-  return 1;
-}
-
-static WEBP_INLINE void flatten(uint8_t* ptr, int v, int stride, int size) {
-  int y;
-  for (y = 0; y < size; ++y) {
-    memset(ptr, v, size);
-    ptr += stride;
-  }
-}
-
-void WebPCleanupTransparentArea(WebPPicture* pic) {
-  int x, y, w, h;
-  const uint8_t* a_ptr;
-  int values[3] = { 0 };
-
-  if (pic == NULL) return;
-
-  a_ptr = pic->a;
-  if (a_ptr == NULL) return;    // nothing to do
-
-  w = pic->width / SIZE;
-  h = pic->height / SIZE;
-  for (y = 0; y < h; ++y) {
-    int need_reset = 1;
-    for (x = 0; x < w; ++x) {
-      const int off_a = (y * pic->a_stride + x) * SIZE;
-      const int off_y = (y * pic->y_stride + x) * SIZE;
-      const int off_uv = (y * pic->uv_stride + x) * SIZE2;
-      if (is_transparent_area(a_ptr + off_a, pic->a_stride, SIZE)) {
-        if (need_reset) {
-          values[0] = pic->y[off_y];
-          values[1] = pic->u[off_uv];
-          values[2] = pic->v[off_uv];
-          need_reset = 0;
-        }
-        flatten(pic->y + off_y, values[0], pic->y_stride, SIZE);
-        flatten(pic->u + off_uv, values[1], pic->uv_stride, SIZE2);
-        flatten(pic->v + off_uv, values[2], pic->uv_stride, SIZE2);
-      } else {
-        need_reset = 1;
-      }
-    }
-    // ignore the left-overs on right/bottom
-  }
-}
-
-#undef SIZE
-#undef SIZE2
-
-//------------------------------------------------------------------------------
-// Blend color and remove transparency info
-
-#define BLEND(V0, V1, ALPHA) \
-    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 16)
-#define BLEND_10BIT(V0, V1, ALPHA) \
-    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 18)
-
-void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
-  const int red = (background_rgb >> 16) & 0xff;
-  const int green = (background_rgb >> 8) & 0xff;
-  const int blue = (background_rgb >> 0) & 0xff;
-  VP8Random rg;
-  int x, y;
-  if (pic == NULL) return;
-  VP8InitRandom(&rg, 0.f);
-  if (!pic->use_argb) {
-    const int uv_width = (pic->width >> 1);  // omit last pixel during u/v loop
-    const int Y0 = RGBToY(red, green, blue, &rg);
-    // VP8RGBToU/V expects the u/v values summed over four pixels
-    const int U0 = RGBToU(4 * red, 4 * green, 4 * blue, &rg);
-    const int V0 = RGBToV(4 * red, 4 * green, 4 * blue, &rg);
-    const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
-    if (!has_alpha || pic->a == NULL) return;    // nothing to do
-    for (y = 0; y < pic->height; ++y) {
-      // Luma blending
-      uint8_t* const y_ptr = pic->y + y * pic->y_stride;
-      uint8_t* const a_ptr = pic->a + y * pic->a_stride;
-      for (x = 0; x < pic->width; ++x) {
-        const int alpha = a_ptr[x];
-        if (alpha < 0xff) {
-          y_ptr[x] = BLEND(Y0, y_ptr[x], a_ptr[x]);
-        }
-      }
-      // Chroma blending every even line
-      if ((y & 1) == 0) {
-        uint8_t* const u = pic->u + (y >> 1) * pic->uv_stride;
-        uint8_t* const v = pic->v + (y >> 1) * pic->uv_stride;
-        uint8_t* const a_ptr2 =
-            (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
-        for (x = 0; x < uv_width; ++x) {
-          // Average four alpha values into a single blending weight.
-          // TODO(skal): might lead to visible contouring. Can we do better?
-          const int alpha =
-              a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
-              a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
-          u[x] = BLEND_10BIT(U0, u[x], alpha);
-          v[x] = BLEND_10BIT(V0, v[x], alpha);
-        }
-        if (pic->width & 1) {   // rightmost pixel
-          const int alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
-          u[x] = BLEND_10BIT(U0, u[x], alpha);
-          v[x] = BLEND_10BIT(V0, v[x], alpha);
-        }
-      }
-      memset(a_ptr, 0xff, pic->width);
-    }
-  } else {
-    uint32_t* argb = pic->argb;
-    const uint32_t background = MakeARGB32(red, green, blue);
-    for (y = 0; y < pic->height; ++y) {
-      for (x = 0; x < pic->width; ++x) {
-        const int alpha = (argb[x] >> 24) & 0xff;
-        if (alpha != 0xff) {
-          if (alpha > 0) {
-            int r = (argb[x] >> 16) & 0xff;
-            int g = (argb[x] >>  8) & 0xff;
-            int b = (argb[x] >>  0) & 0xff;
-            r = BLEND(red, r, alpha);
-            g = BLEND(green, g, alpha);
-            b = BLEND(blue, b, alpha);
-            argb[x] = MakeARGB32(r, g, b);
-          } else {
-            argb[x] = background;
-          }
-        }
-      }
-      argb += pic->argb_stride;
-    }
-  }
-}
-
-#undef BLEND
-#undef BLEND_10BIT
-
-//------------------------------------------------------------------------------
-// local-min distortion
-//
-// For every pixel in the *reference* picture, we search for the local best
-// match in the compressed image. This is not a symmetrical measure.
-
-// search radius. Shouldn't be too large.
-#define RADIUS 2
-
-static float AccumulateLSIM(const uint8_t* src, int src_stride,
-                            const uint8_t* ref, int ref_stride,
-                            int w, int h) {
-  int x, y;
-  double total_sse = 0.;
-  for (y = 0; y < h; ++y) {
-    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
-    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
-    for (x = 0; x < w; ++x) {
-      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
-      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
-      double best_sse = 255. * 255.;
-      const double value = (double)ref[y * ref_stride + x];
-      int i, j;
-      for (j = y_0; j < y_1; ++j) {
-        const uint8_t* s = src + j * src_stride;
-        for (i = x_0; i < x_1; ++i) {
-          const double sse = (double)(s[i] - value) * (s[i] - value);
-          if (sse < best_sse) best_sse = sse;
-        }
-      }
-      total_sse += best_sse;
-    }
-  }
-  return (float)total_sse;
-}
-#undef RADIUS
-
-//------------------------------------------------------------------------------
-// Distortion
-
-// Max value returned in case of exact similarity.
-static const double kMinDistortion_dB = 99.;
-static float GetPSNR(const double v) {
-  return (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
-                          : kMinDistortion_dB);
-}
-
-int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
-                          int type, float result[5]) {
-  DistoStats stats[5];
-  int has_alpha;
-  int uv_w, uv_h;
-
-  if (src == NULL || ref == NULL ||
-      src->width != ref->width || src->height != ref->height ||
-      src->y == NULL || ref->y == NULL ||
-      src->u == NULL || ref->u == NULL ||
-      src->v == NULL || ref->v == NULL ||
-      result == NULL) {
-    return 0;
-  }
-  // TODO(skal): provide distortion for ARGB too.
-  if (src->use_argb == 1 || src->use_argb != ref->use_argb) {
-    return 0;
-  }
-
-  has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
-  if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
-      (has_alpha && (src->a == NULL || ref->a == NULL))) {
-    return 0;
-  }
-
-  memset(stats, 0, sizeof(stats));
-
-  uv_w = HALVE(src->width);
-  uv_h = HALVE(src->height);
-  if (type >= 2) {
-    float sse[4];
-    sse[0] = AccumulateLSIM(src->y, src->y_stride,
-                            ref->y, ref->y_stride, src->width, src->height);
-    sse[1] = AccumulateLSIM(src->u, src->uv_stride,
-                            ref->u, ref->uv_stride, uv_w, uv_h);
-    sse[2] = AccumulateLSIM(src->v, src->uv_stride,
-                            ref->v, ref->uv_stride, uv_w, uv_h);
-    sse[3] = has_alpha ? AccumulateLSIM(src->a, src->a_stride,
-                                        ref->a, ref->a_stride,
-                                        src->width, src->height)
-                       : 0.f;
-    result[0] = GetPSNR(sse[0] / (src->width * src->height));
-    result[1] = GetPSNR(sse[1] / (uv_w * uv_h));
-    result[2] = GetPSNR(sse[2] / (uv_w * uv_h));
-    result[3] = GetPSNR(sse[3] / (src->width * src->height));
-    {
-      double total_sse = sse[0] + sse[1] + sse[2];
-      int total_pixels = src->width * src->height + 2 * uv_w * uv_h;
-      if (has_alpha) {
-        total_pixels += src->width * src->height;
-        total_sse += sse[3];
-      }
-      result[4] = GetPSNR(total_sse / total_pixels);
-    }
-  } else {
-    int c;
-    VP8SSIMAccumulatePlane(src->y, src->y_stride,
-                           ref->y, ref->y_stride,
-                           src->width, src->height, &stats[0]);
-    VP8SSIMAccumulatePlane(src->u, src->uv_stride,
-                           ref->u, ref->uv_stride,
-                           uv_w, uv_h, &stats[1]);
-    VP8SSIMAccumulatePlane(src->v, src->uv_stride,
-                           ref->v, ref->uv_stride,
-                           uv_w, uv_h, &stats[2]);
-    if (has_alpha) {
-      VP8SSIMAccumulatePlane(src->a, src->a_stride,
-                             ref->a, ref->a_stride,
-                             src->width, src->height, &stats[3]);
-    }
-    for (c = 0; c <= 4; ++c) {
-      if (type == 1) {
-        const double v = VP8SSIMGet(&stats[c]);
-        result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
-                                     : kMinDistortion_dB);
-      } else {
-        const double v = VP8SSIMGetSquaredError(&stats[c]);
-        result[c] = GetPSNR(v);
-      }
-      // Accumulate forward
-      if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
-    }
-  }
-  return 1;
-}
-
 //------------------------------------------------------------------------------
 // Simplest high-level calls:
 
@@ -1286,7 +252,7 @@ static size_t Encode(const uint8_t* rgba, int width, int height, int stride,
   ok = import(&pic, rgba, stride) && WebPEncode(&config, &pic);
   WebPPictureFree(&pic);
   if (!ok) {
-    free(wrt.mem);
+    WebPMemoryWriterClear(&wrt);
     *output = NULL;
     return 0;
   }
@@ -1321,4 +287,3 @@ LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA)
 #undef LOSSLESS_ENCODE_FUNC
 
 //------------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/enc/picture_csp.c b/src/3rdparty/libwebp/src/enc/picture_csp.c
new file mode 100644
index 0000000..7875f62
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/picture_csp.c
@@ -0,0 +1,1114 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture utils for colorspace conversion
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+
+#include "./vp8enci.h"
+#include "../utils/random.h"
+#include "../utils/utils.h"
+#include "../dsp/yuv.h"
+
+// Uncomment to disable gamma-compression during RGB->U/V averaging
+#define USE_GAMMA_COMPRESSION
+
+// If defined, use table to compute x / alpha.
+#define USE_INVERSE_ALPHA_TABLE
+
+static const union {
+  uint32_t argb;
+  uint8_t  bytes[4];
+} test_endian = { 0xff000000u };
+#define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)
+
+static WEBP_INLINE uint32_t MakeARGB32(int a, int r, int g, int b) {
+  return (((uint32_t)a << 24) | (r << 16) | (g << 8) | b);
+}
+
+//------------------------------------------------------------------------------
+// Detection of non-trivial transparency
+
+// Returns true if alpha[] has non-0xff values.
+static int CheckNonOpaque(const uint8_t* alpha, int width, int height,
+                          int x_step, int y_step) {
+  if (alpha == NULL) return 0;
+  while (height-- > 0) {
+    int x;
+    for (x = 0; x < width * x_step; x += x_step) {
+      if (alpha[x] != 0xff) return 1;  // TODO(skal): check 4/8 bytes at a time.
+    }
+    alpha += y_step;
+  }
+  return 0;
+}
+
+// Checking for the presence of non-opaque alpha.
+int WebPPictureHasTransparency(const WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (!picture->use_argb) {
+    return CheckNonOpaque(picture->a, picture->width, picture->height,
+                          1, picture->a_stride);
+  } else {
+    int x, y;
+    const uint32_t* argb = picture->argb;
+    if (argb == NULL) return 0;
+    for (y = 0; y < picture->height; ++y) {
+      for (x = 0; x < picture->width; ++x) {
+        if (argb[x] < 0xff000000u) return 1;   // test any alpha values != 0xff
+      }
+      argb += picture->argb_stride;
+    }
+  }
+  return 0;
+}
+
+//------------------------------------------------------------------------------
+// Code for gamma correction
+
+#if defined(USE_GAMMA_COMPRESSION)
+
+// gamma-compensates loss of resolution during chroma subsampling
+#define kGamma 0.80      // for now we use a different gamma value than kGammaF
+#define kGammaFix 12     // fixed-point precision for linear values
+#define kGammaScale ((1 << kGammaFix) - 1)
+#define kGammaTabFix 7   // fixed-point fractional bits precision
+#define kGammaTabScale (1 << kGammaTabFix)
+#define kGammaTabRounder (kGammaTabScale >> 1)
+#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
+
+static int kLinearToGammaTab[kGammaTabSize + 1];
+static uint16_t kGammaToLinearTab[256];
+static int kGammaTablesOk = 0;
+
+static void InitGammaTables(void) {
+  if (!kGammaTablesOk) {
+    int v;
+    const double scale = (double)(1 << kGammaTabFix) / kGammaScale;
+    const double norm = 1. / 255.;
+    for (v = 0; v <= 255; ++v) {
+      kGammaToLinearTab[v] =
+          (uint16_t)(pow(norm * v, kGamma) * kGammaScale + .5);
+    }
+    for (v = 0; v <= kGammaTabSize; ++v) {
+      kLinearToGammaTab[v] = (int)(255. * pow(scale * v, 1. / kGamma) + .5);
+    }
+    kGammaTablesOk = 1;
+  }
+}
+
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
+  return kGammaToLinearTab[v];
+}
+
+static WEBP_INLINE int Interpolate(int v) {
+  const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
+  const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
+  const int v0 = kLinearToGammaTab[tab_pos];
+  const int v1 = kLinearToGammaTab[tab_pos + 1];
+  const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
+  assert(tab_pos + 1 < kGammaTabSize + 1);
+  return y;
+}
+
+// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
+// U/V value, suitable for RGBToU/V calls.
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  const int y = Interpolate(base_value << shift);   // final uplifted value
+  return (y + kGammaTabRounder) >> kGammaTabFix;    // descale
+}
+
+#else
+
+static void InitGammaTables(void) {}
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  return (int)(base_value << shift);
+}
+
+#endif    // USE_GAMMA_COMPRESSION
+
+//------------------------------------------------------------------------------
+// RGB -> YUV conversion
+
+static int RGBToY(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToY(r, g, b, YUV_HALF)
+                      : VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
+}
+
+static int RGBToU(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToU(r, g, b, YUV_HALF << 2)
+                      : VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+}
+
+static int RGBToV(int r, int g, int b, VP8Random* const rg) {
+  return (rg == NULL) ? VP8RGBToV(r, g, b, YUV_HALF << 2)
+                      : VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+}
+
+//------------------------------------------------------------------------------
+// Smart RGB->YUV conversion
+
+static const int kNumIterations = 6;
+static const int kMinDimensionIterativeConversion = 4;
+
+// We use a-priori a different precision for storing RGB and Y/W components
+// We could use YFIX=0 and only uint8_t for fixed_y_t, but it produces some
+// banding sometimes. Better use extra precision.
+// TODO(skal): cleanup once TFIX/YFIX values are fixed.
+
+typedef int16_t fixed_t;      // signed type with extra TFIX precision for UV
+typedef uint16_t fixed_y_t;   // unsigned type with extra YFIX precision for W
+#define TFIX 6   // fixed-point precision of RGB
+#define YFIX 2   // fixed point precision for Y/W
+
+#define THALF ((1 << TFIX) >> 1)
+#define MAX_Y_T ((256 << YFIX) - 1)
+#define TROUNDER (1 << (YUV_FIX + TFIX - 1))
+
+#if defined(USE_GAMMA_COMPRESSION)
+
+// float variant of gamma-correction
+// We use tables of different size and precision, along with a 'real-world'
+// Gamma value close to ~2.
+#define kGammaF 2.2
+static float kGammaToLinearTabF[MAX_Y_T + 1];   // size scales with Y_FIX
+static float kLinearToGammaTabF[kGammaTabSize + 2];
+static int kGammaTablesFOk = 0;
+
+static void InitGammaTablesF(void) {
+  if (!kGammaTablesFOk) {
+    int v;
+    const double norm = 1. / MAX_Y_T;
+    const double scale = 1. / kGammaTabSize;
+    for (v = 0; v <= MAX_Y_T; ++v) {
+      kGammaToLinearTabF[v] = (float)pow(norm * v, kGammaF);
+    }
+    for (v = 0; v <= kGammaTabSize; ++v) {
+      kLinearToGammaTabF[v] = (float)(MAX_Y_T * pow(scale * v, 1. / kGammaF));
+    }
+    // to prevent small rounding errors to cause read-overflow:
+    kLinearToGammaTabF[kGammaTabSize + 1] = kLinearToGammaTabF[kGammaTabSize];
+    kGammaTablesFOk = 1;
+  }
+}
+
+static WEBP_INLINE float GammaToLinearF(int v) {
+  return kGammaToLinearTabF[v];
+}
+
+static WEBP_INLINE float LinearToGammaF(float value) {
+  const float v = value * kGammaTabSize;
+  const int tab_pos = (int)v;
+  const float x = v - (float)tab_pos;      // fractional part
+  const float v0 = kLinearToGammaTabF[tab_pos + 0];
+  const float v1 = kLinearToGammaTabF[tab_pos + 1];
+  const float y = v1 * x + v0 * (1.f - x);  // interpolate
+  return y;
+}
+
+#else
+
+static void InitGammaTablesF(void) {}
+static WEBP_INLINE float GammaToLinearF(int v) {
+  const float norm = 1.f / MAX_Y_T;
+  return norm * v;
+}
+static WEBP_INLINE float LinearToGammaF(float value) {
+  return MAX_Y_T * value;
+}
+
+#endif    // USE_GAMMA_COMPRESSION
+
+//------------------------------------------------------------------------------
+
+// precision: YFIX -> TFIX
+static WEBP_INLINE int FixedYToW(int v) {
+#if TFIX == YFIX
+  return v;
+#elif TFIX >= YFIX
+  return v << (TFIX - YFIX);
+#else
+  return v >> (YFIX - TFIX);
+#endif
+}
+
+static WEBP_INLINE int FixedWToY(int v) {
+#if TFIX == YFIX
+  return v;
+#elif YFIX >= TFIX
+  return v << (YFIX - TFIX);
+#else
+  return v >> (TFIX - YFIX);
+#endif
+}
+
+static uint8_t clip_8b(fixed_t v) {
+  return (!(v & ~0xff)) ? (uint8_t)v : (v < 0) ? 0u : 255u;
+}
+
+static fixed_y_t clip_y(int y) {
+  return (!(y & ~MAX_Y_T)) ? (fixed_y_t)y : (y < 0) ? 0 : MAX_Y_T;
+}
+
+// precision: TFIX -> YFIX
+static fixed_y_t clip_fixed_t(fixed_t v) {
+  const int y = FixedWToY(v);
+  const fixed_y_t w = clip_y(y);
+  return w;
+}
+
+//------------------------------------------------------------------------------
+
+static int RGBToGray(int r, int g, int b) {
+  const int luma = 19595 * r + 38470 * g + 7471 * b + YUV_HALF;
+  return (luma >> YUV_FIX);
+}
+
+static float RGBToGrayF(float r, float g, float b) {
+  return 0.299f * r + 0.587f * g + 0.114f * b;
+}
+
+static float ScaleDown(int a, int b, int c, int d) {
+  const float A = GammaToLinearF(a);
+  const float B = GammaToLinearF(b);
+  const float C = GammaToLinearF(c);
+  const float D = GammaToLinearF(d);
+  return LinearToGammaF(0.25f * (A + B + C + D));
+}
+
+static WEBP_INLINE void UpdateW(const fixed_y_t* src, fixed_y_t* dst, int len) {
+  while (len-- > 0) {
+    const float R = GammaToLinearF(src[0]);
+    const float G = GammaToLinearF(src[1]);
+    const float B = GammaToLinearF(src[2]);
+    const float Y = RGBToGrayF(R, G, B);
+    *dst++ = (fixed_y_t)(LinearToGammaF(Y) + .5);
+    src += 3;
+  }
+}
+
+static WEBP_INLINE void UpdateChroma(const fixed_y_t* src1,
+                                     const fixed_y_t* src2,
+                                     fixed_t* dst, fixed_y_t* tmp, int len) {
+  while (len--> 0) {
+    const float r = ScaleDown(src1[0], src1[3], src2[0], src2[3]);
+    const float g = ScaleDown(src1[1], src1[4], src2[1], src2[4]);
+    const float b = ScaleDown(src1[2], src1[5], src2[2], src2[5]);
+    const float W = RGBToGrayF(r, g, b);
+    dst[0] = (fixed_t)FixedYToW((int)(r - W));
+    dst[1] = (fixed_t)FixedYToW((int)(g - W));
+    dst[2] = (fixed_t)FixedYToW((int)(b - W));
+    dst += 3;
+    src1 += 6;
+    src2 += 6;
+    if (tmp != NULL) {
+      tmp[0] = tmp[1] = clip_y((int)(W + .5));
+      tmp += 2;
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+
+static WEBP_INLINE int Filter(const fixed_t* const A, const fixed_t* const B,
+                              int rightwise) {
+  int v;
+  if (!rightwise) {
+    v = (A[0] * 9 + A[-3] * 3 + B[0] * 3 + B[-3]);
+  } else {
+    v = (A[0] * 9 + A[+3] * 3 + B[0] * 3 + B[+3]);
+  }
+  return (v + 8) >> 4;
+}
+
+static WEBP_INLINE int Filter2(int A, int B) { return (A * 3 + B + 2) >> 2; }
+
+//------------------------------------------------------------------------------
+
+// 8bit -> YFIX
+static WEBP_INLINE fixed_y_t UpLift(uint8_t a) {
+  return ((fixed_y_t)a << YFIX) | (1 << (YFIX - 1));
+}
+
+static void ImportOneRow(const uint8_t* const r_ptr,
+                         const uint8_t* const g_ptr,
+                         const uint8_t* const b_ptr,
+                         int step,
+                         int pic_width,
+                         fixed_y_t* const dst) {
+  int i;
+  for (i = 0; i < pic_width; ++i) {
+    const int off = i * step;
+    dst[3 * i + 0] = UpLift(r_ptr[off]);
+    dst[3 * i + 1] = UpLift(g_ptr[off]);
+    dst[3 * i + 2] = UpLift(b_ptr[off]);
+  }
+  if (pic_width & 1) {  // replicate rightmost pixel
+    memcpy(dst + 3 * pic_width, dst + 3 * (pic_width - 1), 3 * sizeof(*dst));
+  }
+}
+
+static void InterpolateTwoRows(const fixed_y_t* const best_y,
+                               const fixed_t* const prev_uv,
+                               const fixed_t* const cur_uv,
+                               const fixed_t* const next_uv,
+                               int w,
+                               fixed_y_t* const out1,
+                               fixed_y_t* const out2) {
+  int i, k;
+  {  // special boundary case for i==0
+    const int W0 = FixedYToW(best_y[0]);
+    const int W1 = FixedYToW(best_y[w]);
+    for (k = 0; k <= 2; ++k) {
+      out1[k] = clip_fixed_t(Filter2(cur_uv[k], prev_uv[k]) + W0);
+      out2[k] = clip_fixed_t(Filter2(cur_uv[k], next_uv[k]) + W1);
+    }
+  }
+  for (i = 1; i < w - 1; ++i) {
+    const int W0 = FixedYToW(best_y[i + 0]);
+    const int W1 = FixedYToW(best_y[i + w]);
+    const int off = 3 * (i >> 1);
+    for (k = 0; k <= 2; ++k) {
+      const int tmp0 = Filter(cur_uv + off + k, prev_uv + off + k, i & 1);
+      const int tmp1 = Filter(cur_uv + off + k, next_uv + off + k, i & 1);
+      out1[3 * i + k] = clip_fixed_t(tmp0 + W0);
+      out2[3 * i + k] = clip_fixed_t(tmp1 + W1);
+    }
+  }
+  {  // special boundary case for i == w - 1
+    const int W0 = FixedYToW(best_y[i + 0]);
+    const int W1 = FixedYToW(best_y[i + w]);
+    const int off = 3 * (i >> 1);
+    for (k = 0; k <= 2; ++k) {
+      out1[3 * i + k] =
+          clip_fixed_t(Filter2(cur_uv[off + k], prev_uv[off + k]) + W0);
+      out2[3 * i + k] =
+          clip_fixed_t(Filter2(cur_uv[off + k], next_uv[off + k]) + W1);
+    }
+  }
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToY(int r, int g, int b) {
+  const int luma = 16839 * r + 33059 * g + 6420 * b + TROUNDER;
+  return clip_8b(16 + (luma >> (YUV_FIX + TFIX)));
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToU(int r, int g, int b) {
+  const int u =  -9719 * r - 19081 * g + 28800 * b + TROUNDER;
+  return clip_8b(128 + (u >> (YUV_FIX + TFIX)));
+}
+
+static WEBP_INLINE uint8_t ConvertRGBToV(int r, int g, int b) {
+  const int v = +28800 * r - 24116 * g -  4684 * b + TROUNDER;
+  return clip_8b(128 + (v >> (YUV_FIX + TFIX)));
+}
+
+static int ConvertWRGBToYUV(const fixed_y_t* const best_y,
+                            const fixed_t* const best_uv,
+                            WebPPicture* const picture) {
+  int i, j;
+  const int w = (picture->width + 1) & ~1;
+  const int h = (picture->height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  for (j = 0; j < picture->height; ++j) {
+    for (i = 0; i < picture->width; ++i) {
+      const int off = 3 * ((i >> 1) + (j >> 1) * uv_w);
+      const int off2 = i + j * picture->y_stride;
+      const int W = FixedYToW(best_y[i + j * w]);
+      const int r = best_uv[off + 0] + W;
+      const int g = best_uv[off + 1] + W;
+      const int b = best_uv[off + 2] + W;
+      picture->y[off2] = ConvertRGBToY(r, g, b);
+    }
+  }
+  for (j = 0; j < uv_h; ++j) {
+    uint8_t* const dst_u = picture->u + j * picture->uv_stride;
+    uint8_t* const dst_v = picture->v + j * picture->uv_stride;
+    for (i = 0; i < uv_w; ++i) {
+      const int off = 3 * (i + j * uv_w);
+      const int r = best_uv[off + 0];
+      const int g = best_uv[off + 1];
+      const int b = best_uv[off + 2];
+      dst_u[i] = ConvertRGBToU(r, g, b);
+      dst_v[i] = ConvertRGBToV(r, g, b);
+    }
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Main function
+
+#define SAFE_ALLOC(W, H, T) ((T*)WebPSafeMalloc((W) * (H), sizeof(T)))
+
+static int PreprocessARGB(const uint8_t* const r_ptr,
+                          const uint8_t* const g_ptr,
+                          const uint8_t* const b_ptr,
+                          int step, int rgb_stride,
+                          WebPPicture* const picture) {
+  // we expand the right/bottom border if needed
+  const int w = (picture->width + 1) & ~1;
+  const int h = (picture->height + 1) & ~1;
+  const int uv_w = w >> 1;
+  const int uv_h = h >> 1;
+  int i, j, iter;
+
+  // TODO(skal): allocate one big memory chunk. But for now, it's easier
+  // for valgrind debugging to have several chunks.
+  fixed_y_t* const tmp_buffer = SAFE_ALLOC(w * 3, 2, fixed_y_t);   // scratch
+  fixed_y_t* const best_y = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const target_y = SAFE_ALLOC(w, h, fixed_y_t);
+  fixed_y_t* const best_rgb_y = SAFE_ALLOC(w, 2, fixed_y_t);
+  fixed_t* const best_uv = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const target_uv = SAFE_ALLOC(uv_w * 3, uv_h, fixed_t);
+  fixed_t* const best_rgb_uv = SAFE_ALLOC(uv_w * 3, 1, fixed_t);
+  int ok;
+
+  if (best_y == NULL || best_uv == NULL ||
+      target_y == NULL || target_uv == NULL ||
+      best_rgb_y == NULL || best_rgb_uv == NULL ||
+      tmp_buffer == NULL) {
+    ok = WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
+    goto End;
+  }
+  assert(picture->width >= kMinDimensionIterativeConversion);
+  assert(picture->height >= kMinDimensionIterativeConversion);
+
+  // Import RGB samples to W/RGB representation.
+  for (j = 0; j < picture->height; j += 2) {
+    const int is_last_row = (j == picture->height - 1);
+    fixed_y_t* const src1 = tmp_buffer;
+    fixed_y_t* const src2 = tmp_buffer + 3 * w;
+    const int off1 = j * rgb_stride;
+    const int off2 = off1 + rgb_stride;
+    const int uv_off = (j >> 1) * 3 * uv_w;
+    fixed_y_t* const dst_y = best_y + j * w;
+
+    // prepare two rows of input
+    ImportOneRow(r_ptr + off1, g_ptr + off1, b_ptr + off1,
+                 step, picture->width, src1);
+    if (!is_last_row) {
+      ImportOneRow(r_ptr + off2, g_ptr + off2, b_ptr + off2,
+                   step, picture->width, src2);
+    } else {
+      memcpy(src2, src1, 3 * w * sizeof(*src2));
+    }
+    UpdateW(src1, target_y + (j + 0) * w, w);
+    UpdateW(src2, target_y + (j + 1) * w, w);
+    UpdateChroma(src1, src2, target_uv + uv_off, dst_y, uv_w);
+    memcpy(best_uv + uv_off, target_uv + uv_off, 3 * uv_w * sizeof(*best_uv));
+    memcpy(dst_y + w, dst_y, w * sizeof(*dst_y));
+  }
+
+  // Iterate and resolve clipping conflicts.
+  for (iter = 0; iter < kNumIterations; ++iter) {
+    int k;
+    const fixed_t* cur_uv = best_uv;
+    const fixed_t* prev_uv = best_uv;
+    for (j = 0; j < h; j += 2) {
+      fixed_y_t* const src1 = tmp_buffer;
+      fixed_y_t* const src2 = tmp_buffer + 3 * w;
+
+      {
+        const fixed_t* const next_uv = cur_uv + ((j < h - 2) ? 3 * uv_w : 0);
+        InterpolateTwoRows(best_y + j * w, prev_uv, cur_uv, next_uv,
+                           w, src1, src2);
+        prev_uv = cur_uv;
+        cur_uv = next_uv;
+      }
+
+      UpdateW(src1, best_rgb_y + 0 * w, w);
+      UpdateW(src2, best_rgb_y + 1 * w, w);
+      UpdateChroma(src1, src2, best_rgb_uv, NULL, uv_w);
+
+      // update two rows of Y and one row of RGB
+      for (i = 0; i < 2 * w; ++i) {
+        const int off = i + j * w;
+        const int diff_y = target_y[off] - best_rgb_y[i];
+        const int new_y = (int)best_y[off] + diff_y;
+        best_y[off] = clip_y(new_y);
+      }
+      for (i = 0; i < uv_w; ++i) {
+        const int off = 3 * (i + (j >> 1) * uv_w);
+        int W;
+        for (k = 0; k <= 2; ++k) {
+          const int diff_uv = (int)target_uv[off + k] - best_rgb_uv[3 * i + k];
+          best_uv[off + k] += diff_uv;
+        }
+        W = RGBToGray(best_uv[off + 0], best_uv[off + 1], best_uv[off + 2]);
+        for (k = 0; k <= 2; ++k) {
+          best_uv[off + k] -= W;
+        }
+      }
+    }
+    // TODO(skal): add early-termination criterion
+  }
+
+  // final reconstruction
+  ok = ConvertWRGBToYUV(best_y, best_uv, picture);
+
+ End:
+  WebPSafeFree(best_y);
+  WebPSafeFree(best_uv);
+  WebPSafeFree(target_y);
+  WebPSafeFree(target_uv);
+  WebPSafeFree(best_rgb_y);
+  WebPSafeFree(best_rgb_uv);
+  WebPSafeFree(tmp_buffer);
+  return ok;
+}
+#undef SAFE_ALLOC
+
+//------------------------------------------------------------------------------
+// "Fast" regular RGB->YUV
+
+#define SUM4(ptr, step) LinearToGamma(                     \
+    GammaToLinear((ptr)[0]) +                              \
+    GammaToLinear((ptr)[(step)]) +                         \
+    GammaToLinear((ptr)[rgb_stride]) +                     \
+    GammaToLinear((ptr)[rgb_stride + (step)]), 0)          \
+
+#define SUM2(ptr) \
+    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
+
+#define SUM2ALPHA(ptr) ((ptr)[0] + (ptr)[rgb_stride])
+#define SUM4ALPHA(ptr) (SUM2ALPHA(ptr) + SUM2ALPHA((ptr) + 4))
+
+#if defined(USE_INVERSE_ALPHA_TABLE)
+
+static const int kAlphaFix = 19;
+// Following table is (1 << kAlphaFix) / a. The (v * kInvAlpha[a]) >> kAlphaFix
+// formula is then equal to v / a in most (99.6%) cases. Note that this table
+// and constant are adjusted very tightly to fit 32b arithmetic.
+// In particular, they use the fact that the operands for 'v / a' are actually
+// derived as v = (a0.p0 + a1.p1 + a2.p2 + a3.p3) and a = a0 + a1 + a2 + a3
+// with ai in [0..255] and pi in [0..1<<kGammaFix). The constraint to avoid
+// overflow is: kGammaFix + kAlphaFix <= 31.
+static const uint32_t kInvAlpha[4 * 0xff + 1] = {
+  0,  /* alpha = 0 */
+  524288, 262144, 174762, 131072, 104857, 87381, 74898, 65536,
+  58254, 52428, 47662, 43690, 40329, 37449, 34952, 32768,
+  30840, 29127, 27594, 26214, 24966, 23831, 22795, 21845,
+  20971, 20164, 19418, 18724, 18078, 17476, 16912, 16384,
+  15887, 15420, 14979, 14563, 14169, 13797, 13443, 13107,
+  12787, 12483, 12192, 11915, 11650, 11397, 11155, 10922,
+  10699, 10485, 10280, 10082, 9892, 9709, 9532, 9362,
+  9198, 9039, 8886, 8738, 8594, 8456, 8322, 8192,
+  8065, 7943, 7825, 7710, 7598, 7489, 7384, 7281,
+  7182, 7084, 6990, 6898, 6808, 6721, 6636, 6553,
+  6472, 6393, 6316, 6241, 6168, 6096, 6026, 5957,
+  5890, 5825, 5761, 5698, 5637, 5577, 5518, 5461,
+  5405, 5349, 5295, 5242, 5190, 5140, 5090, 5041,
+  4993, 4946, 4899, 4854, 4809, 4766, 4723, 4681,
+  4639, 4599, 4559, 4519, 4481, 4443, 4405, 4369,
+  4332, 4297, 4262, 4228, 4194, 4161, 4128, 4096,
+  4064, 4032, 4002, 3971, 3942, 3912, 3883, 3855,
+  3826, 3799, 3771, 3744, 3718, 3692, 3666, 3640,
+  3615, 3591, 3566, 3542, 3518, 3495, 3472, 3449,
+  3426, 3404, 3382, 3360, 3339, 3318, 3297, 3276,
+  3256, 3236, 3216, 3196, 3177, 3158, 3139, 3120,
+  3102, 3084, 3066, 3048, 3030, 3013, 2995, 2978,
+  2962, 2945, 2928, 2912, 2896, 2880, 2864, 2849,
+  2833, 2818, 2803, 2788, 2774, 2759, 2744, 2730,
+  2716, 2702, 2688, 2674, 2661, 2647, 2634, 2621,
+  2608, 2595, 2582, 2570, 2557, 2545, 2532, 2520,
+  2508, 2496, 2484, 2473, 2461, 2449, 2438, 2427,
+  2416, 2404, 2394, 2383, 2372, 2361, 2351, 2340,
+  2330, 2319, 2309, 2299, 2289, 2279, 2269, 2259,
+  2250, 2240, 2231, 2221, 2212, 2202, 2193, 2184,
+  2175, 2166, 2157, 2148, 2139, 2131, 2122, 2114,
+  2105, 2097, 2088, 2080, 2072, 2064, 2056, 2048,
+  2040, 2032, 2024, 2016, 2008, 2001, 1993, 1985,
+  1978, 1971, 1963, 1956, 1949, 1941, 1934, 1927,
+  1920, 1913, 1906, 1899, 1892, 1885, 1879, 1872,
+  1865, 1859, 1852, 1846, 1839, 1833, 1826, 1820,
+  1814, 1807, 1801, 1795, 1789, 1783, 1777, 1771,
+  1765, 1759, 1753, 1747, 1741, 1736, 1730, 1724,
+  1718, 1713, 1707, 1702, 1696, 1691, 1685, 1680,
+  1675, 1669, 1664, 1659, 1653, 1648, 1643, 1638,
+  1633, 1628, 1623, 1618, 1613, 1608, 1603, 1598,
+  1593, 1588, 1583, 1579, 1574, 1569, 1565, 1560,
+  1555, 1551, 1546, 1542, 1537, 1533, 1528, 1524,
+  1519, 1515, 1510, 1506, 1502, 1497, 1493, 1489,
+  1485, 1481, 1476, 1472, 1468, 1464, 1460, 1456,
+  1452, 1448, 1444, 1440, 1436, 1432, 1428, 1424,
+  1420, 1416, 1413, 1409, 1405, 1401, 1398, 1394,
+  1390, 1387, 1383, 1379, 1376, 1372, 1368, 1365,
+  1361, 1358, 1354, 1351, 1347, 1344, 1340, 1337,
+  1334, 1330, 1327, 1323, 1320, 1317, 1314, 1310,
+  1307, 1304, 1300, 1297, 1294, 1291, 1288, 1285,
+  1281, 1278, 1275, 1272, 1269, 1266, 1263, 1260,
+  1257, 1254, 1251, 1248, 1245, 1242, 1239, 1236,
+  1233, 1230, 1227, 1224, 1222, 1219, 1216, 1213,
+  1210, 1208, 1205, 1202, 1199, 1197, 1194, 1191,
+  1188, 1186, 1183, 1180, 1178, 1175, 1172, 1170,
+  1167, 1165, 1162, 1159, 1157, 1154, 1152, 1149,
+  1147, 1144, 1142, 1139, 1137, 1134, 1132, 1129,
+  1127, 1125, 1122, 1120, 1117, 1115, 1113, 1110,
+  1108, 1106, 1103, 1101, 1099, 1096, 1094, 1092,
+  1089, 1087, 1085, 1083, 1081, 1078, 1076, 1074,
+  1072, 1069, 1067, 1065, 1063, 1061, 1059, 1057,
+  1054, 1052, 1050, 1048, 1046, 1044, 1042, 1040,
+  1038, 1036, 1034, 1032, 1030, 1028, 1026, 1024,
+  1022, 1020, 1018, 1016, 1014, 1012, 1010, 1008,
+  1006, 1004, 1002, 1000, 998, 996, 994, 992,
+  991, 989, 987, 985, 983, 981, 979, 978,
+  976, 974, 972, 970, 969, 967, 965, 963,
+  961, 960, 958, 956, 954, 953, 951, 949,
+  948, 946, 944, 942, 941, 939, 937, 936,
+  934, 932, 931, 929, 927, 926, 924, 923,
+  921, 919, 918, 916, 914, 913, 911, 910,
+  908, 907, 905, 903, 902, 900, 899, 897,
+  896, 894, 893, 891, 890, 888, 887, 885,
+  884, 882, 881, 879, 878, 876, 875, 873,
+  872, 870, 869, 868, 866, 865, 863, 862,
+  860, 859, 858, 856, 855, 853, 852, 851,
+  849, 848, 846, 845, 844, 842, 841, 840,
+  838, 837, 836, 834, 833, 832, 830, 829,
+  828, 826, 825, 824, 823, 821, 820, 819,
+  817, 816, 815, 814, 812, 811, 810, 809,
+  807, 806, 805, 804, 802, 801, 800, 799,
+  798, 796, 795, 794, 793, 791, 790, 789,
+  788, 787, 786, 784, 783, 782, 781, 780,
+  779, 777, 776, 775, 774, 773, 772, 771,
+  769, 768, 767, 766, 765, 764, 763, 762,
+  760, 759, 758, 757, 756, 755, 754, 753,
+  752, 751, 750, 748, 747, 746, 745, 744,
+  743, 742, 741, 740, 739, 738, 737, 736,
+  735, 734, 733, 732, 731, 730, 729, 728,
+  727, 726, 725, 724, 723, 722, 721, 720,
+  719, 718, 717, 716, 715, 714, 713, 712,
+  711, 710, 709, 708, 707, 706, 705, 704,
+  703, 702, 701, 700, 699, 699, 698, 697,
+  696, 695, 694, 693, 692, 691, 690, 689,
+  688, 688, 687, 686, 685, 684, 683, 682,
+  681, 680, 680, 679, 678, 677, 676, 675,
+  674, 673, 673, 672, 671, 670, 669, 668,
+  667, 667, 666, 665, 664, 663, 662, 661,
+  661, 660, 659, 658, 657, 657, 656, 655,
+  654, 653, 652, 652, 651, 650, 649, 648,
+  648, 647, 646, 645, 644, 644, 643, 642,
+  641, 640, 640, 639, 638, 637, 637, 636,
+  635, 634, 633, 633, 632, 631, 630, 630,
+  629, 628, 627, 627, 626, 625, 624, 624,
+  623, 622, 621, 621, 620, 619, 618, 618,
+  617, 616, 616, 615, 614, 613, 613, 612,
+  611, 611, 610, 609, 608, 608, 607, 606,
+  606, 605, 604, 604, 603, 602, 601, 601,
+  600, 599, 599, 598, 597, 597, 596, 595,
+  595, 594, 593, 593, 592, 591, 591, 590,
+  589, 589, 588, 587, 587, 586, 585, 585,
+  584, 583, 583, 582, 581, 581, 580, 579,
+  579, 578, 578, 577, 576, 576, 575, 574,
+  574, 573, 572, 572, 571, 571, 570, 569,
+  569, 568, 568, 567, 566, 566, 565, 564,
+  564, 563, 563, 562, 561, 561, 560, 560,
+  559, 558, 558, 557, 557, 556, 555, 555,
+  554, 554, 553, 553, 552, 551, 551, 550,
+  550, 549, 548, 548, 547, 547, 546, 546,
+  545, 544, 544, 543, 543, 542, 542, 541,
+  541, 540, 539, 539, 538, 538, 537, 537,
+  536, 536, 535, 534, 534, 533, 533, 532,
+  532, 531, 531, 530, 530, 529, 529, 528,
+  527, 527, 526, 526, 525, 525, 524, 524,
+  523, 523, 522, 522, 521, 521, 520, 520,
+  519, 519, 518, 518, 517, 517, 516, 516,
+  515, 515, 514, 514
+};
+
+// Note that LinearToGamma() expects the values to be premultiplied by 4,
+// so we incorporate this factor 4 inside the DIVIDE_BY_ALPHA macro directly.
+#define DIVIDE_BY_ALPHA(sum, a)  (((sum) * kInvAlpha[(a)]) >> (kAlphaFix - 2))
+
+#else
+
+#define DIVIDE_BY_ALPHA(sum, a) (4 * (sum) / (a))
+
+#endif  // USE_INVERSE_ALPHA_TABLE
+
+static WEBP_INLINE int LinearToGammaWeighted(const uint8_t* src,
+                                             const uint8_t* a_ptr,
+                                             uint32_t total_a, int step,
+                                             int rgb_stride) {
+  const uint32_t sum =
+      a_ptr[0] * GammaToLinear(src[0]) +
+      a_ptr[step] * GammaToLinear(src[step]) +
+      a_ptr[rgb_stride] * GammaToLinear(src[rgb_stride]) +
+      a_ptr[rgb_stride + step] * GammaToLinear(src[rgb_stride + step]);
+  assert(total_a > 0 && total_a <= 4 * 0xff);
+#if defined(USE_INVERSE_ALPHA_TABLE)
+  assert((uint64_t)sum * kInvAlpha[total_a] < ((uint64_t)1 << 32));
+#endif
+  return LinearToGamma(DIVIDE_BY_ALPHA(sum, total_a), 0);
+}
+
+static WEBP_INLINE void ConvertRowToY(const uint8_t* const r_ptr,
+                                      const uint8_t* const g_ptr,
+                                      const uint8_t* const b_ptr,
+                                      int step,
+                                      uint8_t* const dst_y,
+                                      int width,
+                                      VP8Random* const rg) {
+  int i, j;
+  for (i = 0, j = 0; i < width; ++i, j += step) {
+    dst_y[i] = RGBToY(r_ptr[j], g_ptr[j], b_ptr[j], rg);
+  }
+}
+
+static WEBP_INLINE void ConvertRowsToUVWithAlpha(const uint8_t* const r_ptr,
+                                                 const uint8_t* const g_ptr,
+                                                 const uint8_t* const b_ptr,
+                                                 const uint8_t* const a_ptr,
+                                                 int rgb_stride,
+                                                 uint8_t* const dst_u,
+                                                 uint8_t* const dst_v,
+                                                 int width,
+                                                 VP8Random* const rg) {
+  int i, j;
+  // we loop over 2x2 blocks and produce one U/V value for each.
+  for (i = 0, j = 0; i < (width >> 1); ++i, j += 2 * sizeof(uint32_t)) {
+    const uint32_t a = SUM4ALPHA(a_ptr + j);
+    int r, g, b;
+    if (a == 4 * 0xff || a == 0) {
+      r = SUM4(r_ptr + j, 4);
+      g = SUM4(g_ptr + j, 4);
+      b = SUM4(b_ptr + j, 4);
+    } else {
+      r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 4, rgb_stride);
+      g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 4, rgb_stride);
+      b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 4, rgb_stride);
+    }
+    dst_u[i] = RGBToU(r, g, b, rg);
+    dst_v[i] = RGBToV(r, g, b, rg);
+  }
+  if (width & 1) {
+    const uint32_t a = 2u * SUM2ALPHA(a_ptr + j);
+    int r, g, b;
+    if (a == 4 * 0xff || a == 0) {
+      r = SUM2(r_ptr + j);
+      g = SUM2(g_ptr + j);
+      b = SUM2(b_ptr + j);
+    } else {
+      r = LinearToGammaWeighted(r_ptr + j, a_ptr + j, a, 0, rgb_stride);
+      g = LinearToGammaWeighted(g_ptr + j, a_ptr + j, a, 0, rgb_stride);
+      b = LinearToGammaWeighted(b_ptr + j, a_ptr + j, a, 0, rgb_stride);
+    }
+    dst_u[i] = RGBToU(r, g, b, rg);
+    dst_v[i] = RGBToV(r, g, b, rg);
+  }
+}
+
+static WEBP_INLINE void ConvertRowsToUV(const uint8_t* const r_ptr,
+                                        const uint8_t* const g_ptr,
+                                        const uint8_t* const b_ptr,
+                                        int step, int rgb_stride,
+                                        uint8_t* const dst_u,
+                                        uint8_t* const dst_v,
+                                        int width,
+                                        VP8Random* const rg) {
+  int i, j;
+  for (i = 0, j = 0; i < (width >> 1); ++i, j += 2 * step) {
+    const int r = SUM4(r_ptr + j, step);
+    const int g = SUM4(g_ptr + j, step);
+    const int b = SUM4(b_ptr + j, step);
+    dst_u[i] = RGBToU(r, g, b, rg);
+    dst_v[i] = RGBToV(r, g, b, rg);
+  }
+  if (width & 1) {
+    const int r = SUM2(r_ptr + j);
+    const int g = SUM2(g_ptr + j);
+    const int b = SUM2(b_ptr + j);
+    dst_u[i] = RGBToU(r, g, b, rg);
+    dst_v[i] = RGBToV(r, g, b, rg);
+  }
+}
+
+static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
+                              const uint8_t* const g_ptr,
+                              const uint8_t* const b_ptr,
+                              const uint8_t* const a_ptr,
+                              int step,         // bytes per pixel
+                              int rgb_stride,   // bytes per scanline
+                              float dithering,
+                              int use_iterative_conversion,
+                              WebPPicture* const picture) {
+  int y;
+  const int width = picture->width;
+  const int height = picture->height;
+  const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
+
+  picture->colorspace = has_alpha ? WEBP_YUV420A : WEBP_YUV420;
+  picture->use_argb = 0;
+
+  // disable smart conversion if source is too small (overkill).
+  if (width < kMinDimensionIterativeConversion ||
+      height < kMinDimensionIterativeConversion) {
+    use_iterative_conversion = 0;
+  }
+
+  if (!WebPPictureAllocYUVA(picture, width, height)) {
+    return 0;
+  }
+  if (has_alpha) {
+    WebPInitAlphaProcessing();
+    assert(step == 4);
+#if defined(USE_INVERSE_ALPHA_TABLE)
+    assert(kAlphaFix + kGammaFix <= 31);
+#endif
+  }
+
+  if (use_iterative_conversion) {
+    InitGammaTablesF();
+    if (!PreprocessARGB(r_ptr, g_ptr, b_ptr, step, rgb_stride, picture)) {
+      return 0;
+    }
+    if (has_alpha) {
+      WebPExtractAlpha(a_ptr, rgb_stride, width, height,
+                       picture->a, picture->a_stride);
+    }
+  } else {
+    uint8_t* dst_y = picture->y;
+    uint8_t* dst_u = picture->u;
+    uint8_t* dst_v = picture->v;
+    uint8_t* dst_a = picture->a;
+
+    VP8Random base_rg;
+    VP8Random* rg = NULL;
+    if (dithering > 0.) {
+      VP8InitRandom(&base_rg, dithering);
+      rg = &base_rg;
+    }
+
+    InitGammaTables();
+
+    // Downsample Y/U/V planes, two rows at a time
+    for (y = 0; y < (height >> 1); ++y) {
+      int rows_have_alpha = has_alpha;
+      const int off1 = (2 * y + 0) * rgb_stride;
+      const int off2 = (2 * y + 1) * rgb_stride;
+      ConvertRowToY(r_ptr + off1, g_ptr + off1, b_ptr + off1, step,
+                    dst_y, width, rg);
+      ConvertRowToY(r_ptr + off2, g_ptr + off2, b_ptr + off2, step,
+                    dst_y + picture->y_stride, width, rg);
+      dst_y += 2 * picture->y_stride;
+      if (has_alpha) {
+        rows_have_alpha &= !WebPExtractAlpha(a_ptr + off1, rgb_stride,
+                                             width, 2,
+                                             dst_a, picture->a_stride);
+        dst_a += 2 * picture->a_stride;
+      }
+      if (!rows_have_alpha) {
+        ConvertRowsToUV(r_ptr + off1, g_ptr + off1, b_ptr + off1,
+                        step, rgb_stride, dst_u, dst_v, width, rg);
+      } else {
+        ConvertRowsToUVWithAlpha(r_ptr + off1, g_ptr + off1, b_ptr + off1,
+                                 a_ptr + off1, rgb_stride,
+                                 dst_u, dst_v, width, rg);
+      }
+      dst_u += picture->uv_stride;
+      dst_v += picture->uv_stride;
+    }
+    if (height & 1) {    // extra last row
+      const int off = 2 * y * rgb_stride;
+      int row_has_alpha = has_alpha;
+      ConvertRowToY(r_ptr + off, g_ptr + off, b_ptr + off, step,
+                    dst_y, width, rg);
+      if (row_has_alpha) {
+        row_has_alpha &= !WebPExtractAlpha(a_ptr + off, 0, width, 1, dst_a, 0);
+      }
+      if (!row_has_alpha) {
+        ConvertRowsToUV(r_ptr + off, g_ptr + off, b_ptr + off,
+                        step, 0, dst_u, dst_v, width, rg);
+      } else {
+        ConvertRowsToUVWithAlpha(r_ptr + off, g_ptr + off, b_ptr + off,
+                                 a_ptr + off, 0,
+                                 dst_u, dst_v, width, rg);
+      }
+    }
+  }
+  return 1;
+}
+
+#undef SUM4
+#undef SUM2
+#undef SUM4ALPHA
+#undef SUM2ALPHA
+
+//------------------------------------------------------------------------------
+// call for ARGB->YUVA conversion
+
+static int PictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace,
+                             float dithering, int use_iterative_conversion) {
+  if (picture == NULL) return 0;
+  if (picture->argb == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  } else if ((colorspace & WEBP_CSP_UV_MASK) != WEBP_YUV420) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  } else {
+    const uint8_t* const argb = (const uint8_t*)picture->argb;
+    const uint8_t* const r = ALPHA_IS_LAST ? argb + 2 : argb + 1;
+    const uint8_t* const g = ALPHA_IS_LAST ? argb + 1 : argb + 2;
+    const uint8_t* const b = ALPHA_IS_LAST ? argb + 0 : argb + 3;
+    const uint8_t* const a = ALPHA_IS_LAST ? argb + 3 : argb + 0;
+
+    picture->colorspace = WEBP_YUV420;
+    return ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride,
+                              dithering, use_iterative_conversion, picture);
+  }
+}
+
+int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
+                                  float dithering) {
+  return PictureARGBToYUVA(picture, colorspace, dithering, 0);
+}
+
+int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
+  return PictureARGBToYUVA(picture, colorspace, 0.f, 0);
+}
+
+#if WEBP_ENCODER_ABI_VERSION > 0x0204
+int WebPPictureSmartARGBToYUVA(WebPPicture* picture) {
+  return PictureARGBToYUVA(picture, WEBP_YUV420, 0.f, 1);
+}
+#endif
+
+//------------------------------------------------------------------------------
+// call for YUVA -> ARGB conversion
+
+int WebPPictureYUVAToARGB(WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (picture->y == NULL || picture->u == NULL || picture->v == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  }
+  if ((picture->colorspace & WEBP_CSP_ALPHA_BIT) && picture->a == NULL) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
+  }
+  if ((picture->colorspace & WEBP_CSP_UV_MASK) != WEBP_YUV420) {
+    return WebPEncodingSetError(picture, VP8_ENC_ERROR_INVALID_CONFIGURATION);
+  }
+  // Allocate a new argb buffer (discarding the previous one).
+  if (!WebPPictureAllocARGB(picture, picture->width, picture->height)) return 0;
+  picture->use_argb = 1;
+
+  // Convert
+  {
+    int y;
+    const int width = picture->width;
+    const int height = picture->height;
+    const int argb_stride = 4 * picture->argb_stride;
+    uint8_t* dst = (uint8_t*)picture->argb;
+    const uint8_t *cur_u = picture->u, *cur_v = picture->v, *cur_y = picture->y;
+    WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST);
+
+    // First row, with replicated top samples.
+    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
+    cur_y += picture->y_stride;
+    dst += argb_stride;
+    // Center rows.
+    for (y = 1; y + 1 < height; y += 2) {
+      const uint8_t* const top_u = cur_u;
+      const uint8_t* const top_v = cur_v;
+      cur_u += picture->uv_stride;
+      cur_v += picture->uv_stride;
+      upsample(cur_y, cur_y + picture->y_stride, top_u, top_v, cur_u, cur_v,
+               dst, dst + argb_stride, width);
+      cur_y += 2 * picture->y_stride;
+      dst += 2 * argb_stride;
+    }
+    // Last row (if needed), with replicated bottom samples.
+    if (height > 1 && !(height & 1)) {
+      upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
+    }
+    // Insert alpha values if needed, in replacement for the default 0xff ones.
+    if (picture->colorspace & WEBP_CSP_ALPHA_BIT) {
+      for (y = 0; y < height; ++y) {
+        uint32_t* const argb_dst = picture->argb + y * picture->argb_stride;
+        const uint8_t* const src = picture->a + y * picture->a_stride;
+        int x;
+        for (x = 0; x < width; ++x) {
+          argb_dst[x] = (argb_dst[x] & 0x00ffffffu) | ((uint32_t)src[x] << 24);
+        }
+      }
+    }
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// automatic import / conversion
+
+static int Import(WebPPicture* const picture,
+                  const uint8_t* const rgb, int rgb_stride,
+                  int step, int swap_rb, int import_alpha) {
+  int y;
+  const uint8_t* const r_ptr = rgb + (swap_rb ? 2 : 0);
+  const uint8_t* const g_ptr = rgb + 1;
+  const uint8_t* const b_ptr = rgb + (swap_rb ? 0 : 2);
+  const uint8_t* const a_ptr = import_alpha ? rgb + 3 : NULL;
+  const int width = picture->width;
+  const int height = picture->height;
+
+  if (!picture->use_argb) {
+    return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
+                              0.f /* no dithering */, 0, picture);
+  }
+  if (!WebPPictureAlloc(picture)) return 0;
+
+  assert(step >= (import_alpha ? 4 : 3));
+  for (y = 0; y < height; ++y) {
+    uint32_t* const dst = &picture->argb[y * picture->argb_stride];
+    int x;
+    for (x = 0; x < width; ++x) {
+      const int offset = step * x + y * rgb_stride;
+      dst[x] = MakeARGB32(import_alpha ? a_ptr[offset] : 0xff,
+                          r_ptr[offset], g_ptr[offset], b_ptr[offset]);
+    }
+  }
+  return 1;
+}
+
+// Public API
+
+int WebPPictureImportRGB(WebPPicture* picture,
+                         const uint8_t* rgb, int rgb_stride) {
+  return (picture != NULL) ? Import(picture, rgb, rgb_stride, 3, 0, 0) : 0;
+}
+
+int WebPPictureImportBGR(WebPPicture* picture,
+                         const uint8_t* rgb, int rgb_stride) {
+  return (picture != NULL) ? Import(picture, rgb, rgb_stride, 3, 1, 0) : 0;
+}
+
+int WebPPictureImportRGBA(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL) ? Import(picture, rgba, rgba_stride, 4, 0, 1) : 0;
+}
+
+int WebPPictureImportBGRA(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL) ? Import(picture, rgba, rgba_stride, 4, 1, 1) : 0;
+}
+
+int WebPPictureImportRGBX(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL) ? Import(picture, rgba, rgba_stride, 4, 0, 0) : 0;
+}
+
+int WebPPictureImportBGRX(WebPPicture* picture,
+                          const uint8_t* rgba, int rgba_stride) {
+  return (picture != NULL) ? Import(picture, rgba, rgba_stride, 4, 1, 0) : 0;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/picture_psnr.c b/src/3rdparty/libwebp/src/enc/picture_psnr.c
new file mode 100644
index 0000000..2254b7e
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/picture_psnr.c
@@ -0,0 +1,150 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools for measuring distortion
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <math.h>
+
+#include "./vp8enci.h"
+
+//------------------------------------------------------------------------------
+// local-min distortion
+//
+// For every pixel in the *reference* picture, we search for the local best
+// match in the compressed image. This is not a symmetrical measure.
+
+#define RADIUS 2  // search radius. Shouldn't be too large.
+
+static float AccumulateLSIM(const uint8_t* src, int src_stride,
+                            const uint8_t* ref, int ref_stride,
+                            int w, int h) {
+  int x, y;
+  double total_sse = 0.;
+  for (y = 0; y < h; ++y) {
+    const int y_0 = (y - RADIUS < 0) ? 0 : y - RADIUS;
+    const int y_1 = (y + RADIUS + 1 >= h) ? h : y + RADIUS + 1;
+    for (x = 0; x < w; ++x) {
+      const int x_0 = (x - RADIUS < 0) ? 0 : x - RADIUS;
+      const int x_1 = (x + RADIUS + 1 >= w) ? w : x + RADIUS + 1;
+      double best_sse = 255. * 255.;
+      const double value = (double)ref[y * ref_stride + x];
+      int i, j;
+      for (j = y_0; j < y_1; ++j) {
+        const uint8_t* s = src + j * src_stride;
+        for (i = x_0; i < x_1; ++i) {
+          const double sse = (double)(s[i] - value) * (s[i] - value);
+          if (sse < best_sse) best_sse = sse;
+        }
+      }
+      total_sse += best_sse;
+    }
+  }
+  return (float)total_sse;
+}
+#undef RADIUS
+
+//------------------------------------------------------------------------------
+// Distortion
+
+// Max value returned in case of exact similarity.
+static const double kMinDistortion_dB = 99.;
+static float GetPSNR(const double v) {
+  return (float)((v > 0.) ? -4.3429448 * log(v / (255 * 255.))
+                          : kMinDistortion_dB);
+}
+
+int WebPPictureDistortion(const WebPPicture* src, const WebPPicture* ref,
+                          int type, float result[5]) {
+  DistoStats stats[5];
+  int has_alpha;
+  int uv_w, uv_h;
+
+  if (src == NULL || ref == NULL ||
+      src->width != ref->width || src->height != ref->height ||
+      src->y == NULL || ref->y == NULL ||
+      src->u == NULL || ref->u == NULL ||
+      src->v == NULL || ref->v == NULL ||
+      result == NULL) {
+    return 0;
+  }
+  // TODO(skal): provide distortion for ARGB too.
+  if (src->use_argb == 1 || src->use_argb != ref->use_argb) {
+    return 0;
+  }
+
+  has_alpha = !!(src->colorspace & WEBP_CSP_ALPHA_BIT);
+  if (has_alpha != !!(ref->colorspace & WEBP_CSP_ALPHA_BIT) ||
+      (has_alpha && (src->a == NULL || ref->a == NULL))) {
+    return 0;
+  }
+
+  memset(stats, 0, sizeof(stats));
+
+  uv_w = (src->width + 1) >> 1;
+  uv_h = (src->height + 1) >> 1;
+  if (type >= 2) {
+    float sse[4];
+    sse[0] = AccumulateLSIM(src->y, src->y_stride,
+                            ref->y, ref->y_stride, src->width, src->height);
+    sse[1] = AccumulateLSIM(src->u, src->uv_stride,
+                            ref->u, ref->uv_stride, uv_w, uv_h);
+    sse[2] = AccumulateLSIM(src->v, src->uv_stride,
+                            ref->v, ref->uv_stride, uv_w, uv_h);
+    sse[3] = has_alpha ? AccumulateLSIM(src->a, src->a_stride,
+                                        ref->a, ref->a_stride,
+                                        src->width, src->height)
+                       : 0.f;
+    result[0] = GetPSNR(sse[0] / (src->width * src->height));
+    result[1] = GetPSNR(sse[1] / (uv_w * uv_h));
+    result[2] = GetPSNR(sse[2] / (uv_w * uv_h));
+    result[3] = GetPSNR(sse[3] / (src->width * src->height));
+    {
+      double total_sse = sse[0] + sse[1] + sse[2];
+      int total_pixels = src->width * src->height + 2 * uv_w * uv_h;
+      if (has_alpha) {
+        total_pixels += src->width * src->height;
+        total_sse += sse[3];
+      }
+      result[4] = GetPSNR(total_sse / total_pixels);
+    }
+  } else {
+    int c;
+    VP8SSIMAccumulatePlane(src->y, src->y_stride,
+                           ref->y, ref->y_stride,
+                           src->width, src->height, &stats[0]);
+    VP8SSIMAccumulatePlane(src->u, src->uv_stride,
+                           ref->u, ref->uv_stride,
+                           uv_w, uv_h, &stats[1]);
+    VP8SSIMAccumulatePlane(src->v, src->uv_stride,
+                           ref->v, ref->uv_stride,
+                           uv_w, uv_h, &stats[2]);
+    if (has_alpha) {
+      VP8SSIMAccumulatePlane(src->a, src->a_stride,
+                             ref->a, ref->a_stride,
+                             src->width, src->height, &stats[3]);
+    }
+    for (c = 0; c <= 4; ++c) {
+      if (type == 1) {
+        const double v = VP8SSIMGet(&stats[c]);
+        result[c] = (float)((v < 1.) ? -10.0 * log10(1. - v)
+                                     : kMinDistortion_dB);
+      } else {
+        const double v = VP8SSIMGetSquaredError(&stats[c]);
+        result[c] = GetPSNR(v);
+      }
+      // Accumulate forward
+      if (c < 4) VP8SSIMAddStats(&stats[c], &stats[4]);
+    }
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/picture_rescale.c b/src/3rdparty/libwebp/src/enc/picture_rescale.c
new file mode 100644
index 0000000..de52848
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/picture_rescale.c
@@ -0,0 +1,285 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools: copy, crop, rescaling and view.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include <stdlib.h>
+
+#include "./vp8enci.h"
+#include "../utils/rescaler.h"
+#include "../utils/utils.h"
+
+#define HALVE(x) (((x) + 1) >> 1)
+
+// Grab the 'specs' (writer, *opaque, width, height...) from 'src' and copy them
+// into 'dst'. Mark 'dst' as not owning any memory.
+static void PictureGrabSpecs(const WebPPicture* const src,
+                             WebPPicture* const dst) {
+  assert(src != NULL && dst != NULL);
+  *dst = *src;
+  WebPPictureResetBuffers(dst);
+}
+
+//------------------------------------------------------------------------------
+// Picture copying
+
+static void CopyPlane(const uint8_t* src, int src_stride,
+                      uint8_t* dst, int dst_stride, int width, int height) {
+  while (height-- > 0) {
+    memcpy(dst, src, width);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+// Adjust top-left corner to chroma sample position.
+static void SnapTopLeftPosition(const WebPPicture* const pic,
+                                int* const left, int* const top) {
+  if (!pic->use_argb) {
+    *left &= ~1;
+    *top &= ~1;
+  }
+}
+
+// Adjust top-left corner and verify that the sub-rectangle is valid.
+static int AdjustAndCheckRectangle(const WebPPicture* const pic,
+                                   int* const left, int* const top,
+                                   int width, int height) {
+  SnapTopLeftPosition(pic, left, top);
+  if ((*left) < 0 || (*top) < 0) return 0;
+  if (width <= 0 || height <= 0) return 0;
+  if ((*left) + width > pic->width) return 0;
+  if ((*top) + height > pic->height) return 0;
+  return 1;
+}
+
+int WebPPictureCopy(const WebPPicture* src, WebPPicture* dst) {
+  if (src == NULL || dst == NULL) return 0;
+  if (src == dst) return 1;
+
+  PictureGrabSpecs(src, dst);
+  if (!WebPPictureAlloc(dst)) return 0;
+
+  if (!src->use_argb) {
+    CopyPlane(src->y, src->y_stride,
+              dst->y, dst->y_stride, dst->width, dst->height);
+    CopyPlane(src->u, src->uv_stride,
+              dst->u, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
+    CopyPlane(src->v, src->uv_stride,
+              dst->v, dst->uv_stride, HALVE(dst->width), HALVE(dst->height));
+    if (dst->a != NULL)  {
+      CopyPlane(src->a, src->a_stride,
+                dst->a, dst->a_stride, dst->width, dst->height);
+    }
+  } else {
+    CopyPlane((const uint8_t*)src->argb, 4 * src->argb_stride,
+              (uint8_t*)dst->argb, 4 * dst->argb_stride,
+              4 * dst->width, dst->height);
+  }
+  return 1;
+}
+
+int WebPPictureIsView(const WebPPicture* picture) {
+  if (picture == NULL) return 0;
+  if (picture->use_argb) {
+    return (picture->memory_argb_ == NULL);
+  }
+  return (picture->memory_ == NULL);
+}
+
+int WebPPictureView(const WebPPicture* src,
+                    int left, int top, int width, int height,
+                    WebPPicture* dst) {
+  if (src == NULL || dst == NULL) return 0;
+
+  // verify rectangle position.
+  if (!AdjustAndCheckRectangle(src, &left, &top, width, height)) return 0;
+
+  if (src != dst) {  // beware of aliasing! We don't want to leak 'memory_'.
+    PictureGrabSpecs(src, dst);
+  }
+  dst->width = width;
+  dst->height = height;
+  if (!src->use_argb) {
+    dst->y = src->y + top * src->y_stride + left;
+    dst->u = src->u + (top >> 1) * src->uv_stride + (left >> 1);
+    dst->v = src->v + (top >> 1) * src->uv_stride + (left >> 1);
+    dst->y_stride = src->y_stride;
+    dst->uv_stride = src->uv_stride;
+    if (src->a != NULL) {
+      dst->a = src->a + top * src->a_stride + left;
+      dst->a_stride = src->a_stride;
+    }
+  } else {
+    dst->argb = src->argb + top * src->argb_stride + left;
+    dst->argb_stride = src->argb_stride;
+  }
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Picture cropping
+
+int WebPPictureCrop(WebPPicture* pic,
+                    int left, int top, int width, int height) {
+  WebPPicture tmp;
+
+  if (pic == NULL) return 0;
+  if (!AdjustAndCheckRectangle(pic, &left, &top, width, height)) return 0;
+
+  PictureGrabSpecs(pic, &tmp);
+  tmp.width = width;
+  tmp.height = height;
+  if (!WebPPictureAlloc(&tmp)) return 0;
+
+  if (!pic->use_argb) {
+    const int y_offset = top * pic->y_stride + left;
+    const int uv_offset = (top / 2) * pic->uv_stride + left / 2;
+    CopyPlane(pic->y + y_offset, pic->y_stride,
+              tmp.y, tmp.y_stride, width, height);
+    CopyPlane(pic->u + uv_offset, pic->uv_stride,
+              tmp.u, tmp.uv_stride, HALVE(width), HALVE(height));
+    CopyPlane(pic->v + uv_offset, pic->uv_stride,
+              tmp.v, tmp.uv_stride, HALVE(width), HALVE(height));
+
+    if (tmp.a != NULL) {
+      const int a_offset = top * pic->a_stride + left;
+      CopyPlane(pic->a + a_offset, pic->a_stride,
+                tmp.a, tmp.a_stride, width, height);
+    }
+  } else {
+    const uint8_t* const src =
+        (const uint8_t*)(pic->argb + top * pic->argb_stride + left);
+    CopyPlane(src, pic->argb_stride * 4,
+              (uint8_t*)tmp.argb, tmp.argb_stride * 4,
+              width * 4, height);
+  }
+  WebPPictureFree(pic);
+  *pic = tmp;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
+// Simple picture rescaler
+
+static void RescalePlane(const uint8_t* src,
+                         int src_width, int src_height, int src_stride,
+                         uint8_t* dst,
+                         int dst_width, int dst_height, int dst_stride,
+                         int32_t* const work,
+                         int num_channels) {
+  WebPRescaler rescaler;
+  int y = 0;
+  WebPRescalerInit(&rescaler, src_width, src_height,
+                   dst, dst_width, dst_height, dst_stride,
+                   num_channels,
+                   src_width, dst_width,
+                   src_height, dst_height,
+                   work);
+  memset(work, 0, 2 * dst_width * num_channels * sizeof(*work));
+  while (y < src_height) {
+    y += WebPRescalerImport(&rescaler, src_height - y,
+                            src + y * src_stride, src_stride);
+    WebPRescalerExport(&rescaler);
+  }
+}
+
+static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
+  assert(pic->argb != NULL);
+  WebPMultARGBRows((uint8_t*)pic->argb, pic->argb_stride * sizeof(*pic->argb),
+                   pic->width, pic->height, inverse);
+}
+
+static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
+  if (pic->a != NULL) {
+    WebPMultRows(pic->y, pic->y_stride, pic->a, pic->a_stride,
+                 pic->width, pic->height, inverse);
+  }
+}
+
+int WebPPictureRescale(WebPPicture* pic, int width, int height) {
+  WebPPicture tmp;
+  int prev_width, prev_height;
+  int32_t* work;
+
+  if (pic == NULL) return 0;
+  prev_width = pic->width;
+  prev_height = pic->height;
+  // if width is unspecified, scale original proportionally to height ratio.
+  if (width == 0) {
+    width = (prev_width * height + prev_height / 2) / prev_height;
+  }
+  // if height is unspecified, scale original proportionally to width ratio.
+  if (height == 0) {
+    height = (prev_height * width + prev_width / 2) / prev_width;
+  }
+  // Check if the overall dimensions still make sense.
+  if (width <= 0 || height <= 0) return 0;
+
+  PictureGrabSpecs(pic, &tmp);
+  tmp.width = width;
+  tmp.height = height;
+  if (!WebPPictureAlloc(&tmp)) return 0;
+
+  if (!pic->use_argb) {
+    work = (int32_t*)WebPSafeMalloc(2ULL * width, sizeof(*work));
+    if (work == NULL) {
+      WebPPictureFree(&tmp);
+      return 0;
+    }
+    // If present, we need to rescale alpha first (for AlphaMultiplyY).
+    if (pic->a != NULL) {
+      WebPInitAlphaProcessing();
+      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
+                   tmp.a, width, height, tmp.a_stride, work, 1);
+    }
+
+    // We take transparency into account on the luma plane only. That's not
+    // totally exact blending, but still is a good approximation.
+    AlphaMultiplyY(pic, 0);
+    RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
+                 tmp.y, width, height, tmp.y_stride, work, 1);
+    AlphaMultiplyY(&tmp, 1);
+
+    RescalePlane(pic->u,
+                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
+                 tmp.u,
+                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
+    RescalePlane(pic->v,
+                 HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
+                 tmp.v,
+                 HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
+  } else {
+    work = (int32_t*)WebPSafeMalloc(2ULL * width * 4, sizeof(*work));
+    if (work == NULL) {
+      WebPPictureFree(&tmp);
+      return 0;
+    }
+    // In order to correctly interpolate colors, we need to apply the alpha
+    // weighting first (black-matting), scale the RGB values, and remove
+    // the premultiplication afterward (while preserving the alpha channel).
+    WebPInitAlphaProcessing();
+    AlphaMultiplyARGB(pic, 0);
+    RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
+                 pic->argb_stride * 4,
+                 (uint8_t*)tmp.argb, width, height,
+                 tmp.argb_stride * 4,
+                 work, 4);
+    AlphaMultiplyARGB(&tmp, 1);
+  }
+  WebPPictureFree(pic);
+  WebPSafeFree(work);
+  *pic = tmp;
+  return 1;
+}
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/picture_tools.c b/src/3rdparty/libwebp/src/enc/picture_tools.c
new file mode 100644
index 0000000..7c73646
--- /dev/null
+++ b/src/3rdparty/libwebp/src/enc/picture_tools.c
@@ -0,0 +1,206 @@
+// Copyright 2014 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// WebPPicture tools: alpha handling, etc.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include "./vp8enci.h"
+#include "../dsp/yuv.h"
+
+static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
+  return (0xff000000u | (r << 16) | (g << 8) | b);
+}
+
+//------------------------------------------------------------------------------
+// Helper: clean up fully transparent area to help compressibility.
+
+#define SIZE 8
+#define SIZE2 (SIZE / 2)
+static int is_transparent_area(const uint8_t* ptr, int stride, int size) {
+  int y, x;
+  for (y = 0; y < size; ++y) {
+    for (x = 0; x < size; ++x) {
+      if (ptr[x]) {
+        return 0;
+      }
+    }
+    ptr += stride;
+  }
+  return 1;
+}
+
+static int is_transparent_argb_area(const uint32_t* ptr, int stride, int size) {
+  int y, x;
+  for (y = 0; y < size; ++y) {
+    for (x = 0; x < size; ++x) {
+      if (ptr[x] & 0xff000000u) {
+        return 0;
+      }
+    }
+    ptr += stride;
+  }
+  return 1;
+}
+
+static void flatten(uint8_t* ptr, int v, int stride, int size) {
+  int y;
+  for (y = 0; y < size; ++y) {
+    memset(ptr, v, size);
+    ptr += stride;
+  }
+}
+
+static void flatten_argb(uint32_t* ptr, uint32_t v, int stride, int size) {
+  int x, y;
+  for (y = 0; y < size; ++y) {
+    for (x = 0; x < size; ++x) ptr[x] = v;
+    ptr += stride;
+  }
+}
+
+void WebPCleanupTransparentArea(WebPPicture* pic) {
+  int x, y, w, h;
+  if (pic == NULL) return;
+  w = pic->width / SIZE;
+  h = pic->height / SIZE;
+
+  // note: we ignore the left-overs on right/bottom
+  if (pic->use_argb) {
+    uint32_t argb_value = 0;
+    for (y = 0; y < h; ++y) {
+      int need_reset = 1;
+      for (x = 0; x < w; ++x) {
+        const int off = (y * pic->argb_stride + x) * SIZE;
+        if (is_transparent_argb_area(pic->argb + off, pic->argb_stride, SIZE)) {
+          if (need_reset) {
+            argb_value = pic->argb[off];
+            need_reset = 0;
+          }
+          flatten_argb(pic->argb + off, argb_value, pic->argb_stride, SIZE);
+        } else {
+          need_reset = 1;
+        }
+      }
+    }
+  } else {
+    const uint8_t* const a_ptr = pic->a;
+    int values[3] = { 0 };
+    if (a_ptr == NULL) return;    // nothing to do
+    for (y = 0; y < h; ++y) {
+      int need_reset = 1;
+      for (x = 0; x < w; ++x) {
+        const int off_a = (y * pic->a_stride + x) * SIZE;
+        const int off_y = (y * pic->y_stride + x) * SIZE;
+        const int off_uv = (y * pic->uv_stride + x) * SIZE2;
+        if (is_transparent_area(a_ptr + off_a, pic->a_stride, SIZE)) {
+          if (need_reset) {
+            values[0] = pic->y[off_y];
+            values[1] = pic->u[off_uv];
+            values[2] = pic->v[off_uv];
+            need_reset = 0;
+          }
+          flatten(pic->y + off_y, values[0], pic->y_stride, SIZE);
+          flatten(pic->u + off_uv, values[1], pic->uv_stride, SIZE2);
+          flatten(pic->v + off_uv, values[2], pic->uv_stride, SIZE2);
+        } else {
+          need_reset = 1;
+        }
+      }
+    }
+  }
+}
+
+#undef SIZE
+#undef SIZE2
+
+//------------------------------------------------------------------------------
+// Blend color and remove transparency info
+
+#define BLEND(V0, V1, ALPHA) \
+    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 16)
+#define BLEND_10BIT(V0, V1, ALPHA) \
+    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 18)
+
+void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
+  const int red = (background_rgb >> 16) & 0xff;
+  const int green = (background_rgb >> 8) & 0xff;
+  const int blue = (background_rgb >> 0) & 0xff;
+  int x, y;
+  if (pic == NULL) return;
+  if (!pic->use_argb) {
+    const int uv_width = (pic->width >> 1);  // omit last pixel during u/v loop
+    const int Y0 = VP8RGBToY(red, green, blue, YUV_HALF);
+    // VP8RGBToU/V expects the u/v values summed over four pixels
+    const int U0 = VP8RGBToU(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
+    const int V0 = VP8RGBToV(4 * red, 4 * green, 4 * blue, 4 * YUV_HALF);
+    const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
+    if (!has_alpha || pic->a == NULL) return;    // nothing to do
+    for (y = 0; y < pic->height; ++y) {
+      // Luma blending
+      uint8_t* const y_ptr = pic->y + y * pic->y_stride;
+      uint8_t* const a_ptr = pic->a + y * pic->a_stride;
+      for (x = 0; x < pic->width; ++x) {
+        const int alpha = a_ptr[x];
+        if (alpha < 0xff) {
+          y_ptr[x] = BLEND(Y0, y_ptr[x], a_ptr[x]);
+        }
+      }
+      // Chroma blending every even line
+      if ((y & 1) == 0) {
+        uint8_t* const u = pic->u + (y >> 1) * pic->uv_stride;
+        uint8_t* const v = pic->v + (y >> 1) * pic->uv_stride;
+        uint8_t* const a_ptr2 =
+            (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
+        for (x = 0; x < uv_width; ++x) {
+          // Average four alpha values into a single blending weight.
+          // TODO(skal): might lead to visible contouring. Can we do better?
+          const int alpha =
+              a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
+              a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
+          u[x] = BLEND_10BIT(U0, u[x], alpha);
+          v[x] = BLEND_10BIT(V0, v[x], alpha);
+        }
+        if (pic->width & 1) {   // rightmost pixel
+          const int alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
+          u[x] = BLEND_10BIT(U0, u[x], alpha);
+          v[x] = BLEND_10BIT(V0, v[x], alpha);
+        }
+      }
+      memset(a_ptr, 0xff, pic->width);
+    }
+  } else {
+    uint32_t* argb = pic->argb;
+    const uint32_t background = MakeARGB32(red, green, blue);
+    for (y = 0; y < pic->height; ++y) {
+      for (x = 0; x < pic->width; ++x) {
+        const int alpha = (argb[x] >> 24) & 0xff;
+        if (alpha != 0xff) {
+          if (alpha > 0) {
+            int r = (argb[x] >> 16) & 0xff;
+            int g = (argb[x] >>  8) & 0xff;
+            int b = (argb[x] >>  0) & 0xff;
+            r = BLEND(red, r, alpha);
+            g = BLEND(green, g, alpha);
+            b = BLEND(blue, b, alpha);
+            argb[x] = MakeARGB32(r, g, b);
+          } else {
+            argb[x] = background;
+          }
+        }
+      }
+      argb += pic->argb_stride;
+    }
+  }
+}
+
+#undef BLEND
+#undef BLEND_10BIT
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/quant.c b/src/3rdparty/libwebp/src/enc/quant.c
index e1d202b..9130a41 100644
--- a/src/3rdparty/libwebp/src/enc/quant.c
+++ b/src/3rdparty/libwebp/src/enc/quant.c
@@ -395,7 +395,7 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
   dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
   // We also boost the dc-uv-quant a little, based on sns-strength, since
   // U/V channels are quite more reactive to high quants (flat DC-blocks
-  // tend to appear, and are displeasant).
+  // tend to appear, and are unpleasant).
   dq_uv_dc = -4 * enc->config_->sns_strength / 100;
   dq_uv_dc = clip(dq_uv_dc, -15, 15);   // 4bit-signed max allowed
 
@@ -454,13 +454,14 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
 // |UUVV| 20
 // +----+
 
-const int VP8Scan[16 + 4 + 4] = {
-  // Luma
+const int VP8Scan[16] = {  // Luma
   0 +  0 * BPS,  4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS,
   0 +  4 * BPS,  4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS,
   0 +  8 * BPS,  4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS,
   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
+};
 
+static const int VP8ScanUV[4 + 4] = {
   0 + 0 * BPS,   4 + 0 * BPS, 0 + 4 * BPS,  4 + 4 * BPS,    // U
   8 + 0 * BPS,  12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS     // V
 };
@@ -514,24 +515,27 @@ static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
 //------------------------------------------------------------------------------
 // Performs trellis-optimized quantization.
 
-// Trellis
-
+// Trellis node
 typedef struct {
-  int prev;        // best previous
-  int level;       // level
-  int sign;        // sign of coeff_i
-  score_t cost;    // bit cost
-  score_t error;   // distortion = sum of (|coeff_i| - level_i * Q_i)^2
-  int ctx;         // context (only depends on 'level'. Could be spared.)
+  int8_t prev;            // best previous node
+  int8_t sign;            // sign of coeff_i
+  int16_t level;          // level
 } Node;
 
+// Score state
+typedef struct {
+  score_t score;          // partial RD score
+  const uint16_t* costs;  // shortcut to cost tables
+} ScoreState;
+
 // If a coefficient was quantized to a value Q (using a neutral bias),
 // we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
 // We don't test negative values though.
 #define MIN_DELTA 0   // how much lower level to try
 #define MAX_DELTA 1   // how much higher
 #define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
-#define NODE(n, l) (nodes[(n) + 1][(l) + MIN_DELTA])
+#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
+#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
 
 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
   // TODO: incorporate the "* 256" in the tables?
@@ -543,34 +547,36 @@ static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
   return rate * lambda + 256 * distortion;
 }
 
-static int TrellisQuantizeBlock(const VP8EncIterator* const it,
+static int TrellisQuantizeBlock(const VP8Encoder* const enc,
                                 int16_t in[16], int16_t out[16],
                                 int ctx0, int coeff_type,
                                 const VP8Matrix* const mtx,
                                 int lambda) {
-  ProbaArray* const last_costs = it->enc_->proba_.coeffs_[coeff_type];
-  CostArray* const costs = it->enc_->proba_.level_cost_[coeff_type];
+  const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
+  const CostArray* const costs = enc->proba_.level_cost_[coeff_type];
   const int first = (coeff_type == 0) ? 1 : 0;
-  Node nodes[17][NUM_NODES];
+  Node nodes[16][NUM_NODES];
+  ScoreState score_states[2][NUM_NODES];
+  ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
+  ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
   int best_path[3] = {-1, -1, -1};   // store best-last/best-level/best-previous
   score_t best_score;
-  int best_node;
-  int last = first - 1;
-  int n, m, p, nz;
+  int n, m, p, last;
 
   {
     score_t cost;
-    score_t max_error;
     const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
-    const int last_proba = last_costs[VP8EncBands[first]][ctx0][0];
+    const int last_proba = probas[VP8EncBands[first]][ctx0][0];
 
-    // compute maximal distortion.
-    max_error = 0;
-    for (n = first; n < 16; ++n) {
-      const int j  = kZigzag[n];
+    // compute the position of the last interesting coefficient
+    last = first - 1;
+    for (n = 15; n >= first; --n) {
+      const int j = kZigzag[n];
       const int err = in[j] * in[j];
-      max_error += kWeightTrellis[j] * err;
-      if (err > thresh) last = n;
+      if (err > thresh) {
+        last = n;
+        break;
+      }
     }
     // we don't need to go inspect up to n = 16 coeffs. We can just go up
     // to last + 1 (inclusive) without losing much.
@@ -578,92 +584,95 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
 
     // compute 'skip' score. This is the max score one can do.
     cost = VP8BitCost(0, last_proba);
-    best_score = RDScoreTrellis(lambda, cost, max_error);
+    best_score = RDScoreTrellis(lambda, cost, 0);
 
     // initialize source node.
-    n = first - 1;
     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
-      NODE(n, m).cost = 0;
-      NODE(n, m).error = max_error;
-      NODE(n, m).ctx = ctx0;
+      const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
+      ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
+      ss_cur[m].costs = costs[VP8EncBands[first]][ctx0];
     }
   }
 
   // traverse trellis.
   for (n = first; n <= last; ++n) {
-    const int j  = kZigzag[n];
-    const int Q  = mtx->q_[j];
-    const int iQ = mtx->iq_[j];
-    const int B = BIAS(0x00);     // neutral bias
+    const int j = kZigzag[n];
+    const uint32_t Q  = mtx->q_[j];
+    const uint32_t iQ = mtx->iq_[j];
+    const uint32_t B = BIAS(0x00);     // neutral bias
     // note: it's important to take sign of the _original_ coeff,
     // so we don't have to consider level < 0 afterward.
     const int sign = (in[j] < 0);
-    const int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
     int level0 = QUANTDIV(coeff0, iQ, B);
     if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
 
+    {   // Swap current and previous score states
+      ScoreState* const tmp = ss_cur;
+      ss_cur = ss_prev;
+      ss_prev = tmp;
+    }
+
     // test all alternate level values around level0.
     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
       Node* const cur = &NODE(n, m);
-      int delta_error, new_error;
-      score_t cur_score = MAX_COST;
       int level = level0 + m;
-      int last_proba;
-
-      cur->sign = sign;
-      cur->level = level;
-      cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2;
+      const int ctx = (level > 2) ? 2 : level;
+      const int band = VP8EncBands[n + 1];
+      score_t base_score, last_pos_score;
+      score_t best_cur_score = MAX_COST;
+      int best_prev = 0;   // default, in case
+
+      ss_cur[m].score = MAX_COST;
+      ss_cur[m].costs = costs[band][ctx];
       if (level > MAX_LEVEL || level < 0) {   // node is dead?
-        cur->cost = MAX_COST;
         continue;
       }
-      last_proba = last_costs[VP8EncBands[n + 1]][cur->ctx][0];
 
-      // Compute delta_error = how much coding this level will
-      // subtract as distortion to max_error
-      new_error = coeff0 - level * Q;
-      delta_error =
-        kWeightTrellis[j] * (coeff0 * coeff0 - new_error * new_error);
+      // Compute extra rate cost if last coeff's position is < 15
+      {
+        const score_t last_pos_cost =
+            (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
+        last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
+      }
+
+      {
+        // Compute delta_error = how much coding this level will
+        // subtract to max_error as distortion.
+        // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
+        const int new_error = coeff0 - level * Q;
+        const int delta_error =
+            kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
+        base_score = RDScoreTrellis(lambda, 0, delta_error);
+      }
 
       // Inspect all possible non-dead predecessors. Retain only the best one.
       for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
-        const Node* const prev = &NODE(n - 1, p);
-        const int prev_ctx = prev->ctx;
-        const uint16_t* const tcost = costs[VP8EncBands[n]][prev_ctx];
-        const score_t total_error = prev->error - delta_error;
-        score_t cost, base_cost, score;
-
-        if (prev->cost >= MAX_COST) {   // dead node?
-          continue;
-        }
-
-        // Base cost of both terminal/non-terminal
-        base_cost = prev->cost + VP8LevelCost(tcost, level);
-
+        // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
+        // eliminated since their score can't be better than the current best.
+        const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
         // Examine node assuming it's a non-terminal one.
-        cost = base_cost;
-        if (level && n < 15) {
-          cost += VP8BitCost(1, last_proba);
+        const score_t score =
+            base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
+        if (score < best_cur_score) {
+          best_cur_score = score;
+          best_prev = p;
         }
-        score = RDScoreTrellis(lambda, cost, total_error);
-        if (score < cur_score) {
-          cur_score = score;
-          cur->cost  = cost;
-          cur->error = total_error;
-          cur->prev  = p;
-        }
-
-        // Now, record best terminal node (and thus best entry in the graph).
-        if (level) {
-          cost = base_cost;
-          if (n < 15) cost += VP8BitCost(0, last_proba);
-          score = RDScoreTrellis(lambda, cost, total_error);
-          if (score < best_score) {
-            best_score = score;
-            best_path[0] = n;   // best eob position
-            best_path[1] = m;   // best level
-            best_path[2] = p;   // best predecessor
-          }
+      }
+      // Store best finding in current node.
+      cur->sign = sign;
+      cur->level = level;
+      cur->prev = best_prev;
+      ss_cur[m].score = best_cur_score;
+
+      // Now, record best terminal node (and thus best entry in the graph).
+      if (level != 0) {
+        const score_t score = best_cur_score + last_pos_score;
+        if (score < best_score) {
+          best_score = score;
+          best_path[0] = n;                     // best eob position
+          best_path[1] = m;                     // best node index
+          best_path[2] = best_prev;             // best predecessor
         }
       }
     }
@@ -676,23 +685,25 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
     return 0;   // skip!
   }
 
-  // Unwind the best path.
-  // Note: best-prev on terminal node is not necessarily equal to the
-  // best_prev for non-terminal. So we patch best_path[2] in.
-  n = best_path[0];
-  best_node = best_path[1];
-  NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
-  nz = 0;
-
-  for (; n >= first; --n) {
-    const Node* const node = &NODE(n, best_node);
-    const int j = kZigzag[n];
-    out[n] = node->sign ? -node->level : node->level;
-    nz |= (node->level != 0);
-    in[j] = out[n] * mtx->q_[j];
-    best_node = node->prev;
+  {
+    // Unwind the best path.
+    // Note: best-prev on terminal node is not necessarily equal to the
+    // best_prev for non-terminal. So we patch best_path[2] in.
+    int nz = 0;
+    int best_node = best_path[1];
+    n = best_path[0];
+    NODE(n, best_node).prev = best_path[2];   // force best-prev for terminal
+
+    for (; n >= first; --n) {
+      const Node* const node = &NODE(n, best_node);
+      const int j = kZigzag[n];
+      out[n] = node->sign ? -node->level : node->level;
+      nz |= node->level;
+      in[j] = out[n] * mtx->q_[j];
+      best_node = node->prev;
+    }
+    return (nz != 0);
   }
-  return nz;
 }
 
 #undef NODE
@@ -706,10 +717,10 @@ static int ReconstructIntra16(VP8EncIterator* const it,
                               VP8ModeScore* const rd,
                               uint8_t* const yuv_out,
                               int mode) {
-  VP8Encoder* const enc = it->enc_;
+  const VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
   const uint8_t* const src = it->yuv_in_ + Y_OFF;
-  VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   int nz = 0;
   int n;
   int16_t tmp[16][16], dc_tmp[16];
@@ -727,20 +738,25 @@ static int ReconstructIntra16(VP8EncIterator* const it,
       for (x = 0; x < 4; ++x, ++n) {
         const int ctx = it->top_nz_[x] + it->left_nz_[y];
         const int non_zero =
-           TrellisQuantizeBlock(it, tmp[n], rd->y_ac_levels[n], ctx, 0,
-                                &dqm->y1_, dqm->lambda_trellis_i16_);
+            TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, 0,
+                                 &dqm->y1_, dqm->lambda_trellis_i16_);
         it->top_nz_[x] = it->left_nz_[y] = non_zero;
+        rd->y_ac_levels[n][0] = 0;
         nz |= non_zero << n;
       }
     }
   } else {
     for (n = 0; n < 16; ++n) {
-      nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], 1, &dqm->y1_) << n;
+      // Zero-out the first coeff, so that: a) nz is correct below, and
+      // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
+      tmp[n][0] = 0;
+      nz |= VP8EncQuantizeBlock(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
+      assert(rd->y_ac_levels[n][0] == 0);
     }
   }
 
   // Transform back
-  VP8ITransformWHT(dc_tmp, tmp[0]);
+  VP8TransformWHT(dc_tmp, tmp[0]);
   for (n = 0; n < 16; n += 2) {
     VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
   }
@@ -763,10 +779,10 @@ static int ReconstructIntra4(VP8EncIterator* const it,
   if (DO_TRELLIS_I4 && it->do_trellis_) {
     const int x = it->i4_ & 3, y = it->i4_ >> 2;
     const int ctx = it->top_nz_[x] + it->left_nz_[y];
-    nz = TrellisQuantizeBlock(it, tmp, levels, ctx, 3, &dqm->y1_,
+    nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, 3, &dqm->y1_,
                               dqm->lambda_trellis_i4_);
   } else {
-    nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_);
+    nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
   }
   VP8ITransform(ref, tmp, yuv_out, 0);
   return nz;
@@ -783,7 +799,7 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
   int16_t tmp[8][16];
 
   for (n = 0; n < 8; ++n) {
-    VP8FTransform(src + VP8Scan[16 + n], ref + VP8Scan[16 + n], tmp[n]);
+    VP8FTransform(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
   }
   if (DO_TRELLIS_UV && it->do_trellis_) {
     int ch, x, y;
@@ -792,8 +808,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
         for (x = 0; x < 2; ++x, ++n) {
           const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
           const int non_zero =
-            TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2,
-                                 &dqm->uv_, dqm->lambda_trellis_uv_);
+              TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, 2,
+                                   &dqm->uv_, dqm->lambda_trellis_uv_);
           it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
           nz |= non_zero << n;
         }
@@ -801,12 +817,12 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
     }
   } else {
     for (n = 0; n < 8; ++n) {
-      nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], 0, &dqm->uv_) << n;
+      nz |= VP8EncQuantizeBlock(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
     }
   }
 
   for (n = 0; n < 8; n += 2) {
-    VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n], 1);
+    VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
   }
   return (nz << 16);
 }
@@ -851,8 +867,7 @@ static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
 
 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
   const int kNumBlocks = 16;
-  VP8Encoder* const enc = it->enc_;
-  VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i16_;
   const int tlambda = dqm->tlambda_;
   const uint8_t* const src = it->yuv_in_ + Y_OFF;
@@ -999,8 +1014,7 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
 
 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
   const int kNumBlocks = 8;
-  const VP8Encoder* const enc = it->enc_;
-  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_uv_;
   const uint8_t* const src = it->yuv_in_ + U_OFF;
   uint8_t* const tmp_dst = it->yuv_out2_ + U_OFF;  // scratch buffer
diff --git a/src/3rdparty/libwebp/src/enc/syntax.c b/src/3rdparty/libwebp/src/enc/syntax.c
index 08cfe79..d1ff0a5 100644
--- a/src/3rdparty/libwebp/src/enc/syntax.c
+++ b/src/3rdparty/libwebp/src/enc/syntax.c
@@ -263,53 +263,16 @@ static int EmitPartitionsSize(const VP8Encoder* const enc,
 
 //------------------------------------------------------------------------------
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-
-#define KTRAILER_SIZE 8
-
-static int WriteExtensions(VP8Encoder* const enc) {
-  uint8_t buffer[KTRAILER_SIZE];
-  VP8BitWriter* const bw = &enc->bw_;
-  WebPPicture* const pic = enc->pic_;
-
-  // Layer (bytes 0..3)
-  PutLE24(buffer + 0, enc->layer_data_size_);
-  buffer[3] = enc->pic_->colorspace & WEBP_CSP_UV_MASK;
-  if (enc->layer_data_size_ > 0) {
-    assert(enc->use_layer_);
-    // append layer data to last partition
-    if (!VP8BitWriterAppend(&enc->parts_[enc->num_parts_ - 1],
-                            enc->layer_data_, enc->layer_data_size_)) {
-      return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY);
-    }
-  }
-
-  buffer[KTRAILER_SIZE - 1] = 0x01;  // marker
-  if (!VP8BitWriterAppend(bw, buffer, KTRAILER_SIZE)) {
-    return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY);
-  }
-  return 1;
-}
-
-#endif    /* WEBP_EXPERIMENTAL_FEATURES */
-
-//------------------------------------------------------------------------------
-
-static size_t GeneratePartition0(VP8Encoder* const enc) {
+static int GeneratePartition0(VP8Encoder* const enc) {
   VP8BitWriter* const bw = &enc->bw_;
   const int mb_size = enc->mb_w_ * enc->mb_h_;
   uint64_t pos1, pos2, pos3;
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  const int need_extensions = enc->use_layer_;
-#endif
 
   pos1 = VP8BitWriterPos(bw);
-  VP8BitWriterInit(bw, mb_size * 7 / 8);        // ~7 bits per macroblock
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  VP8PutBitUniform(bw, need_extensions);   // extensions
-#else
+  if (!VP8BitWriterInit(bw, mb_size * 7 / 8)) {        // ~7 bits per macroblock
+    return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
   VP8PutBitUniform(bw, 0);   // colorspace
-#endif
   VP8PutBitUniform(bw, 0);   // clamp type
 
   PutSegmentHeader(bw, enc);
@@ -324,21 +287,17 @@ static size_t GeneratePartition0(VP8Encoder* const enc) {
   VP8CodeIntraModes(enc);
   VP8BitWriterFinish(bw);
 
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  if (need_extensions && !WriteExtensions(enc)) {
-    return 0;
-  }
-#endif
-
   pos3 = VP8BitWriterPos(bw);
 
   if (enc->pic_->stats) {
     enc->pic_->stats->header_bytes[0] = (int)((pos2 - pos1 + 7) >> 3);
     enc->pic_->stats->header_bytes[1] = (int)((pos3 - pos2 + 7) >> 3);
     enc->pic_->stats->alpha_data_size = (int)enc->alpha_data_size_;
-    enc->pic_->stats->layer_data_size = (int)enc->layer_data_size_;
   }
-  return !bw->error_;
+  if (bw->error_) {
+    return WebPEncodingSetError(enc->pic_, VP8_ENC_ERROR_OUT_OF_MEMORY);
+  }
+  return 1;
 }
 
 void VP8EncFreeBitWriters(VP8Encoder* const enc) {
@@ -360,7 +319,8 @@ int VP8EncWrite(VP8Encoder* const enc) {
   int p;
 
   // Partition #0 with header and partition sizes
-  ok = !!GeneratePartition0(enc);
+  ok = GeneratePartition0(enc);
+  if (!ok) return 0;
 
   // Compute VP8 size
   vp8_size = VP8_FRAME_HEADER_SIZE +
diff --git a/src/3rdparty/libwebp/src/enc/token.c b/src/3rdparty/libwebp/src/enc/token.c
index e696642..8af13a0 100644
--- a/src/3rdparty/libwebp/src/enc/token.c
+++ b/src/3rdparty/libwebp/src/enc/token.c
@@ -22,27 +22,32 @@
 
 #include "./cost.h"
 #include "./vp8enci.h"
+#include "../utils/utils.h"
 
 #if !defined(DISABLE_TOKEN_BUFFER)
 
 // we use pages to reduce the number of memcpy()
-#define MAX_NUM_TOKEN 8192          // max number of token per page
+#define MIN_PAGE_SIZE 8192          // minimum number of token per page
 #define FIXED_PROBA_BIT (1u << 14)
 
+typedef uint16_t token_t;  // bit#15: bit
+                           // bit #14: constant proba or idx
+                           // bits 0..13: slot or constant proba
 struct VP8Tokens {
-  uint16_t tokens_[MAX_NUM_TOKEN];  // bit#15: bit
-                                    // bit #14: constant proba or idx
-                                    // bits 0..13: slot or constant proba
-  VP8Tokens* next_;
+  VP8Tokens* next_;        // pointer to next page
 };
+// Token data is located in memory just after the next_ field.
+// This macro is used to return their address and hide the trick.
+#define TOKEN_DATA(p) ((token_t*)&(p)[1])
 
 //------------------------------------------------------------------------------
 
-void VP8TBufferInit(VP8TBuffer* const b) {
+void VP8TBufferInit(VP8TBuffer* const b, int page_size) {
   b->tokens_ = NULL;
   b->pages_ = NULL;
   b->last_page_ = &b->pages_;
   b->left_ = 0;
+  b->page_size_ = (page_size < MIN_PAGE_SIZE) ? MIN_PAGE_SIZE : page_size;
   b->error_ = 0;
 }
 
@@ -51,24 +56,29 @@ void VP8TBufferClear(VP8TBuffer* const b) {
     const VP8Tokens* p = b->pages_;
     while (p != NULL) {
       const VP8Tokens* const next = p->next_;
-      free((void*)p);
+      WebPSafeFree((void*)p);
       p = next;
     }
-    VP8TBufferInit(b);
+    VP8TBufferInit(b, b->page_size_);
   }
 }
 
 static int TBufferNewPage(VP8TBuffer* const b) {
-  VP8Tokens* const page = b->error_ ? NULL : (VP8Tokens*)malloc(sizeof(*page));
+  VP8Tokens* page = NULL;
+  const size_t size = sizeof(*page) + b->page_size_ * sizeof(token_t);
+  if (!b->error_) {
+    page = (VP8Tokens*)WebPSafeMalloc(1ULL, size);
+  }
   if (page == NULL) {
     b->error_ = 1;
     return 0;
   }
+  page->next_ = NULL;
+
   *b->last_page_ = page;
   b->last_page_ = &page->next_;
-  b->left_ = MAX_NUM_TOKEN;
-  b->tokens_ = page->tokens_;
-  page->next_ = NULL;
+  b->left_ = b->page_size_;
+  b->tokens_ = TOKEN_DATA(page);
   return 1;
 }
 
@@ -195,8 +205,9 @@ void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats) {
   while (p != NULL) {
     const int N = (p->next_ == NULL) ? b->left_ : 0;
     int n = MAX_NUM_TOKEN;
+    const token_t* const tokens = TOKEN_DATA(p);
     while (n-- > N) {
-      const uint16_t token = p->tokens_[n];
+      const token_t token = tokens[n];
       if (!(token & FIXED_PROBA_BIT)) {
         Record((token >> 15) & 1, stats + (token & 0x3fffu));
       }
@@ -214,13 +225,14 @@ int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
                   const uint8_t* const probas, int final_pass) {
   const VP8Tokens* p = b->pages_;
   (void)final_pass;
-  if (b->error_) return 0;
+  assert(!b->error_);
   while (p != NULL) {
     const VP8Tokens* const next = p->next_;
     const int N = (next == NULL) ? b->left_ : 0;
-    int n = MAX_NUM_TOKEN;
+    int n = b->page_size_;
+    const token_t* const tokens = TOKEN_DATA(p);
     while (n-- > N) {
-      const uint16_t token = p->tokens_[n];
+      const token_t token = tokens[n];
       const int bit = (token >> 15) & 1;
       if (token & FIXED_PROBA_BIT) {
         VP8PutBit(bw, bit, token & 0xffu);  // constant proba
@@ -228,7 +240,7 @@ int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
         VP8PutBit(bw, bit, probas[token & 0x3fffu]);
       }
     }
-    if (final_pass) free((void*)p);
+    if (final_pass) WebPSafeFree((void*)p);
     p = next;
   }
   if (final_pass) b->pages_ = NULL;
@@ -239,13 +251,14 @@ int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
 size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) {
   size_t size = 0;
   const VP8Tokens* p = b->pages_;
-  if (b->error_) return 0;
+  assert(!b->error_);
   while (p != NULL) {
     const VP8Tokens* const next = p->next_;
     const int N = (next == NULL) ? b->left_ : 0;
-    int n = MAX_NUM_TOKEN;
+    int n = b->page_size_;
+    const token_t* const tokens = TOKEN_DATA(p);
     while (n-- > N) {
-      const uint16_t token = p->tokens_[n];
+      const token_t token = tokens[n];
       const int bit = token & (1 << 15);
       if (token & FIXED_PROBA_BIT) {
         size += VP8BitCost(bit, token & 0xffu);
diff --git a/src/3rdparty/libwebp/src/enc/vp8enci.h b/src/3rdparty/libwebp/src/enc/vp8enci.h
index 71adf6c..74c8f70 100644
--- a/src/3rdparty/libwebp/src/enc/vp8enci.h
+++ b/src/3rdparty/libwebp/src/enc/vp8enci.h
@@ -30,7 +30,7 @@ extern "C" {
 // version numbers
 #define ENC_MAJ_VERSION 0
 #define ENC_MIN_VERSION 4
-#define ENC_REV_VERSION 0
+#define ENC_REV_VERSION 3
 
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@@ -130,8 +130,8 @@ typedef enum {   // Rate-distortion optimization levels
 #define ALIGN_CST 15
 #define DO_ALIGN(PTR) ((uintptr_t)((PTR) + ALIGN_CST) & ~ALIGN_CST)
 
-extern const int VP8Scan[16 + 4 + 4];           // in quant.c
-extern const int VP8UVModeOffsets[4];           // in analyze.c
+extern const int VP8Scan[16];           // in quant.c
+extern const int VP8UVModeOffsets[4];   // in analyze.c
 extern const int VP8I16ModeOffsets[4];
 extern const int VP8I4ModeOffsets[NUM_BMODES];
 
@@ -160,14 +160,16 @@ extern const int VP8I4ModeOffsets[NUM_BMODES];
 #define I4TMP (6 * 16 * BPS + 8 * BPS +  8)
 
 typedef int64_t score_t;     // type used for scores, rate, distortion
+// Note that MAX_COST is not the maximum allowed by sizeof(score_t),
+// in order to allow overflowing computations.
 #define MAX_COST ((score_t)0x7fffffffffffffLL)
 
 #define QFIX 17
 #define BIAS(b)  ((b) << (QFIX - 8))
 // Fun fact: this is the _only_ line where we're actually being lossy and
 // discarding bits.
-static WEBP_INLINE int QUANTDIV(int n, int iQ, int B) {
-  return (n * iQ + B) >> QFIX;
+static WEBP_INLINE int QUANTDIV(uint32_t n, uint32_t iQ, uint32_t B) {
+  return (int)((n * iQ + B) >> QFIX);
 }
 
 // size of histogram used by CollectHistogram.
@@ -204,9 +206,9 @@ typedef struct {
 typedef struct {
   uint8_t segments_[3];     // probabilities for segment tree
   uint8_t skip_proba_;      // final probability of being skipped.
-  ProbaArray coeffs_[NUM_TYPES][NUM_BANDS];      // 924 bytes
+  ProbaArray coeffs_[NUM_TYPES][NUM_BANDS];      // 1056 bytes
   StatsArray stats_[NUM_TYPES][NUM_BANDS];       // 4224 bytes
-  CostArray level_cost_[NUM_TYPES][NUM_BANDS];   // 11.4k
+  CostArray level_cost_[NUM_TYPES][NUM_BANDS];   // 13056 bytes
   int dirty_;               // if true, need to call VP8CalculateLevelCosts()
   int use_skip_proba_;      // Note: we always use skip_proba for now.
   int nb_skip_;             // number of skipped blocks
@@ -236,8 +238,8 @@ typedef struct {
 typedef struct VP8Matrix {
   uint16_t q_[16];        // quantizer steps
   uint16_t iq_[16];       // reciprocals, fixed point.
-  uint16_t bias_[16];     // rounding bias
-  uint16_t zthresh_[16];  // value under which a coefficient is zeroed
+  uint32_t bias_[16];     // rounding bias
+  uint32_t zthresh_[16];  // value below which a coefficient is zeroed
   uint16_t sharpen_[16];  // frequency boosters for slight sharpening
 } VP8Matrix;
 
@@ -361,12 +363,14 @@ typedef struct {
   VP8Tokens* pages_;        // first page
   VP8Tokens** last_page_;   // last page
   uint16_t* tokens_;        // set to (*last_page_)->tokens_
-  int left_;          // how many free tokens left before the page is full.
+  int left_;                // how many free tokens left before the page is full
+  int page_size_;           // number of tokens per page
 #endif
   int error_;         // true in case of malloc error
 } VP8TBuffer;
 
-void VP8TBufferInit(VP8TBuffer* const b);    // initialize an empty buffer
+// initialize an empty buffer
+void VP8TBufferInit(VP8TBuffer* const b, int page_size);
 void VP8TBufferClear(VP8TBuffer* const b);   // de-allocate pages memory
 
 #if !defined(DISABLE_TOKEN_BUFFER)
@@ -422,12 +426,6 @@ struct VP8Encoder {
   uint32_t alpha_data_size_;
   WebPWorker alpha_worker_;
 
-  // enhancement layer
-  int use_layer_;
-  VP8BitWriter layer_bw_;
-  uint8_t* layer_data_;
-  size_t layer_data_size_;
-
   // quantization info (one set of DC/AC dequant factor per segment)
   VP8SegmentInfo dqm_[NUM_MB_SEGMENTS];
   int base_quant_;                 // nominal quantizer value. Only used
@@ -459,10 +457,10 @@ struct VP8Encoder {
   VP8MBInfo* mb_info_;   // contextual macroblock infos (mb_w_ + 1)
   uint8_t*   preds_;     // predictions modes: (4*mb_w+1) * (4*mb_h+1)
   uint32_t*  nz_;        // non-zero bit context: mb_w+1
-  uint8_t   *y_top_;     // top luma samples.
-  uint8_t   *uv_top_;    // top u/v samples.
+  uint8_t*   y_top_;     // top luma samples.
+  uint8_t*   uv_top_;    // top u/v samples.
                          // U and V are packed into 16 bytes (8 U + 8 V)
-  LFStats   *lf_stats_;  // autofilter stats (if NULL, autofilter is off)
+  LFStats*   lf_stats_;  // autofilter stats (if NULL, autofilter is off)
 };
 
 //------------------------------------------------------------------------------
@@ -533,12 +531,6 @@ int VP8EncStartAlpha(VP8Encoder* const enc);    // start alpha coding process
 int VP8EncFinishAlpha(VP8Encoder* const enc);   // finalize compressed data
 int VP8EncDeleteAlpha(VP8Encoder* const enc);   // delete compressed data
 
-  // in layer.c
-void VP8EncInitLayer(VP8Encoder* const enc);     // init everything
-void VP8EncCodeLayerBlock(VP8EncIterator* it);   // code one more macroblock
-int VP8EncFinishLayer(VP8Encoder* const enc);    // finalize coding
-void VP8EncDeleteLayer(VP8Encoder* enc);         // reclaim memory
-
   // in filter.c
 
 // SSIM utils
@@ -561,8 +553,28 @@ void VP8AdjustFilterStrength(VP8EncIterator* const it);
 // step of 'delta', given a sharpness parameter 'sharpness'.
 int VP8FilterStrengthFromDelta(int sharpness, int delta);
 
+  // misc utils for picture_*.c:
+
+// Remove reference to the ARGB/YUVA buffer (doesn't free anything).
+void WebPPictureResetBuffers(WebPPicture* const picture);
+
+// Allocates ARGB buffer of given dimension (previous one is always free'd).
+// Preserves the YUV(A) buffer. Returns false in case of error (invalid param,
+// out-of-memory).
+int WebPPictureAllocARGB(WebPPicture* const picture, int width, int height);
+
+// Allocates YUVA buffer of given dimension (previous one is always free'd).
+// Uses picture->csp to determine whether an alpha buffer is needed.
+// Preserves the ARGB buffer.
+// Returns false in case of error (invalid param, out-of-memory).
+int WebPPictureAllocYUVA(WebPPicture* const picture, int width, int height);
+
 //------------------------------------------------------------------------------
 
+#if WEBP_ENCODER_ABI_VERSION <= 0x0203
+void WebPMemoryWriterClear(WebPMemoryWriter* writer);
+#endif
+
 #ifdef __cplusplus
 }    // extern "C"
 #endif
diff --git a/src/3rdparty/libwebp/src/enc/vp8l.c b/src/3rdparty/libwebp/src/enc/vp8l.c
index 1572631..c2bb13d 100644
--- a/src/3rdparty/libwebp/src/enc/vp8l.c
+++ b/src/3rdparty/libwebp/src/enc/vp8l.c
@@ -106,14 +106,9 @@ static int AnalyzeEntropy(const uint32_t* argb,
   const uint32_t* last_line = NULL;
   uint32_t last_pix = argb[0];    // so we're sure that pix_diff == 0
 
-  VP8LHistogram* nonpredicted = NULL;
-  VP8LHistogram* predicted =
-      (VP8LHistogram*)malloc(2 * sizeof(*predicted));
-  if (predicted == NULL) return 0;
-  nonpredicted = predicted + 1;
-
-  VP8LHistogramInit(predicted, 0);
-  VP8LHistogramInit(nonpredicted, 0);
+  VP8LHistogramSet* const histo_set = VP8LAllocateHistogramSet(2, 0);
+  if (histo_set == NULL) return 0;
+
   for (y = 0; y < height; ++y) {
     for (x = 0; x < width; ++x) {
       const uint32_t pix = argb[x];
@@ -126,21 +121,28 @@ static int AnalyzeEntropy(const uint32_t* argb,
       {
         const PixOrCopy pix_token = PixOrCopyCreateLiteral(pix);
         const PixOrCopy pix_diff_token = PixOrCopyCreateLiteral(pix_diff);
-        VP8LHistogramAddSinglePixOrCopy(nonpredicted, &pix_token);
-        VP8LHistogramAddSinglePixOrCopy(predicted, &pix_diff_token);
+        VP8LHistogramAddSinglePixOrCopy(histo_set->histograms[0], &pix_token);
+        VP8LHistogramAddSinglePixOrCopy(histo_set->histograms[1],
+                                        &pix_diff_token);
       }
     }
     last_line = argb;
     argb += argb_stride;
   }
-  *nonpredicted_bits = VP8LHistogramEstimateBitsBulk(nonpredicted);
-  *predicted_bits = VP8LHistogramEstimateBitsBulk(predicted);
-  free(predicted);
+  *nonpredicted_bits = VP8LHistogramEstimateBitsBulk(histo_set->histograms[0]);
+  *predicted_bits = VP8LHistogramEstimateBitsBulk(histo_set->histograms[1]);
+  VP8LFreeHistogramSet(histo_set);
   return 1;
 }
 
-static int VP8LEncAnalyze(VP8LEncoder* const enc, WebPImageHint image_hint) {
+static int AnalyzeAndInit(VP8LEncoder* const enc, WebPImageHint image_hint) {
   const WebPPicture* const pic = enc->pic_;
+  const int width = pic->width;
+  const int height = pic->height;
+  const int pix_cnt = width * height;
+  // we round the block size up, so we're guaranteed to have
+  // at max MAX_REFS_BLOCK_PER_IMAGE blocks used:
+  int refs_block_size = (pix_cnt - 1) / MAX_REFS_BLOCK_PER_IMAGE + 1;
   assert(pic != NULL && pic->argb != NULL);
 
   enc->use_palette_ =
@@ -158,7 +160,7 @@ static int VP8LEncAnalyze(VP8LEncoder* const enc, WebPImageHint image_hint) {
       enc->use_cross_color_ = 1;
     } else {
       double non_pred_entropy, pred_entropy;
-      if (!AnalyzeEntropy(pic->argb, pic->width, pic->height, pic->argb_stride,
+      if (!AnalyzeEntropy(pic->argb, width, height, pic->argb_stride,
                           &non_pred_entropy, &pred_entropy)) {
         return 0;
       }
@@ -168,27 +170,38 @@ static int VP8LEncAnalyze(VP8LEncoder* const enc, WebPImageHint image_hint) {
       }
     }
   }
+  if (!VP8LHashChainInit(&enc->hash_chain_, pix_cnt)) return 0;
+
+  // palette-friendly input typically uses less literals
+  //  -> reduce block size a bit
+  if (enc->use_palette_) refs_block_size /= 2;
+  VP8LBackwardRefsInit(&enc->refs_[0], refs_block_size);
+  VP8LBackwardRefsInit(&enc->refs_[1], refs_block_size);
 
   return 1;
 }
 
+// Returns false in case of memory error.
 static int GetHuffBitLengthsAndCodes(
     const VP8LHistogramSet* const histogram_image,
     HuffmanTreeCode* const huffman_codes) {
   int i, k;
-  int ok = 1;
+  int ok = 0;
   uint64_t total_length_size = 0;
   uint8_t* mem_buf = NULL;
   const int histogram_image_size = histogram_image->size;
+  int max_num_symbols = 0;
+  uint8_t* buf_rle = NULL;
+  HuffmanTree* huff_tree = NULL;
 
   // Iterate over all histograms and get the aggregate number of codes used.
   for (i = 0; i < histogram_image_size; ++i) {
     const VP8LHistogram* const histo = histogram_image->histograms[i];
     HuffmanTreeCode* const codes = &huffman_codes[5 * i];
     for (k = 0; k < 5; ++k) {
-      const int num_symbols = (k == 0) ? VP8LHistogramNumCodes(histo)
-                            : (k == 4) ? NUM_DISTANCE_CODES
-                            : 256;
+      const int num_symbols =
+          (k == 0) ? VP8LHistogramNumCodes(histo->palette_code_bits_) :
+          (k == 4) ? NUM_DISTANCE_CODES : 256;
       codes[k].num_symbols = num_symbols;
       total_length_size += num_symbols;
     }
@@ -200,10 +213,8 @@ static int GetHuffBitLengthsAndCodes(
     uint8_t* lengths;
     mem_buf = (uint8_t*)WebPSafeCalloc(total_length_size,
                                        sizeof(*lengths) + sizeof(*codes));
-    if (mem_buf == NULL) {
-      ok = 0;
-      goto End;
-    }
+    if (mem_buf == NULL) goto End;
+
     codes = (uint16_t*)mem_buf;
     lengths = (uint8_t*)&codes[total_length_size];
     for (i = 0; i < 5 * histogram_image_size; ++i) {
@@ -212,24 +223,33 @@ static int GetHuffBitLengthsAndCodes(
       huffman_codes[i].code_lengths = lengths;
       codes += bit_length;
       lengths += bit_length;
+      if (max_num_symbols < bit_length) {
+        max_num_symbols = bit_length;
+      }
     }
   }
 
+  buf_rle = (uint8_t*)WebPSafeMalloc(1ULL, max_num_symbols);
+  huff_tree = (HuffmanTree*)WebPSafeMalloc(3ULL * max_num_symbols,
+                                           sizeof(*huff_tree));
+  if (buf_rle == NULL || huff_tree == NULL) goto End;
+
   // Create Huffman trees.
-  for (i = 0; ok && (i < histogram_image_size); ++i) {
+  for (i = 0; i < histogram_image_size; ++i) {
     HuffmanTreeCode* const codes = &huffman_codes[5 * i];
     VP8LHistogram* const histo = histogram_image->histograms[i];
-    ok = ok && VP8LCreateHuffmanTree(histo->literal_, 15, codes + 0);
-    ok = ok && VP8LCreateHuffmanTree(histo->red_, 15, codes + 1);
-    ok = ok && VP8LCreateHuffmanTree(histo->blue_, 15, codes + 2);
-    ok = ok && VP8LCreateHuffmanTree(histo->alpha_, 15, codes + 3);
-    ok = ok && VP8LCreateHuffmanTree(histo->distance_, 15, codes + 4);
+    VP8LCreateHuffmanTree(histo->literal_, 15, buf_rle, huff_tree, codes + 0);
+    VP8LCreateHuffmanTree(histo->red_, 15, buf_rle, huff_tree, codes + 1);
+    VP8LCreateHuffmanTree(histo->blue_, 15, buf_rle, huff_tree, codes + 2);
+    VP8LCreateHuffmanTree(histo->alpha_, 15, buf_rle, huff_tree, codes + 3);
+    VP8LCreateHuffmanTree(histo->distance_, 15, buf_rle, huff_tree, codes + 4);
   }
-
+  ok = 1;
  End:
+  WebPSafeFree(huff_tree);
+  WebPSafeFree(buf_rle);
   if (!ok) {
-    free(mem_buf);
-    // If one VP8LCreateHuffmanTree() above fails, we need to clean up behind.
+    WebPSafeFree(mem_buf);
     memset(huffman_codes, 0, 5 * histogram_image_size * sizeof(*huffman_codes));
   }
   return ok;
@@ -296,18 +316,16 @@ static void StoreHuffmanTreeToBitMask(
   }
 }
 
-static int StoreFullHuffmanCode(VP8LBitWriter* const bw,
-                                const HuffmanTreeCode* const tree) {
-  int ok = 0;
+// 'huff_tree' and 'tokens' are pre-alloacted buffers.
+static void StoreFullHuffmanCode(VP8LBitWriter* const bw,
+                                 HuffmanTree* const huff_tree,
+                                 HuffmanTreeToken* const tokens,
+                                 const HuffmanTreeCode* const tree) {
   uint8_t code_length_bitdepth[CODE_LENGTH_CODES] = { 0 };
   uint16_t code_length_bitdepth_symbols[CODE_LENGTH_CODES] = { 0 };
   const int max_tokens = tree->num_symbols;
   int num_tokens;
   HuffmanTreeCode huffman_code;
-  HuffmanTreeToken* const tokens =
-      (HuffmanTreeToken*)WebPSafeMalloc((uint64_t)max_tokens, sizeof(*tokens));
-  if (tokens == NULL) return 0;
-
   huffman_code.num_symbols = CODE_LENGTH_CODES;
   huffman_code.code_lengths = code_length_bitdepth;
   huffman_code.codes = code_length_bitdepth_symbols;
@@ -315,15 +333,14 @@ static int StoreFullHuffmanCode(VP8LBitWriter* const bw,
   VP8LWriteBits(bw, 1, 0);
   num_tokens = VP8LCreateCompressedHuffmanTree(tree, tokens, max_tokens);
   {
-    int histogram[CODE_LENGTH_CODES] = { 0 };
+    uint32_t histogram[CODE_LENGTH_CODES] = { 0 };
+    uint8_t buf_rle[CODE_LENGTH_CODES] = { 0 };
     int i;
     for (i = 0; i < num_tokens; ++i) {
       ++histogram[tokens[i].code];
     }
 
-    if (!VP8LCreateHuffmanTree(histogram, 7, &huffman_code)) {
-      goto End;
-    }
+    VP8LCreateHuffmanTree(histogram, 7, buf_rle, huff_tree, &huffman_code);
   }
 
   StoreHuffmanTreeOfHuffmanTreeToBitMask(bw, code_length_bitdepth);
@@ -360,14 +377,13 @@ static int StoreFullHuffmanCode(VP8LBitWriter* const bw,
     }
     StoreHuffmanTreeToBitMask(bw, tokens, length, &huffman_code);
   }
-  ok = 1;
- End:
-  free(tokens);
-  return ok;
 }
 
-static int StoreHuffmanCode(VP8LBitWriter* const bw,
-                            const HuffmanTreeCode* const huffman_code) {
+// 'huff_tree' and 'tokens' are pre-alloacted buffers.
+static void StoreHuffmanCode(VP8LBitWriter* const bw,
+                             HuffmanTree* const huff_tree,
+                             HuffmanTreeToken* const tokens,
+                             const HuffmanTreeCode* const huffman_code) {
   int i;
   int count = 0;
   int symbols[2] = { 0, 0 };
@@ -385,7 +401,6 @@ static int StoreHuffmanCode(VP8LBitWriter* const bw,
   if (count == 0) {   // emit minimal tree for empty cases
     // bits: small tree marker: 1, count-1: 0, large 8-bit code: 0, code: 0
     VP8LWriteBits(bw, 4, 0x01);
-    return 1;
   } else if (count <= 2 && symbols[0] < kMaxSymbol && symbols[1] < kMaxSymbol) {
     VP8LWriteBits(bw, 1, 1);  // Small tree marker to encode 1 or 2 symbols.
     VP8LWriteBits(bw, 1, count - 1);
@@ -399,9 +414,8 @@ static int StoreHuffmanCode(VP8LBitWriter* const bw,
     if (count == 2) {
       VP8LWriteBits(bw, 8, symbols[1]);
     }
-    return 1;
   } else {
-    return StoreFullHuffmanCode(bw, huffman_code);
+    StoreFullHuffmanCode(bw, huff_tree, tokens, huffman_code);
   }
 }
 
@@ -413,18 +427,18 @@ static void WriteHuffmanCode(VP8LBitWriter* const bw,
   VP8LWriteBits(bw, depth, symbol);
 }
 
-static void StoreImageToBitMask(
+static WebPEncodingError StoreImageToBitMask(
     VP8LBitWriter* const bw, int width, int histo_bits,
-    const VP8LBackwardRefs* const refs,
+    VP8LBackwardRefs* const refs,
     const uint16_t* histogram_symbols,
     const HuffmanTreeCode* const huffman_codes) {
   // x and y trace the position in the image.
   int x = 0;
   int y = 0;
   const int histo_xsize = histo_bits ? VP8LSubSampleSize(width, histo_bits) : 1;
-  int i;
-  for (i = 0; i < refs->size; ++i) {
-    const PixOrCopy* const v = &refs->refs[i];
+  VP8LRefsCursor c = VP8LRefsCursorInit(refs);
+  while (VP8LRefsCursorOk(&c)) {
+    const PixOrCopy* const v = c.cur_pos;
     const int histogram_ix = histogram_symbols[histo_bits ?
                                                (y >> histo_bits) * histo_xsize +
                                                (x >> histo_bits) : 0];
@@ -458,88 +472,128 @@ static void StoreImageToBitMask(
       x -= width;
       ++y;
     }
+    VP8LRefsCursorNext(&c);
   }
+  return bw->error_ ? VP8_ENC_ERROR_OUT_OF_MEMORY : VP8_ENC_OK;
 }
 
 // Special case of EncodeImageInternal() for cache-bits=0, histo_bits=31
-static int EncodeImageNoHuffman(VP8LBitWriter* const bw,
-                                const uint32_t* const argb,
-                                int width, int height, int quality) {
+static WebPEncodingError EncodeImageNoHuffman(VP8LBitWriter* const bw,
+                                              const uint32_t* const argb,
+                                              VP8LHashChain* const hash_chain,
+                                              VP8LBackwardRefs refs_array[2],
+                                              int width, int height,
+                                              int quality) {
   int i;
-  int ok = 0;
-  VP8LBackwardRefs refs;
+  int max_tokens = 0;
+  WebPEncodingError err = VP8_ENC_OK;
+  VP8LBackwardRefs* refs;
+  HuffmanTreeToken* tokens = NULL;
   HuffmanTreeCode huffman_codes[5] = { { 0, NULL, NULL } };
   const uint16_t histogram_symbols[1] = { 0 };    // only one tree, one symbol
   VP8LHistogramSet* const histogram_image = VP8LAllocateHistogramSet(1, 0);
-  if (histogram_image == NULL) return 0;
+  HuffmanTree* const huff_tree = (HuffmanTree*)WebPSafeMalloc(
+        3ULL * CODE_LENGTH_CODES, sizeof(*huff_tree));
+  if (histogram_image == NULL || huff_tree == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
 
   // Calculate backward references from ARGB image.
-  if (!VP8LGetBackwardReferences(width, height, argb, quality, 0, 1, &refs)) {
+  refs = VP8LGetBackwardReferences(width, height, argb, quality, 0, 1,
+                                   hash_chain, refs_array);
+  if (refs == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
   // Build histogram image and symbols from backward references.
-  VP8LHistogramStoreRefs(&refs, histogram_image->histograms[0]);
+  VP8LHistogramStoreRefs(refs, histogram_image->histograms[0]);
 
   // Create Huffman bit lengths and codes for each histogram image.
   assert(histogram_image->size == 1);
   if (!GetHuffBitLengthsAndCodes(histogram_image, huffman_codes)) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
 
   // No color cache, no Huffman image.
   VP8LWriteBits(bw, 1, 0);
 
-  // Store Huffman codes.
+  // Find maximum number of symbols for the huffman tree-set.
   for (i = 0; i < 5; ++i) {
     HuffmanTreeCode* const codes = &huffman_codes[i];
-    if (!StoreHuffmanCode(bw, codes)) {
-      goto Error;
+    if (max_tokens < codes->num_symbols) {
+      max_tokens = codes->num_symbols;
     }
+  }
+
+  tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens, sizeof(*tokens));
+  if (tokens == NULL) {
+    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
+    goto Error;
+  }
+
+  // Store Huffman codes.
+  for (i = 0; i < 5; ++i) {
+    HuffmanTreeCode* const codes = &huffman_codes[i];
+    StoreHuffmanCode(bw, huff_tree, tokens, codes);
     ClearHuffmanTreeIfOnlyOneSymbol(codes);
   }
 
   // Store actual literals.
-  StoreImageToBitMask(bw, width, 0, &refs, histogram_symbols, huffman_codes);
-  ok = 1;
+  err = StoreImageToBitMask(bw, width, 0, refs, histogram_symbols,
+                            huffman_codes);
 
  Error:
-  free(histogram_image);
-  VP8LClearBackwardRefs(&refs);
-  free(huffman_codes[0].codes);
-  return ok;
+  WebPSafeFree(tokens);
+  WebPSafeFree(huff_tree);
+  VP8LFreeHistogramSet(histogram_image);
+  WebPSafeFree(huffman_codes[0].codes);
+  return err;
 }
 
-static int EncodeImageInternal(VP8LBitWriter* const bw,
-                               const uint32_t* const argb,
-                               int width, int height, int quality,
-                               int cache_bits, int histogram_bits) {
-  int ok = 0;
+static WebPEncodingError EncodeImageInternal(VP8LBitWriter* const bw,
+                                             const uint32_t* const argb,
+                                             VP8LHashChain* const hash_chain,
+                                             VP8LBackwardRefs refs_array[2],
+                                             int width, int height, int quality,
+                                             int cache_bits,
+                                             int histogram_bits) {
+  WebPEncodingError err = VP8_ENC_OK;
   const int use_2d_locality = 1;
   const int use_color_cache = (cache_bits > 0);
   const uint32_t histogram_image_xysize =
       VP8LSubSampleSize(width, histogram_bits) *
       VP8LSubSampleSize(height, histogram_bits);
   VP8LHistogramSet* histogram_image =
-      VP8LAllocateHistogramSet(histogram_image_xysize, 0);
+      VP8LAllocateHistogramSet(histogram_image_xysize, cache_bits);
   int histogram_image_size = 0;
   size_t bit_array_size = 0;
+  HuffmanTree* huff_tree = NULL;
+  HuffmanTreeToken* tokens = NULL;
   HuffmanTreeCode* huffman_codes = NULL;
   VP8LBackwardRefs refs;
+  VP8LBackwardRefs* best_refs;
   uint16_t* const histogram_symbols =
-      (uint16_t*)WebPSafeMalloc((uint64_t)histogram_image_xysize,
+      (uint16_t*)WebPSafeMalloc(histogram_image_xysize,
                                 sizeof(*histogram_symbols));
   assert(histogram_bits >= MIN_HUFFMAN_BITS);
   assert(histogram_bits <= MAX_HUFFMAN_BITS);
 
+  VP8LBackwardRefsInit(&refs, refs_array[0].block_size_);
   if (histogram_image == NULL || histogram_symbols == NULL) {
-    free(histogram_image);
-    free(histogram_symbols);
+    VP8LFreeHistogramSet(histogram_image);
+    WebPSafeFree(histogram_symbols);
     return 0;
   }
 
+  // 'best_refs' is the reference to the best backward refs and points to one
+  // of refs_array[0] or refs_array[1].
   // Calculate backward references from ARGB image.
-  if (!VP8LGetBackwardReferences(width, height, argb, quality, cache_bits,
-                                 use_2d_locality, &refs)) {
+  best_refs = VP8LGetBackwardReferences(width, height, argb, quality,
+                                        cache_bits, use_2d_locality,
+                                        hash_chain, refs_array);
+  if (best_refs == NULL || !VP8LBackwardRefsCopy(best_refs, &refs)) {
     goto Error;
   }
   // Build histogram image and symbols from backward references.
@@ -559,7 +613,7 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
     goto Error;
   }
   // Free combined histograms.
-  free(histogram_image);
+  VP8LFreeHistogramSet(histogram_image);
   histogram_image = NULL;
 
   // Color Cache parameters.
@@ -574,7 +628,7 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
     VP8LWriteBits(bw, 1, write_histogram_image);
     if (write_histogram_image) {
       uint32_t* const histogram_argb =
-          (uint32_t*)WebPSafeMalloc((uint64_t)histogram_image_xysize,
+          (uint32_t*)WebPSafeMalloc(histogram_image_xysize,
                                     sizeof(*histogram_argb));
       int max_index = 0;
       uint32_t i;
@@ -589,40 +643,54 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
       histogram_image_size = max_index;
 
       VP8LWriteBits(bw, 3, histogram_bits - 2);
-      ok = EncodeImageNoHuffman(bw, histogram_argb,
-                                VP8LSubSampleSize(width, histogram_bits),
-                                VP8LSubSampleSize(height, histogram_bits),
-                                quality);
-      free(histogram_argb);
-      if (!ok) goto Error;
+      err = EncodeImageNoHuffman(bw, histogram_argb, hash_chain, refs_array,
+                                 VP8LSubSampleSize(width, histogram_bits),
+                                 VP8LSubSampleSize(height, histogram_bits),
+                                 quality);
+      WebPSafeFree(histogram_argb);
+      if (err != VP8_ENC_OK) goto Error;
     }
   }
 
   // Store Huffman codes.
   {
     int i;
+    int max_tokens = 0;
+    huff_tree = (HuffmanTree*)WebPSafeMalloc(3ULL * CODE_LENGTH_CODES,
+                                             sizeof(*huff_tree));
+    if (huff_tree == NULL) goto Error;
+    // Find maximum number of symbols for the huffman tree-set.
+    for (i = 0; i < 5 * histogram_image_size; ++i) {
+      HuffmanTreeCode* const codes = &huffman_codes[i];
+      if (max_tokens < codes->num_symbols) {
+        max_tokens = codes->num_symbols;
+      }
+    }
+    tokens = (HuffmanTreeToken*)WebPSafeMalloc(max_tokens,
+                                               sizeof(*tokens));
+    if (tokens == NULL) goto Error;
     for (i = 0; i < 5 * histogram_image_size; ++i) {
       HuffmanTreeCode* const codes = &huffman_codes[i];
-      if (!StoreHuffmanCode(bw, codes)) goto Error;
+      StoreHuffmanCode(bw, huff_tree, tokens, codes);
       ClearHuffmanTreeIfOnlyOneSymbol(codes);
     }
   }
 
   // Store actual literals.
-  StoreImageToBitMask(bw, width, histogram_bits, &refs,
-                      histogram_symbols, huffman_codes);
-  ok = 1;
+  err = StoreImageToBitMask(bw, width, histogram_bits, &refs,
+                            histogram_symbols, huffman_codes);
 
  Error:
-  free(histogram_image);
-
-  VP8LClearBackwardRefs(&refs);
+  WebPSafeFree(tokens);
+  WebPSafeFree(huff_tree);
+  VP8LFreeHistogramSet(histogram_image);
+  VP8LBackwardRefsClear(&refs);
   if (huffman_codes != NULL) {
-    free(huffman_codes->codes);
-    free(huffman_codes);
+    WebPSafeFree(huffman_codes->codes);
+    WebPSafeFree(huffman_codes);
   }
-  free(histogram_symbols);
-  return ok;
+  WebPSafeFree(histogram_symbols);
+  return err;
 }
 
 // -----------------------------------------------------------------------------
@@ -630,17 +698,16 @@ static int EncodeImageInternal(VP8LBitWriter* const bw,
 
 // Check if it would be a good idea to subtract green from red and blue. We
 // only impact entropy in red/blue components, don't bother to look at others.
-static int EvalAndApplySubtractGreen(VP8LEncoder* const enc,
-                                     int width, int height,
-                                     VP8LBitWriter* const bw) {
+static WebPEncodingError EvalAndApplySubtractGreen(VP8LEncoder* const enc,
+                                                   int width, int height,
+                                                   VP8LBitWriter* const bw) {
   if (!enc->use_palette_) {
     int i;
     const uint32_t* const argb = enc->argb_;
     double bit_cost_before, bit_cost_after;
-    VP8LHistogram* const histo = (VP8LHistogram*)malloc(sizeof(*histo));
-    if (histo == NULL) return 0;
-
-    VP8LHistogramInit(histo, 1);
+    // Allocate histogram with cache_bits = 1.
+    VP8LHistogram* const histo = VP8LAllocateHistogram(1);
+    if (histo == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
     for (i = 0; i < width * height; ++i) {
       const uint32_t c = argb[i];
       ++histo->red_[(c >> 16) & 0xff];
@@ -656,7 +723,7 @@ static int EvalAndApplySubtractGreen(VP8LEncoder* const enc,
       ++histo->blue_[((c >> 0) - green) & 0xff];
     }
     bit_cost_after = VP8LHistogramEstimateBits(histo);
-    free(histo);
+    VP8LFreeHistogram(histo);
 
     // Check if subtracting green yields low entropy.
     enc->use_subtract_green_ = (bit_cost_after < bit_cost_before);
@@ -666,12 +733,12 @@ static int EvalAndApplySubtractGreen(VP8LEncoder* const enc,
       VP8LSubtractGreenFromBlueAndRed(enc->argb_, width * height);
     }
   }
-  return 1;
+  return VP8_ENC_OK;
 }
 
-static int ApplyPredictFilter(const VP8LEncoder* const enc,
-                              int width, int height, int quality,
-                              VP8LBitWriter* const bw) {
+static WebPEncodingError ApplyPredictFilter(const VP8LEncoder* const enc,
+                                            int width, int height, int quality,
+                                            VP8LBitWriter* const bw) {
   const int pred_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, pred_bits);
   const int transform_height = VP8LSubSampleSize(height, pred_bits);
@@ -682,32 +749,32 @@ static int ApplyPredictFilter(const VP8LEncoder* const enc,
   VP8LWriteBits(bw, 2, PREDICTOR_TRANSFORM);
   assert(pred_bits >= 2);
   VP8LWriteBits(bw, 3, pred_bits - 2);
-  if (!EncodeImageNoHuffman(bw, enc->transform_data_,
-                            transform_width, transform_height, quality)) {
-    return 0;
-  }
-  return 1;
+  return EncodeImageNoHuffman(bw, enc->transform_data_,
+                              (VP8LHashChain*)&enc->hash_chain_,
+                              (VP8LBackwardRefs*)enc->refs_,  // cast const away
+                              transform_width, transform_height,
+                              quality);
 }
 
-static int ApplyCrossColorFilter(const VP8LEncoder* const enc,
-                                 int width, int height, int quality,
-                                 VP8LBitWriter* const bw) {
+static WebPEncodingError ApplyCrossColorFilter(const VP8LEncoder* const enc,
+                                               int width, int height,
+                                               int quality,
+                                               VP8LBitWriter* const bw) {
   const int ccolor_transform_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
   const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
-  const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16;
 
-  VP8LColorSpaceTransform(width, height, ccolor_transform_bits, step,
+  VP8LColorSpaceTransform(width, height, ccolor_transform_bits, quality,
                           enc->argb_, enc->transform_data_);
   VP8LWriteBits(bw, 1, TRANSFORM_PRESENT);
   VP8LWriteBits(bw, 2, CROSS_COLOR_TRANSFORM);
   assert(ccolor_transform_bits >= 2);
   VP8LWriteBits(bw, 3, ccolor_transform_bits - 2);
-  if (!EncodeImageNoHuffman(bw, enc->transform_data_,
-                            transform_width, transform_height, quality)) {
-    return 0;
-  }
-  return 1;
+  return EncodeImageNoHuffman(bw, enc->transform_data_,
+                              (VP8LHashChain*)&enc->hash_chain_,
+                              (VP8LBackwardRefs*)enc->refs_,  // cast const away
+                              transform_width, transform_height,
+                              quality);
 }
 
 // -----------------------------------------------------------------------------
@@ -785,11 +852,11 @@ static WebPEncodingError AllocateTransformBuffer(VP8LEncoder* const enc,
   const int tile_size = 1 << enc->transform_bits_;
   const uint64_t image_size = width * height;
   const uint64_t argb_scratch_size = tile_size * width + width;
-  const uint64_t transform_data_size =
-      (uint64_t)VP8LSubSampleSize(width, enc->transform_bits_) *
-      (uint64_t)VP8LSubSampleSize(height, enc->transform_bits_);
+  const int transform_data_size =
+      VP8LSubSampleSize(width, enc->transform_bits_) *
+      VP8LSubSampleSize(height, enc->transform_bits_);
   const uint64_t total_size =
-      image_size + argb_scratch_size + transform_data_size;
+      image_size + argb_scratch_size + (uint64_t)transform_data_size;
   uint32_t* mem = (uint32_t*)WebPSafeMalloc(total_size, sizeof(*mem));
   if (mem == NULL) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
@@ -888,7 +955,7 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
   if (err != VP8_ENC_OK) goto Error;
   dst = enc->argb_;
 
-  row = (uint8_t*)WebPSafeMalloc((uint64_t)width, sizeof(*row));
+  row = (uint8_t*)WebPSafeMalloc(width, sizeof(*row));
   if (row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
 
   ApplyPalette(src, dst, pic->argb_stride, enc->current_width_,
@@ -902,42 +969,48 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
   for (i = palette_size - 1; i >= 1; --i) {
     palette[i] = VP8LSubPixels(palette[i], palette[i - 1]);
   }
-  if (!EncodeImageNoHuffman(bw, palette, palette_size, 1, quality)) {
-    err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
-    goto Error;
-  }
+  err = EncodeImageNoHuffman(bw, palette, &enc->hash_chain_, enc->refs_,
+                             palette_size, 1, quality);
 
  Error:
-  free(row);
+  WebPSafeFree(row);
   return err;
 }
 
 // -----------------------------------------------------------------------------
 
 static int GetHistoBits(int method, int use_palette, int width, int height) {
-  const uint64_t hist_size = sizeof(VP8LHistogram);
+  const int hist_size = VP8LGetHistogramSize(MAX_COLOR_CACHE_BITS);
   // Make tile size a function of encoding method (Range: 0 to 6).
   int histo_bits = (use_palette ? 9 : 7) - method;
   while (1) {
-    const uint64_t huff_image_size = VP8LSubSampleSize(width, histo_bits) *
-                                     VP8LSubSampleSize(height, histo_bits) *
-                                     hist_size;
-    if (huff_image_size <= MAX_HUFF_IMAGE_SIZE) break;
+    const int huff_image_size = VP8LSubSampleSize(width, histo_bits) *
+                                VP8LSubSampleSize(height, histo_bits);
+    if ((uint64_t)huff_image_size * hist_size <= MAX_HUFF_IMAGE_SIZE) break;
     ++histo_bits;
   }
   return (histo_bits < MIN_HUFFMAN_BITS) ? MIN_HUFFMAN_BITS :
          (histo_bits > MAX_HUFFMAN_BITS) ? MAX_HUFFMAN_BITS : histo_bits;
 }
 
+static int GetTransformBits(int method, int histo_bits) {
+  const int max_transform_bits = (method < 4) ? 6 : (method > 4) ? 4 : 5;
+  return (histo_bits > max_transform_bits) ? max_transform_bits : histo_bits;
+}
+
+static int GetCacheBits(float quality) {
+  return (quality <= 25.f) ? 0 : 7;
+}
+
 static void FinishEncParams(VP8LEncoder* const enc) {
   const WebPConfig* const config = enc->config_;
   const WebPPicture* const pic = enc->pic_;
   const int method = config->method;
   const float quality = config->quality;
   const int use_palette = enc->use_palette_;
-  enc->transform_bits_ = (method < 4) ? 5 : (method > 4) ? 3 : 4;
   enc->histo_bits_ = GetHistoBits(method, use_palette, pic->width, pic->height);
-  enc->cache_bits_ = (quality <= 25.f) ? 0 : 7;
+  enc->transform_bits_ = GetTransformBits(method, enc->histo_bits_);
+  enc->cache_bits_ = GetCacheBits(quality);
 }
 
 // -----------------------------------------------------------------------------
@@ -945,7 +1018,7 @@ static void FinishEncParams(VP8LEncoder* const enc) {
 
 static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
                                    const WebPPicture* const picture) {
-  VP8LEncoder* const enc = (VP8LEncoder*)calloc(1, sizeof(*enc));
+  VP8LEncoder* const enc = (VP8LEncoder*)WebPSafeCalloc(1ULL, sizeof(*enc));
   if (enc == NULL) {
     WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     return NULL;
@@ -959,8 +1032,13 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
 }
 
 static void VP8LEncoderDelete(VP8LEncoder* enc) {
-  free(enc->argb_);
-  free(enc);
+  if (enc != NULL) {
+    VP8LHashChainClear(&enc->hash_chain_);
+    VP8LBackwardRefsClear(&enc->refs_[0]);
+    VP8LBackwardRefsClear(&enc->refs_[1]);
+    WebPSafeFree(enc->argb_);
+    WebPSafeFree(enc);
+  }
 }
 
 // -----------------------------------------------------------------------------
@@ -984,7 +1062,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   // ---------------------------------------------------------------------------
   // Analyze image (entropy, num_palettes etc)
 
-  if (!VP8LEncAnalyze(enc, config->image_hint)) {
+  if (!AnalyzeAndInit(enc, config->image_hint)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
@@ -1003,6 +1081,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
     int y;
     err = AllocateTransformBuffer(enc, width, height);
     if (err != VP8_ENC_OK) goto Error;
+    assert(enc->argb_ != NULL);
     for (y = 0; y < height; ++y) {
       memcpy(enc->argb_ + y * width,
              picture->argb + y * picture->argb_stride,
@@ -1014,23 +1093,17 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   // ---------------------------------------------------------------------------
   // Apply transforms and write transform data.
 
-  if (!EvalAndApplySubtractGreen(enc, enc->current_width_, height, bw)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
+  err = EvalAndApplySubtractGreen(enc, enc->current_width_, height, bw);
+  if (err != VP8_ENC_OK) goto Error;
 
   if (enc->use_predict_) {
-    if (!ApplyPredictFilter(enc, enc->current_width_, height, quality, bw)) {
-      err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
-      goto Error;
-    }
+    err = ApplyPredictFilter(enc, enc->current_width_, height, quality, bw);
+    if (err != VP8_ENC_OK) goto Error;
   }
 
   if (enc->use_cross_color_) {
-    if (!ApplyCrossColorFilter(enc, enc->current_width_, height, quality, bw)) {
-      err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
-      goto Error;
-    }
+    err = ApplyCrossColorFilter(enc, enc->current_width_, height, quality, bw);
+    if (err != VP8_ENC_OK) goto Error;
   }
 
   VP8LWriteBits(bw, 1, !TRANSFORM_PRESENT);  // No more transforms.
@@ -1040,8 +1113,9 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
 
   if (enc->cache_bits_ > 0) {
     if (!VP8LCalculateEstimateForCacheSize(enc->argb_, enc->current_width_,
-                                           height, &enc->cache_bits_)) {
-      err = VP8_ENC_ERROR_INVALID_CONFIGURATION;
+                                           height, quality, &enc->hash_chain_,
+                                           &enc->refs_[0], &enc->cache_bits_)) {
+      err = VP8_ENC_ERROR_OUT_OF_MEMORY;
       goto Error;
     }
   }
@@ -1049,11 +1123,10 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
   // ---------------------------------------------------------------------------
   // Encode and write the transformed image.
 
-  if (!EncodeImageInternal(bw, enc->argb_, enc->current_width_, height,
-                           quality, enc->cache_bits_, enc->histo_bits_)) {
-    err = VP8_ENC_ERROR_OUT_OF_MEMORY;
-    goto Error;
-  }
+  err = EncodeImageInternal(bw, enc->argb_, &enc->hash_chain_, enc->refs_,
+                            enc->current_width_, height, quality,
+                            enc->cache_bits_, enc->histo_bits_);
+  if (err != VP8_ENC_OK) goto Error;
 
   if (picture->stats != NULL) {
     WebPAuxStats* const stats = picture->stats;
@@ -1080,6 +1153,7 @@ int VP8LEncodeImage(const WebPConfig* const config,
   int has_alpha;
   size_t coded_size;
   int percent = 0;
+  int initial_size;
   WebPEncodingError err = VP8_ENC_OK;
   VP8LBitWriter bw;
 
@@ -1093,7 +1167,11 @@ int VP8LEncodeImage(const WebPConfig* const config,
 
   width = picture->width;
   height = picture->height;
-  if (!VP8LBitWriterInit(&bw, (width * height) >> 1)) {
+  // Initialize BitWriter with size corresponding to 16 bpp to photo images and
+  // 8 bpp for graphical images.
+  initial_size = (config->image_hint == WEBP_HINT_GRAPH) ?
+                 width * height : width * height * 2;
+  if (!VP8LBitWriterInit(&bw, initial_size)) {
     err = VP8_ENC_ERROR_OUT_OF_MEMORY;
     goto Error;
   }
@@ -1165,4 +1243,3 @@ int VP8LEncodeImage(const WebPConfig* const config,
 }
 
 //------------------------------------------------------------------------------
-
diff --git a/src/3rdparty/libwebp/src/enc/vp8li.h b/src/3rdparty/libwebp/src/enc/vp8li.h
index 96d6fae..6b6db12 100644
--- a/src/3rdparty/libwebp/src/enc/vp8li.h
+++ b/src/3rdparty/libwebp/src/enc/vp8li.h
@@ -14,6 +14,7 @@
 #ifndef WEBP_ENC_VP8LI_H_
 #define WEBP_ENC_VP8LI_H_
 
+#include "./backward_references.h"
 #include "./histogram.h"
 #include "../utils/bit_writer.h"
 #include "../webp/encode.h"
@@ -45,6 +46,12 @@ typedef struct {
   int use_palette_;
   int palette_size_;
   uint32_t palette_[MAX_PALETTE_SIZE];
+
+  // Some 'scratch' (potentially large) objects.
+  struct VP8LBackwardRefs refs_[2];  // Backward Refs array corresponding to
+                                     // LZ77 & RLE coding.
+  VP8LHashChain hash_chain_;         // HashChain data for constructing
+                                     // backward references.
 } VP8LEncoder;
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/enc/webpenc.c b/src/3rdparty/libwebp/src/enc/webpenc.c
index 207cce6..ca85e0b 100644
--- a/src/3rdparty/libwebp/src/enc/webpenc.c
+++ b/src/3rdparty/libwebp/src/enc/webpenc.c
@@ -18,6 +18,7 @@
 
 #include "./vp8enci.h"
 #include "./vp8li.h"
+#include "./cost.h"
 #include "../utils/utils.h"
 
 // #define PRINT_MEMORY_INFO
@@ -33,31 +34,6 @@ int WebPGetEncoderVersion(void) {
 }
 
 //------------------------------------------------------------------------------
-// WebPPicture
-//------------------------------------------------------------------------------
-
-static int DummyWriter(const uint8_t* data, size_t data_size,
-                       const WebPPicture* const picture) {
-  // The following are to prevent 'unused variable' error message.
-  (void)data;
-  (void)data_size;
-  (void)picture;
-  return 1;
-}
-
-int WebPPictureInitInternal(WebPPicture* picture, int version) {
-  if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_ENCODER_ABI_VERSION)) {
-    return 0;   // caller/system version mismatch!
-  }
-  if (picture != NULL) {
-    memset(picture, 0, sizeof(*picture));
-    picture->writer = DummyWriter;
-    WebPEncodingSetError(picture, VP8_ENC_OK);
-  }
-  return 1;
-}
-
-//------------------------------------------------------------------------------
 // VP8Encoder
 //------------------------------------------------------------------------------
 
@@ -143,23 +119,21 @@ static void MapConfigToTools(VP8Encoder* const enc) {
 // Memory scaling with dimensions:
 //  memory (bytes) ~= 2.25 * w + 0.0625 * w * h
 //
-// Typical memory footprint (768x510 picture)
-// Memory used:
-//              encoder: 33919
-//          block cache: 2880
-//                 info: 3072
-//                preds: 24897
-//          top samples: 1623
-//             non-zero: 196
-//             lf-stats: 2048
-//                total: 68635
+// Typical memory footprint (614x440 picture)
+//              encoder: 22111
+//                 info: 4368
+//                preds: 17741
+//          top samples: 1263
+//             non-zero: 175
+//             lf-stats: 0
+//                total: 45658
 // Transient object sizes:
-//       VP8EncIterator: 352
-//         VP8ModeScore: 912
-//       VP8SegmentInfo: 532
-//             VP8Proba: 31032
+//       VP8EncIterator: 3360
+//         VP8ModeScore: 872
+//       VP8SegmentInfo: 732
+//             VP8Proba: 18352
 //              LFStats: 2048
-// Picture size (yuv): 589824
+// Picture size (yuv): 419328
 
 static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
                                   WebPPicture* const picture) {
@@ -251,13 +225,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   ResetSegmentHeader(enc);
   ResetFilterHeader(enc);
   ResetBoundaryPredictions(enc);
-
+  VP8GetResidualCostInit();
+  VP8SetResidualCoeffsInit();
   VP8EncInitAlpha(enc);
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-  VP8EncInitLayer(enc);
-#endif
 
-  VP8TBufferInit(&enc->tokens_);
+  // lower quality means smaller output -> we modulate a little the page
+  // size based on quality. This is just a crude 1rst-order prediction.
+  {
+    const float scale = 1.f + config->quality * 5.f / 100.f;  // in [1,6]
+    VP8TBufferInit(&enc->tokens_, (int)(mb_w * mb_h * 4 * scale));
+  }
   return enc;
 }
 
@@ -265,11 +242,8 @@ static int DeleteVP8Encoder(VP8Encoder* enc) {
   int ok = 1;
   if (enc != NULL) {
     ok = VP8EncDeleteAlpha(enc);
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    VP8EncDeleteLayer(enc);
-#endif
     VP8TBufferClear(&enc->tokens_);
-    free(enc);
+    WebPSafeFree(enc);
   }
   return ok;
 }
@@ -352,18 +326,26 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
 
   if (!config->lossless) {
     VP8Encoder* enc = NULL;
-    if (pic->y == NULL || pic->u == NULL || pic->v == NULL) {
+    if (pic->use_argb || pic->y == NULL || pic->u == NULL || pic->v == NULL) {
       // Make sure we have YUVA samples.
-      float dithering = 0.f;
-      if (config->preprocessing & 2) {
-        const float x = config->quality / 100.f;
-        const float x2 = x * x;
-        // slowly decreasing from max dithering at low quality (q->0)
-        // to 0.5 dithering amplitude at high quality (q->100)
-        dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
-      }
-      if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
-        return 0;
+      if (config->preprocessing & 4) {
+#if WEBP_ENCODER_ABI_VERSION > 0x0204
+        if (!WebPPictureSmartARGBToYUVA(pic)) {
+          return 0;
+        }
+#endif
+      } else {
+        float dithering = 0.f;
+        if (config->preprocessing & 2) {
+          const float x = config->quality / 100.f;
+          const float x2 = x * x;
+          // slowly decreasing from max dithering at low quality (q->0)
+          // to 0.5 dithering amplitude at high quality (q->100)
+          dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
+        }
+        if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
+          return 0;
+        }
       }
     }
 
@@ -380,9 +362,6 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
       ok = ok && VP8EncTokenLoop(enc);
     }
     ok = ok && VP8EncFinishAlpha(enc);
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    ok = ok && VP8EncFinishLayer(enc);
-#endif
 
     ok = ok && VP8EncWrite(enc);
     StoreStats(enc);
@@ -401,4 +380,3 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
 
   return ok;
 }
-