26 files changed, 281 insertions, 133 deletions
diff --git a/src/3rdparty/libwebp/src/utils/bit_reader_inl.h b/src/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
index 3721570..fd7fb04 100644
--- a/src/3rdparty/libwebp/src/utils/bit_reader_inl.h
+++ b/src/3rdparty/libwebp/src/utils/bit_reader_inl_utils.h
@@ -20,13 +20,12 @@
 #include "../webp/config.h"
 #endif
 
-#ifdef WEBP_FORCE_ALIGNED
-#include <string.h>  // memcpy
-#endif
+#include <string.h>  // for memcpy
 
 #include "../dsp/dsp.h"
-#include "./bit_reader.h"
-#include "./endian_inl.h"
+#include "./bit_reader_utils.h"
+#include "./endian_inl_utils.h"
+#include "./utils.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -55,16 +54,14 @@ void VP8LoadFinalBytes(VP8BitReader* const br);
 // Inlined critical functions
 
 // makes sure br->value_ has at least BITS bits worth of data
-static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
+static WEBP_UBSAN_IGNORE_UNDEF WEBP_INLINE
+void VP8LoadNewBytes(VP8BitReader* const br) {
   assert(br != NULL && br->buf_ != NULL);
   // Read 'BITS' bits at a time if possible.
   if (br->buf_ < br->buf_max_) {
     // convert memory type to register type (with some zero'ing!)
     bit_t bits;
-#if defined(WEBP_FORCE_ALIGNED)
-    lbit_t in_bits;
-    memcpy(&in_bits, br->buf_, sizeof(in_bits));
-#elif defined(WEBP_USE_MIPS32)
+#if defined(WEBP_USE_MIPS32)
     // This is needed because of un-aligned read.
     lbit_t in_bits;
     lbit_t* p_buf_ = (lbit_t*)br->buf_;
@@ -79,7 +76,8 @@ static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
       : "memory", "at"
     );
 #else
-    const lbit_t in_bits = *(const lbit_t*)br->buf_;
+    lbit_t in_bits;
+    memcpy(&in_bits, br->buf_, sizeof(in_bits));
 #endif
     br->buf_ += BITS >> 3;
 #if !defined(WORDS_BIGENDIAN)
@@ -118,37 +116,26 @@ static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
     const int pos = br->bits_;
     const range_t split = (range * prob) >> 8;
     const range_t value = (range_t)(br->value_ >> pos);
-#if defined(__arm__) || defined(_M_ARM)      // ARM-specific
-    const int bit = ((int)(split - value) >> 31) & 1;
-    if (value > split) {
-      range -= split + 1;
-      br->value_ -= (bit_t)(split + 1) << pos;
-    } else {
-      range = split;
-    }
-#else  // faster version on x86
-    int bit;  // Don't use 'const int bit = (value > split);", it's slower.
-    if (value > split) {
-      range -= split + 1;
+    const int bit = (value > split);
+    if (bit) {
+      range -= split;
       br->value_ -= (bit_t)(split + 1) << pos;
-      bit = 1;
     } else {
-      range = split;
-      bit = 0;
+      range = split + 1;
     }
-#endif
-    if (range <= (range_t)0x7e) {
-      const int shift = kVP8Log2Range[range];
-      range = kVP8NewRange[range];
+    {
+      const int shift = 7 ^ BitsLog2Floor(range);
+      range <<= shift;
       br->bits_ -= shift;
     }
-    br->range_ = range;
+    br->range_ = range - 1;
     return bit;
   }
 }
 
 // simplified version of VP8GetBit() for prob=0x80 (note shift is always 1 here)
-static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
+static WEBP_UBSAN_IGNORE_UNSIGNED_OVERFLOW WEBP_INLINE
+int VP8GetSigned(VP8BitReader* const br, int v) {
   if (br->bits_ < 0) {
     VP8LoadNewBytes(br);
   }
@@ -165,6 +152,37 @@ static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
   }
 }
 
+static WEBP_INLINE int VP8GetBitAlt(VP8BitReader* const br, int prob) {
+  // Don't move this declaration! It makes a big speed difference to store
+  // 'range' *before* calling VP8LoadNewBytes(), even if this function doesn't
+  // alter br->range_ value.
+  range_t range = br->range_;
+  if (br->bits_ < 0) {
+    VP8LoadNewBytes(br);
+  }
+  {
+    const int pos = br->bits_;
+    const range_t split = (range * prob) >> 8;
+    const range_t value = (range_t)(br->value_ >> pos);
+    int bit;  // Don't use 'const int bit = (value > split);", it's slower.
+    if (value > split) {
+      range -= split + 1;
+      br->value_ -= (bit_t)(split + 1) << pos;
+      bit = 1;
+    } else {
+      range = split;
+      bit = 0;
+    }
+    if (range <= (range_t)0x7e) {
+      const int shift = kVP8Log2Range[range];
+      range = kVP8NewRange[range];
+      br->bits_ -= shift;
+    }
+    br->range_ = range;
+    return bit;
+  }
+}
+
 #ifdef __cplusplus
 }    // extern "C"
 #endif
diff --git a/src/3rdparty/libwebp/src/utils/bit_reader.c b/src/3rdparty/libwebp/src/utils/bit_reader_utils.c
index 45198e1..c3157e8 100644
--- a/src/3rdparty/libwebp/src/utils/bit_reader.c
+++ b/src/3rdparty/libwebp/src/utils/bit_reader_utils.c
@@ -15,7 +15,8 @@
 #include "../webp/config.h"
 #endif
 
-#include "./bit_reader_inl.h"
+#include "./bit_reader_inl_utils.h"
+#include "../utils/utils.h"
 
 //------------------------------------------------------------------------------
 // VP8BitReader
@@ -119,11 +120,10 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
 
 #define VP8L_LOG8_WBITS 4  // Number of bytes needed to store VP8L_WBITS bits.
 
-#if !defined(WEBP_FORCE_ALIGNED) && \
-    (defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
-     defined(__i386__) || defined(_M_IX86) || \
-     defined(__x86_64__) || defined(_M_X64))
-#define VP8L_USE_UNALIGNED_LOAD
+#if defined(__arm__) || defined(_M_ARM) || defined(__aarch64__) || \
+    defined(__i386__) || defined(_M_IX86) || \
+    defined(__x86_64__) || defined(_M_X64)
+#define VP8L_USE_FAST_LOAD
 #endif
 
 static const uint32_t kBitMask[VP8L_MAX_NUM_BIT_READ + 1] = {
@@ -191,15 +191,11 @@ static void ShiftBytes(VP8LBitReader* const br) {
 
 void VP8LDoFillBitWindow(VP8LBitReader* const br) {
   assert(br->bit_pos_ >= VP8L_WBITS);
-  // TODO(jzern): given the fixed read size it may be possible to force
-  //              alignment in this block.
-#if defined(VP8L_USE_UNALIGNED_LOAD)
+#if defined(VP8L_USE_FAST_LOAD)
   if (br->pos_ + sizeof(br->val_) < br->len_) {
     br->val_ >>= VP8L_WBITS;
     br->bit_pos_ -= VP8L_WBITS;
-    // The expression below needs a little-endian arch to work correctly.
-    // This gives a large speedup for decoding speed.
-    br->val_ |= (vp8l_val_t)WebPMemToUint32(br->buf_ + br->pos_) <<
+    br->val_ |= (vp8l_val_t)HToLE32(WebPMemToUint32(br->buf_ + br->pos_)) <<
                 (VP8L_LBITS - VP8L_WBITS);
     br->pos_ += VP8L_LOG8_WBITS;
     return;
diff --git a/src/3rdparty/libwebp/src/utils/bit_reader.h b/src/3rdparty/libwebp/src/utils/bit_reader_utils.h
index ec3426c..ec3426c 100644
--- a/src/3rdparty/libwebp/src/utils/bit_reader.h
+++ b/src/3rdparty/libwebp/src/utils/bit_reader_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/bit_writer.c b/src/3rdparty/libwebp/src/utils/bit_writer_utils.c
index 0644286..ab0c49d 100644
--- a/src/3rdparty/libwebp/src/utils/bit_writer.c
+++ b/src/3rdparty/libwebp/src/utils/bit_writer_utils.c
@@ -16,8 +16,8 @@
 #include <string.h>   // for memcpy()
 #include <stdlib.h>
 
-#include "./bit_writer.h"
-#include "./endian_inl.h"
+#include "./bit_writer_utils.h"
+#include "./endian_inl_utils.h"
 #include "./utils.h"
 
 //------------------------------------------------------------------------------
@@ -143,13 +143,13 @@ int VP8PutBitUniform(VP8BitWriter* const bw, int bit) {
 void VP8PutBits(VP8BitWriter* const bw, uint32_t value, int nb_bits) {
   uint32_t mask;
   assert(nb_bits > 0 && nb_bits < 32);
-  for (mask = 1u << (nb_bits - 1); mask; mask >>= 1)
+  for (mask = 1u << (nb_bits - 1); mask; mask >>= 1) {
     VP8PutBitUniform(bw, value & mask);
+  }
 }
 
 void VP8PutSignedBits(VP8BitWriter* const bw, int value, int nb_bits) {
-  if (!VP8PutBitUniform(bw, value != 0))
-    return;
+  if (!VP8PutBitUniform(bw, value != 0)) return;
   if (value < 0) {
     VP8PutBits(bw, ((-value) << 1) | 1, nb_bits + 1);
   } else {
diff --git a/src/3rdparty/libwebp/src/utils/bit_writer.h b/src/3rdparty/libwebp/src/utils/bit_writer_utils.h
index ef360d1..9c02bbc 100644
--- a/src/3rdparty/libwebp/src/utils/bit_writer.h
+++ b/src/3rdparty/libwebp/src/utils/bit_writer_utils.h
@@ -54,7 +54,8 @@ int VP8BitWriterAppend(VP8BitWriter* const bw,
 
 // return approximate write position (in bits)
 static WEBP_INLINE uint64_t VP8BitWriterPos(const VP8BitWriter* const bw) {
-  return (uint64_t)(bw->pos_ + bw->run_) * 8 + 8 + bw->nb_bits_;
+  const uint64_t nb_bits = 8 + bw->nb_bits_;   // bw->nb_bits_ is <= 0, note
+  return (bw->pos_ + bw->run_) * 8 + nb_bits;
 }
 
 // Returns a pointer to the internal buffer.
diff --git a/src/3rdparty/libwebp/src/utils/color_cache.c b/src/3rdparty/libwebp/src/utils/color_cache_utils.c
index f9ff4b5..0172590 100644
--- a/src/3rdparty/libwebp/src/utils/color_cache.c
+++ b/src/3rdparty/libwebp/src/utils/color_cache_utils.c
@@ -14,8 +14,8 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./color_cache.h"
-#include "../utils/utils.h"
+#include "./color_cache_utils.h"
+#include "./utils.h"
 
 //------------------------------------------------------------------------------
 // VP8LColorCache.
diff --git a/src/3rdparty/libwebp/src/utils/color_cache.h b/src/3rdparty/libwebp/src/utils/color_cache_utils.h
index a9a9f64..c373e6b 100644
--- a/src/3rdparty/libwebp/src/utils/color_cache.h
+++ b/src/3rdparty/libwebp/src/utils/color_cache_utils.h
@@ -28,7 +28,11 @@ typedef struct {
   int hash_bits_;
 } VP8LColorCache;
 
-static const uint32_t kHashMul = 0x1e35a7bd;
+static const uint64_t kHashMul = 0x1e35a7bdull;
+
+static WEBP_INLINE int HashPix(uint32_t argb, int shift) {
+  return (int)(((argb * kHashMul) & 0xffffffffu) >> shift);
+}
 
 static WEBP_INLINE uint32_t VP8LColorCacheLookup(
     const VP8LColorCache* const cc, uint32_t key) {
@@ -44,19 +48,20 @@ static WEBP_INLINE void VP8LColorCacheSet(const VP8LColorCache* const cc,
 
 static WEBP_INLINE void VP8LColorCacheInsert(const VP8LColorCache* const cc,
                                              uint32_t argb) {
-  const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
+  const int key = HashPix(argb, cc->hash_shift_);
   cc->colors_[key] = argb;
 }
 
 static WEBP_INLINE int VP8LColorCacheGetIndex(const VP8LColorCache* const cc,
                                               uint32_t argb) {
-  return (kHashMul * argb) >> cc->hash_shift_;
+  return HashPix(argb, cc->hash_shift_);
 }
 
+// Return the key if cc contains argb, and -1 otherwise.
 static WEBP_INLINE int VP8LColorCacheContains(const VP8LColorCache* const cc,
                                               uint32_t argb) {
-  const uint32_t key = (kHashMul * argb) >> cc->hash_shift_;
-  return (cc->colors_[key] == argb);
+  const int key = HashPix(argb, cc->hash_shift_);
+  return (cc->colors_[key] == argb) ? key : -1;
 }
 
 //------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/utils/endian_inl.h b/src/3rdparty/libwebp/src/utils/endian_inl_utils.h
index e11260f..e11260f 100644
--- a/src/3rdparty/libwebp/src/utils/endian_inl.h
+++ b/src/3rdparty/libwebp/src/utils/endian_inl_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/filters.c b/src/3rdparty/libwebp/src/utils/filters_utils.c
index 15543b1..49c1d18 100644
--- a/src/3rdparty/libwebp/src/utils/filters.c
+++ b/src/3rdparty/libwebp/src/utils/filters_utils.c
@@ -11,7 +11,7 @@
 //
 // Author: Urvang (urvang@google.com)
 
-#include "./filters.h"
+#include "./filters_utils.h"
 #include <stdlib.h>
 #include <string.h>
 
diff --git a/src/3rdparty/libwebp/src/utils/filters.h b/src/3rdparty/libwebp/src/utils/filters_utils.h
index 088b132..088b132 100644
--- a/src/3rdparty/libwebp/src/utils/filters.h
+++ b/src/3rdparty/libwebp/src/utils/filters_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/huffman_encode.c b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.c
index 6421c2b..f950465 100644
--- a/src/3rdparty/libwebp/src/utils/huffman_encode.c
+++ b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.c
@@ -14,8 +14,8 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./huffman_encode.h"
-#include "../utils/utils.h"
+#include "./huffman_encode_utils.h"
+#include "./utils.h"
 #include "../webp/format_constants.h"
 
 // -----------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/utils/huffman_encode.h b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.h
index a157165..a157165 100644
--- a/src/3rdparty/libwebp/src/utils/huffman_encode.h
+++ b/src/3rdparty/libwebp/src/utils/huffman_encode_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/huffman.c b/src/3rdparty/libwebp/src/utils/huffman_utils.c
index d57376a..008b5d7 100644
--- a/src/3rdparty/libwebp/src/utils/huffman.c
+++ b/src/3rdparty/libwebp/src/utils/huffman_utils.c
@@ -14,8 +14,8 @@
 #include <assert.h>
 #include <stdlib.h>
 #include <string.h>
-#include "./huffman.h"
-#include "../utils/utils.h"
+#include "./huffman_utils.h"
+#include "./utils.h"
 #include "../webp/format_constants.h"
 
 // Huffman data read via DecodeImageStream is represented in two (red and green)
@@ -45,7 +45,7 @@ static WEBP_INLINE uint32_t GetNextKey(uint32_t key, int len) {
   while (key & step) {
     step >>= 1;
   }
-  return (key & (step - 1)) + step;
+  return step ? (key & (step - 1)) + step : key;
 }
 
 // Stores code in table[0], table[step], table[2*step], ..., table[end].
@@ -75,11 +75,13 @@ static WEBP_INLINE int NextTableBitSize(const int* const count,
   return len - root_bits;
 }
 
-int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
-                          const int code_lengths[], int code_lengths_size) {
+// sorted[code_lengths_size] is a pre-allocated array for sorting symbols
+// by code length.
+static int BuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
+                             const int code_lengths[], int code_lengths_size,
+                             uint16_t sorted[]) {
   HuffmanCode* table = root_table;  // next available space in table
   int total_size = 1 << root_bits;  // total size root table + 2nd level table
-  int* sorted = NULL;               // symbols sorted by code length
   int len;                          // current code length
   int symbol;                       // symbol index in original or sorted table
   // number of codes of each length:
@@ -114,11 +116,6 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
     offset[len + 1] = offset[len] + count[len];
   }
 
-  sorted = (int*)WebPSafeMalloc(code_lengths_size, sizeof(*sorted));
-  if (sorted == NULL) {
-    return 0;
-  }
-
   // Sort symbols by length, by symbol order within each length.
   for (symbol = 0; symbol < code_lengths_size; ++symbol) {
     const int symbol_code_length = code_lengths[symbol];
@@ -133,7 +130,6 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
     code.bits = 0;
     code.value = (uint16_t)sorted[0];
     ReplicateValue(table, 1, total_size, code);
-    WebPSafeFree(sorted);
     return total_size;
   }
 
@@ -153,7 +149,6 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
       num_nodes += num_open;
       num_open -= count[len];
       if (num_open < 0) {
-        WebPSafeFree(sorted);
         return 0;
       }
       for (; count[len] > 0; --count[len]) {
@@ -172,7 +167,6 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
       num_nodes += num_open;
       num_open -= count[len];
       if (num_open < 0) {
-        WebPSafeFree(sorted);
         return 0;
       }
       for (; count[len] > 0; --count[len]) {
@@ -195,11 +189,35 @@ int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
 
     // Check if tree is full.
     if (num_nodes != 2 * offset[MAX_ALLOWED_CODE_LENGTH] - 1) {
-      WebPSafeFree(sorted);
       return 0;
     }
   }
 
-  WebPSafeFree(sorted);
+  return total_size;
+}
+
+// Maximum code_lengths_size is 2328 (reached for 11-bit color_cache_bits).
+// More commonly, the value is around ~280.
+#define MAX_CODE_LENGTHS_SIZE \
+  ((1 << MAX_CACHE_BITS) + NUM_LITERAL_CODES + NUM_LENGTH_CODES)
+// Cut-off value for switching between heap and stack allocation.
+#define SORTED_SIZE_CUTOFF 512
+int VP8LBuildHuffmanTable(HuffmanCode* const root_table, int root_bits,
+                          const int code_lengths[], int code_lengths_size) {
+  int total_size;
+  assert(code_lengths_size <= MAX_CODE_LENGTHS_SIZE);
+  if (code_lengths_size <= SORTED_SIZE_CUTOFF) {
+    // use local stack-allocated array.
+    uint16_t sorted[SORTED_SIZE_CUTOFF];
+    total_size = BuildHuffmanTable(root_table, root_bits,
+                                   code_lengths, code_lengths_size, sorted);
+  } else {   // rare case. Use heap allocation.
+    uint16_t* const sorted =
+        (uint16_t*)WebPSafeMalloc(code_lengths_size, sizeof(*sorted));
+    if (sorted == NULL) return 0;
+    total_size = BuildHuffmanTable(root_table, root_bits,
+                                   code_lengths, code_lengths_size, sorted);
+    WebPSafeFree(sorted);
+  }
   return total_size;
 }
diff --git a/src/3rdparty/libwebp/src/utils/huffman.h b/src/3rdparty/libwebp/src/utils/huffman_utils.h
index c6dd6aa..c6dd6aa 100644
--- a/src/3rdparty/libwebp/src/utils/huffman.h
+++ b/src/3rdparty/libwebp/src/utils/huffman_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels_dec.c b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
index 5b8b8b4..d4d23d3 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels_dec.c
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.c
@@ -14,7 +14,7 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
-#include "./quant_levels_dec.h"
+#include "./quant_levels_dec_utils.h"
 
 #include <string.h>   // for memset
 
@@ -44,6 +44,7 @@ static const uint8_t kOrderedDither[DSIZE][DSIZE] = {
 
 typedef struct {
   int width_, height_;  // dimension
+  int stride_;          // stride in bytes
   int row_;             // current input row being processed
   uint8_t* src_;        // input pointer
   uint8_t* dst_;        // output pointer
@@ -99,7 +100,7 @@ static void VFilter(SmoothParams* const p) {
   // We replicate edges, as it's somewhat easier as a boundary condition.
   // That's why we don't update the 'src' pointer on top/bottom area:
   if (p->row_ >= 0 && p->row_ < p->height_ - 1) {
-    p->src_ += p->width_;
+    p->src_ += p->stride_;
   }
 }
 
@@ -149,7 +150,7 @@ static void ApplyFilter(SmoothParams* const p) {
 #endif
     }
   }
-  p->dst_ += w;  // advance output pointer
+  p->dst_ += p->stride_;  // advance output pointer
 }
 
 //------------------------------------------------------------------------------
@@ -178,17 +179,20 @@ static void InitCorrectionLUT(int16_t* const lut, int min_dist) {
   lut[0] = 0;
 }
 
-static void CountLevels(const uint8_t* const data, int size,
-                        SmoothParams* const p) {
-  int i, last_level;
+static void CountLevels(SmoothParams* const p) {
+  int i, j, last_level;
   uint8_t used_levels[256] = { 0 };
+  const uint8_t* data = p->src_;
   p->min_ = 255;
   p->max_ = 0;
-  for (i = 0; i < size; ++i) {
-    const int v = data[i];
-    if (v < p->min_) p->min_ = v;
-    if (v > p->max_) p->max_ = v;
-    used_levels[v] = 1;
+  for (j = 0; j < p->height_; ++j) {
+    for (i = 0; i < p->width_; ++i) {
+      const int v = data[i];
+      if (v < p->min_) p->min_ = v;
+      if (v > p->max_) p->max_ = v;
+      used_levels[v] = 1;
+    }
+    data += p->stride_;
   }
   // Compute the mininum distance between two non-zero levels.
   p->min_level_dist_ = p->max_ - p->min_;
@@ -208,7 +212,7 @@ static void CountLevels(const uint8_t* const data, int size,
 }
 
 // Initialize all params.
-static int InitParams(uint8_t* const data, int width, int height,
+static int InitParams(uint8_t* const data, int width, int height, int stride,
                       int radius, SmoothParams* const p) {
   const int R = 2 * radius + 1;  // total size of the kernel
 
@@ -233,6 +237,7 @@ static int InitParams(uint8_t* const data, int width, int height,
 
   p->width_ = width;
   p->height_ = height;
+  p->stride_ = stride;
   p->src_ = data;
   p->dst_ = data;
   p->radius_ = radius;
@@ -240,7 +245,7 @@ static int InitParams(uint8_t* const data, int width, int height,
   p->row_ = -radius;
 
   // analyze the input distribution so we can best-fit the threshold
-  CountLevels(data, width * height, p);
+  CountLevels(p);
 
   // correction table
   p->correction_ = ((int16_t*)mem) + LUT_SIZE;
@@ -253,7 +258,7 @@ static void CleanupParams(SmoothParams* const p) {
   WebPSafeFree(p->mem_);
 }
 
-int WebPDequantizeLevels(uint8_t* const data, int width, int height,
+int WebPDequantizeLevels(uint8_t* const data, int width, int height, int stride,
                          int strength) {
   const int radius = 4 * strength / 100;
   if (strength < 0 || strength > 100) return 0;
@@ -261,7 +266,7 @@ int WebPDequantizeLevels(uint8_t* const data, int width, int height,
   if (radius > 0) {
     SmoothParams p;
     memset(&p, 0, sizeof(p));
-    if (!InitParams(data, width, height, radius, &p)) return 0;
+    if (!InitParams(data, width, height, stride, radius, &p)) return 0;
     if (p.num_levels_ > 2) {
       for (; p.row_ < p.height_; ++p.row_) {
         VFilter(&p);  // accumulate average of input
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels_dec.h b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.h
index 9aab068..59a1349 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels_dec.h
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_dec_utils.h
@@ -21,11 +21,11 @@ extern "C" {
 #endif
 
 // Apply post-processing to input 'data' of size 'width'x'height' assuming that
-// the source was quantized to a reduced number of levels.
+// the source was quantized to a reduced number of levels. 'stride' is in bytes.
 // Strength is in [0..100] and controls the amount of dithering applied.
 // Returns false in case of error (data is NULL, invalid parameters,
 // malloc failure, ...).
-int WebPDequantizeLevels(uint8_t* const data, int width, int height,
+int WebPDequantizeLevels(uint8_t* const data, int width, int height, int stride,
                          int strength);
 
 #ifdef __cplusplus
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels.c b/src/3rdparty/libwebp/src/utils/quant_levels_utils.c
index d7c8aab..73174e8 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels.c
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_utils.c
@@ -14,7 +14,7 @@
 
 #include <assert.h>
 
-#include "./quant_levels.h"
+#include "./quant_levels_utils.h"
 
 #define NUM_SYMBOLS     256
 
diff --git a/src/3rdparty/libwebp/src/utils/quant_levels.h b/src/3rdparty/libwebp/src/utils/quant_levels_utils.h
index 1cb5a32..1cb5a32 100644
--- a/src/3rdparty/libwebp/src/utils/quant_levels.h
+++ b/src/3rdparty/libwebp/src/utils/quant_levels_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/random.c b/src/3rdparty/libwebp/src/utils/random_utils.c
index 24e96ad..9f1e415 100644
--- a/src/3rdparty/libwebp/src/utils/random.c
+++ b/src/3rdparty/libwebp/src/utils/random_utils.c
@@ -12,7 +12,7 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <string.h>
-#include "./random.h"
+#include "./random_utils.h"
 
 //------------------------------------------------------------------------------
 
diff --git a/src/3rdparty/libwebp/src/utils/random.h b/src/3rdparty/libwebp/src/utils/random_utils.h
index c392a61..c392a61 100644
--- a/src/3rdparty/libwebp/src/utils/random.h
+++ b/src/3rdparty/libwebp/src/utils/random_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/rescaler.c b/src/3rdparty/libwebp/src/utils/rescaler_utils.c
index 00c9300..0d1f80d 100644
--- a/src/3rdparty/libwebp/src/utils/rescaler.c
+++ b/src/3rdparty/libwebp/src/utils/rescaler_utils.c
@@ -15,7 +15,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include "../dsp/dsp.h"
-#include "./rescaler.h"
+#include "./rescaler_utils.h"
 
 //------------------------------------------------------------------------------
 
@@ -48,11 +48,15 @@ void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
   wrk->y_sub = wrk->y_expand ? y_sub - 1 : y_sub;
   wrk->y_accum = wrk->y_expand ? wrk->y_sub : wrk->y_add;
   if (!wrk->y_expand) {
-    // this is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
+    // This is WEBP_RESCALER_FRAC(dst_height, x_add * y_add) without the cast.
+    // Its value is <= WEBP_RESCALER_ONE, because dst_height <= wrk->y_add, and
+    // wrk->x_add >= 1;
     const uint64_t ratio =
         (uint64_t)dst_height * WEBP_RESCALER_ONE / (wrk->x_add * wrk->y_add);
     if (ratio != (uint32_t)ratio) {
-      // We can't represent the ratio with the current fixed-point precision.
+      // When ratio == WEBP_RESCALER_ONE, we can't represent the ratio with the
+      // current fixed-point precision. This happens when src_height ==
+      // wrk->y_add (which == src_height), and wrk->x_add == 1.
       // => We special-case fxy_scale = 0, in WebPRescalerExportRow().
       wrk->fxy_scale = 0;
     } else {
diff --git a/src/3rdparty/libwebp/src/utils/rescaler.h b/src/3rdparty/libwebp/src/utils/rescaler_utils.h
index 98b01a7..98b01a7 100644
--- a/src/3rdparty/libwebp/src/utils/rescaler.h
+++ b/src/3rdparty/libwebp/src/utils/rescaler_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/thread.c b/src/3rdparty/libwebp/src/utils/thread_utils.c
index 93f7622..1729060 100644
--- a/src/3rdparty/libwebp/src/utils/thread.c
+++ b/src/3rdparty/libwebp/src/utils/thread_utils.c
@@ -13,7 +13,7 @@
 
 #include <assert.h>
 #include <string.h>   // for memset()
-#include "./thread.h"
+#include "./thread_utils.h"
 #include "./utils.h"
 
 #ifdef WEBP_USE_THREAD
@@ -183,8 +183,7 @@ static int pthread_cond_wait(pthread_cond_t* const condition,
 #else
   // note that there is a consumer available so the signal isn't dropped in
   // pthread_cond_signal
-  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL))
-    return 1;
+  if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) return 1;
   // now unlock the mutex so pthread_cond_signal may be issued
   pthread_mutex_unlock(mutex);
   ok = (WaitForSingleObject(condition->signal_event_, INFINITE) ==
@@ -226,8 +225,7 @@ static THREADFN ThreadLoop(void* ptr) {
 }
 
 // main thread state control
-static void ChangeState(WebPWorker* const worker,
-                        WebPWorkerStatus new_status) {
+static void ChangeState(WebPWorker* const worker, WebPWorkerStatus new_status) {
   // No-op when attempting to change state on a thread that didn't come up.
   // Checking status_ without acquiring the lock first would result in a data
   // race.
diff --git a/src/3rdparty/libwebp/src/utils/thread.h b/src/3rdparty/libwebp/src/utils/thread_utils.h
index 8408311..8408311 100644
--- a/src/3rdparty/libwebp/src/utils/thread.h
+++ b/src/3rdparty/libwebp/src/utils/thread_utils.h
diff --git a/src/3rdparty/libwebp/src/utils/utils.c b/src/3rdparty/libwebp/src/utils/utils.c
index d8e3093..504d924 100644
--- a/src/3rdparty/libwebp/src/utils/utils.c
+++ b/src/3rdparty/libwebp/src/utils/utils.c
@@ -15,6 +15,7 @@
 #include <string.h>  // for memcpy()
 #include "../webp/decode.h"
 #include "../webp/encode.h"
+#include "../webp/format_constants.h"  // for MAX_PALETTE_SIZE
 #include "./utils.h"
 
 // If PRINT_MEM_INFO is defined, extra info (like total memory used, number of
@@ -24,7 +25,7 @@
 //    http://valgrind.org/docs/manual/ms-manual.html
 // Here is an example command line:
 /*    valgrind --tool=massif --massif-out-file=massif.out \
-               --stacks=yes --alloc-fn=WebPSafeAlloc --alloc-fn=WebPSafeCalloc
+               --stacks=yes --alloc-fn=WebPSafeMalloc --alloc-fn=WebPSafeCalloc
       ms_print massif.out
 */
 // In addition:
@@ -174,8 +175,12 @@ static int CheckSizeArgumentsOverflow(uint64_t nmemb, size_t size) {
   }
 #endif
 #if defined(MALLOC_LIMIT)
-  if (mem_limit > 0 && total_mem + total_size >= mem_limit) {
-    return 0;   // fake fail!
+  if (mem_limit > 0) {
+    const uint64_t new_total_mem = (uint64_t)total_mem + total_size;
+    if (new_total_mem != (size_t)new_total_mem ||
+        new_total_mem > mem_limit) {
+      return 0;   // fake fail!
+    }
   }
 #endif
 
@@ -237,3 +242,89 @@ void WebPCopyPixels(const WebPPicture* const src, WebPPicture* const dst) {
 }
 
 //------------------------------------------------------------------------------
+
+#define COLOR_HASH_SIZE         (MAX_PALETTE_SIZE * 4)
+#define COLOR_HASH_RIGHT_SHIFT  22  // 32 - log2(COLOR_HASH_SIZE).
+
+int WebPGetColorPalette(const WebPPicture* const pic, uint32_t* const palette) {
+  int i;
+  int x, y;
+  int num_colors = 0;
+  uint8_t in_use[COLOR_HASH_SIZE] = { 0 };
+  uint32_t colors[COLOR_HASH_SIZE];
+  static const uint64_t kHashMul = 0x1e35a7bdull;
+  const uint32_t* argb = pic->argb;
+  const int width = pic->width;
+  const int height = pic->height;
+  uint32_t last_pix = ~argb[0];   // so we're sure that last_pix != argb[0]
+  assert(pic != NULL);
+  assert(pic->use_argb);
+
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int key;
+      if (argb[x] == last_pix) {
+        continue;
+      }
+      last_pix = argb[x];
+      key = ((last_pix * kHashMul) & 0xffffffffu) >> COLOR_HASH_RIGHT_SHIFT;
+      while (1) {
+        if (!in_use[key]) {
+          colors[key] = last_pix;
+          in_use[key] = 1;
+          ++num_colors;
+          if (num_colors > MAX_PALETTE_SIZE) {
+            return MAX_PALETTE_SIZE + 1;  // Exact count not needed.
+          }
+          break;
+        } else if (colors[key] == last_pix) {
+          break;  // The color is already there.
+        } else {
+          // Some other color sits here, so do linear conflict resolution.
+          ++key;
+          key &= (COLOR_HASH_SIZE - 1);  // Key mask.
+        }
+      }
+    }
+    argb += pic->argb_stride;
+  }
+
+  if (palette != NULL) {  // Fill the colors into palette.
+    num_colors = 0;
+    for (i = 0; i < COLOR_HASH_SIZE; ++i) {
+      if (in_use[i]) {
+        palette[num_colors] = colors[i];
+        ++num_colors;
+      }
+    }
+  }
+  return num_colors;
+}
+
+#undef COLOR_HASH_SIZE
+#undef COLOR_HASH_RIGHT_SHIFT
+
+//------------------------------------------------------------------------------
+
+#if defined(WEBP_NEED_LOG_TABLE_8BIT)
+const uint8_t WebPLogTable8bit[256] = {   // 31 ^ clz(i)
+  0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
+  4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+  7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7
+};
+#endif
+
+//------------------------------------------------------------------------------
diff --git a/src/3rdparty/libwebp/src/utils/utils.h b/src/3rdparty/libwebp/src/utils/utils.h
index f506d66..3ab4590 100644
--- a/src/3rdparty/libwebp/src/utils/utils.h
+++ b/src/3rdparty/libwebp/src/utils/utils.h
@@ -20,7 +20,9 @@
 #endif
 
 #include <assert.h>
+#include <limits.h>
 
+#include "../dsp/dsp.h"
 #include "../webp/types.h"
 
 #ifdef __cplusplus
@@ -31,7 +33,14 @@ extern "C" {
 // Memory allocation
 
 // This is the maximum memory amount that libwebp will ever try to allocate.
-#define WEBP_MAX_ALLOCABLE_MEMORY (1ULL << 40)
+#ifndef WEBP_MAX_ALLOCABLE_MEMORY
+#if SIZE_MAX > (1ULL << 34)
+#define WEBP_MAX_ALLOCABLE_MEMORY (1ULL << 34)
+#else
+// For 32-bit targets keep this below INT_MAX to avoid valgrind warnings.
+#define WEBP_MAX_ALLOCABLE_MEMORY ((1ULL << 31) - (1 << 16))
+#endif
+#endif  // WEBP_MAX_ALLOCABLE_MEMORY
 
 // size-checking safe malloc/calloc: verify that the requested size is not too
 // large, or return NULL. You don't need to call these for constructs like
@@ -51,9 +60,8 @@ WEBP_EXTERN(void) WebPSafeFree(void* const ptr);
 // Alignment
 
 #define WEBP_ALIGN_CST 31
-#define WEBP_ALIGN(PTR) ((uintptr_t)((PTR) + WEBP_ALIGN_CST) & ~WEBP_ALIGN_CST)
+#define WEBP_ALIGN(PTR) (((uintptr_t)(PTR) + WEBP_ALIGN_CST) & ~WEBP_ALIGN_CST)
 
-#if defined(WEBP_FORCE_ALIGNED)
 #include <string.h>
 // memcpy() is the safe way of moving potentially unaligned 32b memory.
 static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) {
@@ -64,14 +72,6 @@ static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) {
 static WEBP_INLINE void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) {
   memcpy(ptr, &val, sizeof(val));
 }
-#else
-static WEBP_INLINE uint32_t WebPMemToUint32(const uint8_t* const ptr) {
-  return *(const uint32_t*)ptr;
-}
-static WEBP_INLINE void WebPUint32ToMem(uint8_t* const ptr, uint32_t val) {
-  *(uint32_t*)ptr = val;
-}
-#endif
 
 //------------------------------------------------------------------------------
 // Reading/writing data.
@@ -107,6 +107,19 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
   PutLE16(data + 2, (int)(val >> 16));
 }
 
+// Returns 31 ^ clz(n) = log2(n). This is the default C-implementation, either
+// based on table or not. Can be used as fallback if clz() is not available.
+#define WEBP_NEED_LOG_TABLE_8BIT
+extern const uint8_t WebPLogTable8bit[256];
+static WEBP_INLINE int WebPLog2FloorC(uint32_t n) {
+  int log = 0;
+  while (n >= 256) {
+    log += 8;
+    n >>= 8;
+  }
+  return log + WebPLogTable8bit[n];
+}
+
 // Returns (int)floor(log2(n)). n must be > 0.
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
@@ -124,22 +137,8 @@ static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
-#else
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  int log = 0;
-  uint32_t value = n;
-  int i;
-
-  for (i = 4; i >= 0; --i) {
-    const int shift = (1 << i);
-    const uint32_t x = value >> shift;
-    if (x != 0) {
-      value = x;
-      log += shift;
-    }
-  }
-  return log;
-}
+#else   // default: use the C-version.
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) { return WebPLog2FloorC(n); }
 #endif
 
 //------------------------------------------------------------------------------
@@ -158,6 +157,19 @@ WEBP_EXTERN(void) WebPCopyPixels(const struct WebPPicture* const src,
                                  struct WebPPicture* const dst);
 
 //------------------------------------------------------------------------------
+// Unique colors.
+
+// Returns count of unique colors in 'pic', assuming pic->use_argb is true.
+// If the unique color count is more than MAX_PALETTE_SIZE, returns
+// MAX_PALETTE_SIZE+1.
+// If 'palette' is not NULL and number of unique colors is less than or equal to
+// MAX_PALETTE_SIZE, also outputs the actual unique colors into 'palette'.
+// Note: 'palette' is assumed to be an array already allocated with at least
+// MAX_PALETTE_SIZE elements.
+WEBP_EXTERN(int) WebPGetColorPalette(const struct WebPPicture* const pic,
+                                     uint32_t* const palette);
+
+//------------------------------------------------------------------------------
 
 #ifdef __cplusplus
 }    // extern "C"